Skip to content

Commit 03d60a6

Browse files
committed
Refactor and implement optional exception on non-200 response
1 parent aac2eb6 commit 03d60a6

File tree

1 file changed

+61
-25
lines changed

1 file changed

+61
-25
lines changed

htmldiff2.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616

1717

1818
class Server(object):
19-
def __init__(self, base_url, protocol='http', auth=None):
19+
def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None):
2020
self.base_url = base_url
21+
self.ignore_non_200 = ignore_non_200
2122
self.protocol = protocol
2223
self.auth = tuple(auth) if auth else None
2324

@@ -26,15 +27,15 @@ def __str__(self):
2627

2728
@staticmethod
2829
def compare_pages(
29-
relative_urls, servers, html, json, threads=1, debug=False, **kwargs):
30+
relative_urls, servers, html, json, ignore_non_200=False, threads=1, debug=False, **kwargs):
3031
"""
3132
relative_urls: list of str URLs
3233
servers: list of Server objects
3334
html: Boolean for HTML response type
3435
json: Boolean for JSON response type
3536
"""
3637
_servers = [
37-
Server.factory(html=html, json=json, **server_config)
38+
Server.factory(html=html, json=json, ignore_non_200=ignore_non_200, **server_config)
3839
for server_config in servers]
3940
func = functools.partial(_servers[0].compare_page, servers=_servers, **kwargs)
4041
if debug:
@@ -60,18 +61,34 @@ def factory(html=None, json=None, **config):
6061
def get_full_url(self, relative_url):
6162
return "{}://{}{}".format(self.protocol, self.base_url, relative_url)
6263

64+
def get_base_response(self, relative_url):
65+
url = self.get_full_url(relative_url)
66+
r = requests.get(url, auth=self.auth)
67+
if r.status_code != 200:
68+
if self.ignore_non_200:
69+
return None
70+
else:
71+
raise Exception("Got status code {} for URL {}".format(r.status_code, url))
72+
return r
73+
6374

6475
class HtmlServer(Server):
65-
def __init__(self, base_url, protocol='http', auth=None):
66-
Server.__init__(self, base_url, protocol, auth)
76+
def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None):
77+
Server.__init__(self, base_url, ignore_non_200, protocol, auth)
6778

6879
@staticmethod
6980
def compare_page(relative_url, servers, selectors):
7081
differences = []
7182

7283
trees = OrderedDict()
7384
for server in servers:
74-
trees[server.get_full_url(relative_url)] = server.get_dom_tree(relative_url)
85+
response = server.get_dom_tree(relative_url)
86+
if not response:
87+
# Early out for None server response
88+
url = server.get_full_url(relative_url)
89+
return ['Failed to retreive URL: {}'.format(url)]
90+
# return []
91+
trees[server.get_full_url(relative_url)] = response
7592

7693
for selector_name, selector in selectors.iteritems():
7794
results = [HtmlServer.get_text_from_tree(tree, selector) for _, tree in trees.iteritems()]
@@ -88,7 +105,7 @@ def compare_page(relative_url, servers, selectors):
88105
def mismatched_error_message(relative_url, selector_name, selector, trees, results):
89106
msg = []
90107
msg.append("-------------------------")
91-
msg.append("Error: mismatched results")
108+
msg.append("Error - mismatched results for: {}".format(relative_url))
92109
for url, _ in trees.iteritems():
93110
msg.append(" - {}".format(url))
94111
msg.append("Selector name: {}".format(selector_name))
@@ -111,37 +128,43 @@ def get_text_from_tree(tree, selector, strip_whitespace=True):
111128
return ''
112129

113130
# get the html out of all the results
114-
data = [lxml.html.tostring(result) for result in results]
131+
data = [result.text for result in results]
115132

116133
if strip_whitespace:
117-
data = [result.strip() for result in data]
134+
data = [result.strip() if isinstance(result, basestring) else None for result in data]
118135

119136
return data[0]
120137

121138
def get_dom_tree(self, relative_url):
122139
""" Build the DOM Tree """
123-
return lxml.html.fromstring(self.get_response(relative_url))
140+
response = self.get_response(relative_url)
141+
return lxml.html.fromstring(response) if response else None
124142

125143
def get_response(self, relative_url):
126-
url = self.get_full_url(relative_url)
127-
r = requests.get(url, auth=self.auth)
128-
if r.status_code != 200:
129-
raise Exception("Got status code {} for URL {}".format(r.status_code, url))
144+
r = Server.get_base_response(self, relative_url)
145+
if not r:
146+
return None
130147
r.encoding = 'utf-8'
131148
return r.text
132149

133150

134151
class JsonServer(Server):
135-
def __init__(self, base_url, protocol='http', auth=None):
136-
Server.__init__(self, base_url, protocol, auth)
152+
def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None):
153+
Server.__init__(self, base_url, ignore_non_200, protocol, auth)
137154

138155
@staticmethod
139156
def compare_page(relative_url, servers, keys=None):
140157
differences = []
141158

142159
server_responses = OrderedDict()
143160
for server in servers:
144-
server_responses[server.get_full_url(relative_url)] = server.get_response(relative_url)
161+
response = server.get_response(relative_url)
162+
if not response:
163+
# Early out for None server response
164+
url = server.get_full_url(relative_url)
165+
return ['Failed to retreive URL: {}'.format(url)]
166+
# return []
167+
server_responses[server.get_full_url(relative_url)] = response
145168

146169
results = []
147170
for _, response in server_responses.iteritems():
@@ -172,25 +195,31 @@ def pluck(json_obj, keys=None):
172195
if temp:
173196
temp_obj = temp
174197
else:
175-
break
176-
plucked[key] = temp_obj
198+
temp_obj = None
199+
plucked[key] = temp_obj\
200+
.replace('www.nerdwallet.com', 'localnerd.nerdwallet.com')\
201+
.replace('stage.nerdwallet.biz', 'localnerd.nerdwallet.com')\
202+
.replace('<span>', '')\
203+
.replace('</span>', '')\
204+
.replace(u'<sup>\u00ae</sup>', '(R)')\
205+
.replace(u'\u00a0', ' ')\
206+
if isinstance(temp_obj, basestring) else temp_obj
177207

178208
return plucked
179209

180210
@staticmethod
181211
def mismatched_error_message(relative_url, results):
182212
msg = []
183213
msg.append("-------------------------")
184-
msg.append("Error: mismatched results")
214+
msg.append("Error - mismatched results for url: {}".format(relative_url))
185215
msg.append("")
186216
msg.append('\n'.join(difflib.ndiff(results[0].splitlines(), results[1].splitlines())))
187217
return '\n'.join(msg)
188218

189219
def get_response(self, relative_url):
190-
url = self.get_full_url(relative_url)
191-
r = requests.get(url, auth=self.auth)
192-
if r.status_code != 200:
193-
raise Exception("Got status code {} for URL {}".format(r.status_code, url))
220+
r = Server.get_base_response(self, relative_url)
221+
if not r:
222+
return None
194223
return r.json()
195224

196225

@@ -209,6 +238,7 @@ def parse_args():
209238
parser.add_argument("--show-config-format", help="show the config format", action="store_true")
210239
parser.add_argument("-t", "--threads", type=int, default=1, help="set the number of threads")
211240
parser.add_argument("--debug", help="disable threading for debug purposes", action="store_true")
241+
parser.add_argument("--ignore-non-200", help="ignore responses that aren't 200 OK", action="store_true")
212242
group = parser.add_mutually_exclusive_group(required=True)
213243
group.add_argument("--html", help="Parse responses as HTML", action="store_true")
214244
group.add_argument("--json", help="Parse responses as JSON", action="store_true")
@@ -226,7 +256,13 @@ def parse_config_file(filename):
226256
if __name__ == "__main__":
227257
args = parse_args()
228258
config = parse_config_file(args.config)
229-
differences = Server.compare_pages(threads=args.threads, debug=args.debug, html=args.html, json=args.json, **config)
259+
differences = Server.compare_pages(
260+
threads=args.threads,
261+
debug=args.debug,
262+
html=args.html,
263+
json=args.json,
264+
ignore_non_200=args.ignore_non_200,
265+
**config)
230266

231267
print "Number of differences: {}".format(len(differences))
232268
for difference in differences:

0 commit comments

Comments
 (0)