1616
1717
1818class Server (object ):
19- def __init__ (self , base_url , protocol = 'http' , auth = None ):
19+ def __init__ (self , base_url , ignore_non_200 = False , protocol = 'http' , auth = None ):
2020 self .base_url = base_url
21+ self .ignore_non_200 = ignore_non_200
2122 self .protocol = protocol
2223 self .auth = tuple (auth ) if auth else None
2324
@@ -26,15 +27,15 @@ def __str__(self):
2627
2728 @staticmethod
2829 def compare_pages (
29- relative_urls , servers , html , json , threads = 1 , debug = False , ** kwargs ):
30+ relative_urls , servers , html , json , ignore_non_200 = False , threads = 1 , debug = False , ** kwargs ):
3031 """
3132 relative_urls: list of str URLs
3233 servers: list of Server objects
3334 html: Boolean for HTML response type
3435 json: Boolean for JSON response type
3536 """
3637 _servers = [
37- Server .factory (html = html , json = json , ** server_config )
38+ Server .factory (html = html , json = json , ignore_non_200 = ignore_non_200 , ** server_config )
3839 for server_config in servers ]
3940 func = functools .partial (_servers [0 ].compare_page , servers = _servers , ** kwargs )
4041 if debug :
@@ -60,18 +61,34 @@ def factory(html=None, json=None, **config):
6061 def get_full_url (self , relative_url ):
6162 return "{}://{}{}" .format (self .protocol , self .base_url , relative_url )
6263
64+ def get_base_response (self , relative_url ):
65+ url = self .get_full_url (relative_url )
66+ r = requests .get (url , auth = self .auth )
67+ if r .status_code != 200 :
68+ if self .ignore_non_200 :
69+ return None
70+ else :
71+ raise Exception ("Got status code {} for URL {}" .format (r .status_code , url ))
72+ return r
73+
6374
6475class HtmlServer (Server ):
65- def __init__ (self , base_url , protocol = 'http' , auth = None ):
66- Server .__init__ (self , base_url , protocol , auth )
76+ def __init__ (self , base_url , ignore_non_200 = False , protocol = 'http' , auth = None ):
77+ Server .__init__ (self , base_url , ignore_non_200 , protocol , auth )
6778
6879 @staticmethod
6980 def compare_page (relative_url , servers , selectors ):
7081 differences = []
7182
7283 trees = OrderedDict ()
7384 for server in servers :
74- trees [server .get_full_url (relative_url )] = server .get_dom_tree (relative_url )
85+ response = server .get_dom_tree (relative_url )
86+ if not response :
87+ # Early out for None server response
88+ url = server .get_full_url (relative_url )
89+ return ['Failed to retreive URL: {}' .format (url )]
90+ # return []
91+ trees [server .get_full_url (relative_url )] = response
7592
7693 for selector_name , selector in selectors .iteritems ():
7794 results = [HtmlServer .get_text_from_tree (tree , selector ) for _ , tree in trees .iteritems ()]
@@ -88,7 +105,7 @@ def compare_page(relative_url, servers, selectors):
88105 def mismatched_error_message (relative_url , selector_name , selector , trees , results ):
89106 msg = []
90107 msg .append ("-------------------------" )
91- msg .append ("Error: mismatched results" )
108+ msg .append ("Error - mismatched results for: {}" . format ( relative_url ) )
92109 for url , _ in trees .iteritems ():
93110 msg .append (" - {}" .format (url ))
94111 msg .append ("Selector name: {}" .format (selector_name ))
@@ -111,37 +128,43 @@ def get_text_from_tree(tree, selector, strip_whitespace=True):
111128 return ''
112129
113130 # get the html out of all the results
114- data = [lxml . html . tostring ( result ) for result in results ]
131+ data = [result . text for result in results ]
115132
116133 if strip_whitespace :
117- data = [result .strip () for result in data ]
134+ data = [result .strip () if isinstance ( result , basestring ) else None for result in data ]
118135
119136 return data [0 ]
120137
121138 def get_dom_tree (self , relative_url ):
122139 """ Build the DOM Tree """
123- return lxml .html .fromstring (self .get_response (relative_url ))
140+ response = self .get_response (relative_url )
141+ return lxml .html .fromstring (response ) if response else None
124142
125143 def get_response (self , relative_url ):
126- url = self .get_full_url (relative_url )
127- r = requests .get (url , auth = self .auth )
128- if r .status_code != 200 :
129- raise Exception ("Got status code {} for URL {}" .format (r .status_code , url ))
144+ r = Server .get_base_response (self , relative_url )
145+ if not r :
146+ return None
130147 r .encoding = 'utf-8'
131148 return r .text
132149
133150
134151class JsonServer (Server ):
135- def __init__ (self , base_url , protocol = 'http' , auth = None ):
136- Server .__init__ (self , base_url , protocol , auth )
152+ def __init__ (self , base_url , ignore_non_200 = False , protocol = 'http' , auth = None ):
153+ Server .__init__ (self , base_url , ignore_non_200 , protocol , auth )
137154
138155 @staticmethod
139156 def compare_page (relative_url , servers , keys = None ):
140157 differences = []
141158
142159 server_responses = OrderedDict ()
143160 for server in servers :
144- server_responses [server .get_full_url (relative_url )] = server .get_response (relative_url )
161+ response = server .get_response (relative_url )
162+ if not response :
163+ # Early out for None server response
164+ url = server .get_full_url (relative_url )
165+ return ['Failed to retreive URL: {}' .format (url )]
166+ # return []
167+ server_responses [server .get_full_url (relative_url )] = response
145168
146169 results = []
147170 for _ , response in server_responses .iteritems ():
@@ -172,25 +195,31 @@ def pluck(json_obj, keys=None):
172195 if temp :
173196 temp_obj = temp
174197 else :
175- break
176- plucked [key ] = temp_obj
198+ temp_obj = None
199+ plucked [key ] = temp_obj \
200+ .replace ('www.nerdwallet.com' , 'localnerd.nerdwallet.com' )\
201+ .replace ('stage.nerdwallet.biz' , 'localnerd.nerdwallet.com' )\
202+ .replace ('<span>' , '' )\
203+ .replace ('</span>' , '' )\
204+ .replace (u'<sup>\u00ae </sup>' , '(R)' )\
205+ .replace (u'\u00a0 ' , ' ' )\
206+ if isinstance (temp_obj , basestring ) else temp_obj
177207
178208 return plucked
179209
180210 @staticmethod
181211 def mismatched_error_message (relative_url , results ):
182212 msg = []
183213 msg .append ("-------------------------" )
184- msg .append ("Error: mismatched results" )
214+ msg .append ("Error - mismatched results for url: {}" . format ( relative_url ) )
185215 msg .append ("" )
186216 msg .append ('\n ' .join (difflib .ndiff (results [0 ].splitlines (), results [1 ].splitlines ())))
187217 return '\n ' .join (msg )
188218
189219 def get_response (self , relative_url ):
190- url = self .get_full_url (relative_url )
191- r = requests .get (url , auth = self .auth )
192- if r .status_code != 200 :
193- raise Exception ("Got status code {} for URL {}" .format (r .status_code , url ))
220+ r = Server .get_base_response (self , relative_url )
221+ if not r :
222+ return None
194223 return r .json ()
195224
196225
@@ -209,6 +238,7 @@ def parse_args():
209238 parser .add_argument ("--show-config-format" , help = "show the config format" , action = "store_true" )
210239 parser .add_argument ("-t" , "--threads" , type = int , default = 1 , help = "set the number of threads" )
211240 parser .add_argument ("--debug" , help = "disable threading for debug purposes" , action = "store_true" )
241+ parser .add_argument ("--ignore-non-200" , help = "ignore responses that aren't 200 OK" , action = "store_true" )
212242 group = parser .add_mutually_exclusive_group (required = True )
213243 group .add_argument ("--html" , help = "Parse responses as HTML" , action = "store_true" )
214244 group .add_argument ("--json" , help = "Parse responses as JSON" , action = "store_true" )
@@ -226,7 +256,13 @@ def parse_config_file(filename):
226256if __name__ == "__main__" :
227257 args = parse_args ()
228258 config = parse_config_file (args .config )
229- differences = Server .compare_pages (threads = args .threads , debug = args .debug , html = args .html , json = args .json , ** config )
259+ differences = Server .compare_pages (
260+ threads = args .threads ,
261+ debug = args .debug ,
262+ html = args .html ,
263+ json = args .json ,
264+ ignore_non_200 = args .ignore_non_200 ,
265+ ** config )
230266
231267 print "Number of differences: {}" .format (len (differences ))
232268 for difference in differences :
0 commit comments