@@ -81,26 +81,30 @@ def print_result(result_list):
8181
8282# create filelist
8383def query_list (url : str , range : int , mode : str ):
84- print ("\n Querying snapshots..." )
85- if range :
86- range = datetime .now ().year - range
87- range = "&from=" + str (range )
88- else :
89- range = ""
90- cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
91- cdxResult = requests .get (cdxQuery )
92- if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
93- cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
94- cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
95- if mode == "current" :
96- cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
97- cdxResult_list_filtered = []
98- for snapshot in cdxResult_list :
99- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
100- cdxResult_list_filtered .append (snapshot )
101- cdxResult_list = cdxResult_list_filtered
102- print (f"\n -----> { len (cdxResult_list )} snapshots found" )
103- return cdxResult_list
84+ try :
85+ print ("\n Querying snapshots..." )
86+ if range :
87+ range = datetime .now ().year - range
88+ range = "&from=" + str (range )
89+ else :
90+ range = ""
91+ cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
92+ cdxResult = requests .get (cdxQuery )
93+ if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
94+ cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
95+ cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
96+ if mode == "current" :
97+ cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
98+ cdxResult_list_filtered = []
99+ for snapshot in cdxResult_list :
100+ if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
101+ cdxResult_list_filtered .append (snapshot )
102+ cdxResult_list = cdxResult_list_filtered
103+ print (f"\n -----> { len (cdxResult_list )} snapshots found" )
104+ return cdxResult_list
105+ except requests .exceptions .ConnectionError as e :
106+ print (f"\n -----> ERROR: could not query snapshots:\n { e } " ); exit ()
107+
104108
105109
106110
@@ -296,11 +300,18 @@ def download_url_entry(url, filename, filepath, connection, status_message):
296300 f" -> URL: { url } \n " + \
297301 f" -> FILE: { output } "
298302 return True
299- except http .client .HTTPException as e :
303+ except ConnectionRefusedError as e :
304+ status_message = f"{ status_message } \n " + \
305+ f"REFUSED -> ({ i + 1 } /{ max_retries } ), reconnect in { sleep_time } seconds...\n " + \
306+ f" -> { e } "
300307 print (status_message )
301- print (f"REFUSED -> ({ i + 1 } /{ max_retries } ), reconnect in { sleep_time } seconds..." )
302- print (f" -> { e } " )
303308 time .sleep (sleep_time )
309+ except http .client .HTTPException as e :
310+ status_message = f"{ status_message } \n " + \
311+ f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { url } \n " + \
312+ f" -> { e } "
313+ print (status_message )
314+ return False
304315 print (f"FAILED -> download, append to failed_urls: { url } " )
305316 return False
306317
0 commit comments