88from urllib .parse import urljoin
99from datetime import datetime , timezone
1010
11+ import pywaybackup .SnapshotCollection as sc
12+
1113
1214
1315
@@ -67,13 +69,13 @@ def save_page(url: str):
6769
6870
6971
70- def print_result (result_list ):
72+ def print_result (snapshots ):
7173 print ("" )
72- if not result_list :
74+ if not snapshots :
7375 print ("No snapshots found" )
7476 else :
75- __import__ ('pprint' ).pprint (result_list )
76- print (f"\n -----> { len ( result_list )} snapshots listed" )
77+ __import__ ('pprint' ).pprint (snapshots . CDX_RESULT_LIST )
78+ print (f"\n -----> { snapshots . count_list ( )} snapshots listed" )
7779
7880
7981
@@ -91,17 +93,10 @@ def query_list(url: str, range: int, mode: str):
9193 cdxQuery = f"https://web.archive.org/cdx/search/xd?output=json&url=*.{ url } /*{ range } &fl=timestamp,original&filter=!statuscode:200"
9294 cdxResult = requests .get (cdxQuery )
9395 if cdxResult .status_code != 200 : print (f"\n -----> ERROR: could not query snapshots, status code: { cdxResult .status_code } " ); exit ()
94- cdxResult_json = cdxResult .json ()[1 :] # first line is fieldlist, so remove it [timestamp, original
95- cdxResult_list = [{"timestamp" : snapshot [0 ], "url" : snapshot [1 ]} for snapshot in cdxResult_json ]
96- if mode == "current" :
97- cdxResult_list = sorted (cdxResult_list , key = lambda k : k ['timestamp' ], reverse = True )
98- cdxResult_list_filtered = []
99- for snapshot in cdxResult_list :
100- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
101- cdxResult_list_filtered .append (snapshot )
102- cdxResult_list = cdxResult_list_filtered
103- print (f"\n -----> { len (cdxResult_list )} snapshots found" )
104- return cdxResult_list
96+ snapshots = sc .SnapshotCollection (cdxResult )
97+ if mode == "current" : snapshots .create_current ()
98+ print (f"\n -----> { snapshots .count_list ()} snapshots found" )
99+ return snapshots
105100 except requests .exceptions .ConnectionError as e :
106101 print (f"\n -----> ERROR: could not query snapshots:\n { e } " ); exit ()
107102
@@ -110,38 +105,6 @@ def query_list(url: str, range: int, mode: str):
110105
111106
112107
113- def split_url (url ):
114- """
115- Split url into domain, subdir and file.
116- If no file is present, the filename will be index.html
117- """
118- domain = url .split ("//" )[- 1 ].split ("/" )[0 ]
119- subdir = "/" .join (url .split ("//" )[- 1 ].split ("/" )[1 :- 1 ])
120- filename = url .split ("/" )[- 1 ] or "index.html"
121- return domain , subdir , filename
122-
123- def determine_url_filetype (url ):
124- """
125- Determine filetype of the archive-url by looking at the file extension.
126- """
127- image = ["jpg" , "jpeg" , "png" , "gif" , "svg" , "ico" ]
128- css = ["css" ]
129- js = ["js" ]
130- file_extension = url .split ("." )[- 1 ]
131- if file_extension in image :
132- urltype = "im_"
133- elif file_extension in css :
134- urltype = "cs_"
135- elif file_extension in js :
136- urltype = "js_"
137- else :
138- urltype = "id_"
139- return urltype
140-
141-
142-
143-
144-
145108def remove_empty_folders (path , remove_root = True ):
146109 print ("" )
147110 print ("Removing empty output folders..." )
@@ -175,130 +138,108 @@ def remove_empty_folders(path, remove_root=True):
175138
176139
177140# example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
178- # example url: https://www.google.com/
179- # example timestamp: 20190815104545
180- def download_prepare_list (cdxResult_list , output , retry , worker , mode ):
141+ def download_prepare_list (snapshots , output , retry , worker ):
181142 """
182143 Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
183144 """
184145 print ("\n Downloading latest snapshots of each file..." )
185- download_list = []
186- for snapshot in cdxResult_list :
187- timestamp , url = snapshot ["timestamp" ], snapshot ["url" ]
188- type = determine_url_filetype (url )
189- download_url = f"http://web.archive.org/web/{ timestamp } { type } /{ url } "
190- domain , subdir , filename = split_url (url )
191- if mode == "current" : download_dir = os .path .join (output , domain , subdir )
192- if mode == "full" : download_dir = os .path .join (output , domain , timestamp , subdir )
193- download_list .append ({"url" : download_url , "filename" : filename , "filepath" : download_dir })
146+ snapshots .create_collection (output )
147+ download_list = snapshots .CDX_RESULT_COLLECTION
194148 if worker > 1 :
195149 print (f"\n -----> Simultaneous downloads: { worker } " )
196- batch_size = len ( download_list ) // worker + 1
150+ batch_size = snapshots . count_collection ( ) // worker + 1
197151 else :
198- batch_size = len ( download_list )
152+ batch_size = snapshots . count_collection ( )
199153 batch_list = [download_list [i :i + batch_size ] for i in range (0 , len (download_list ), batch_size )]
200154 threads = []
201155 worker = 0
202156 for batch in batch_list :
203157 worker += 1
204- thread = threading .Thread (target = download_url_list , args = (batch , worker , retry ))
158+ thread = threading .Thread (target = download_url_list , args = (snapshots , batch , worker , retry ))
205159 threads .append (thread )
206160 thread .start ()
207161 for thread in threads :
208162 thread .join ()
163+ failed_urls = len ([url for url in snapshots .CDX_RESULT_COLLECTION if url ["success" ] == False ])
164+ if failed_urls : print (f"\n -----> Failed downloads: { len (failed_urls )} " )
209165
210- def download_url_list (url_list , worker , retry ):
166+ def download_url_list (snapshots , url_list , worker , retry , attempt = 1 , connection = None ):
167+ max_attempt = retry
211168 failed_urls = []
212- connection = http .client .HTTPSConnection ("web.archive.org" )
213- for url_entry in url_list :
214- status = f"\n -----> Snapshot [{ url_list .index (url_entry ) + 1 } /{ len (url_list )} ] Worker: { worker } "
215- download_url , download_filename , download_filepath = url_entry ["url" ], url_entry ["filename" ], url_entry ["filepath" ]
216- download_status = download_url_entry (download_url , download_filename , download_filepath , connection , status )
217- if download_status != True : failed_urls .append ({"url" : download_url , "filename" : download_filename , "filepath" : download_filepath })
218- if retry :
219- download_retry (failed_urls , retry , connection )
220- connection .close ()
221-
222- def download_retry (failed_urls , retry , connection ):
223- """
224- Retry failed downloads.
225- failed_urls: [{"url": download_url, "filename": download_filename, "filepath": download_filepath}]
226- retry: int or None
227- """
228- attempt = 1
229- max_attempt = retry if retry is not True else "no-limit"
230- while failed_urls and (attempt <= retry or retry is True ):
231- print ("\n -----> Retrying..." )
232- retry_urls = []
233- for failed_entry in failed_urls :
234- status = f"\n -----> RETRY attempt: [{ attempt } /{ max_attempt } ] Snapshot [{ failed_urls .index (failed_entry ) + 1 } /{ len (failed_urls )} ]"
235- download_url , download_filename , download_filepath = failed_entry ["url" ], failed_entry ["filename" ], failed_entry ["filepath" ]
236- retry_status = download_url_entry (download_url , download_filename , download_filepath , connection , status )
237- if retry_status != bool (1 ):
238- retry_urls .append ({"url" : download_url , "filename" : download_filename , "filepath" : download_filepath })
239- failed_urls = retry_urls
240- print (f"\n -----> Fail downloads: { len (failed_urls )} " )
241- if retry : attempt += 1
242-
243- def download_url_entry (url , filename , filepath , connection , status_message ):
169+ if not connection :
170+ connection = http .client .HTTPSConnection ("web.archive.org" )
171+ if attempt > max_attempt :
172+ connection .close ()
173+ print (f"\n -----> Worker: { worker } - Failed downloads: { len (url_list )} " )
174+ return
175+ else :
176+ for url_entry in url_list :
177+ status = f"\n -----> Attempt: [{ attempt } /{ max_attempt } ] Snapshot [{ url_list .index (url_entry ) + 1 } /{ len (url_list )} ] Worker: { worker } "
178+ download_status = download_url_entry (url_entry , connection , status )
179+ if download_status != True : failed_urls .append (url_entry ); url_entry ["retry" ] += 1
180+ if download_status == True : snapshots .set_value (url_entry ["index" ], "success" , True )
181+ attempt += 1
182+ if failed_urls : download_url_list (snapshots , failed_urls , worker , retry , attempt , connection )
183+
184+ def download_url_entry (download_entry , connection , status_message ):
244185 """
245186 Download a single URL and save it to the specified filepath.
246187
247188 Args:
248- url (str): The URL to download.
249- filename (str): The name of the file to save.
250- filepath (str): The path where the file will be saved.
189+ download_url (str): The URL to download.
190+ download_file (str): The name of the file to save.
251191 connection (http.client.HTTPConnection): The HTTP connection object.
252- status (str): The current status message.
192+ status_message (str): The current status message.
253193
254194 Returns:
255195 bool: True if the download is successful, False otherwise.
256196 """
257- output = os .path .join (filepath , filename )
197+ download_url = download_entry ["url" ]
198+ download_file = download_entry ["file" ]
258199 max_retries = 2
259200 sleep_time = 45
260201 headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' }
261202 for i in range (max_retries ):
262203 try :
263- connection .request ("GET" , url , headers = headers )
204+ connection .request ("GET" , download_url , headers = headers )
264205 response = connection .getresponse ()
265206 response_data = response .read ()
266207 response_status = response .status
267208 if response_status == 302 :
268209 status_message = f"{ status_message } \n " + \
269210 f"REDIRECT -> HTTP: { response .status } "
270211 while response_status == 302 :
271- connection .request ("GET" , url , headers = headers )
212+ connection .request ("GET" , download_url , headers = headers )
272213 response = connection .getresponse ()
273214 response_data = response .read ()
274215 response_status = response .status
275216 location = response .getheader ("Location" )
276217 if location :
277218 status_message = f"{ status_message } \n " + \
278219 f" -> URL: { location } "
279- location = urljoin (url , location )
280- url = location
220+ location = urljoin (download_url , location )
221+ download_url = location
281222 else :
282223 break
283224 if response_status != 404 :
284- os .makedirs (filepath , exist_ok = True )
285- with open (output , 'wb' ) as file :
225+ os .makedirs (os . path . dirname ( download_file ) , exist_ok = True )
226+ with open (download_file , 'wb' ) as file :
286227 file .write (response_data )
287228 if response_status == 200 :
288229 status_message = f"{ status_message } \n " + \
289230 f"SUCCESS -> HTTP: { response .status } \n " + \
290- f" -> URL: { url } \n " + \
291- f" -> FILE: { output } "
292- print (status_message )
231+ f" -> URL: { download_url } \n " + \
232+ f" -> FILE: { download_file } "
293233 elif response_status == 404 :
294234 status_message = f"{ status_message } \n " + \
295235 f"NOT FOUND -> HTTP: { response .status } \n " + \
296- f" -> URL: { url } "
236+ f" -> URL: { download_url } "
297237 else :
298238 status_message = f"{ status_message } \n " + \
299239 f"UNEXPECTED -> HTTP: { response .status } \n " + \
300- f" -> URL: { url } \n " + \
301- f" -> FILE: { output } "
240+ f" -> URL: { download_url } \n " + \
241+ f" -> FILE: { download_file } "
242+ print (status_message )
302243 return True
303244 except ConnectionRefusedError as e :
304245 status_message = f"{ status_message } \n " + \
@@ -308,11 +249,11 @@ def download_url_entry(url, filename, filepath, connection, status_message):
308249 time .sleep (sleep_time )
309250 except http .client .HTTPException as e :
310251 status_message = f"{ status_message } \n " + \
311- f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { url } \n " + \
252+ f"EXCEPTION -> ({ i + 1 } /{ max_retries } ), append to failed_urls: { download_url } \n " + \
312253 f" -> { e } "
313254 print (status_message )
314255 return False
315- print (f"FAILED -> download, append to failed_urls: { url } " )
256+ print (f"FAILED -> download, append to failed_urls: { download_url } " )
316257 return False
317258
318259
0 commit comments