1414
1515
1616
17+
1718# GET: store page to wayback machine and response with redirect to snapshot
1819# POST: store page to wayback machine and response with wayback machine status-page
1920# tag_jobid = '<script>spn.watchJob("spn2-%s", "/_static/",6000);</script>'
@@ -69,7 +70,6 @@ def save_page(url: str):
6970
7071
7172
72-
7373def print_list ():
7474 v .write ("" )
7575 count = sc .count_list ()
@@ -146,6 +146,10 @@ def download_list(output, retry, no_redirect, workers):
146146 for thread in threads :
147147 thread .join ()
148148
149+
150+
151+
152+
149153def download_loop (snapshot_batch , output , worker , retry , no_redirect , attempt = 1 , connection = None ):
150154 """
151155 Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
@@ -174,6 +178,10 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, attempt=1,
174178 time .sleep (15 )
175179 download_loop (failed_urls , output , worker , retry , no_redirect , attempt , connection )
176180
181+
182+
183+
184+
177185def download (output , snapshot_entry , connection , status_message , no_redirect = False ):
178186 """
179187 Download a single URL and save it to the specified filepath.
@@ -202,20 +210,21 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
202210 response = connection .getresponse ()
203211 response_data = response .read ()
204212 response_status = response .status
213+ response_status_message = parse_response_code (response_status )
205214 location = response .getheader ("Location" )
206215 if location :
207- status_message = f"{ status_message } \n " + \
208- f" -> URL: { location } "
209216 location = urljoin (download_url , location )
210217 download_url = location
218+ status_message = f"{ status_message } \n " + \
219+ f" -> URL: { download_url } "
211220 sc .snapshot_entry_modify (snapshot_entry , "redirect_timestamp" , sc .url_get_timestamp (location ))
212221 sc .snapshot_entry_modify (snapshot_entry , "redirect_url" , location )
213222 else :
214223 break
215224 if response_status == 200 :
216- sc . snapshot_entry_modify ( snapshot_entry , "file" , sc .snapshot_entry_create_output (snapshot_entry , output ) )
217- download_file = snapshot_entry [ "file" ]
218- os .makedirs (os . path . dirname ( download_file ) , exist_ok = True )
225+ download_file = sc .snapshot_entry_create_output (snapshot_entry , output )
226+ download_path = os . path . dirname ( download_file )
227+ os .makedirs (download_path , exist_ok = True )
219228 with open (download_file , 'wb' ) as file :
220229 if response .getheader ('Content-Encoding' ) == 'gzip' :
221230 response_data = gzip .decompress (response_data )
@@ -227,12 +236,13 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
227236 f"SUCCESS -> HTTP: { response_status } - { response_status_message } \n " + \
228237 f" -> URL: { download_url } \n " + \
229238 f" -> FILE: { download_file } "
239+ sc .snapshot_entry_modify (snapshot_entry , "file" , download_file )
230240 v .write (status_message )
231241 return True
232242 else :
233243 status_message = f"{ status_message } \n " + \
234244 f"UNEXPECTED -> HTTP: { response_status } - { response_status_message } \n " + \
235- f" -> URL: { download_url } \n "
245+ f" -> URL: { download_url } "
236246 v .write (status_message )
237247 return True
238248 # exception returns false and appends the url to the failed list
@@ -262,6 +272,10 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
262272 500 : "Internal Server Error" ,
263273 503 : "Service Unavailable"
264274}
275+
276+
277+
278+
265279def parse_response_code (response_code : int ):
266280 """
267281 Parse the response code of the Wayback Machine and return a human-readable message.
@@ -270,6 +284,10 @@ def parse_response_code(response_code: int):
270284 return RESPONSE_CODE_DICT [response_code ]
271285 return "Unknown response code"
272286
287+
288+
289+
290+
273291def save_csv (csv_path : str , url : str ):
274292 """
275293 Write a CSV file with the list of snapshots.
@@ -285,4 +303,4 @@ def save_csv(csv_path: str, url: str):
285303 row = csv .DictWriter (file , sc .SNAPSHOT_COLLECTION [0 ].keys ())
286304 row .writeheader ()
287305 for snapshot in sc .SNAPSHOT_COLLECTION :
288- row .writerow (snapshot )
306+ row .writerow (snapshot )
0 commit comments