33
44class SnapshotCollection :
55
6- CDX_RESULT_JSON = []
7- CDX_RESULT_LIST = []
8- CDX_RESULT_COLLECTION = []
6+ CDX_JSON = []
7+ CDX_LIST = []
8+
9+ SNAPSHOT_COLLECTION = []
910
1011 MODE_CURRENT = 0
1112
12- def __init__ (self , cdxResult = None , cdxCollection = None ):
13- if cdxResult :
14- self . CDX_RESULT_JSON = cdxResult . json ()[ 1 :]
15- self . CDX_RESULT_LIST = [{ "timestamp" : snapshot [ 0 ], "url" : snapshot [ 1 ]} for snapshot in self . CDX_RESULT_JSON ]
16- self .CDX_RESULT_LIST = sorted ( self . CDX_RESULT_LIST , key = lambda k : k [ 'timestamp' ], reverse = True )
17- if cdxCollection :
18- self .CDX_RESULT_COLLECTION = cdxCollection
13+ def __init__ (self ):
14+ pass
15+
16+ def create_full ( self , cdxResult ):
17+ self .CDX_JSON = cdxResult . json ()[ 1 :]
18+ self . CDX_LIST = [{ "id" : i , "timestamp" : snapshot [ 0 ], "url" : snapshot [ 1 ]} for i , snapshot in enumerate ( self . CDX_JSON )]
19+ self . CDX_LIST = sorted ( self .CDX_LIST , key = lambda k : k [ 'timestamp' ], reverse = True )
1920
2021 def create_current (self ):
2122 self .MODE_CURRENT = 1
22- self .CDX_RESULT_LIST = sorted (self .CDX_RESULT_LIST , key = lambda k : k ['timestamp' ], reverse = True )
23+ self .CDX_LIST = sorted (self .CDX_LIST , key = lambda k : k ['timestamp' ], reverse = True )
2324 cdxResult_list_filtered = []
24- for snapshot in self .CDX_RESULT_LIST :
25- if snapshot ["url" ] not in [snapshot ["url" ] for snapshot in cdxResult_list_filtered ]:
25+ url_set = set ()
26+ for snapshot in self .CDX_LIST :
27+ if snapshot ["url" ] not in url_set :
2628 cdxResult_list_filtered .append (snapshot )
27- self .CDX_RESULT_LIST = cdxResult_list_filtered
29+ url_set .add (snapshot ["url" ])
30+ self .CDX_LIST = cdxResult_list_filtered
2831
29- def create_collection (self , output ) :
30- for snapshot in self . CDX_RESULT_LIST :
31- timestamp , url = snapshot [ "timestamp" ], snapshot [ " url" ]
32- url_type = self . __get_url_filetype ( url )
33- download_url = f"http://web.archive.org/web/ { timestamp } { url_type } / { url } "
34- domain , subdir , filename = self . __split_url ( url )
35- if self . MODE_CURRENT : download_dir = os .path .join (output , domain , subdir )
36- else : download_dir = os .path .join (output , domain , timestamp , subdir )
37- download_file = os . path . join ( download_dir , filename )
38- self . CDX_RESULT_COLLECTION . append (
39- {
40- "index " : self . CDX_RESULT_LIST . index ( snapshot ) ,
41- "url " : download_url ,
42- "file " : str ( download_file ) ,
43- "success" : False ,
44- "retry" : 0
45- }
46- )
32+ def create_entry (self , cdx_entry : dict , output : str ) -> dict :
33+ timestamp , url = cdx_entry [ "timestamp" ], cdx_entry [ "url" ]
34+ url_type = self . __get_url_filetype ( url )
35+ download_url = f"http://web.archive.org/web/ { timestamp } { url_type } / { url } "
36+ domain , subdir , filename = self . __split_url ( url )
37+ if self . MODE_CURRENT : download_dir = os . path . join ( output , domain , subdir )
38+ else : download_dir = os .path .join (output , domain , timestamp , subdir )
39+ download_file = os .path .join (download_dir , filename )
40+ cdx_entry = {
41+ "id" : cdx_entry [ "id" ],
42+ "url" : download_url ,
43+ "file " : download_file ,
44+ "timestamp " : timestamp ,
45+ "origin_url " : url ,
46+ "success" : False ,
47+ "retry" : 0
48+ }
49+ return cdx_entry
4750
4851 def count_list (self ):
49- return len (self .CDX_RESULT_LIST )
52+ return len (self .CDX_LIST )
5053
51- def count_collection (self ):
52- return len (self .CDX_RESULT_COLLECTION )
54+ def snapshot_collection_write (self , query_entry : dict ):
55+ if query_entry ["id" ] not in self .SNAPSHOT_COLLECTION :
56+ self .SNAPSHOT_COLLECTION .append (query_entry )
5357
54- def set_value (self , index : int , key : str , value : str ):
55- """
56- Set a value in the collection
58+ def snapshot_collection_update (self , id : int , key : str , value : str ):
59+ index = next ((index for (index , d ) in enumerate (self .SNAPSHOT_COLLECTION ) if d ["id" ] == id ), None )
60+ if index is not None :
61+ self .SNAPSHOT_COLLECTION [index ][key ] = value
5762
58- Args:
59- index (int): Index of the snapshot
60- key (str): Key of the value
61- value (str): Value to set
62- """
63- self .CDX_RESULT_COLLECTION [index ][key ] = value
64-
6563 def __get_url_filetype (self , url ):
66- file_extension = url . split ( "." )[ - 1 ]
64+ file_extension = os . path . splitext ( url )[ 1 ][ 1 : ]
6765 urltype_mapping = {
6866 "jpg" : "im_" ,
6967 "jpeg" : "im_" ,
@@ -80,6 +78,6 @@ def __get_url_filetype(self, url):
8078 def __split_url (self , url ):
8179 parsed_url = urlparse (url )
8280 domain = parsed_url .netloc
83- subdir = parsed_url .path .strip ("/" )
81+ subdir = parsed_url .path .strip ("/" ). rsplit ( "/" , 1 )[ 0 ]
8482 filename = parsed_url .path .split ("/" )[- 1 ] or "index.html"
8583 return domain , subdir , filename
0 commit comments