@@ -195,26 +195,38 @@ def update_dataset(database, network, newer_than=None, older_than=None):
195195 max_issue = db .get_max_issue (logger = logger )
196196
197197 older_than = (datetime .datetime .today ().date () + datetime .timedelta (days = 1 )) if newer_than is None else older_than
198- newer_than = max_issue if newer_than is None else newer_than
198+ newer_than = ( max_issue - datetime . timedelta ( days = 1 )) if newer_than is None else newer_than
199199 daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than , logger = logger )
200200 if not daily_issues :
201201 logger .info ("no new issues; nothing to do" )
202202 return False
203203 for issue , revisions in daily_issues .items ():
204204 issue_int = int (issue .strftime ("%Y%m%d" ))
205- # download the dataset and add it to the database
206- dataset = Utils .merge_by_key_cols ([network .fetch_dataset (url , logger = logger ) for url , _ in revisions ],
207- db .KEY_COLS ,
208- logger = logger )
209- # add metadata to the database
205+ # download new dataset(s) and save associated metadata
206+ dataset_list = []
210207 all_metadata = []
211208 for url , index in revisions :
212- all_metadata .append ((url , metadata .loc [index ].reset_index ().to_json ()))
209+ with database .connect () as db :
210+ already_in_db = db .contains_revision (url ):
211+ if already_in_db :
212+ logger .info (f"already collected revision: { url } " )
213+ else :
214+ dataset_list .append ( network .fetch_dataset (url , logger = logger ) )
215+ all_metadata .append ((url , metadata .loc [index ].reset_index ().to_json ()))
216+ if not dataset_list :
217+ # we already had all of this issue's revisions in our db, so move on to the next issue
218+ continue
219+ dataset = Utils .merge_by_key_cols (dataset_list ,
220+ db .KEY_COLS ,
221+ logger = logger )
213222 datasets .append ((
214223 issue_int ,
215224 dataset ,
216225 all_metadata
217226 ))
227+ if not datasets :
228+ logger .info ("all issues already collected; nothing to do" )
229+ return False
218230 with database .connect () as db :
219231 for issue_int , dataset , all_metadata in datasets :
220232 db .insert_dataset (issue_int , dataset , logger = logger )
0 commit comments