@@ -192,30 +192,38 @@ def update_dataset(database, network, newer_than=None, older_than=None):
192192 metadata = network .fetch_metadata (logger = logger )
193193 datasets = []
194194 # daily runs specify no bounds; patching runs specify at least one bound
195- patching = any (bound is not None for bound in (newer_than , older_than ))
195+ is_patch_run = any (bound is not None for bound in (newer_than , older_than ))
196+ if is_patch_run :
197+ logger .warn ('runing update_dataset() as a "patch" with some specific date bound[s] specified;'
198+ ' this will include and overwrite any revisions that were already collected.' ,
199+ newer_than = newer_than , older_than = older_than )
196200 if older_than is None :
201+ # by default, include days "older than tomorrow" which thus includes "today"
197202 older_than = (datetime .datetime .today ().date () + datetime .timedelta (days = 1 ))
198203 if newer_than is None :
204+ # by default, include days "newer than the day before the last update"
205+ # which thus includes the day of the last update (in case there are new updates
206+ # that day which were published after the one we already ingested)
199207 with database .connect () as db :
200208 max_issue = db .get_max_issue (logger = logger )
201209 newer_than = (max_issue - datetime .timedelta (days = 1 ))
210+ logger .info ("looking up issues in date range" , newer_than = newer_than , older_than = older_than )
202211 daily_issues = Utils .issues_to_fetch (metadata , newer_than , older_than , logger = logger )
203212 if not daily_issues :
204- logger .info ("no new issues; nothing to do" )
213+ logger .info ("no issues found in date range ; nothing to do" )
205214 return False
206215 for issue , revisions in daily_issues .items ():
207216 issue_int = int (issue .strftime ("%Y%m%d" ))
208217 # download dataset(s) and save associated metadata
209218 dataset_list = []
210219 all_metadata = []
211220 for url , index in revisions :
212- if not patching :
213- # for daily runs, we only want new datasets
214- with database .connect () as db :
215- already_in_db = db .contains_revision (url )
216- if already_in_db :
217- logger .info (f"already collected revision: { url } " )
218- if patching or not already_in_db :
221+ with database .connect () as db :
222+ already_in_db = db .contains_revision (url )
223+ if already_in_db :
224+ logger .info (f"already collected revision: { url } " )
225+ if is_patch_run or not already_in_db :
226+ logger .info (f"including dataset revision: { url } " )
219227 dataset_list .append (network .fetch_dataset (url , logger = logger ))
220228 all_metadata .append ((url , metadata .loc [index ].reset_index ().to_json ()))
221229 if not dataset_list :
@@ -230,8 +238,10 @@ def update_dataset(database, network, newer_than=None, older_than=None):
230238 dataset ,
231239 all_metadata
232240 ))
241+ tot_revs = sum (len (revisions ) for revisions in daily_issues .values ())
242+ logger .info (f"{ len (daily_issues )} issues checked w/ { tot_revs } revisions, resulting in { len (datasets )} datasets." )
233243 if not datasets :
234- logger .info (f" { len ( daily_issues ) } issues checked containing { sum ( len ( revisions ) for revisions in daily_issues . values ()) } revisions; nothing to do" )
244+ logger .info (" nothing to do, exiting " )
235245 return False
236246 with database .connect () as db :
237247 for issue_int , dataset , all_metadata in datasets :
0 commit comments