Skip to content

Commit 3e34f5c

Browse files
committed
look for new issues from later in the same day as the last issue
1 parent 193ef7b commit 3e34f5c

File tree

1 file changed

+19
-7
lines changed
  • src/acquisition/covid_hosp/common

1 file changed

+19
-7
lines changed

src/acquisition/covid_hosp/common/utils.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -195,26 +195,38 @@ def update_dataset(database, network, newer_than=None, older_than=None):
195195
max_issue = db.get_max_issue(logger=logger)
196196

197197
older_than = (datetime.datetime.today().date() + datetime.timedelta(days=1)) if newer_than is None else older_than
198-
newer_than = max_issue if newer_than is None else newer_than
198+
newer_than = (max_issue - datetime.timedelta(days=1)) if newer_than is None else newer_than
199199
daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger)
200200
if not daily_issues:
201201
logger.info("no new issues; nothing to do")
202202
return False
203203
for issue, revisions in daily_issues.items():
204204
issue_int = int(issue.strftime("%Y%m%d"))
205-
# download the dataset and add it to the database
206-
dataset = Utils.merge_by_key_cols([network.fetch_dataset(url, logger=logger) for url, _ in revisions],
207-
db.KEY_COLS,
208-
logger=logger)
209-
# add metadata to the database
205+
# download new dataset(s) and save associated metadata
206+
dataset_list = []
210207
all_metadata = []
211208
for url, index in revisions:
212-
all_metadata.append((url, metadata.loc[index].reset_index().to_json()))
209+
with database.connect() as db:
210+
already_in_db = db.contains_revision(url):
211+
if already_in_db:
212+
logger.info(f"already collected revision: {url}")
213+
else:
214+
dataset_list.append( network.fetch_dataset(url, logger=logger) )
215+
all_metadata.append((url, metadata.loc[index].reset_index().to_json()))
216+
if not dataset_list:
217+
# we already had all of this issue's revisions in our db, so move on to the next issue
218+
continue
219+
dataset = Utils.merge_by_key_cols(dataset_list,
220+
db.KEY_COLS,
221+
logger=logger)
213222
datasets.append((
214223
issue_int,
215224
dataset,
216225
all_metadata
217226
))
227+
if not datasets:
228+
logger.info("all issues already collected; nothing to do")
229+
return False
218230
with database.connect() as db:
219231
for issue_int, dataset, all_metadata in datasets:
220232
db.insert_dataset(issue_int, dataset, logger=logger)

0 commit comments

Comments
 (0)