Skip to content

Commit 2099ab7

Browse files
committed
add/fix most tests.
unit tests: * factor out mocks for "it's not there yet" / "it's already there" cases * check both cases * tests pass integration tests (specifically state_daily): * move second 3/15 entry and the 3/16 entry to a separate metadata file * add new dataset file for 3/16 showing new day of data * make sure first and second 3/15 entries have different data * add checks for pretend-it's-3/16 * test still broken; ran out of steam when it came time to complete the ON DUPLICATE KEY UPDATE clause
1 parent 58e9519 commit 2099ab7

File tree

8 files changed

+153
-89
lines changed

8 files changed

+153
-89
lines changed

integrations/acquisition/covid_hosp/state_daily/test_scenarios.py

Lines changed: 86 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -47,62 +47,95 @@ def setUp(self):
4747
cur.execute('delete from api_user')
4848
cur.execute('insert into api_user(api_key, email) values("key", "email")')
4949

50-
@freeze_time("2021-03-15")
5150
def test_acquire_dataset(self):
5251
"""Acquire a new dataset."""
5352

54-
# make sure the data does not yet exist
55-
with self.subTest(name='no data yet'):
56-
response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
57-
self.assertEqual(response['result'], -2, response)
58-
59-
# acquire sample data into local database
60-
# mock out network calls to external hosts
61-
with self.subTest(name='first acquisition'), \
62-
patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
63-
patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv"), # dataset for 3/13
64-
self.test_utils.load_sample_dataset("dataset0.csv"), # first dataset for 3/15
65-
self.test_utils.load_sample_dataset()] # second dataset for 3/15
66-
) as mock_fetch:
67-
acquired = Update.run()
68-
self.assertTrue(acquired)
69-
self.assertEqual(mock_fetch_meta.call_count, 1)
70-
71-
# make sure the data now exists
72-
with self.subTest(name='initial data checks'):
73-
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
74-
self.assertEqual(response['result'], 1)
75-
self.assertEqual(len(response['epidata']), 1)
76-
row = response['epidata'][0]
77-
self.assertEqual(row['state'], 'WY')
78-
self.assertEqual(row['date'], 20201209)
79-
self.assertEqual(row['issue'], 20210315) # include today's data by default
80-
self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
81-
self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
82-
actual = row['inpatient_bed_covid_utilization']
83-
expected = 0.11729857819905214
84-
self.assertAlmostEqual(actual, expected)
85-
self.assertIsNone(row['critical_staffing_shortage_today_no'])
86-
87-
# expect 61 fields per row (63 database columns, except `id` and `record_type`)
88-
self.assertEqual(len(row), 118)
89-
90-
with self.subTest(name='all date batches acquired'):
91-
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
92-
self.assertEqual(response['result'], 1)
93-
94-
# re-acquisition of the same dataset should be a no-op
95-
with self.subTest(name='second acquisition'), \
96-
patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
97-
patch.object(Network, 'fetch_dataset', return_value=self.test_utils.load_sample_dataset()) as mock_fetch:
98-
acquired = Update.run()
99-
self.assertFalse(acquired)
100-
101-
# make sure the data still exists
102-
with self.subTest(name='final data checks'):
103-
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
104-
self.assertEqual(response['result'], 1)
105-
self.assertEqual(len(response['epidata']), 1)
53+
with freeze_time("2021-03-15"):
54+
# make sure the data does not yet exist
55+
with self.subTest(name='no data yet'):
56+
response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
57+
self.assertEqual(response['result'], -2, response)
58+
59+
# acquire sample data into local database
60+
# mock out network calls to external hosts
61+
with self.subTest(name='first acquisition'), \
62+
patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
63+
patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv"), # dataset for 3/13
64+
self.test_utils.load_sample_dataset("dataset0.csv")] # dataset for 3/15
65+
) as mock_fetch:
66+
acquired = Update.run()
67+
self.assertTrue(acquired)
68+
self.assertEqual(mock_fetch_meta.call_count, 1)
69+
70+
# make sure the data now exists
71+
with self.subTest(name='initial data checks'):
72+
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
73+
self.assertEqual(response['result'], 1)
74+
self.assertEqual(len(response['epidata']), 1)
75+
row = response['epidata'][0]
76+
self.assertEqual(row['state'], 'WY')
77+
self.assertEqual(row['date'], 20201209)
78+
self.assertEqual(row['issue'], 20210315) # include today's data by default
79+
self.assertEqual(row['critical_staffing_shortage_today_yes'], 5)
80+
self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
81+
self.assertIsNone(row['critical_staffing_shortage_today_no'])
82+
83+
# expect 61 fields per row (63 database columns, except `id` and `record_type`)
84+
self.assertEqual(len(row), 118)
85+
86+
with self.subTest(name='all date batches acquired'):
87+
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
88+
self.assertEqual(response['result'], 1)
89+
90+
# re-acquisition of the same dataset should be a no-op
91+
with self.subTest(name='second acquisition'), \
92+
patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
93+
patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset(), # late posted dataset for 3/15
94+
self.test_utils.load_sample_dataset("dataset1.csv")] # dataset for 3/16
95+
) as mock_fetch:
96+
acquired = Update.run()
97+
self.assertFalse(acquired)
98+
99+
# make sure the data still exists
100+
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
101+
self.assertEqual(response['result'], 1)
102+
self.assertEqual(len(response['epidata']), 1)
103+
104+
with freeze_time("2021-03-16"):
105+
# simulate issue posted after yesterday's run
106+
with self.subTest(name='late issue posted'), \
107+
patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata("metadata2.csv")) as mock_fetch_meta, \
108+
patch.object(Network, 'fetch_dataset', return_value=self.test_utils.load_sample_dataset()) as mock_fetch:
109+
acquired = Update.run()
110+
self.assertTrue(acquired)
111+
self.assertEqual(mock_fetch_meta.call_count, 1)
112+
113+
# make sure everything was filed correctly
114+
with self.subTest(name='late issue data checks'):
115+
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
116+
self.assertEqual(response['result'], 2)
117+
self.assertEqual(len(response['epidata']), 1)
118+
row = response['epidata'][0] # data from 03-15, dataset.csv
119+
self.assertEqual(row['state'], 'WY')
120+
self.assertEqual(row['date'], 20201209)
121+
self.assertEqual(row['issue'], 20210315) # include today's data by default
122+
self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
123+
self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
124+
self.assertIsNone(row['critical_staffing_shortage_today_no'])
125+
row = response['epidata'][1] # data from 03-16, dataset1.csv
126+
self.assertEqual(row['state'], 'WY')
127+
self.assertEqual(row['date'], 20201210)
128+
self.assertEqual(row['issue'], 20210315) # include today's data by default
129+
self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
130+
self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
131+
self.assertIsNone(row['critical_staffing_shortage_today_no'])
132+
133+
# expect 61 fields per row (63 database columns, except `id` and `record_type`)
134+
self.assertEqual(len(row), 118)
135+
136+
with self.subTest(name='all date batches acquired'):
137+
response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210316)
138+
self.assertEqual(response['result'], 1)
106139

107140

108141
@freeze_time("2021-03-16")

src/acquisition/covid_hosp/common/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def nan_safe_dtype(dtype, value):
188188
value_placeholders = ', '.join(['%s'] * num_columns)
189189
columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields)
190190
sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \
191-
f'VALUES ({value_placeholders})'
191+
f'VALUES ({value_placeholders})' # TODO: add ON DUPLICATE KEY UPDATE here for when we need to backfill yesterday's issue
192192
id_and_publication_date = (0, publication_date)
193193
if logger:
194194
logger.info('updating values', count=len(dataframe.index))

src/acquisition/covid_hosp/common/utils.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -191,41 +191,47 @@ def update_dataset(database, network, newer_than=None, older_than=None):
191191

192192
metadata = network.fetch_metadata(logger=logger)
193193
datasets = []
194-
with database.connect() as db:
195-
max_issue = db.get_max_issue(logger=logger)
196-
197-
older_than = (datetime.datetime.today().date() + datetime.timedelta(days=1)) if newer_than is None else older_than
198-
newer_than = (max_issue - datetime.timedelta(days=1)) if newer_than is None else newer_than
194+
# daily runs specify no bounds; patching runs specify at least one bound
195+
patching = any(bound is not None for bound in (newer_than, older_than))
196+
if older_than is None:
197+
older_than = (datetime.datetime.today().date() + datetime.timedelta(days=1))
198+
if newer_than is None:
199+
with database.connect() as db:
200+
max_issue = db.get_max_issue(logger=logger)
201+
newer_than = (max_issue - datetime.timedelta(days=1))
199202
daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than, logger=logger)
200203
if not daily_issues:
201204
logger.info("no new issues; nothing to do")
202205
return False
203206
for issue, revisions in daily_issues.items():
204207
issue_int = int(issue.strftime("%Y%m%d"))
205-
# download new dataset(s) and save associated metadata
208+
# download dataset(s) and save associated metadata
206209
dataset_list = []
207210
all_metadata = []
208211
for url, index in revisions:
209-
with database.connect() as db:
210-
already_in_db = db.contains_revision(url)
211-
if already_in_db:
212-
logger.info(f"already collected revision: {url}")
213-
else:
214-
dataset_list.append( network.fetch_dataset(url, logger=logger) )
212+
if not patching:
213+
# for daily runs, we only want new datasets
214+
with database.connect() as db:
215+
already_in_db = db.contains_revision(url)
216+
if already_in_db:
217+
logger.info(f"already collected revision: {url}")
218+
if patching or not already_in_db:
219+
dataset_list.append(network.fetch_dataset(url, logger=logger))
215220
all_metadata.append((url, metadata.loc[index].reset_index().to_json()))
216221
if not dataset_list:
217-
# we already had all of this issue's revisions in our db, so move on to the next issue
222+
# we already had everything for this issue or the issue was empty:
223+
# move on to the next issue
218224
continue
219225
dataset = Utils.merge_by_key_cols(dataset_list,
220-
db.KEY_COLS,
226+
database.KEY_COLS,
221227
logger=logger)
222228
datasets.append((
223229
issue_int,
224230
dataset,
225231
all_metadata
226232
))
227233
if not datasets:
228-
logger.info("all issues already collected; nothing to do")
234+
logger.info(f"{len(daily_issues)} issues checked containing {sum(len(revisions) for revisions in daily_issues.values())} revisions; nothing to do")
229235
return False
230236
with database.connect() as db:
231237
for issue_int, dataset, all_metadata in datasets:

testdata/acquisition/covid_hosp/state_daily/dataset0.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,4 @@ MI,30,129,4,32,127,4,41,159,23598,163,18003,163,3812,159,376,163,162,159,9,159,9
5151
MN,21,116,2,26,111,2,63,138,10358,139,7558,139,1516,138,182,139,70,138,3,138,2,138,806,139,346,138,355,139,1490,138,1358,139,26,138,21,138,1019,139,0.7296775439273991,139,7558,10358,0.2082417582417582,138,1516,7280,0.151630326065213,138,1516,9998,0.365364308342133,138,346,947,0.7909715407262021,139,806,1019,2020-12-09,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
5252
MO,47,78,16,62,63,16,22,137,17433,141,13521,141,2611,137,315,141,239,137,5,137,18,137,1615,141,645,137,604,141,2546,137,2307,141,65,137,26,137,1931,141,0.7755980037859233,141,13521,17433,0.1964487247009254,137,2611,13291,0.1523959610109146,137,2611,17133,0.3456591639871382,137,645,1866,0.8363542206110823,141,1615,1931,2020-12-09,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
5353
MS,21,85,2,30,76,2,12,106,8799,108,5637,108,1254,106,142,108,30,106,3,106,5,106,718,108,338,106,263,108,1234,106,1066,108,20,106,5,106,881,108,0.6406409819297647,108,5637,8799,0.2250134577426879,106,1254,5573,0.143922873866636,106,1254,8713,0.3953216374269006,106,338,855,0.8149829738933031,108,718,881,2020-12-09,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
54-
WY,8,,2,7,22,2,5,29,1729,31,856,31,198,29,26,31,15,29,0,29,0,29,58,31,32,29,32,31,196,29,189,31,2,29,2,29,137,31,0.4950838635049161,31,856,1729,0.2362768496420047,29,198,838,0.2272985781990521,29,198,1688,0.2519685039370078,29,32,127,0.4233576642335766,31,58,137,2020/12/09,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
54+
WY,5,,5,7,22,2,5,29,1729,31,856,31,198,29,26,31,15,29,0,29,0,29,58,31,32,29,32,31,196,29,189,31,2,29,2,29,137,31,0.4950838635049161,31,856,1729,0.2362768496420047,29,198,838,0.2272985781990521,29,198,1688,0.2519685039370078,29,32,127,0.4233576642335766,31,58,137,2020/12/09,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
state,critical_staffing_shortage_today_yes,critical_staffing_shortage_today_no,critical_staffing_shortage_today_not_reported,critical_staffing_shortage_anticipated_within_week_yes,critical_staffing_shortage_anticipated_within_week_no,critical_staffing_shortage_anticipated_within_week_not_reported,hospital_onset_covid,hospital_onset_covid_coverage,inpatient_beds,inpatient_beds_coverage,inpatient_beds_used,inpatient_beds_used_coverage,inpatient_beds_used_covid,inpatient_beds_used_covid_coverage,previous_day_admission_adult_covid_confirmed,previous_day_admission_adult_covid_confirmed_coverage,previous_day_admission_adult_covid_suspected,previous_day_admission_adult_covid_suspected_coverage,previous_day_admission_pediatric_covid_confirmed,previous_day_admission_pediatric_covid_confirmed_coverage,previous_day_admission_pediatric_covid_suspected,previous_day_admission_pediatric_covid_suspected_coverage,staffed_adult_icu_bed_occupancy,staffed_adult_icu_bed_occupancy_coverage,staffed_icu_adult_patients_confirmed_and_suspected_covid,staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage,staffed_icu_adult_patients_confirmed_covid,staffed_icu_adult_patients_confirmed_covid_coverage,total_adult_patients_hospitalized_confirmed_and_suspected_covid,total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage,total_adult_patients_hospitalized_confirmed_covid,total_adult_patients_hospitalized_confirmed_covid_coverage,total_pediatric_patients_hospitalized_confirmed_and_suspected_covid,total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage,total_pediatric_patients_hospitalized_confirmed_covid,total_pediatric_patients_hospitalized_confirmed_covid_coverage,total_staffed_adult_icu_beds,total_staffed_adult_icu_beds_coverage,inpatient_beds_utilization,inpatient_beds_utilization_coverage,inpatient_beds_utilization_numerator,inpatient_beds_utilization_denominator,percent_of_inpatients_with_covid,percent_of_inpatients_with_covid_coverage,percent_of_inpatients_with_covid_numerator,percent_of_inpatients_with_covid_denominator,inpatient_bed_covid_utilization,inpatient_bed_covid_utilization_coverage,inpatient_bed_covid_utilization_numerator,inpatient_bed_covid_utilization_denominator,adult_icu_bed_covid_utilization,adult_icu_bed_covid_utilization_coverage,adult_icu_bed_covid_utilization_numerator,adult_icu_bed_covid_utilization_denominator,adult_icu_bed_utilization,adult_icu_bed_utilization_coverage,adult_icu_bed_utilization_numerator,adult_icu_bed_utilization_denominator,reporting_cutoff_start,deaths_covid,deaths_covid_coverage,geocoded_state,icu_patients_confirmed_influenza,icu_patients_confirmed_influenza_coverage,on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses,on_hand_supply_therapeutic_b_bamlanivimab_courses,on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses,previous_day_admission_adult_covid_confirmed_18-19,previous_day_admission_adult_covid_confirmed_18-19_coverage,previous_day_admission_adult_covid_confirmed_20-29,previous_day_admission_adult_covid_confirmed_20-29_coverage,previous_day_admission_adult_covid_confirmed_30-39,previous_day_admission_adult_covid_confirmed_30-39_coverage,previous_day_admission_adult_covid_confirmed_40-49,previous_day_admission_adult_covid_confirmed_40-49_coverage,previous_day_admission_adult_covid_confirmed_50-59,previous_day_admission_adult_covid_confirmed_50-59_coverage,previous_day_admission_adult_covid_confirmed_60-69,previous_day_admission_adult_covid_confirmed_60-69_coverage,previous_day_admission_adult_covid_confirmed_70-79,previous_day_admission_adult_covid_confirmed_70-79_coverage,previous_day_admission_adult_covid_confirmed_80+,previous_day_admission_adult_covid_confirmed_80+_coverage,previous_day_admission_adult_covid_confirmed_unknown,previous_day_admission_adult_covid_confirmed_unknown_coverage,previous_day_admission_adult_covid_suspected_18-19,previous_day_admission_adult_covid_suspected_18-19_coverage,previous_day_admission_adult_covid_suspected_20-29,previous_day_admission_adult_covid_suspected_20-29_coverage,previous_day_admission_adult_covid_suspected_30-39,previous_day_admission_adult_covid_suspected_30-39_coverage,previous_day_admission_adult_covid_suspected_40-49,previous_day_admission_adult_covid_suspected_40-49_coverage,previous_day_admission_adult_covid_suspected_50-59,previous_day_admission_adult_covid_suspected_50-59_coverage,previous_day_admission_adult_covid_suspected_60_69,previous_day_admission_adult_covid_suspected_60-69_coverage,previous_day_admission_adult_covid_suspected_70-79,previous_day_admission_adult_covid_suspected_70-79_coverage,previous_day_admission_adult_covid_suspected_80,previous_day_admission_adult_covid_suspected_80+_coverage,previous_day_admission_adult_covid_suspected_unknown,previous_day_admission_adult_covid_suspected_unknown_coverage,previous_day_admission_influenza_confirmed,previous_day_admission_influenza_confirmed_coverage,previous_day_deaths_covid_and_influenza,previous_day_deaths_covid_and_influenza_coverage,previous_day_deaths_influenza,previous_day_deaths_influenza_coverage,previous_week_therapeutic_a_casirivimab_imdevimab_courses_used,previous_week_therapeutic_b_bamlanivimab_courses_used,previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used,total_patients_hospitalized_confirmed_influenza,total_patients_hospitalized_confirmed_influenza_coverage,total_patients_hospitalized_confirmed_influenza_covid,total_patients_hospitalized_confirmed_influenza_covid_coverage
2+
WY,8,,2,7,22,2,5,29,1729,31,856,31,198,29,26,31,15,29,0,29,0,29,58,31,32,29,32,31,196,29,189,31,2,29,2,29,137,31,0.4950838635049161,31,856,1729,0.2362768496420047,29,198,838,0.1172985781990521,29,198,1688,0.2519685039370078,29,32,127,0.4233576642335766,31,58,137,2020/12/10,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56

0 commit comments

Comments
 (0)