11from dataclasses import dataclass
22from typing import List
33
4+ import covidcast
5+ import numpy as np
46import pandas as pd
57
68@dataclass
@@ -27,33 +29,87 @@ def to_md(self):
2729 message = self .message , updated = self .last_updated .strftime ("%Y-%m-%d" ))
2830
2931def check_source (data_source , meta , params , grace ):
30- """Iterate over all signals from a source and check if they exceed max age."""
32+ """Iterate over all signals from a source and check for problems.
33+
34+ Possible problems:
35+
36+ - Newest available data exceeds max age.
37+ - Gap between subsequent data points exceeds max gap.
38+
39+ For example, consider a source with a max age of 5 days and max gap of 1
40+ day. If today is 2020-10-15, and the latest available data is from
41+ 2020-10-09, the max age is exceeded. If there is no data available on
42+ 2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
43+ days and the max gap is exceeded.
44+
45+ The gap window controls how much data we check for gaps -- a gap window of
46+ 10 days means we check the most recent 10 days of data. Defaults to 7.
47+
48+ """
3149
3250 source_config = params [data_source ]
51+ gap_window = pd .Timedelta (days = source_config .get ("gap_window" , 7 ))
52+ max_allowed_gap = source_config .get ("max_gap" , 1 )
3353
3454 signals = meta [meta .data_source == data_source ]
3555
3656 now = pd .Timestamp .now ()
3757
38- complaints = {}
58+ age_complaints = {}
59+ gap_complaints = {}
3960
4061 for _ , row in signals .iterrows ():
4162 if "retired-signals" in source_config and \
4263 row ["signal" ] in source_config ["retired-signals" ]:
4364 continue
4465
66+ # Check max age
4567 age = (now - row ["max_time" ]).days
4668
4769 if age > source_config ["max_age" ] + grace :
48- if row ["signal" ] not in complaints :
49- complaints [row ["signal" ]] = Complaint (
70+ if row ["signal" ] not in age_complaints :
71+ age_complaints [row ["signal" ]] = Complaint (
5072 "is more than {age} days old" .format (age = age ),
5173 data_source ,
5274 row ["signal" ],
5375 [row ["geo_type" ]],
5476 row ["max_time" ],
5577 source_config ["maintainers" ])
5678 else :
57- complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
79+ age_complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
80+
81+ # Check max gap
82+ if max_allowed_gap == - 1 :
83+ # No gap detection for this source
84+ continue
85+
86+ latest_data = covidcast .signal (
87+ data_source , row ["signal" ],
88+ start_day = row ["max_time" ] - gap_window ,
89+ end_day = row ["max_time" ],
90+ geo_type = row ["geo_type" ]
91+ )
92+
93+ # convert numpy datetime values to pandas datetimes and then to
94+ # datetime.date, so we can work with timedeltas after
95+ unique_dates = [pd .to_datetime (val ).date ()
96+ for val in latest_data ["time_value" ].unique ()]
97+
98+ gap_days = [(day - prev_day ).days
99+ for day , prev_day in zip (unique_dates [1 :], unique_dates [:- 1 ])]
100+ gap = max (gap_days )
101+
102+ if gap > max_allowed_gap :
103+ if row ["signal" ] not in gap_complaints :
104+ gap_complaints [row ["signal" ]] = Complaint (
105+ "has a {gap}-day gap of missing data in its most recent "
106+ "{gap_window} days of data" .format (gap = gap , gap_window = gap_window .days ),
107+ data_source ,
108+ row ["signal" ],
109+ [row ["geo_type" ]],
110+ row ["max_time" ],
111+ source_config ["maintainers" ])
112+ else :
113+ gap_complaints [row ["signal" ]].geo_types .append (row ["geo_type" ])
58114
59- return list (complaints .values ())
115+ return list (age_complaints . values ()) + list ( gap_complaints .values ())
0 commit comments