Skip to content

Commit 63633bb

Browse files
authored
Merge pull request #1006 from dshemetov/nans_hhs_facilities
Add NAN code support to HHS Facilities
2 parents d8fb3f3 + 552b4bb commit 63633bb

13 files changed

+66
-41
lines changed

hhs_facilities/delphi_hhs_facilities/constants.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
"""Registry for signals and geographies to process."""
2+
from numpy import nan
23
from .generate_signals import sum_cols
34

4-
NAN_VALUES = [None, -999999, -999999.0]
5+
NAN_VALUES = {
6+
None: nan,
7+
-999999: 1.5, # -999,999 represents the data range [0-3], so we use the range mean
8+
-999999.0: 1.5
9+
}
510

611
CONFIRMED_ADMISSIONS = "confirmed_admissions_7d"
712
CONFIRMED_SUSPECTED_ADMISSIONS = "sum_confirmed_suspected_admissions_7d"

hhs_facilities/delphi_hhs_facilities/generate_signals.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,20 @@
55
import pandas as pd
66
import numpy as np
77

8+
from delphi_utils import Nans
9+
10+
11+
def add_nancodes(df):
12+
"""Add nancodes to a signal dataframe."""
13+
# Default missingness codes
14+
df["missing_val"] = Nans.NOT_MISSING
15+
df["missing_se"] = Nans.NOT_APPLICABLE
16+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
17+
18+
# Mark any remaining nans with unknown
19+
remaining_nans_mask = df["val"].isnull()
20+
df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER
21+
return df
822

923
def generate_signal(df: pd.DataFrame,
1024
input_cols: list,
@@ -34,10 +48,13 @@ def generate_signal(df: pd.DataFrame,
3448
df_cols = [df[i] for i in input_cols]
3549
df["val"] = signal_func(df_cols)
3650
df["timestamp"] = df["timestamp"] + pd.Timedelta(days=date_offset)
37-
df.dropna(subset=["val"], inplace=True)
38-
df = df.groupby(["timestamp", "geo_id"], as_index=False).sum()
51+
df = df.groupby(["timestamp", "geo_id"], as_index=False).sum(min_count=1)
3952
df["se"] = df["sample_size"] = np.nan
40-
return df[["timestamp", "geo_id", "val", "se", "sample_size"]]
53+
df = add_nancodes(df)
54+
export_columns = [
55+
"timestamp", "geo_id", "val", "se", "sample_size",
56+
"missing_val", "missing_se", "missing_sample_size"]
57+
return df[export_columns]
4158

4259

4360
def sum_cols(cols: list) -> pd.Series:

hhs_facilities/delphi_hhs_facilities/pull.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from datetime import date
44

55
import pandas as pd
6-
import numpy as np
76
from delphi_utils.geomap import GeoMapper
87
from delphi_epidata import Epidata
98

@@ -54,6 +53,6 @@ def pull_data() -> pd.DataFrame:
5453
past_reference_day = int(date(2020, 1, 1).strftime("%Y%m%d")) # first available date in DB
5554
all_states = GeoMapper().get_geo_values("state_id")
5655
responses = pull_data_iteratively(all_states, Epidata.range(past_reference_day, today))
57-
all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan)
56+
all_columns = pd.DataFrame(responses).replace(NAN_VALUES)
5857
all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"], format="%Y%m%d")
5958
return all_columns
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
25013,33.0,NA,NA
3-
72001,76.56462035541196,NA,NA
4-
72141,0.4353796445880453,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
25013,33.00000000,NA,NA,0,1,1
3+
72001,76.56462040,NA,NA,0,1,1
4+
72141,0.43537960,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
25013,98.0,NA,NA
3-
72001,161.08400646203557,NA,NA
4-
72141,0.9159935379644588,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
25013,98.00000000,NA,NA,0,1,1
3+
72001,161.08400650,NA,NA,0,1,1
4+
72141,0.91599350,NA,NA,0,1,1
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
geo_id,val,se,sample_size
2-
230,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
230,33.00000000,NA,NA,0,1,1
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
geo_id,val,se,sample_size
2-
230,98.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
230,98.00000000,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
10380,0.4353796445880453,NA,NA
3-
38660,76.56462035541196,NA,NA
4-
44140,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
10380,0.43537960,NA,NA,0,1,1
3+
38660,76.56462040,NA,NA,0,1,1
4+
44140,33.00000000,NA,NA,0,1,1
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
geo_id,val,se,sample_size
2-
10380,0.9159935379644588,NA,NA
3-
38660,161.08400646203557,NA,NA
4-
44140,98.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
10380,0.91599350,NA,NA,0,1,1
3+
38660,161.08400650,NA,NA,0,1,1
4+
44140,98.00000000,NA,NA,0,1,1
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
geo_id,val,se,sample_size
2-
AL,33.0,NA,NA
3-
PR,33.0,NA,NA
1+
geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size
2+
AL,33.00000000,NA,NA,0,1,1
3+
PR,33.00000000,NA,NA,0,1,1

0 commit comments

Comments
 (0)