Skip to content

Commit 6cf2d23

Browse files
Jingjing TangJingjing Tang
authored andcommitted
add geo map to hrr and msa
1 parent ed05eb9 commit 6cf2d23

File tree

4 files changed

+180
-51
lines changed

4 files changed

+180
-51
lines changed

google_symptoms/delphi_google_symptoms/constants.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
METRICS = ["Anosmia", "Ageusia"]
1212
SMOOTHERS = ["raw", "smoothed"]
1313
GEO_RESOLUTIONS = [
14-
"county",
15-
"state",
14+
"state",
15+
"county",
16+
"msa",
17+
"hrr"
1618
]
1719

1820
seven_day_moving_average = partial(kday_moving_average, k=7)
@@ -77,3 +79,5 @@
7779
'West_Virginia': 'wv',
7880
'Wisconsin': 'wi',
7981
'Wyoming': 'wy'}
82+
83+
DC_FIPS = "11001"
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
import pandas as pd
4+
from delphi_utils import GeoMapper
5+
from .constants import METRICS
6+
7+
gmpr = GeoMapper()
8+
def generate_transition_matrix(geo_res):
9+
"""
10+
Generate transition matrix from county to msa/hrr
11+
12+
Parameters
13+
----------
14+
geo_res: str
15+
"msa" or "hrr"
16+
17+
Returns
18+
-------
19+
pd.DataFrame
20+
columns "geo_id", "timestamp", and "val".
21+
The first is a data frame for HRR regions and the second are MSA
22+
regions.
23+
"""
24+
map_df = gmpr._load_crosswalk("fips", geo_res)
25+
# Add population as weights
26+
map_df = gmpr.add_population_column(map_df, "fips")
27+
if geo_res == "hrr":
28+
map_df["population"] = map_df["population"] * map_df["weight"]
29+
msa_pop = map_df.groupby(geo_res).sum().reset_index()
30+
map_df = map_df.merge(
31+
msa_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
32+
)
33+
map_df["weight"] = map_df["population_raw"] / map_df["population_groupsum"]
34+
35+
map_df = pd.pivot_table(
36+
map_df, values='weight', index=["fips"], columns=[geo_res]
37+
).fillna(0).reset_index().rename({"fips": "geo_id"}, axis = 1)
38+
return map_df
39+
40+
def geo_map(df, geo_res):
41+
"""
42+
Compute derived HRR and MSA counts as a weighted sum of the county dataset.
43+
44+
Parameters
45+
----------
46+
df: pd.DataFrame
47+
a data frame with columns "geo_id", "timestamp",
48+
and columns for signal vals
49+
geo_res: str
50+
"msa" or "hrr"
51+
52+
Returns
53+
-------
54+
pd.DataFrame
55+
A dataframe with columns "geo_id", "timestamp",
56+
and columns for signal vals.
57+
The geo_id has been converted from fips to HRRs/MSAs
58+
"""
59+
if geo_res in set(["county", "state"]):
60+
return df
61+
62+
map_df = generate_transition_matrix(geo_res)
63+
dfList = []
64+
for _date in df["timestamp"].unique():
65+
newdf = pd.DataFrame({
66+
"timestamp": _date,
67+
"geo_id": list(map_df.keys())[1:]
68+
})
69+
val_lists = df[df["timestamp"] == _date].merge(
70+
map_df["geo_id"], how="right"
71+
)[METRICS + ["combined_symptoms"]].fillna(0)
72+
newdf = pd.DataFrame(
73+
np.matmul(map_df.values[:, 1:].T, val_lists.values),
74+
columns = list(val_lists.keys())
75+
)
76+
newdf["timestamp"] = _date
77+
newdf["geo_id"] = list(map_df.keys())[1:]
78+
mask = (newdf[METRICS].sum(axis=1) == 0)
79+
newdf.loc[mask, METRICS + ["combined_symptoms"]] = np.nan
80+
dfList.append(newdf)
81+
return pd.concat(dfList)
82+

google_symptoms/delphi_google_symptoms/pull.py

Lines changed: 77 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66

7-
from .constants import STATE_TO_ABBREV
7+
from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS
88

99
def get_geo_id(region_code):
1010
"""
@@ -16,9 +16,11 @@ def get_geo_id(region_code):
1616
return splits[2]
1717
return np.nan
1818

19-
def pull_gs_data(base_url, metrics, level):
20-
"""Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
21-
conforms it into a dataset
19+
20+
def preprocess(df, level):
21+
"""
22+
Conforms the pulled data from Google COVID-19 Search Trends symptoms
23+
data into a dataset
2224
2325
The output dataset has:
2426
@@ -27,50 +29,30 @@ def pull_gs_data(base_url, metrics, level):
2729
- Each row additionally has columns corresponding to sensors such as
2830
"Anosmia" and "Ageusia".
2931
30-
Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
31-
where there are state level data for 51 states including 'District of Columbia'.
32-
33-
We retrieve the county level data from "/subregions/state/**daily**.csv"
34-
where there is county level data available except District of Columbia.
35-
We filter the data such that we only keep rows with valid FIPS.
36-
37-
PS: No information for PR
38-
3932
Parameters
4033
----------
41-
base_url: str
42-
Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
43-
metrics: list of string
44-
Symptoms to consider: "Anosmia" and "Ageusia".
34+
df: pd.DataFrame
35+
Read from the raw url with column "geo_id" for state/fips
36+
level: str
37+
"county" or "state"
4538
4639
Returns
47-
-------
40+
---------
4841
pd.DataFrame
4942
Dataframe as described above.
5043
"""
5144
# Constants
52-
KEEP_COLUMNS = ["geo_id", "date"]
53-
for metric in metrics:
54-
KEEP_COLUMNS.append("symptom:" + metric)
55-
56-
# Read data
57-
if level == "state":
58-
df = pd.read_csv(base_url.format(sub_url="/", state=""),
59-
parse_dates = ["date"])
60-
df["geo_id"] = df["open_covid_region_code"].apply(
61-
lambda x: x.split("-")[1].lower()
62-
)
63-
64-
else:
65-
dfList = []
66-
for state in list(STATE_TO_ABBREV.keys()):
67-
sub_url = "/subregions/" + "%20".join(state.split("_")) + "/"
68-
dfList.append(pd.read_csv(base_url.format(sub_url=sub_url,
69-
state=state+"_"),
70-
parse_dates = ["date"]))
71-
df = pd.concat(dfList)
72-
df["geo_id"] = df["open_covid_region_code"].apply(get_geo_id)
73-
45+
KEEP_COLUMNS = ["geo_id", "date"] + METRICS + ["combined_symptoms"]
46+
47+
df["combined_symptoms"] = 0
48+
for metric in METRICS:
49+
df.rename({"symptom:" + metric: metric}, axis = 1, inplace = True)
50+
df["combined_symptoms"] += df[metric].fillna(0)
51+
df.loc[
52+
(df["Anosmia"].isnull())
53+
& (df["Ageusia"].isnull())
54+
, "combined_symptoms"] = np.nan
55+
7456
# Delete rows with missing FIPS
7557
null_mask = (df["geo_id"].isnull())
7658
df = df.loc[~null_mask]
@@ -106,3 +88,58 @@ def pull_gs_data(base_url, metrics, level):
10688
).rename({"date": "timestamp"}, axis = 1)
10789

10890
return df
91+
92+
def pull_gs_data(base_url):
93+
"""Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
94+
conforms it into a dataset as described in preprocess function.
95+
96+
Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
97+
where there are state level data for 51 states including 'District of Columbia'.
98+
99+
We retrieve the county level data from "/subregions/state/**daily**.csv"
100+
where there is county level data available except District of Columbia.
101+
We filter the data such that we only keep rows with valid FIPS.
102+
103+
PS: No information for PR
104+
105+
Parameters
106+
----------
107+
base_url: str
108+
Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
109+
level: str
110+
"county" or "state"
111+
112+
Returns
113+
-------
114+
dict: {"county": pd.DataFrame, "state": pd.DataFrame}
115+
"""
116+
# Create dictionary for state and county level data
117+
dfs = {}
118+
# For state level data
119+
df = pd.read_csv(base_url.format(sub_url="/", state=""),
120+
parse_dates = ["date"])
121+
df["geo_id"] = df["open_covid_region_code"].apply(
122+
lambda x: x.split("-")[1].lower())
123+
dfs["state"] = preprocess(df, "state")
124+
125+
# For county level data
126+
dfList = []
127+
for state in list(STATE_TO_ABBREV.keys()):
128+
sub_url = "/subregions/" + "%20".join(state.split("_")) + "/"
129+
dfList.append(pd.read_csv(base_url.format(sub_url=sub_url,
130+
state=state+"_"),
131+
parse_dates = ["date"]))
132+
df = pd.concat(dfList)
133+
df["geo_id"] = df["open_covid_region_code"].apply(get_geo_id)
134+
dfs["county"] = preprocess(df, "county")
135+
136+
# Add District of Columbia County
137+
try:
138+
df_dc_county = dfs["state"][dfs["state"]["geo_id"]=="dc"].drop(
139+
"geo_id", axis = 1)
140+
df_dc_county.loc[:, "geo_id"] = DC_FIPS
141+
dfs["county"] = dfs["county"].append(df_dc_county)
142+
except KeyError:
143+
pass
144+
145+
return dfs

google_symptoms/delphi_google_symptoms/run.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from delphi_utils import read_params, create_export_csv
1212

1313
from .pull import pull_gs_data
14+
from .geo import geo_map
1415
from .constants import METRICS, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
1516

1617

@@ -21,16 +22,22 @@ def run_module():
2122
export_dir = params["export_dir"]
2223
base_url = params["base_url"]
2324

25+
# Pull GS data
26+
dfs = pull_gs_data(base_url)
2427
for geo_res in GEO_RESOLUTIONS:
25-
df_pull = pull_gs_data(base_url, METRICS, geo_res)
26-
for metric, smoother in product(METRICS, SMOOTHERS):
28+
if geo_res == "state":
29+
df_pull = dfs["state"]
30+
else:
31+
df_pull = dfs["county"]
32+
df_pull = geo_map(df_pull, geo_res)
33+
for metric, smoother in product(
34+
METRICS+["combined_symptoms"], SMOOTHERS):
2735
print(geo_res, metric, smoother)
28-
# df = df_pull.copy()
29-
# if smoother == "smoothed":
30-
# df = df.fillna(0)
3136
df = df_pull.set_index(["timestamp", "geo_id"])
32-
df["val"] = df["symptom:"+metric].groupby(level=1
33-
).transform(SMOOTHERS_MAP[smoother][0])
37+
if smoother == "smoothed":
38+
df[metric] = df[metric].fillna(0)
39+
df["val"] = df[metric].groupby(level=1
40+
).transform(SMOOTHERS_MAP[smoother][0])
3441
df["se"] = np.nan
3542
df["sample_size"] = np.nan
3643
# Drop early entries where data insufficient for smoothing
@@ -43,5 +50,4 @@ def run_module():
4350
start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
4451
metric=metric.lower(),
4552
geo_res=geo_res,
46-
sensor=sensor_name,
47-
)
53+
sensor=sensor_name)

0 commit comments

Comments
 (0)