44import numpy as np
55import pandas as pd
66
7- from .constants import STATE_TO_ABBREV
7+ from .constants import STATE_TO_ABBREV , DC_FIPS , METRICS
88
99def get_geo_id (region_code ):
1010 """
@@ -16,9 +16,11 @@ def get_geo_id(region_code):
1616 return splits [2 ]
1717 return np .nan
1818
19- def pull_gs_data (base_url , metrics , level ):
20- """Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
21- conforms it into a dataset
19+
20+ def preprocess (df , level ):
21+ """
22+ Conforms the pulled data from Google COVID-19 Search Trends symptoms
23+ data into a dataset
2224
2325 The output dataset has:
2426
@@ -27,50 +29,30 @@ def pull_gs_data(base_url, metrics, level):
2729 - Each row additionally has columns corresponding to sensors such as
2830 "Anosmia" and "Ageusia".
2931
30- Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
31- where there are state level data for 51 states including 'District of Columbia'.
32-
33- We retrieve the county level data from "/subregions/state/**daily**.csv"
34- where there is county level data available except District of Columbia.
35- We filter the data such that we only keep rows with valid FIPS.
36-
37- PS: No information for PR
38-
3932 Parameters
4033 ----------
41- base_url: str
42- Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
43- metrics: list of string
44- Symptoms to consider: "Anosmia" and "Ageusia".
34+ df: pd.DataFrame
35+ Read from the raw url with column "geo_id" for state/fips
36+ level: str
37+ "county" or "state"
4538
4639 Returns
47- -------
40+ ---------
4841 pd.DataFrame
4942 Dataframe as described above.
5043 """
5144 # Constants
52- KEEP_COLUMNS = ["geo_id" , "date" ]
53- for metric in metrics :
54- KEEP_COLUMNS .append ("symptom:" + metric )
55-
56- # Read data
57- if level == "state" :
58- df = pd .read_csv (base_url .format (sub_url = "/" , state = "" ),
59- parse_dates = ["date" ])
60- df ["geo_id" ] = df ["open_covid_region_code" ].apply (
61- lambda x : x .split ("-" )[1 ].lower ()
62- )
63-
64- else :
65- dfList = []
66- for state in list (STATE_TO_ABBREV .keys ()):
67- sub_url = "/subregions/" + "%20" .join (state .split ("_" )) + "/"
68- dfList .append (pd .read_csv (base_url .format (sub_url = sub_url ,
69- state = state + "_" ),
70- parse_dates = ["date" ]))
71- df = pd .concat (dfList )
72- df ["geo_id" ] = df ["open_covid_region_code" ].apply (get_geo_id )
73-
45+ KEEP_COLUMNS = ["geo_id" , "date" ] + METRICS + ["combined_symptoms" ]
46+
47+ df ["combined_symptoms" ] = 0
48+ for metric in METRICS :
49+ df .rename ({"symptom:" + metric : metric }, axis = 1 , inplace = True )
50+ df ["combined_symptoms" ] += df [metric ].fillna (0 )
51+ df .loc [
52+ (df ["Anosmia" ].isnull ())
53+ & (df ["Ageusia" ].isnull ())
54+ , "combined_symptoms" ] = np .nan
55+
7456 # Delete rows with missing FIPS
7557 null_mask = (df ["geo_id" ].isnull ())
7658 df = df .loc [~ null_mask ]
@@ -106,3 +88,58 @@ def pull_gs_data(base_url, metrics, level):
10688 ).rename ({"date" : "timestamp" }, axis = 1 )
10789
10890 return df
91+
92+ def pull_gs_data (base_url ):
93+ """Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
94+ conforms it into a dataset as described in preprocess function.
95+
96+ Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
97+ where there are state level data for 51 states including 'District of Columbia'.
98+
99+ We retrieve the county level data from "/subregions/state/**daily**.csv"
100+ where there is county level data available except District of Columbia.
101+ We filter the data such that we only keep rows with valid FIPS.
102+
103+ PS: No information for PR
104+
105+ Parameters
106+ ----------
107+ base_url: str
108+ Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
109+ level: str
110+ "county" or "state"
111+
112+ Returns
113+ -------
114+ dict: {"county": pd.DataFrame, "state": pd.DataFrame}
115+ """
116+ # Create dictionary for state and county level data
117+ dfs = {}
118+ # For state level data
119+ df = pd .read_csv (base_url .format (sub_url = "/" , state = "" ),
120+ parse_dates = ["date" ])
121+ df ["geo_id" ] = df ["open_covid_region_code" ].apply (
122+ lambda x : x .split ("-" )[1 ].lower ())
123+ dfs ["state" ] = preprocess (df , "state" )
124+
125+ # For county level data
126+ dfList = []
127+ for state in list (STATE_TO_ABBREV .keys ()):
128+ sub_url = "/subregions/" + "%20" .join (state .split ("_" )) + "/"
129+ dfList .append (pd .read_csv (base_url .format (sub_url = sub_url ,
130+ state = state + "_" ),
131+ parse_dates = ["date" ]))
132+ df = pd .concat (dfList )
133+ df ["geo_id" ] = df ["open_covid_region_code" ].apply (get_geo_id )
134+ dfs ["county" ] = preprocess (df , "county" )
135+
136+ # Add District of Columbia County
137+ try :
138+ df_dc_county = dfs ["state" ][dfs ["state" ]["geo_id" ]== "dc" ].drop (
139+ "geo_id" , axis = 1 )
140+ df_dc_county .loc [:, "geo_id" ] = DC_FIPS
141+ dfs ["county" ] = dfs ["county" ].append (df_dc_county )
142+ except KeyError :
143+ pass
144+
145+ return dfs
0 commit comments