add geo map to hrr and msa

Jingjing Tang · Jingjing Tang · commit 6cf2d231a815 · 2020-10-29T01:22:53.000-04:00
diff --git a/google_symptoms/delphi_google_symptoms/constants.py b/google_symptoms/delphi_google_symptoms/constants.py
@@ -11,8 +11,10 @@
 METRICS = ["Anosmia", "Ageusia"]
 SMOOTHERS = ["raw", "smoothed"]
 GEO_RESOLUTIONS = [
-    "county",
-    "state",
+        "state",
+        "county",
+        "msa",
+        "hrr"
 ]
 
 seven_day_moving_average = partial(kday_moving_average, k=7)
@@ -77,3 +79,5 @@
                    'West_Virginia': 'wv',
                    'Wisconsin': 'wi',
                    'Wyoming': 'wy'}
+
+DC_FIPS = "11001"
diff --git a/google_symptoms/delphi_google_symptoms/geo.py b/google_symptoms/delphi_google_symptoms/geo.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+import pandas as pd
+from delphi_utils import GeoMapper
+from .constants import METRICS
+
+gmpr = GeoMapper()
+def generate_transition_matrix(geo_res):
+    """
+    Generate transition matrix from county to msa/hrr
+
+    Parameters
+    ----------
+    geo_res: str
+        "msa" or "hrr"
+
+    Returns
+    -------
+    pd.DataFrame
+        columns "geo_id", "timestamp", and "val".
+        The first is a data frame for HRR regions and the second are MSA
+        regions.
+    """
+    map_df = gmpr._load_crosswalk("fips", geo_res)
+    # Add population as weights
+    map_df = gmpr.add_population_column(map_df, "fips")
+    if geo_res == "hrr":
+        map_df["population"] = map_df["population"] *  map_df["weight"]
+    msa_pop = map_df.groupby(geo_res).sum().reset_index()
+    map_df = map_df.merge(
+            msa_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
+            )
+    map_df["weight"] = map_df["population_raw"] / map_df["population_groupsum"]
+ 
+    map_df = pd.pivot_table(
+                 map_df, values='weight', index=["fips"], columns=[geo_res]
+              ).fillna(0).reset_index().rename({"fips": "geo_id"}, axis = 1)
+    return map_df
+
+def geo_map(df, geo_res):
+    """
+    Compute derived HRR and MSA counts as a weighted sum of the county dataset.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        a data frame with columns "geo_id", "timestamp",
+        and columns for signal vals
+    geo_res: str
+        "msa" or "hrr"
+
+    Returns
+    -------
+    pd.DataFrame
+        A dataframe with columns "geo_id", "timestamp", 
+        and columns for signal vals. 
+        The geo_id has been converted from fips to HRRs/MSAs
+    """
+    if geo_res in set(["county", "state"]):
+        return df
+    
+    map_df = generate_transition_matrix(geo_res)
+    dfList = []
+    for _date in df["timestamp"].unique():
+        newdf = pd.DataFrame({
+                              "timestamp": _date,
+                              "geo_id": list(map_df.keys())[1:]
+                              })
+        val_lists = df[df["timestamp"] == _date].merge(
+                map_df["geo_id"], how="right"
+                )[METRICS + ["combined_symptoms"]].fillna(0)
+        newdf = pd.DataFrame(
+                np.matmul(map_df.values[:, 1:].T, val_lists.values),
+                columns = list(val_lists.keys())
+                )
+        newdf["timestamp"] = _date
+        newdf["geo_id"] = list(map_df.keys())[1:]
+        mask = (newdf[METRICS].sum(axis=1) == 0)
+        newdf.loc[mask, METRICS + ["combined_symptoms"]] = np.nan
+        dfList.append(newdf)
+    return pd.concat(dfList)
+    
diff --git a/google_symptoms/delphi_google_symptoms/pull.py b/google_symptoms/delphi_google_symptoms/pull.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 
-from .constants import STATE_TO_ABBREV
+from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS
 
 def get_geo_id(region_code):
     """
@@ -16,9 +16,11 @@ def get_geo_id(region_code):
         return splits[2]
     return np.nan
 
-def pull_gs_data(base_url, metrics, level):
-    """Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
-    conforms it into a dataset
+
+def preprocess(df, level):
+    """
+    Conforms the pulled data from Google COVID-19 Search Trends symptoms
+    data into a dataset
 
     The output dataset has:
 
@@ -27,50 +29,30 @@ def pull_gs_data(base_url, metrics, level):
     - Each row additionally has columns corresponding to sensors such as
       "Anosmia" and "Ageusia".
 
-    Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
-    where there are state level data for 51 states including 'District of Columbia'.
-
-    We retrieve the county level data from "/subregions/state/**daily**.csv"
-    where there is county level data available except District of Columbia.
-    We filter the data such that we only keep rows with valid FIPS.
-
-    PS:  No information for PR
-
     Parameters
     ----------
-    base_url: str
-        Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
-    metrics: list of string
-        Symptoms to consider: "Anosmia" and "Ageusia".
+    df: pd.DataFrame
+        Read from the raw url with column "geo_id" for state/fips
+    level: str
+        "county" or "state"
 
     Returns
-    -------
+    ---------
     pd.DataFrame
         Dataframe as described above.
     """
     # Constants
-    KEEP_COLUMNS = ["geo_id", "date"]
-    for metric in metrics:
-        KEEP_COLUMNS.append("symptom:" + metric)
-
-    # Read data
-    if level == "state":
-        df = pd.read_csv(base_url.format(sub_url="/", state=""),
-                         parse_dates = ["date"])
-        df["geo_id"] = df["open_covid_region_code"].apply(
-                lambda x: x.split("-")[1].lower()
-        )
-
-    else:
-        dfList = []
-        for state in list(STATE_TO_ABBREV.keys()):
-            sub_url = "/subregions/" + "%20".join(state.split("_")) + "/"
-            dfList.append(pd.read_csv(base_url.format(sub_url=sub_url,
-                                                      state=state+"_"),
-                                      parse_dates = ["date"]))
-        df = pd.concat(dfList)
-        df["geo_id"] = df["open_covid_region_code"].apply(get_geo_id)
-
+    KEEP_COLUMNS = ["geo_id", "date"] + METRICS + ["combined_symptoms"]
+    
+    df["combined_symptoms"] = 0
+    for metric in METRICS:
+        df.rename({"symptom:" + metric: metric}, axis = 1, inplace = True)
+        df["combined_symptoms"] += df[metric].fillna(0)
+    df.loc[
+            (df["Anosmia"].isnull())
+            & (df["Ageusia"].isnull())
+            , "combined_symptoms"] = np.nan
+  
     # Delete rows with missing FIPS
     null_mask = (df["geo_id"].isnull())
     df = df.loc[~null_mask]
@@ -106,3 +88,58 @@ def pull_gs_data(base_url, metrics, level):
         ).rename({"date": "timestamp"}, axis = 1)
 
     return df
+
+def pull_gs_data(base_url):
+    """Pulls the latest Google COVID-19 Search Trends symptoms dataset, and
+    conforms it into a dataset as described in preprocess function.
+
+    Note that we retrieve state level data from "2020_US_daily_symptoms_dataset.csv"
+    where there are state level data for 51 states including 'District of Columbia'.
+
+    We retrieve the county level data from "/subregions/state/**daily**.csv"
+    where there is county level data available except District of Columbia.
+    We filter the data such that we only keep rows with valid FIPS.
+
+    PS:  No information for PR
+
+    Parameters
+    ----------
+    base_url: str
+        Base URL for pulling the Google COVID-19 Search Trends symptoms dataset
+    level: str
+        "county" or "state"
+
+    Returns
+    -------
+    dict: {"county": pd.DataFrame, "state": pd.DataFrame}
+    """
+    # Create dictionary for state and county level data
+    dfs = {}
+    # For state level data
+    df = pd.read_csv(base_url.format(sub_url="/", state=""),
+                     parse_dates = ["date"])
+    df["geo_id"] = df["open_covid_region_code"].apply(
+            lambda x: x.split("-")[1].lower())
+    dfs["state"] = preprocess(df, "state")
+
+    # For county level data
+    dfList = []
+    for state in list(STATE_TO_ABBREV.keys()):
+        sub_url = "/subregions/" + "%20".join(state.split("_")) + "/"
+        dfList.append(pd.read_csv(base_url.format(sub_url=sub_url,
+                                                  state=state+"_"),
+                                  parse_dates = ["date"]))
+    df = pd.concat(dfList)
+    df["geo_id"] = df["open_covid_region_code"].apply(get_geo_id)
+    dfs["county"] = preprocess(df, "county")
+
+    # Add District of Columbia County
+    try:
+        df_dc_county = dfs["state"][dfs["state"]["geo_id"]=="dc"].drop(
+                "geo_id", axis = 1)
+        df_dc_county.loc[:, "geo_id"] = DC_FIPS
+        dfs["county"] = dfs["county"].append(df_dc_county)
+    except KeyError:
+        pass
+
+    return dfs
diff --git a/google_symptoms/delphi_google_symptoms/run.py b/google_symptoms/delphi_google_symptoms/run.py
@@ -11,6 +11,7 @@
 from delphi_utils import read_params, create_export_csv
 
 from .pull import pull_gs_data
+from .geo import geo_map
 from .constants import METRICS, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
 
 
@@ -21,16 +22,22 @@ def run_module():
     export_dir = params["export_dir"]
     base_url = params["base_url"]
 
+    # Pull GS data
+    dfs = pull_gs_data(base_url)    
     for geo_res in GEO_RESOLUTIONS:
-        df_pull = pull_gs_data(base_url, METRICS, geo_res)
-        for metric, smoother in product(METRICS, SMOOTHERS):
+        if geo_res == "state":
+            df_pull = dfs["state"]
+        else:
+            df_pull = dfs["county"]
+            df_pull = geo_map(df_pull, geo_res)
+        for metric, smoother in product(
+                METRICS+["combined_symptoms"], SMOOTHERS):
             print(geo_res, metric, smoother)
-#            df = df_pull.copy()
-#            if smoother == "smoothed":
-#                df = df.fillna(0)  
             df = df_pull.set_index(["timestamp", "geo_id"])
-            df["val"] = df["symptom:"+metric].groupby(level=1
-                         ).transform(SMOOTHERS_MAP[smoother][0])
+            if smoother == "smoothed":
+                df[metric] = df[metric].fillna(0)
+            df["val"] = df[metric].groupby(level=1
+                                 ).transform(SMOOTHERS_MAP[smoother][0])
             df["se"] = np.nan
             df["sample_size"] = np.nan
             # Drop early entries where data insufficient for smoothing
@@ -43,5 +50,4 @@ def run_module():
                 start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                 metric=metric.lower(),
                 geo_res=geo_res,
-                sensor=sensor_name,
-            )
+                sensor=sensor_name)