set default pull start date to 2020-01-05

Jingjing Tang · Jingjing Tang · commit 19b3737ee38e · 2020-11-05T11:02:52.000-05:00
diff --git a/google_health/delphi_google_health/constants.py b/google_health/delphi_google_health/constants.py
@@ -10,3 +10,5 @@
 
 SIGNALS = [RAW, SMOOTHED]
 GEO_TYPES = [STATE, HRR, MSA, DMA]
+
+PULL_START_DATE = "2020-01-05"
diff --git a/google_health/delphi_google_health/export.py b/google_health/delphi_google_health/export.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 """Function to export the dataset in the format expected of the API.
 """
+from datetime import datetime
+
 import numpy as np
 import pandas as pd
 
@@ -10,7 +12,8 @@
 
 
 def export_csv(
-    df: pd.DataFrame, geo_name: str, sensor: str, smooth: bool, receiving_dir: str
+    df: pd.DataFrame, geo_name: str, sensor: str, smooth: bool,
+    start_date: str, receiving_dir: str
 ) -> None:
     """Export data set in format expected for injestion by the API
 
@@ -27,6 +30,8 @@ def export_csv(
         name of the sensor; only used for naming the output file
     smooth: bool
         should the signal in "val" be smoothed?
+    start_date: str
+        Output start date as a string formated as "YYYY-MM-DD"
     receiving_dir: str
         path to location where the output CSV files to be uploaded should be stored
     """
@@ -39,13 +44,16 @@ def export_csv(
     df["val"] /= RESCALE_VAL
     df["se"] = np.nan
     df["sample_size"] = np.nan
+    
+    start_date = datetime.strptime(start_date, "%Y-%m-%d")
 
     for date in df["timestamp"].unique():
-        date_short = date.replace("-", "")
-        export_fn = f"{date_short}_{geo_name}_{sensor}.csv"
-        df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size"]].to_csv(
-            f"{receiving_dir}/{export_fn}",
-            index=False,
-            na_rep="NA",
-            float_format="%.8f",
-        )
+        if datetime.strptime(date, "%Y-%m-%d") >= start_date:
+            date_short = date.replace("-", "")
+            export_fn = f"{date_short}_{geo_name}_{sensor}.csv"
+            df[df["timestamp"] == date][["geo_id", "val", "se", "sample_size"]].to_csv(
+                f"{receiving_dir}/{export_fn}",
+                index=False,
+                na_rep="NA",
+                float_format="%.8f",
+            )
diff --git a/google_health/delphi_google_health/run.py b/google_health/delphi_google_health/run.py
@@ -17,7 +17,9 @@
 from .pull_api import GoogleHealthTrends, get_counts_states, get_counts_dma
 from .map_values import derived_counts_from_dma
 from .export import export_csv
-from .constants import SIGNALS, RAW, SMOOTHED, MSA, HRR, STATE, DMA
+from .constants import (SIGNALS, RAW, SMOOTHED,
+                        MSA, HRR, STATE, DMA,
+                        PULL_START_DATE)
 
 
 def run_module():
@@ -39,12 +41,12 @@ def run_module():
     wip_signal = params["wip_signal"]
     cache_dir = params["cache_dir"]
 
-    arch_diff = S3ArchiveDiffer(
-        cache_dir, export_dir,
-        params["bucket_name"], "ght",
-        params["aws_credentials"])
-    arch_diff.update_cache()
-    print(arch_diff)
+#    arch_diff = S3ArchiveDiffer(
+#        cache_dir, export_dir,
+#        params["bucket_name"], "ght",
+#        params["aws_credentials"])
+#    arch_diff.update_cache()
+#    print(arch_diff)
     # if missing start_date, set to today (GMT) minus 5 days
     if start_date == "":
         now = datetime.datetime.now(datetime.timezone.utc)
@@ -69,10 +71,10 @@ def run_module():
 
     # read data frame version of the data
     df_state = get_counts_states(
-        ght, start_date, end_date, static_dir=static_dir, data_dir=data_dir
+        ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir
     )
     df_dma = get_counts_dma(
-        ght, start_date, end_date, static_dir=static_dir, data_dir=data_dir
+        ght, PULL_START_DATE, end_date, static_dir=static_dir, data_dir=data_dir
     )
     df_hrr, df_msa = derived_counts_from_dma(df_dma, static_dir=static_dir)
 
@@ -81,27 +83,35 @@ def run_module():
     for signal in signal_names:
         if signal.endswith(SMOOTHED):
             # export each geographic region, with both smoothed and unsmoothed data
-            export_csv(df_state, STATE, signal, smooth=True, receiving_dir=export_dir)
-            export_csv(df_dma, DMA, signal, smooth=True, receiving_dir=export_dir)
-            export_csv(df_hrr, HRR, signal, smooth=True, receiving_dir=export_dir)
-            export_csv(df_msa, MSA, signal, smooth=True, receiving_dir=export_dir)
+            export_csv(df_state, STATE, signal, smooth=True,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_dma, DMA, signal, smooth=True,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_hrr, HRR, signal, smooth=True,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_msa, MSA, signal, smooth=True,
+                       start_date = start_date, receiving_dir=export_dir)
         elif signal.endswith(RAW):
-            export_csv(df_state, STATE, signal, smooth=False, receiving_dir=export_dir)
-            export_csv(df_dma, DMA, signal, smooth=False, receiving_dir=export_dir)
-            export_csv(df_hrr, HRR, signal, smooth=False, receiving_dir=export_dir)
-            export_csv(df_msa, MSA, signal, smooth=False, receiving_dir=export_dir)
-    # Diff exports, and make incremental versions
-    _, common_diffs, new_files = arch_diff.diff_exports()
-
-    # Archive changed and new files only
-    to_archive = [f for f, diff in common_diffs.items() if diff is not None]
-    to_archive += new_files
-    _, fails = arch_diff.archive_exports(to_archive)
-
-    # Filter existing exports to exclude those that failed to archive
-    succ_common_diffs = {f: diff for f, diff in common_diffs.items() if f not in fails}
-    arch_diff.filter_exports(succ_common_diffs)
-
-    # Report failures: someone should probably look at them
-    for exported_file in fails:
-        print(f"Failed to archive '{exported_file}'")
+            export_csv(df_state, STATE, signal, smooth=False,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_dma, DMA, signal, smooth=False,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_hrr, HRR, signal, smooth=False,
+                       start_date=start_date, receiving_dir=export_dir)
+            export_csv(df_msa, MSA, signal, smooth=False,
+                       start_date=start_date, receiving_dir=export_dir)
+#    # Diff exports, and make incremental versions
+#    _, common_diffs, new_files = arch_diff.diff_exports()
+#
+#    # Archive changed and new files only
+#    to_archive = [f for f, diff in common_diffs.items() if diff is not None]
+#    to_archive += new_files
+#    _, fails = arch_diff.archive_exports(to_archive)
+#
+#    # Filter existing exports to exclude those that failed to archive
+#    succ_common_diffs = {f: diff for f, diff in common_diffs.items() if f not in fails}
+#    arch_diff.filter_exports(succ_common_diffs)
+#
+#    # Report failures: someone should probably look at them
+#    for exported_file in fails:
+#        print(f"Failed to archive '{exported_file}'")