Merge pull request #1370 from cmu-delphi/release/indicators_v0.2.6_utils_v0.2.4

krivard · web-flow · commit 5f76fe02b87c · 2021-11-18T16:35:06.000-05:00
Release covidcast-indicators 0.2.6
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.5
+current_version = 0.2.6
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/changehc/delphi_changehc/config.py b/changehc/delphi_changehc/config.py
@@ -19,7 +19,7 @@ class Config:
     BURN_IN_PERIOD = timedelta(days=1)
 
     # shift dates forward for labeling purposes
-    DAY_SHIFT = timedelta(days=1)
+    DAY_SHIFT = timedelta(days=0)
 
     ## data columns
     COVID_COL = "COVID"
diff --git a/changehc/delphi_changehc/update_sensor.py b/changehc/delphi_changehc/update_sensor.py
@@ -155,11 +155,13 @@ def geo_reindex(self, data):
                                                  Config.MIN_DEN,
                                                  Config.MAX_BACKFILL_WINDOW,
                                                  thr_col="den",
-                                                 mega_col=geo)
+                                                 mega_col=geo,
+                                                 date_col=Config.DATE_COL)
         elif geo == "state":
-            data_frame = gmpr.replace_geocode(data, "fips", "state_id", new_col="state")
+            data_frame = gmpr.replace_geocode(data, "fips", "state_id", new_col="state",
+                                              date_col=Config.DATE_COL)
         else:
-            data_frame = gmpr.replace_geocode(data, "fips", geo)
+            data_frame = gmpr.replace_geocode(data, "fips", geo, date_col=Config.DATE_COL)
 
         unique_geo_ids = pd.unique(data_frame[geo])
         data_frame.set_index([geo, Config.DATE_COL],inplace=True)
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -132,6 +132,47 @@ def test_update_sensor(self):
         assert outputs["20200319_hhs_smoothed_outpatient_covid.csv"].empty
         assert outputs["20200319_nation_smoothed_outpatient_covid.csv"].empty
 
+    def test_update_sensor_output_daterange(self):
+        """Tests that output does not change when data range changes"""
+        small_test_data = pd.DataFrame({
+            "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600] * 2,
+            "fips": ["01001"] * 13 + ["42003"] * 13,
+            "den": [30, 50, 50, 10, 1, 5, 5, 50, 50, 50, 0, 0, 0] * 2,
+            "timestamp": list(pd.date_range("20200301", "20200313")) * 2
+        }).set_index(["fips", "timestamp"])
+        startdates = ["2020-03-01", "2020-03-05"]
+        outputs = {s:{} for s in startdates}
+        for startdate in startdates:
+            for geo in ["county", "state", "hhs", "nation"]:
+                td = TemporaryDirectory()
+                su_inst = CHCSensorUpdater(
+                    startdate,
+                    "03-22-2020",
+                    "03-27-2020",
+                    geo,
+                    self.parallel,
+                    self.weekday,
+                    self.numtype,
+                    self.se,
+                    "",
+                    TEST_LOGGER
+                )
+                su_inst.update_sensor(small_test_data.copy(), td.name)
+                for f in os.listdir(td.name):
+                    outputs[startdate][f] = pd.read_csv(os.path.join(td.name, f))
+            assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
+                f"failed {geo} update sensor test"
+            td.cleanup()
+
+        def pretty(key):
+            return "\n".join(f"{s}[{key}]: {len(outputs[s][key])}" for s in startdates)
+        for f in outputs[startdates[-1]]:
+            assert len(outputs[startdates[0]][f]) == len(outputs[startdates[1]][f]), \
+                f"\n{pretty(f)}"
+            assert np.array_equal(
+                outputs[startdates[0]][f].val.values,
+                outputs[startdates[1]][f].val.values
+            ), f
 
 class TestWriteToCsv:
     """Tests for writing output files to CSV."""
@@ -141,7 +182,7 @@ def test_write_to_csv_results(self):
             "val": [0.1, 0.5, 1.5] + [1, 2, 3],
             "se": [0.1, 1, 1.1] + [0.5, np.nan, 0.5],
             "sample_size": [np.nan] * 6,
-            "timestamp": pd.to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"] * 2),
+            "timestamp": pd.to_datetime(["2020-05-02", "2020-05-03", "2020-05-05"] * 2),
             "include": [True, True, True] + [True, False, True],
             "geo_id": ["a"] * 3 + ["b"] * 3,
         })
@@ -197,7 +238,7 @@ def test_write_to_csv_with_se_results(self):
             "val": [0.1, 0.5, 1.5] + [1, 2, 3],
             "se": [0.1, 1, 1.1] + [0.5, np.nan, 0.5],
             "sample_size": [np.nan] * 6,
-            "timestamp": pd.to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"] * 2),
+            "timestamp": pd.to_datetime(["2020-05-02", "2020-05-03", "2020-05-05"] * 2),
             "include": [True, True, True] + [True, False, True],
             "geo_id": ["a"] * 3 + ["b"] * 3,
         })
@@ -231,7 +272,7 @@ def test_write_to_csv_wrong_results(self):
             "val": [0.1, 0.5, 1.5] + [1, 2, 3],
             "se": [0.1, 1, 1.1] + [0.5, 0.5, 0.5],
             "sample_size": [np.nan] * 6,
-            "timestamp": pd.to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"] * 2),
+            "timestamp": pd.to_datetime(["2020-05-02", "2020-05-03", "2020-05-05"] * 2),
             "include": [True, True, True] + [True, False, True],
             "geo_id": ["a"] * 3 + ["b"] * 3,
         }).set_index(["timestamp", "geo_id"]).sort_index()
@@ -241,7 +282,7 @@ def test_write_to_csv_wrong_results(self):
         # nan value for included loc-date
         res1 = res0.copy()
         res1 = res1[res1['include']]
-        res1.loc[("2020-05-01", "a"), "val"] = np.nan
+        res1.loc[("2020-05-02", "a"), "val"] = np.nan
         res1.reset_index(inplace=True)
         with pytest.raises(AssertionError):
             write_to_csv(
@@ -257,7 +298,7 @@ def test_write_to_csv_wrong_results(self):
         # nan se for included loc-date
         res2 = res0.copy()
         res2 = res2[res2['include']]
-        res2.loc[("2020-05-01", "a"), "se"] = np.nan
+        res2.loc[("2020-05-02", "a"), "se"] = np.nan
         res2.reset_index(inplace=True)
         with pytest.raises(AssertionError):
             write_to_csv(
@@ -273,7 +314,7 @@ def test_write_to_csv_wrong_results(self):
         # large se value
         res3 = res0.copy()
         res3 = res3[res3['include']]
-        res3.loc[("2020-05-01", "a"), "se"] = 10
+        res3.loc[("2020-05-02", "a"), "se"] = 10
         res3.reset_index(inplace=True)
         with pytest.raises(AssertionError):
             write_to_csv(
diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R
@@ -18,13 +18,25 @@
 #' @export
 load_responses_all <- function(params, contingency_run = FALSE) {
   msg_plain(paste0("Loading ", length(params$input), " CSVs"))
-  
+
   map_fn <- if (params$parallel) { mclapply } else { lapply }
   input_data <- map_fn(seq_along(params$input), function(i) {
     load_response_one(params$input[i], params, contingency_run)
   })
   
   msg_plain(paste0("Finished loading CSVs"))
+  
+  which_errors <- unlist(lapply(input_data, inherits, "try-error"))
+  if (any( which_errors )) {
+    errored_filenames <- paste(params$input[which_errors], collapse=", ")
+    stop(
+      "ingestion and field creation failed for at least one of input data file(s) ",
+      errored_filenames,
+      " with error(s)\n",
+      unique(input_data[which_errors])
+    )
+  }
+  
   input_data <- bind_rows(input_data)
   msg_plain(paste0("Finished combining CSVs"))
   return(input_data)
diff --git a/facebook/delphiFacebook/integration-tests/testthat/test-integration.R b/facebook/delphiFacebook/integration-tests/testthat/test-integration.R
@@ -385,3 +385,10 @@ test_that("testing national aggregation", {
   }
 
 })
+
+test_that("testing load_responses behavior for missing input", {
+  params <- relativize_params(read_params(test_path("params-test.json")))
+  params$input <- c(params$input, "file-does-not-exist.csv")
+  params$parallel <- TRUE
+  expect_error(load_responses_all(params), regexp="ingestion and field creation failed")
+})
diff --git a/facebook/micro/monthly-archive.sh b/facebook/micro/monthly-archive.sh
@@ -8,13 +8,23 @@ else
 fi
 echo ${MONTH}
 R_MONTH=${MONTH#*_}; R_MONTH=${R_MONTH#0}
-BATCH="cd fb-public-results\nls -1 cvid_responses_${MONTH}*.gz"
-sftp -b <(echo -e "${BATCH}") -P 2222 fb-automation@ftp.delphi.cmu.edu 2>/dev/null | \
-    grep "^cvid" | \
-    awk -F_ 'BEGIN{print "cd fb-public-results"} {key=$3 $4 $5; if (key!=last && last!="") {print record} last=key; record=$0} END{print record}' | \
-    sed '/^cvid/ s/^/get /' >fetch.sftp
-sftp -b fetch.sftp -P 2222 fb-automation@ftp.delphi.cmu.edu
-OUT=${MONTH/_/-}
-Rscript ../monthly-files.R ${MONTH%_*} ${R_MONTH} . >${OUT}.csv
-gzip ${OUT}.csv
-sftp -b <(echo -e "cd fb-public-results\nput ${OUT}.csv.gz") -P 2222 fb-automation@ftp.delphi.cmu.edu
+
+perform_rollup_and_post ()
+{
+    BATCH="cd $1\nls -1 cvid_responses_${MONTH}*.gz"
+    sftp -b <(echo -e "${BATCH}") -P 2222 fb-automation@ftp.delphi.cmu.edu 2>/dev/null | \
+        grep "^cvid" | \
+        awk -F_ -vDIR="$1"  'BEGIN{print "cd " DIR} {key=$3 $4 $5; if (key!=last && last!="") {print record} last=key; record=$0} END{print record}' | \
+        sed '/^cvid/ s/^/get /' >fetch.sftp
+    sftp -b fetch.sftp -P 2222 fb-automation@ftp.delphi.cmu.edu
+    OUT=${MONTH/_/-}$2
+    Rscript ../monthly-files.R ${MONTH%_*} ${R_MONTH} . >${OUT}.csv
+    gzip ${OUT}.csv
+    sftp -b <(echo -e "cd $1\nput ${OUT}.csv.gz") -P 2222 fb-automation@ftp.delphi.cmu.edu
+    rm -rf $1
+    mkdir $1
+    mv *.gz $1/
+}
+
+perform_rollup_and_post "fb-public-results" ""
+perform_rollup_and_post "protected-race-ethnicity-data" "-race-ethnicity"