Merge pull request #1279 from cmu-delphi/release/indicators_v0.1.18_utils_v0.1.13

krivard · web-flow · commit 952a4952571a · 2021-09-30T09:51:38.000-04:00
Release covidcast-indicators 0.1.18
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.17
+current_version = 0.1.18
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/ansible/templates/facebook-params-prod.json.j2 b/ansible/templates/facebook-params-prod.json.j2
@@ -34,6 +34,7 @@
         "Survey of COVID-Like Illness - TODEPLOY- US Expansion - With Translations": "fb-survey",
         "Survey of COVID-Like Illness - Wave 10": "fb-survey",
         "Survey of COVID-Like Illness - Wave 11": "fb-survey",
+        "Survey of COVID-Like Illness - Wave 12": "fb-survey",
         "Survey of COVID-Like Illness - Wave 4": "fb-survey",
         "Survey of COVID-Like Illness - Wave 5": "fb-survey",
         "Survey of COVID-Like Illness - Wave 6": "fb-survey",
diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R
@@ -130,6 +130,9 @@ load_response_one <- function(input_filename, params, contingency_run) {
                            Q80 = col_integer(),
                            I5 = col_character(),
                            I7 = col_character(),
+                           V1alt = col_character(),
+                           V15c = col_character(),
+                           P6 = col_character(),
                            E2_1 = col_integer(),
                            E2_2 = col_integer()
                          ),
@@ -162,6 +165,8 @@ load_response_one <- function(input_filename, params, contingency_run) {
   assert(length(wave) == 1, "can only code one wave at a time")
   
   input_data <- module_assignment(input_data, wave)
+  input_data <- experimental_arm_assignment(input_data, wave)
+  
   input_data <- bodge_v4_translation(input_data, wave)
   input_data <- bodge_C6_C8(input_data, wave)
   input_data <- bodge_B13(input_data, wave)
@@ -364,7 +369,8 @@ filter_data_for_aggregation <- function(df, params, lead_days = 12L)
                dplyr::between(.data$hh_number_sick, 0L, 30L),
                dplyr::between(.data$hh_number_total, 1L, 30L),
                .data$hh_number_sick <= .data$hh_number_total,
-               .data$day >= (as.Date(params$start_date) - lead_days)
+               .data$day >= (as.Date(params$start_date) - lead_days),
+               .data$wave != 12.5 # Ignore experimental Wave 12 data
   )
 
   msg_plain(paste0("Finished filtering data for aggregations"))
@@ -503,6 +509,28 @@ module_assignment <- function(input_data, wave) {
   return(input_data)
 }
 
+#' Label arms of experimental Wave 12.
+#' 
+#' @param input_data data frame of responses, before subsetting to select
+#'   variables
+#' @param wave integer indicating survey version
+#' 
+#' @return data frame with new `module` column
+#' @importFrom dplyr case_when
+experimental_arm_assignment <- function(input_data, wave) {
+  if (wave == 12.5) {
+    assert( "random_number_exp" %in% names(input_data) )
+    input_data$w12_treatment <- case_when(
+      input_data$random_number_exp >= 0.6666 ~ 1, # demographics placed after symptom items
+      input_data$random_number_exp >= 0.3333 ~ 2, # demographics placed after vaccine items
+      input_data$random_number_exp < 0.3333 ~ 3, # alternative wording to V1
+      TRUE ~ NA_real_
+    )
+  }
+  
+  return(input_data)
+}
+
 #' Create dataset for sharing with research partners
 #'
 #' Different survey waves may have different sets of questions. Here we report
@@ -511,11 +539,13 @@ module_assignment <- function(input_data, wave) {
 #'
 #' @param input_data data frame of responses
 #' @param county_crosswalk crosswalk mapping ZIP5 to counties
+#' @param params list containing `produce_individual_raceeth`, indicating
+#'   whether or not to issue microdata with race-ethnicity field
 #' @importFrom stringi stri_trim stri_replace_all
 #' @importFrom dplyr left_join group_by filter ungroup select rename
 #'
 #' @export
-create_complete_responses <- function(input_data, county_crosswalk)
+create_complete_responses <- function(input_data, county_crosswalk, params)
 {
   cols_to_report <- c(
     "start_dt", "end_dt", "date",
@@ -541,9 +571,10 @@ create_complete_responses <- function(input_data, county_crosswalk)
     "B10c", "B13", "C18a", "C18b", "C7a", "D12", "E4",
     "G1", "G2", "G3", "H1", "H2", "H3", "I1", "I2", "I3", "I4", "I5",
     "I6_1", "I6_2", "I6_3", "I6_4", "I6_5", "I6_6", "I6_7", "I6_8",
-    "I7", "K1", "K2", "V11a", "V12a", "V15a", "V15b", "V16", "V3a", "module", # added in Wave 11
+    "I7", "K1", "K2", "V11a", "V12a", "V15a", "V15b", "V16", "V3a", # added in Wave 11
+    "V1alt", "B13a", "V15c", "P1", "P2", "P3", "P4", "P5", "P6", # added in experimental Wave 12
     
-    "raceethnicity", "token", "wave", "UserLanguage",
+    "raceethnicity", "token", "wave", "w12_treatment", "module", "UserLanguage",
     "zip5" # temporarily; we'll filter by this column later and then drop it before writing
   )
 
@@ -617,7 +648,10 @@ surveyID_to_wave <- Vectorize(function(surveyID) {
                 "SV_ddjHkcYrrLWgM2V" = 7,
                 "SV_ewAVaX7Wz3l0UqG" = 8,
                 "SV_6PADB8DyF9SIyXk" = 10,
-                "SV_4VEaeffqQtDo33M" = 11)
+                "SV_4VEaeffqQtDo33M" = 11,
+                "SV_3TL0r243mLkDzCK" = 12.5, # experimental version of Wave 12
+                "TBD finalized version" = 12 # finalized version of Wave 12
+  )
 
   if ( any(names(waves) == surveyID) ) {
       return(waves[[surveyID]])
@@ -667,9 +701,11 @@ filter_complete_responses <- function(data_full, params)
   data_full <- select(data_full, -.data$zip5)
 
   # 9 includes StartDatetime, EndDatetime, Date, token, wave, geo_id,
-  # UserLanguage + two questions (ignore raceethnicity field which may or may
-  # not exist, depending on params)
-  valid_row_filter <- rowSums( !is.na(data_full[, names(data_full) != "raceethnicity"]) ) >= 9
+  # UserLanguage + two questions (ignore raceethnicity, module, and
+  # w12_assignment fields which may or may not exist, depending on params and
+  # survey version)
+  ignore_cols <- c("raceethnicity", "w12_assignment", "module")
+  valid_row_filter <- rowSums( !is.na(data_full[, !(names(data_full) %in% ignore_cols)]) ) >= 9
   data_full <- data_full[valid_row_filter, ]
 
   return(data_full)
diff --git a/facebook/delphiFacebook/R/run.R b/facebook/delphiFacebook/R/run.R
@@ -31,7 +31,7 @@ run_facebook <- function(params)
   msg_df("response data to aggregate", data_agg)
 
   # create "complete" data that will be shared with research partners
-  data_full <- create_complete_responses(input_data, cw_list$county)
+  data_full <- create_complete_responses(input_data, cw_list$county, params)
   data_full <- filter_complete_responses(data_full, params)
   data_full <- join_weights(data_full, params, weights = "full")
   msg_df("full data to share with research partners", data_full)
diff --git a/facebook/delphiFacebook/man/create_complete_responses.Rd b/facebook/delphiFacebook/man/create_complete_responses.Rd
diff --git a/facebook/delphiFacebook/man/experimental_arm_assignment.Rd b/facebook/delphiFacebook/man/experimental_arm_assignment.Rd
diff --git a/facebook/delphiFacebook/unit-tests/testthat/test-responses.R b/facebook/delphiFacebook/unit-tests/testthat/test-responses.R
@@ -124,15 +124,17 @@ test_that("filter_data_for_aggregation works correctly", {
     hh_number_sick = c(0, NA, 4, -5, 55, 5, 5, 5, 3, 3, 0, 30, 1),
     hh_number_total = c(1, 4, NA, 5, 5, -5, 100, 5, 5, 1, 1, 30, 1),
     day = c("2021-01-01", "2021-01-01", "2021-01-02", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2020-01-01"),
-    date = c("2021-01-01", "2021-01-01", "2021-01-02", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2020-01-01")
+    date = c("2021-01-01", "2021-01-01", "2021-01-02", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01", "2020-01-01"),
+    wave = 12
   )
   
   expected <- tibble(
     zip5 = c("10001", "10001", "10001", "10001"),
     hh_number_sick = c(5, 3, 0, 30),
     hh_number_total = c(5, 5, 1, 30),
     day = c("2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01"),
-    date = c("2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01")
+    date = c("2021-01-01", "2021-01-01", "2021-01-01", "2021-01-01"),
+    wave = 12
   )
 
   expect_equal(filter_data_for_aggregation(input, params),