climatological-baseline submission process

dsweber2 · dsweber2 · commit a6de31ae215d · 2024-12-17T14:58:53.000-06:00
diff --git a/R/forecasters/formatters.R b/R/forecasters/formatters.R
@@ -64,7 +64,7 @@ format_flusight <- function(pred, disease = c("flu", "covid")) {
     mutate(
       reference_date = get_forecast_reference_date(forecast_date),
       target = glue::glue("wk inc {disease} hosp"),
-      horizon = floor((target_end_date - reference_date) / 7),
+      horizon = as.integer(floor((target_end_date - reference_date) / 7)),
       output_type = "quantile",
       output_type_id = quantile,
       value = value
diff --git a/R/utils.R b/R/utils.R
@@ -252,11 +252,11 @@ filter_forecast_geos <- function(forecasts, truth_data) {
 }
 
 #' Write a submission file. pred is assumed to be in the correct submission format.
-write_submission_file <- function(pred, forecast_reference_date, submission_directory) {
+write_submission_file <- function(pred, forecast_reference_date, submission_directory, file_name = "CMU-TimeSeries") {
   if (!file.exists(submission_directory)) {
     cli::cli_abort("Submission directory does not exist.", call = rlang::current_call())
   }
-  file_path <- file.path(submission_directory, sprintf("%s-CMU-TimeSeries.csv", forecast_reference_date))
+  file_path <- file.path(submission_directory, sprintf("%s-%s.csv", forecast_reference_date, file_name))
   if (file.exists(file_path)) {
     cli::cli_warn(c("Overwriting existing file in", file_path), call = rlang::current_call())
     file.remove(file_path)
diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R
@@ -4,13 +4,12 @@ source("scripts/targets-exploration-common.R")
 # These globals are needed by make_forecasts_and_scores (and they need to persist
 # during the actual targets run, since the commands are frozen as expressions).
 hhs_signal <- "confirmed_admissions_covid_1d"
-ref_time_values_ <- as.Date(c("2023-11-08", "2023-11-22"))
 if (!exists("ref_time_values_")) {
   # Alternatively you can let slide_forecaster figure out ref_time_values
   start_date <- as.Date("2023-10-04")
   end_date <- as.Date("2024-04-24")
   date_step <- 7L
-  ref_time_values_ <- NULL
+  #ref_time_values_ <- as.Date(c("2023-11-08", "2023-11-22"))
 }
 time_value_adjust <- 3 # this moves the week marker from Saturday to Wednesday
 
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
@@ -8,7 +8,7 @@ insufficient_data_geos <- c("as", "mp", "vi", "gu")
 # date to cut the truth data off at, so we don't have too much of the past
 truth_data_date <- "2023-09-01"
 # Generically set the generation date to the next Wednesday (or today if it's Wednesday)
-forecast_generation_date <- Sys.Date()
+forecast_generation_date <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 
 forecaster_fns <- list2(
   linear = function(...) {
@@ -29,12 +29,6 @@ forecaster_fns <- list2(
     )
   },
 )
-geo_forecasters_weights <- parse_prod_weights(here::here("covid_geo_exclusions.csv"), forecast_generation_date)
-geo_exclusions <- exclude_geos(geo_forecasters_weights)
-if (nrow(geo_forecasters_weights %>% filter(forecast_date == forecast_generation_date)) == 0) {
-  cli_abort("there are no weights  for the forecast date {forecast_generation_date}")
-}
-
 
 rlang::list2(
   tar_target(
@@ -56,6 +50,23 @@ rlang::list2(
       )
     ),
     names = "forecast_generation_date",
+    tar_target(
+      name = geo_forecasters_weights,
+      command = {
+        geo_forecasters_weights <- parse_prod_weights(here::here("covid_geo_exclusions.csv"), forecast_generation_date)
+        if (nrow(geo_forecasters_weights %>% filter(forecast_date == forecast_generation_date)) == 0) {
+          cli_abort("there are no weights  for the forecast date {forecast_generation_date}")
+        }
+        geo_forecasters_weights
+      },
+      cue = tar_cue(mode = "always")
+    ),
+    tar_target(
+      name = geo_exclusions,
+      command = {
+        exclude_geos(geo_forecasters_weights)
+      }
+    ),
     tar_target(
       nhsn_latest_data,
       command = {
@@ -123,6 +134,23 @@ rlang::list2(
       },
       cue = tar_cue(mode = "always")
     ),
+    tar_target(
+      name = make_climate_submission_csv,
+      command = {
+        forecasts <- forecast_res
+        forecasts %>%
+          filter(forecaster %in% c("climate_base", "climate_geo_agged")) %>%
+          group_by(geo_value, target_end_date, quantile) %>%
+          summarize(forecast_date = first(forecast_date), value = mean(value, na.rm = TRUE), .groups = "drop") %>%
+          ungroup() %>%
+          format_flusight(disease = "covid") %>%
+          write_submission_file(
+            get_forecast_reference_date(as.Date(forecast_generation_date)),
+            file.path(submission_directory, "model-output/CMU-climatological-baseline")
+          )
+      },
+      cue = tar_cue(mode = "always")
+    ),
     tar_target(
       name = validate_result,
       command = {
@@ -139,6 +167,22 @@ rlang::list2(
       },
       cue = tar_cue(mode = "always")
     ),
+    tar_target(
+      name = validate_climate_result,
+      command = {
+        make_climate_submission_csv
+        # only validate if we're saving the result to a hub
+        if (submission_directory != "cache") {
+          validation <- validate_submission(
+            submission_directory,
+            file_path = sprintf("CMU-climatological-baseline/%s-CMU-climatological-baseline.csv", get_forecast_reference_date(as.Date(forecast_generation_date))))
+        } else {
+          validation <- "not validating when there is no hub (set submission_directory)"
+        }
+        validation
+      },
+      cue = tar_cue(mode = "always")
+    ),
     tar_target(
       name = truth_data,
       command = {
diff --git a/scripts/flu_hosp_prod.R b/scripts/flu_hosp_prod.R
@@ -9,7 +9,7 @@ truth_data_date <- "2023-09-01"
 # needed to create the aux data targets
 end_date <- Sys.Date()
 # Generically set the generation date to the next Wednesday (or today if it's Wednesday)
-forecast_generation_date <- Sys.Date()
+forecast_generation_date <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
 very_latent_locations <- list(list(
   c("source"),
   c("flusurv", "ILI+")
@@ -50,12 +50,6 @@ forecaster_fns <- list2(
       mutate(target_end_date = target_end_date + 3)
   },
 )
-geo_forecasters_weights <- parse_prod_weights(here::here("flu_geo_exclusions.csv"), forecast_generation_date)
-geo_exclusions <- exclude_geos(geo_forecasters_weights)
-if (nrow(geo_forecasters_weights %>% filter(forecast_date == forecast_generation_date)) == 0) {
-  geo_forecasters_weights
-  cli_abort("there are no weights  for the forecast date {forecast_generation_date}")
-}
 
 # This is needed to build the data archive
 ref_time_values_ <- seq.Date(as.Date("2023-10-04"), as.Date("2024-04-24"), by = 7L)
@@ -107,6 +101,23 @@ rlang::list2(
   tar_map(
     values = tidyr::expand_grid(tibble(forecast_generation_date = forecast_generation_date)),
     names = "forecast_generation_date",
+    tar_target(
+      name = geo_forecasters_weights,
+      command = {
+        geo_forecasters_weights <- parse_prod_weights(here::here("flu_geo_exclusions.csv"), forecast_generation_date)
+        if (nrow(geo_forecasters_weights %>% filter(forecast_date == forecast_generation_date)) == 0) {
+          cli_abort("there are no weights  for the forecast date {forecast_generation_date}")
+        }
+        geo_forecasters_weights
+      },
+      cue = tar_cue(mode = "always")
+    ),
+    tar_target(
+      name = geo_exclusions,
+      command = {
+        exclude_geos(geo_forecasters_weights)
+      }
+    ),
     tar_target(
       forecast_res,
       command = {
@@ -162,6 +173,23 @@ rlang::list2(
       },
       cue = tar_cue(mode = "always")
     ),
+    tar_target(
+      name = make_climate_submission_csv,
+      command = {
+        forecasts <- forecast_res
+        forecasts %>%
+          filter(forecaster %in% c("climate_base", "climate_geo_agged")) %>%
+          group_by(geo_value, target_end_date, quantile) %>%
+          summarize(forecast_date = first(forecast_date), value = mean(value, na.rm = TRUE), .groups = "drop") %>%
+          ungroup() %>%
+          format_flusight(disease = "flu") %>%
+          write_submission_file(
+            get_forecast_reference_date(as.Date(forecast_generation_date)),
+            file.path(submission_directory, "model-output/CMU-climatological-baseline")
+          )
+      },
+      cue = tar_cue(mode = "always")
+    ),
     tar_target(
       name = validate_result,
       command = {
@@ -179,6 +207,22 @@ rlang::list2(
       },
       cue = tar_cue(mode = "always")
     ),
+    tar_target(
+      name = validate_climate_result,
+      command = {
+        make_climate_submission_csv
+        # only validate if we're saving the result to a hub
+        if (submission_directory != "cache") {
+          validation <- validate_submission(
+            submission_directory,
+            file_path = sprintf("CMU-climatological-baseline/%s-CMU-climatological-baseline.csv", get_forecast_reference_date(as.Date(forecast_generation_date))))
+        } else {
+          validation <- "not validating when there is no hub (set submission_directory)"
+        }
+        validation
+      },
+      cue = tar_cue(mode = "always")
+    ),
     tar_target(
       name = truth_data,
       command = {

Original file line number	Diff line number	Diff line change
`@@ -252,11 +252,11 @@ filter_forecast_geos <- function(forecasts, truth_data) {`
`252`	`252`	`}`
`253`	`253`
`254`	`254`	`#' Write a submission file. pred is assumed to be in the correct submission format.`
`255`		`-write_submission_file <- function(pred, forecast_reference_date, submission_directory) {`
	`255`	`+write_submission_file <- function(pred, forecast_reference_date, submission_directory, file_name = "CMU-TimeSeries") {`
`256`	`256`	`if (!file.exists(submission_directory)) {`
`257`	`257`	`cli::cli_abort("Submission directory does not exist.", call = rlang::current_call())`
`258`	`258`	`}`
`259`		`- file_path <- file.path(submission_directory, sprintf("%s-CMU-TimeSeries.csv", forecast_reference_date))`
	`259`	`+ file_path <- file.path(submission_directory, sprintf("%s-%s.csv", forecast_reference_date, file_name))`
`260`	`260`	`if (file.exists(file_path)) {`
`261`	`261`	`cli::cli_warn(c("Overwriting existing file in", file_path), call = rlang::current_call())`
`262`	`262`	`file.remove(file_path)`