cmu-delphi
diff --git a/‎R/aux_data_utils.R‎
Lines changed: 27 additions & 14 deletions b/‎R/aux_data_utils.R‎
Lines changed: 27 additions & 14 deletions
diff --git a/‎R/forecasters/epipredict_utilities.R‎
Lines changed: 5 additions & 0 deletions b/‎R/forecasters/epipredict_utilities.R‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/forecasters/forecaster_scaled_pop.R‎
Lines changed: 4 additions & 0 deletions b/‎R/forecasters/forecaster_scaled_pop.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/imports.R‎
Lines changed: 1 addition & 0 deletions b/‎R/imports.R‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/looping.R‎
Lines changed: 1 addition & 87 deletions b/‎R/looping.R‎
Lines changed: 1 addition & 87 deletions
diff --git a/‎R/scoring.R‎
Lines changed: 34 additions & 13 deletions b/‎R/scoring.R‎
Lines changed: 34 additions & 13 deletions
@@ -188,38 +188,56 @@ daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), day_of_week =
     select(-epiweek, -year)
 }
 
+#' Aggregate a daily archive to a weekly archive.
+#'
+#' @param epi_arch the archive to aggregate.
+#' @param agg_columns the columns to aggregate.
+#' @param agg_method the method to use to aggregate the data, one of "sum" or "mean".
+#' @param day_of_week the day of the week to use as the reference day.
+#' @param day_of_week_end the day of the week to use as the end of the week.
 daily_to_weekly_archive <- function(epi_arch,
                                     agg_columns,
                                     agg_method = c("sum", "mean"),
                                     day_of_week = 4L,
                                     day_of_week_end = 7L) {
+  # How to aggregate the windowed data.
   agg_method <- arg_match(agg_method)
-  keys <- key_colnames(epi_arch, exclude = "time_value")
+  # The columns we will later group by when aggregating.
+  keys <- key_colnames(epi_arch, exclude = c("time_value", "version"))
+  # The versions we will slide over.
   ref_time_values <- epi_arch$DT$version %>%
     unique() %>%
     sort()
+  # Choose a fast function to use to slide and aggregate.
   if (agg_method == "sum") {
     slide_fun <- epi_slide_sum
   } else if (agg_method == "mean") {
     slide_fun <- epi_slide_mean
   }
-  too_many_tibbles <- epix_slide(
+  # Slide over the versions and aggregate.
+  epix_slide(
     epi_arch,
-    .before = 99999999L,
     .versions = ref_time_values,
-    function(x, group, ref_time) {
+    function(x, group_keys, ref_time) {
+      # The last day of the week we will slide over.
       ref_time_last_week_end <-
         floor_date(ref_time, "week", day_of_week_end - 1) # this is over by 1
+      # The last day of the week we will slide over.
       max_time <- max(x$time_value)
+      # The days we will slide over.
       valid_slide_days <- seq.Date(
         from = ceiling_date(min(x$time_value), "week", week_start = day_of_week_end - 1),
         to = floor_date(max(x$time_value), "week", week_start = day_of_week_end - 1),
         by = 7L
       )
+      # If the last day of the week is not the end of the week, add it to the
+      # list of valid slide days (this will produce an incomplete slide, but
+      # that's fine for us, since it should only be 1 day, historically.)
       if (wday(max_time) != day_of_week_end) {
         valid_slide_days <- c(valid_slide_days, max_time)
       }
-      slid_result <- x %>%
+      # Slide over the days and aggregate.
+      x %>%
         group_by(across(all_of(keys))) %>%
         slide_fun(
           agg_columns,
@@ -229,18 +247,13 @@ daily_to_weekly_archive <- function(epi_arch,
         ) %>%
         select(-all_of(agg_columns)) %>%
         rename_with(~ gsub("slide_value_", "", .x)) %>%
-        # only keep 1/week
-        # group_by week, keep the largest in each week
-        # alternatively
-        # switch time_value to the designated day of the week
+        rename_with(~ gsub("_7dsum", "", .x)) %>%
+        # Round all dates to reference day of the week. These will get
+        # de-duplicated by compactify in as_epi_archive below.
         mutate(time_value = round_date(time_value, "week", day_of_week - 1)) %>%
         as_tibble()
     }
-  )
-  too_many_tibbles %>%
-    pull(time_value) %>%
-    max()
-  too_many_tibbles %>%
+  ) %>%
     as_epi_archive(compactify = TRUE)
 }
 
 
@@ -114,6 +114,11 @@ run_workflow_and_format <- function(preproc,
   if (is.null(as_of)) {
     as_of <- max(train_data$time_value)
   }
+
+  # Look at the train data (uncomment for debuggin).
+  # df <- preproc %>% prep(train_data) %>% bake(train_data)
+  # browser()
+
   workflow <- epi_workflow(preproc, trainer) %>%
     fit(train_data) %>%
     add_frosting(postproc)
 
@@ -141,12 +141,16 @@ scaled_pop <- function(epi_data,
       by = c("geo_value" = "abbr")
     )
   }
+
   # with all the setup done, we execute and format
   pred <- run_workflow_and_format(preproc, postproc, trainer, season_data, epi_data)
   # now pred has the columns
   # (geo_value, forecast_date, target_end_date, quantile, value)
   # finally, any postprocessing not supported by epipredict e.g. calibration
   # reintroduce color into the value
+  # if (pred %>% distinct(forecast_date) %>% pull(forecast_date) == as.Date("2023-10-04")) {
+  #   browser()
+  # }
   pred_final <- pred %>%
     rename({{ outcome }} := value) %>%
     data_coloring(outcome, learned_params, join_cols = key_colnames(epi_data, exclude = "time_value"), nonlin_method = nonlin_method) %>%
 
@@ -29,6 +29,7 @@ library(parsnip)
 library(paws.storage)
 library(plotly)
 library(purrr)
+library(qs2)
 library(quantreg)
 library(readr)
 library(recipes)
 
@@ -1,90 +1,4 @@
-#' Generate forecaster predictions on a particular dataset
-#'
-#' A wrapper that turns a forecaster, parameters, data combination into an
-#' actual experiment that outputs a prediction for each day.
-#'
-#' @param archive the epi_df object
-#' @param outcome the name of the target column
-#' @param ahead the number of days ahead to forecast
-#' @param extra_sources any extra columns used for prediction that aren't
-#'   the target
-#' @param forecaster a function that does the actual forecasting for a given
-#'   day. See `exampleSpec.R` for an example function and its documentation
-#'   for the general parameter requirements.
-#' @param slide_training a required parameter that governs how much data to
-#'   exclude before starting the evaluation.
-#' @param n_training_pad a required parameter that determines how many extra
-#'   samples for epix_slide to hand to the forecaster to guarantee that at
-#'   least `ntraining` examples are available to the forecaster.
-#' @param forecaster_args the list of arguments to the forecaster; it must
-#'   contain `ahead`
-#' @param forecaster_args_names a bit of a hack around targets, it contains
-#'   the names of the `forecaster_args`.
-#' @param date_range_step_size the step size (in days) to use when generating
-#'   the forecast dates.
-#' @param cache_key a unique identifier for the cache file
-#'
-#' @importFrom epiprocess epix_slide
-#' @importFrom cli cli_abort
-#' @importFrom rlang !!
-#' @export
-slide_forecaster <- function(epi_archive,
-                             outcome,
-                             ahead,
-                             forecaster = scaled_pop,
-                             slide_training = 0,
-                             n_training_pad = 5,
-                             forecaster_args = list(),
-                             forecaster_args_names = list(),
-                             ref_time_values = NULL,
-                             start_date = NULL,
-                             end_date = NULL,
-                             date_range_step_size = 1L,
-                             cache_key = NULL) {
-  if (length(forecaster_args) > 0) {
-    names(forecaster_args) <- forecaster_args_names
-  }
-  forecaster_args$ahead <- ahead
-  if (!is.numeric(forecaster_args$n_training) && !is.null(forecaster_args$n_training)) {
-    n_training <- as.numeric(forecaster_args$n_training)
-    net_slide_training <- max(slide_training, n_training) + n_training_pad
-  } else {
-    n_training <- Inf
-    net_slide_training <- slide_training + n_training_pad
-  }
-  if (is.null(ref_time_values)) {
-    # restrict the dataset to areas where training is possible
-    if (is.null(start_date)) {
-      start_date <- min(epi_archive$DT$time_value) + net_slide_training
-    }
-    if (is.null(end_date)) {
-      end_date <- max(epi_archive$DT$time_value) - forecaster_args$ahead
-    }
-    ref_time_values <- seq.Date(from = start_date, to = end_date, by = date_range_step_size)
-  }
-
-  # first generate the forecasts
-  before <- n_training + n_training_pad - 1
-  forecaster_args <- rlang::dots_list(
-    !!!list(
-      outcome = outcome
-    ),
-    !!!forecaster_args,
-    .homonyms = "last"
-  )
-  forecaster_wrapper <- function(x) {
-    rlang::inject(forecaster(epi_data = x, !!!forecaster_args))
-  }
-  epix_slide_simple(
-    epi_archive,
-    forecaster_wrapper,
-    ref_time_values,
-    before,
-    cache_key = cache_key
-  )
-}
-
-epix_slide_simple <- function(epi_archive, forecaster, ref_time_values, before, cache_key = NULL) {
+epix_slide_simple <- function(epi_archive, forecaster, ref_time_values, before = Inf, cache_key = NULL) {
   # this is so that changing the object without changing the name doesn't result in pulling the wrong cache
   cache_hash <- rlang::hash(epi_archive)
   dir.create(".exploration_cache/slide_cache", showWarnings = FALSE, recursive = TRUE)
 
@@ -1,30 +1,51 @@
 # Scoring and Evaluation Functions
 
-evaluate_predictions <- function(predictions_cards, truth_data) {
-  checkmate::assert_data_frame(predictions_cards)
+evaluate_predictions <- function(forecasts, truth_data) {
+  checkmate::assert_data_frame(forecasts)
   checkmate::assert_data_frame(truth_data)
   checkmate::assert_names(
-    names(predictions_cards),
+    names(forecasts),
     must.include = c("model", "geo_value", "forecast_date", "target_end_date", "quantile", "prediction")
   )
   checkmate::assert_names(
     names(truth_data),
     must.include = c("geo_value", "target_end_date", "true_value")
   )
 
-  left_join(predictions_cards, truth_data, by = c("geo_value", "target_end_date")) %>%
-    scoringutils::score(metrics = c("interval_score", "ae_median", "coverage")) %>%
-    scoringutils::add_coverage(by = c("model", "geo_value", "forecast_date", "target_end_date"), ranges = c(80)) %>%
-    scoringutils::summarize_scores(by = c("model", "geo_value", "forecast_date", "target_end_date")) %>%
+  # joined_forecasts <- left_join(forecasts, truth_data, by = c("geo_value", "target_end_date"))
+
+  # joined_forecasts %>%
+  #   group_by(model, geo_value, forecast_date, target_end_date) %>%
+  #   summarize(increasing = all(prediction - shift(prediction, 1, 0) > 0)) %>%
+  #   ungroup() %>%
+  #   filter(!increasing)
+
+  pred_final %>%
+    group_by(geo_value, forecast_date, target_end_date) %>%
+    summarize(increasing = all(value - shift(value, 1, 0) > 0)) %>%
+    ungroup() %>%
+    filter(!increasing)
+
+  # joined_forecasts %>% filter(geo_value == "ma", forecast_date == "2023-10-07", target_end_date == "2023-10-21") %>% print(n=50)
+
+  forecast_obj <- left_join(forecasts, truth_data, by = c("geo_value", "target_end_date")) %>%
+    scoringutils::as_forecast_quantile(
+      quantile_level = "quantile",
+      observed = "true_value",
+      predicted = "prediction",
+      forecast_unit = c("model", "geo_value", "forecast_date", "target_end_date")
+    )
+
+  # browser()
+  scores <- forecast_obj %>%
+    scoringutils::score(metrics = get_metrics(.)) %>%
     as_tibble() %>%
     select(
-      model,
-      geo_value,
-      forecast_date,
-      target_end_date,
-      wis = interval_score,
+      model, geo_value, forecast_date, target_end_date,
+      wis,
       ae = ae_median,
-      coverage_80
+      coverage_50 = interval_coverage_50,
+      coverage_90 = interval_coverage_90
     ) %>%
     mutate(ahead = as.numeric(target_end_date - forecast_date))
 }