feat: add step_training_window with seasonal and use it

dshemetov · dshemetov · commit f830d1a640cf · 2024-12-12T19:21:37.000-08:00
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -40,16 +40,11 @@ add_season_info <- function(data) {
   }
 
   data %>%
-    select(-any_of(c("season", "season_week"))) %>%
-    {
-      if ("epiweek" %nin% names(.)) {
-        . <- (.) %>% mutate(epiweek = epiweek(time_value))
-      }
-      if ("epiyear" %nin% names(.)) {
-        . <- (.) %>% mutate(epiyear = epiyear(time_value))
-      }
-      .
-    } %>%
+    select(-any_of(c("season", "season_week", "epiweek", "epiyear"))) %>%
+    mutate(
+      epiweek = epiweek(time_value),
+      epiyear = epiyear(time_value)
+    ) %>%
     left_join(
       (.) %>%
         distinct(epiweek, epiyear) %>%
diff --git a/R/default_epipredict_args.R b/R/default_epipredict_args.R
@@ -18,9 +18,9 @@ default_args_list <- function(
     check_enough_data_n = NULL,
     check_enough_data_epi_keys = NULL,
     keys_to_ignore = list(),
-    n_recent = 5 * 7,
-    n_forward = 3 * 7,
     seasonal_window = FALSE,
+    seasonal_backward_window = 5 * 7,
+    seasonal_forward_window = 3 * 7,
     ...) {
   # error checking if lags is a list
   rlang::check_dots_empty()
@@ -72,9 +72,9 @@ default_args_list <- function(
       check_enough_data_n,
       check_enough_data_epi_keys,
       keys_to_ignore,
-      n_recent,
-      n_forward,
-      seasonal_window
+      seasonal_window,
+      seasonal_backward_window,
+      seasonal_forward_window
     ),
     class = c("arx_fcast", "alist")
   )
diff --git a/R/forecasters/epipredict_utilities.R b/R/forecasters/epipredict_utilities.R
@@ -30,12 +30,12 @@ arx_preprocess <- function(preproc, outcome, predictors, args_list) {
   }
   preproc %<>%
     step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%
-    # TODO: Uncomment after debugging
     step_epi_naomit() %>%
-    step_training_window(
+    step_training_window2(
       n_recent = args_list$n_training,
-      # n_forward = args_list$n_forward,
-      # seasonal = args_list$seasonal_window
+      seasonal = args_list$seasonal_window,
+      seasonal_backward_window = args_list$seasonal_backward_window,
+      seasonal_forward_window = args_list$seasonal_forward_window,
     )
   return(preproc)
 }
diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -45,8 +45,8 @@ scaled_pop_seasonal <- function(epi_data,
                                 center_method = c("median", "mean", "none"),
                                 nonlin_method = c("quart_root", "none"),
                                 seasonal_method = c("none", "flu", "covid", "indicator", "window", "climatological"),
-                                season_backward_window = 5 * 7,
-                                season_forward_window = 3 * 7,
+                                seasonal_backward_window = 5 * 7,
+                                seasonal_forward_window = 3 * 7,
                                 train_residual = FALSE,
                                 trainer = parsnip::linear_reg(),
                                 quantile_levels = covidhub_probs(),
@@ -89,9 +89,9 @@ scaled_pop_seasonal <- function(epi_data,
   args_input[["ahead"]] <- ahead
   args_input[["quantile_levels"]] <- quantile_levels
   args_input[["nonneg"]] <- scale_method == "none"
-  args_input[["n_training"]] <- season_backward_window
-  args_input[["n_forward"]] <- season_forward_window + ahead
   args_input[["seasonal_window"]] <- "window" %in% seasonal_method
+  args_input[["seasonal_backward_window"]] <- seasonal_backward_window
+  args_input[["seasonal_forward_window"]] <- seasonal_forward_window + ahead
   args_list <- inject(default_args_list(!!!args_input))
   # if you want to hardcode particular predictors in a particular forecaster
   predictors <- c(outcome, extra_sources[[1]])
@@ -142,25 +142,6 @@ scaled_pop_seasonal <- function(epi_data,
     }
   }
 
-  # TODO: Replace with step_training_window2
-  if ("window" %in% seasonal_method) {
-    last_data_season_week <- epi_data %>%
-      filter(source == "nhsn") %>%
-      filter(time_value == max(time_value)) %>%
-      pull(season_week) %>%
-      max()
-    current_season_week <- convert_epiweek_to_season_week(epiyear(epi_as_of(epi_data)), epiweek(epi_as_of(epi_data)))
-    date_ranges <- epi_data %>%
-      filter(season_week == last_data_season_week) %>%
-      pull(time_value) %>%
-      unique() %>%
-      map(~ c(.x - seq(from = 7, to = season_backward_window, by = 7), .x + seq(from = 0, to = season_forward_window, by = 7))) %>%
-      unlist() %>%
-      as.Date() %>%
-      unique()
-    epi_data <- epi_data %>% filter(time_value %in% unlist(date_ranges))
-  }
-
   if (drop_non_seasons) {
     season_data <- epi_data %>% drop_non_seasons()
   } else {
diff --git a/R/new_epipredict_steps/step_training_window.R b/R/new_epipredict_steps/step_training_window.R
@@ -0,0 +1,174 @@
+#' Limits the size of the training window to the most recent observations
+#'
+#' `step_training_window2` creates a *specification* of a recipe step that
+#'   limits the size of the training window to the `n_recent` most recent
+#'   observations in `time_value` per group, where the groups are formed
+#'   based on the remaining `epi_keys`.
+#'
+#' @param n_recent An integer value that represents the number of most recent
+#'   observations that are to be kept in the training window per key
+#'   The default value is 50.
+#' @param seasonal Bool, default FALSE. If TRUE, the training window will slice
+#'   through epidemic seasons. This is useful for forecasting models that need
+#'   to leverage the data in previous years, but only limited to similar phases
+#'   in the epidemic. Most useful to heavily seasonal data, like influenza.
+#'   Expects n_recent to be finite.
+#' @param seasonal_forward_window An integer value that represents the number of days
+#'   after a season week to include in the training window. The default value
+#'   is 14. Only valid when seasonal is TRUE.
+#' @param seasonal_backward_window An integer value that represents the number of days
+#'   before a season week to include in the training window. The default value
+#'   is 35. Only valid when seasonal is TRUE.
+#' @param epi_keys An optional character vector for specifying "key" variables
+#'   to group on. The default, `NULL`, ensures that every key combination is
+#'   limited.
+#' @inheritParams step_epi_lag
+#' @template step-return
+#'
+#' @details Note that `step_epi_lead()` and `step_epi_lag()` should come
+#' after any filtering step.
+#'
+#' @export
+#'
+#' @examples
+#' tib <- tibble(
+#'   x = 1:10,
+#'   y = 1:10,
+#'   time_value = rep(seq(as.Date("2020-01-01"), by = 1, length.out = 5), 2),
+#'   geo_value = rep(c("ca", "hi"), each = 5)
+#' ) %>%
+#'   as_epi_df()
+#'
+#' epi_recipe(y ~ x, data = tib) %>%
+#'   step_training_window2(n_recent = 3) %>%
+#'   prep(tib) %>%
+#'   bake(new_data = NULL)
+#'
+#' epi_recipe(y ~ x, data = tib) %>%
+#'   step_epi_naomit() %>%
+#'   step_training_window2(n_recent = 3) %>%
+#'   prep(tib) %>%
+#'   bake(new_data = NULL)
+step_training_window2 <-
+  function(recipe,
+           role = NA,
+           n_recent = 50,
+           seasonal = FALSE,
+           seasonal_forward_window = 14,
+           seasonal_backward_window = 35,
+           epi_keys = NULL,
+           id = rand_id("training_window2")) {
+    epipredict:::arg_is_scalar(n_recent, id, seasonal, seasonal_forward_window, seasonal_backward_window)
+    epipredict:::arg_is_pos(n_recent, seasonal_forward_window, seasonal_backward_window)
+    if (is.finite(n_recent)) epipredict:::arg_is_pos_int(n_recent)
+    epipredict:::arg_is_chr(id)
+    epipredict:::arg_is_chr(epi_keys, allow_null = TRUE)
+    add_step(
+      recipe,
+      step_training_window2_new(
+        role = role,
+        trained = FALSE,
+        n_recent = n_recent,
+        seasonal = seasonal,
+        seasonal_forward_window = seasonal_forward_window,
+        seasonal_backward_window = seasonal_backward_window,
+        epi_keys = epi_keys,
+        skip = TRUE,
+        id = id
+      )
+    )
+  }
+
+step_training_window2_new <-
+  function(role, trained, n_recent, seasonal, seasonal_forward_window, seasonal_backward_window, epi_keys, skip, id) {
+    step(
+      subclass = "training_window2",
+      role = role,
+      trained = trained,
+      n_recent = n_recent,
+      seasonal = seasonal,
+      seasonal_forward_window = seasonal_forward_window,
+      seasonal_backward_window = seasonal_backward_window,
+      epi_keys = epi_keys,
+      skip = skip,
+      id = id
+    )
+  }
+
+#' @export
+prep.step_training_window2 <- function(x, training, info = NULL, ...) {
+  ekt <- epipredict:::epi_keys_only(training)
+  ek <- x$epi_keys %||% ekt %||% character(0L)
+
+  hardhat::validate_column_names(training, ek)
+
+  step_training_window2_new(
+    role = x$role,
+    trained = TRUE,
+    n_recent = x$n_recent,
+    seasonal = x$seasonal,
+    seasonal_forward_window = x$seasonal_forward_window,
+    seasonal_backward_window = x$seasonal_backward_window,
+    epi_keys = ek,
+    skip = x$skip,
+    id = x$id
+  )
+}
+
+#' @export
+bake.step_training_window2 <- function(object, new_data, ...) {
+  hardhat::validate_column_names(new_data, object$epi_keys)
+
+  if (object$n_recent < Inf) {
+    new_data %<>%
+      group_by(across(all_of(object$epi_keys))) %>%
+      arrange(time_value) %>%
+      dplyr::slice_tail(n = object$n_recent) %>%
+      ungroup()
+  }
+
+  if (object$seasonal) {
+    new_data %<>% add_season_info()
+
+    last_data_season_week <- new_data %>%
+      filter(time_value == max(time_value)) %>%
+      pull(season_week) %>%
+      max()
+    recent_weeks <- c(last_data_season_week)
+    if (inherits(new_data, "epi_df")) {
+      current_season_week <- convert_epiweek_to_season_week(epiyear(epi_as_of(new_data)), epiweek(epi_as_of(new_data)))
+      recent_weeks <- c(recent_weeks, current_season_week)
+    }
+    date_ranges <- new_data %>%
+      filter(season_week %in% recent_weeks) %>%
+      pull(time_value) %>%
+      unique() %>%
+      map(~ c(.x - 1:(object$seasonal_backward_window), .x + 0:(object$seasonal_forward_window))) %>%
+      unlist() %>%
+      as.Date() %>%
+      unique()
+    new_data %<>% filter(time_value %in% date_ranges)
+  }
+
+
+  new_data
+}
+
+#' @export
+print.step_training_window2 <-
+  function(x, width = max(20, options()$width - 30), ...) {
+    if (x$seasonal) {
+      title <- "# of seasonal observations per key limited to:"
+      n_recent <- x$n_recent
+      seasonal_forward_window <- x$seasonal_forward_window
+      seasonal_backward_window <- x$seasonal_backward_window
+      tr_obj <- recipes::format_selectors(rlang::enquos(n_recent, seasonal_forward_window, seasonal_backward_window), width)
+      recipes::print_step(tr_obj, rlang::enquos(n_recent, seasonal_forward_window, seasonal_backward_window), x$trained, title, width)
+    } else {
+      title <- "# of recent observations per key limited to:"
+      n_recent <- x$n_recent
+      tr_obj <- recipes::format_selectors(rlang::enquos(n_recent), width)
+      recipes::print_step(tr_obj, rlang::enquos(n_recent), x$trained, title, width)
+    }
+    invisible(x)
+  }
diff --git a/scripts/flu_hosp_explore.R b/scripts/flu_hosp_explore.R
@@ -267,7 +267,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       filter_agg_level = "state",
       drop_non_seasons = c(TRUE, FALSE),
       n_training = Inf,
-      season_backward_window = 5,
+      seasonal_backward_window = 5,
       keys_to_ignore = very_latent_locations
     ),
     # Window-based seasonal method shouldn't drop non-seasons
@@ -284,7 +284,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       filter_agg_level = "state",
       drop_non_seasons = FALSE,
       n_training = Inf,
-      season_backward_window = 5,
+      seasonal_backward_window = 5,
       keys_to_ignore = very_latent_locations
     ),
     tidyr::expand_grid(
@@ -300,7 +300,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       filter_agg_level = "state",
       drop_non_seasons = FALSE,
       n_training = Inf,
-      season_backward_window = 8,
+      seasonal_backward_window = 8,
       keys_to_ignore = very_latent_locations
     )
     # trying various window sizes
@@ -339,8 +339,8 @@ forecaster_parameter_combinations_ <- rlang::list2(
     filter_agg_level = "state",
     drop_non_seasons = FALSE,
     n_training = Inf,
-    season_backward_window = c(3, 5, 7, 9, 52),
-    season_forward_window = c(3, 5, 7),
+    seasonal_backward_window = c(3, 5, 7, 9, 52),
+    seasonal_forward_window = c(3, 5, 7),
     keys_to_ignore = very_latent_locations
   ),
   climate_linear = expand_grid(
diff --git a/tests/testthat/test-step-training-window.R b/tests/testthat/test-step-training-window.R
@@ -0,0 +1,21 @@
+source(here::here("R", "load_all.R"))
+
+data <- tribble(
+  ~geo_value, ~time_value, ~version, ~value,
+  "ak", "2024-11-08", "2024-11-13", 1,
+  "ak", "2024-11-07", "2024-11-13", 2,
+  "ak", "2024-10-08", "2024-11-13", 2,
+  "ak", "2024-10-07", "2024-11-13", 2,
+) %>%
+  mutate(time_value = as.Date(time_value), version = as.Date(version)) %>%
+  bind_rows((.) %>% mutate(geo_value = "ca", value = value * 3 + 1)) %>%
+  bind_rows((.) %>% filter(geo_value == "ca") %>% mutate(time_value = time_value - 365)) %>%
+  as_epi_df()
+
+# debugonce(bake.step_training_window2)
+epi_recipe(data) %>%
+  step_training_window2(seasonal_backward_window = 5, seasonal_forward_window = 3, seasonal = TRUE) %>%
+  prep(data) %>%
+  bake(new_data = NULL)
+
+# Seems fine

Original file line number	Diff line number	Diff line change
`@@ -30,12 +30,12 @@ arx_preprocess <- function(preproc, outcome, predictors, args_list) {`
`30`	`30`	`}`
`31`	`31`	`preproc %<>%`
`32`	`32`	`step_epi_ahead(!!outcome, ahead = args_list$ahead) %>%`
`33`		`- # TODO: Uncomment after debugging`
`34`	`33`	`step_epi_naomit() %>%`
`35`		`- step_training_window(`
	`34`	`+ step_training_window2(`
`36`	`35`	`n_recent = args_list$n_training,`
`37`		`- # n_forward = args_list$n_forward,`
`38`		`- # seasonal = args_list$seasonal_window`
	`36`	`+ seasonal = args_list$seasonal_window,`
	`37`	`+ seasonal_backward_window = args_list$seasonal_backward_window,`
	`38`	`+ seasonal_forward_window = args_list$seasonal_forward_window,`
`39`	`39`	`)`
`40`	`40`	`return(preproc)`
`41`	`41`	`}`