include climate, only calculate necessary days

dsweber2 · dsweber2 · commit ea276f352e52 · 2025-04-10T10:30:02.000-05:00
diff --git a/R/autoplot.R b/R/autoplot.R
@@ -114,8 +114,12 @@ autoplot.epi_workflow <- function(
   keys <- c("geo_value", "time_value", "key")
   mold_roles <- names(mold$extras$roles)
   # extract the relevant column names for plotting
-  old_name_y <- unlist(strsplit(names(y), "_"))
-  new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_")
+  if (starts_with_impl("ahead_", names(y)) || starts_with_impl("lag_", names(y))) {
+    old_name_y <- unlist(strsplit(names(y), "_"))
+    new_name_y <- paste(old_name_y[-c(1:2)], collapse = "_")
+  } else {
+    new_name_y <- names(y)
+  }
   if (is.null(plot_data)) {
     # the outcome has shifted, so we need to shift it forward (or back)
     # by the corresponding amount
diff --git a/R/climatological_forecaster.R b/R/climatological_forecaster.R
@@ -115,10 +115,42 @@ climatological_forecaster <- function(epi_data,
     mean = function(x, w) mean(x, na.rm = TRUE),
     median = function(x, w) stats::median(x, na.rm = TRUE)
   )
-  # get the point predictions
   keys <- key_colnames(epi_data, exclude = "time_value")
-  epi_data <- epi_data %>% mutate(.idx = time_aggr(time_value), .weights = 1)
-  climate_center <- epi_data %>%
+  # Get the prediction geo and .idx for the target date(s)
+  predictions <- epi_data %>%
+    select(all_of(keys)) %>%
+    dplyr::distinct() %>%
+    mutate(forecast_date = forecast_date, .idx = time_aggr(forecast_date))
+  predictions <-
+    map(horizon, ~ {
+      predictions %>%
+        mutate(.idx = .idx + .x, target_date = forecast_date + ttype_dur(.x))
+    }) %>%
+    purrr::list_rbind() %>%
+    mutate(
+      .idx = .idx %% modulus,
+      .idx = dplyr::case_when(.idx == 0 ~ modulus, TRUE ~ .idx)
+    )
+  # get the distinct .idx for the target date(s)
+  distinct_target_idx <- predictions$.idx %>% unique()
+  # get all of the idx's within the window of the target .idxs
+  entries <- map(distinct_target_idx, \(idx) within_window(idx, window_size, modulus)) %>%
+    do.call(c, .) %>%
+    unique()
+  # for the center, we need those within twice the window, since for each point
+  # we're subtracting out the center to generate the quantiles
+  entries_double_window <- map(entries, \(idx) within_window(idx, window_size, modulus)) %>%
+    do.call(c, .) %>%
+    unique()
+
+  epi_data_target <-
+    epi_data %>%
+    mutate(.idx = time_aggr(time_value), .weights = 1)
+  # get the point predictions
+  climate_center <-
+    epi_data_target %>%
+    filter(.idx %in% entries_double_window) %>%
+    mutate(.idx = time_aggr(time_value), .weights = 1) %>%
     select(.idx, .weights, all_of(c(outcome, keys))) %>%
     dplyr::reframe(
       roll_modular_multivec(
@@ -136,7 +168,10 @@ climatological_forecaster <- function(epi_data,
       probs = args_list$quantile_levels, na.rm = TRUE, type = 8
     )))
   }
-  climate_quantiles <- epi_data %>%
+  # add on the centers and subtract them out before computing the quantiles
+  climate_quantiles <-
+    epi_data_target %>%
+    filter(.idx %in% entries) %>%
     left_join(climate_center, by = c(".idx", keys)) %>%
     mutate({{ outcome }} := !!sym_outcome - .pred) %>%
     select(.idx, .weights, all_of(c(outcome, args_list$quantile_by_key))) %>%
@@ -147,31 +182,17 @@ climatological_forecaster <- function(epi_data,
       ),
       .by = all_of(args_list$quantile_by_key)
     ) %>%
-    rename(.pred_distn = climate_pred) %>%
-    mutate(.pred_distn = hardhat::quantile_pred(do.call(rbind, .pred_distn), args_list$quantile_levels))
+    mutate(.pred_distn = hardhat::quantile_pred(do.call(rbind, climate_pred), args_list$quantile_levels)) %>%
+    select(-climate_pred)
   # combine them together
   climate_table <- climate_center %>%
-    left_join(climate_quantiles, by = c(".idx", args_list$quantile_by_key)) %>%
+    inner_join(climate_quantiles, by = c(".idx", args_list$quantile_by_key)) %>%
     mutate(.pred_distn = .pred_distn + .pred)
-  # create the predictions
-  predictions <- epi_data %>%
-    select(all_of(keys)) %>%
-    dplyr::distinct() %>%
-    mutate(forecast_date = forecast_date, .idx = time_aggr(forecast_date))
-  predictions <- map(horizon, ~ {
-    predictions %>%
-      mutate(.idx = .idx + .x, target_date = forecast_date + ttype_dur(.x))
-  }) %>%
-    purrr::list_rbind() %>%
-    mutate(
-      .idx = .idx %% modulus,
-      .idx = dplyr::case_when(.idx == 0 ~ modulus, TRUE ~ .idx)
-    ) %>%
+  predictions <- predictions %>%
     left_join(climate_table, by = c(".idx", keys)) %>%
     select(-.idx)
   if (args_list$nonneg) {
-    predictions <- mutate(
-      predictions,
+    predictions <- predictions %>% mutate(
       .pred = snap(.pred, 0, Inf),
       .pred_distn = snap(.pred_distn, 0, Inf)
     )
diff --git a/R/step_climate.R b/R/step_climate.R
@@ -338,57 +338,74 @@ print.step_climate <- function(x, width = max(20, options()$width - 30), ...) {
 }
 
 #' group col by .idx values and sum windows around each .idx value
-#' @param .idx the relevant periodic part of time value, e.g. the week number
-#' @param col the list of values indexed by `.idx`
-#' @param weights how much to weigh each particular datapoint
-#' @param aggr the aggregation function, probably Quantile, mean or median
+#' @param idx_in the relevant periodic part of time value, e.g. the week number,
+#'   limited to the relevant range
+#' @param col the list of values indexed by `idx_in`
+#' @param weights how much to weigh each particular datapoint (also indexed by
+#'   `idx_in`)
+#' @param aggr the aggregation function, probably Quantile, mean, or median
 #' @param window_size the number of .idx entries before and after to include in
 #'   the aggregation
-#' @param modulus the maximum value of `.idx`
+#' @param modulus the number of days/weeks/months in the year, not including any
+#'   leap days/weeks
 #' @importFrom lubridate %m-%
 #' @keywords internal
-roll_modular_multivec <- function(col, .idx, weights, aggr, window_size, modulus) {
-  tib <- tibble(col = col, weights = weights, .idx = .idx) |>
+roll_modular_multivec <- function(col, idx_in, weights, aggr, window_size, modulus) {
+  # make a tibble where data gives the list of all datapoints with the
+  # corresponding .idx
+  tib <- tibble(col = col, weights = weights, .idx = idx_in) |>
     arrange(.idx) |>
     tidyr::nest(data = c(col, weights), .by = .idx)
-  out <- double(modulus + 1)
-  for (iter in seq_along(out)) {
-    # +1 from 1-indexing
-    entries <- (iter - window_size):(iter + window_size) %% modulus
-    entries[entries == 0] <- modulus
-    # note that because we are 1-indexing, we're looking for indices that are 1
-    # larger than the actual day/week in the year
-    if (modulus == 365) {
-      # we need to grab just the window around the leap day on the leap day
-      if (iter == 366) {
-        # there's an extra data point in front of the leap day
-        entries <- (59 - window_size):(59 + window_size - 1) %% modulus
-        entries[entries == 0] <- modulus
-        # adding in the leap day itself
-        entries <- c(entries, 999)
-      } else if ((59 %in% entries) || (60 %in% entries)) {
-        # if we're on the Feb/March boundary for daily data, we need to add in the
-        # leap day data
-        entries <- c(entries, 999)
-      }
-    } else if (modulus == 52) {
-      # we need to grab just the window around the leap week on the leap week
-      if (iter == 53) {
-        entries <- (53 - window_size):(53 + window_size - 1) %% 52
-        entries[entries == 0] <- 52
-        entries <- c(entries, 999)
-      } else if ((52 %in% entries) || (1 %in% entries)) {
-        # if we're on the year boundary for weekly data, we need to add in the
-        # leap week data (which is the extra week at the end)
-        entries <- c(entries, 999)
-      }
-    }
-    out[iter] <- with(
+  # storage for the results, includes all possible time indexes
+  out <- tibble(.idx = c(1:modulus, 999), climate_pred = double(modulus + 1))
+  for (tib_idx in tib$.idx) {
+    entries <- within_window(tib_idx, window_size, modulus)
+    out$climate_pred[out$.idx == tib_idx] <- with(
       purrr::list_rbind(tib %>% filter(.idx %in% entries) %>% pull(data)),
       aggr(col, weights)
     )
   }
-  tibble(.idx = unique(tib$.idx), climate_pred = out[seq_len(nrow(tib))])
+  # filter to only the ones we actually computed
+  out %>% filter(.idx %in% idx_in)
+}
+
+#' generate the idx values within `window_size` of `target_idx` given that our
+#' time value is of the type matching modulus
+#' @param target_idx the time index which we're drawing the window around
+#' @param window_size the size of the window on one side of `target_idx`
+#' @param modulus the number of days/weeks/months in the year, not including any leap days/weeks
+#' @keywords internal
+within_window <- function(target_idx, window_size, modulus) {
+  entries <- (target_idx - window_size):(target_idx + window_size) %% modulus
+  entries[entries == 0] <- modulus
+  # note that because we are 1-indexing, we're looking for indices that are 1
+  # larger than the actual day/week in the year
+  if (modulus == 365) {
+    # we need to grab just the window around the leap day on the leap day
+    if (target_idx == 999) {
+      # there's an extra data point in front of the leap day
+      entries <- (59 - window_size):(59 + window_size - 1) %% modulus
+      entries[entries == 0] <- modulus
+      # adding in the leap day itself
+      entries <- c(entries, 999)
+    } else if ((59 %in% entries) || (60 %in% entries)) {
+      # if we're on the Feb/March boundary for daily data, we need to add in the
+      # leap day data
+      entries <- c(entries, 999)
+    }
+  } else if (modulus == 52) {
+    # we need to grab just the window around the leap week on the leap week
+    if (target_idx == 999) {
+      entries <- (53 - window_size):(53 + window_size - 1) %% 52
+      entries[entries == 0] <- 52
+      entries <- c(entries, 999)
+    } else if ((52 %in% entries) || (1 %in% entries)) {
+      # if we're on the year boundary for weekly data, we need to add in the
+      # leap week data (which is the extra week at the end)
+      entries <- c(entries, 999)
+    }
+  }
+  entries
 }
 
 
diff --git a/man/roll_modular_multivec.Rd b/man/roll_modular_multivec.Rd
diff --git a/tests/testthat/test-step_climate.R b/tests/testthat/test-step_climate.R
@@ -110,7 +110,7 @@ test_that("prep/bake steps create the correct training data with an incomplete y
   r <- epi_recipe(x) %>% step_climate(y, time_type = "epiweek")
   p <- prep(r, x)
 
-  expected_res <- tibble(.idx = c(1:44, 999), climate_y = c(2, 3, 3, 4:25, 25, 25, 25:12, 12, 11, 11, 10))
+  expected_res <- tibble(.idx = c(1:44, 999), climate_y = c(2, 3, 3, 4:25, 25, 25, 25:12, 12, 11, 11, 2))
   expect_equal(p$steps[[1]]$climate_table, expected_res)
 
   b <- bake(p, new_data = NULL)
diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd
@@ -45,6 +45,7 @@ We currently provide the following basic forecasters:
   
   * _Flatline forecaster_: predicts as the median the most recently seen value 
     with increasingly wide quantiles.
+  * _Climatological forecaster_: predicts the median and quantiles based on the historical values around the same date in previous years.
   * _Autoregressive forecaster_: fits a model (e.g. linear regression) on
     lagged data to predict quantiles for continuous values.
   * _Autoregressive classifier_: fits a model (e.g. logistic regression) on
@@ -243,7 +244,6 @@ all_flatlines <- lapply(
       outcome = "death_rate",
       args_list = flatline_args_list(
         ahead = days_ahead,
-        quantile_levels = c(0.05, 0.5, 0.95)
       )
     )
   }
@@ -262,6 +262,43 @@ autoplot(
 Note that the `cdc_baseline_forecaster` is a slight modification of this method
 for use in [the CDC COVID19 Forecasting Hub](https://covid19forecasthub.org/).
 
+### `climatological_forecaster()`
+A different kind of baseline, the `climatological_forecaster()` forecasts the
+point forecast and quantiles based on the historical values for this time of
+year, rather than extrapolating from recent values.
+For example, on the same dataset as above:
+```{r make-climatological-forecast, warning=FALSE}
+all_climate <- climatological_forecaster(
+  covid_case_death_rates_extended |>
+    filter(time_value <= forecast_date, geo_value %in% used_locations),
+  outcome = "death_rate",
+  args_list = climate_args_list(
+    forecast_horizon = seq(0, 28),
+    window_size = 14,
+    time_type = "day",
+    forecast_date = forecast_date
+  )
+)
+workflow <- all_climate$epi_workflow
+results <- all_climate$predictions
+autoplot(
+  object = workflow,
+  predictions = results,
+  plot_data = covid_case_death_rates_extended |> filter(geo_value %in% used_locations, time_value > "2021-07-01")
+)
+```
+
+Note that we're using `covid_case_death_rates_extended` rather than
+`covid_case_death_rates`, since it starts in March of 2020 rather than December.
+Without at least a year's worth of historical data, it is impossible to do a
+climatological model.
+Even with only one year as we have here the resulting forecasts are unreliable.
+
+One feature of the climatological baseline is that it forecasts multiple aheads
+simultaneously.
+This is possible for `arx_forecaster()`, but only using `trainer =
+smooth_quantile_reg()`, which is built to handle multiple aheads simultaneously.
+
 ### `arx_classifier()`
 
 The most complicated of the canned forecasters, `arx_classifier` first