cmu-delphi · dshemetov · Mar 17, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -19,6 +19,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
+          # The one we use in production.
+          - { os: ubuntu-latest, r: "renv" }
+          # See if the latest release works.
           - { os: ubuntu-latest, r: "release" }
 
     env:

diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R
@@ -0,0 +1,249 @@
+#' Unormalizing transformation
+#'
+#' Will undo a step_epi_YeoJohnson transformation.
+#'
+#' @param frosting a `frosting` postprocessor. The layer will be added to the
+#'   sequence of operations for this frosting.
+#' @param ... One or more selector functions to scale variables
+#'   for this step. See [recipes::selections()] for more details.
+#' @param df a data frame that contains the population data to be used for
+#'   inverting the existing scaling.
+#' @param by A (possibly named) character vector of variables to join by.
+#' @param id a random id string
+#'
+#' @return an updated `frosting` postprocessor
+#' @export
+#' @examples
+#' library(dplyr)
+#' jhu <- epidatasets::cases_deaths_subset %>%
+#'   filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
+#'   select(geo_value, time_value, cases)
+#'
+#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
+#'
+#' r <- epi_recipe(jhu) %>%
+#'   step_epi_YeoJohnson(
+#'     df = pop_data,
+#'     df_pop_col = "value",
+#'     by = c("geo_value" = "states"),
+#'     cases, suffix = "_scaled"
+#'   ) %>%
+#'   step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
+#'   step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
+#'   step_epi_naomit()
+#'
+#' f <- frosting() %>%
+#'   layer_predict() %>%
+#'   layer_threshold(.pred) %>%
+#'   layer_naomit(.pred) %>%
+#'   layer_epi_YeoJohnson(.pred,
+#'     df = pop_data,
+#'     by = c("geo_value" = "states"),
+#'     df_pop_col = "value"
+#'   )
+#'
+#' wf <- epi_workflow(r, linear_reg()) %>%
+#'   fit(jhu) %>%
+#'   add_frosting(f)
+#'
+#' forecast(wf)
+layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
+  checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE)
+
+  add_layer(
+    frosting,
+    layer_epi_YeoJohnson_new(
+      lambdas = lambdas,
+      by = by,
+      terms = dplyr::enquos(...),
+      id = id
+    )
+  )
+}
+
+layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) {
+  epipredict:::layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id)
+}
+
+#' @export
+#' @importFrom workflows extract_preprocessor
+slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
+  rlang::check_dots_empty()
+
+  # Get the lambdas from the layer or from the workflow.
+  lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow)
+
+  # If the by is not specified, try to infer it from the lambdas.
+  if (is.null(object$by)) {
+    # Assume `layer_predict` has calculated the prediction keys and other
+    # layers don't change the prediction key colnames:
+    prediction_key_colnames <- names(components$keys)
+    lhs_potential_keys <- prediction_key_colnames
+    rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_")))
+    object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
+    suggested_min_keys <- setdiff(lhs_potential_keys, "time_value")
+    if (!all(suggested_min_keys %in% object$by)) {
+      cli_warn(
+        c(
+          "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
+          but {?wasn't/weren't} found in the population `df`.",
+          "i" = "Defaulting to join by {object$by}",
+          ">" = "Double-check whether column names on the population `df` match those expected in your predictions",
+          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
+          ">" = "Manually specify `by =` to silence"
+        ),
+        class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
+      )
+    }
+  }
+
+  # Establish the join columns.
+  object$by <- object$by %||%
+    intersect(
+      epipredict:::epi_keys_only(components$predictions),
+      colnames(select(lambdas, -starts_with(".lambda_")))
+    )
+  joinby <- list(x = names(object$by) %||% object$by, y = object$by)
+  hardhat::validate_column_names(components$predictions, joinby$x)
+  hardhat::validate_column_names(lambdas, joinby$y)
+
+  # Join the lambdas.
+  components$predictions <- inner_join(
+    components$predictions,
+    lambdas,
+    by = object$by,
+    relationship = "many-to-one",
+    unmatched = c("error", "drop")
+  )
+
+  # TODO: There are many possibilities here:
+  # - (a) the terms can be empty, where we should probably default to
+  #   all_outcomes().
+  # - (b) explicitly giving all_outcomes(), we end here with terms being empty,
+  #   which doesn't seem right; need to make sure we pull in all the outcome
+  #   columns here. The question is what form should they have?
+  # - (c) if the user just specifies .pred, then we have to infer the outcome
+  #   from the mold, which is simple enough and the main case I have working.
+  # - (d) the user might specify outcomes of the form .pred_ahead_1_cases,
+  #   .pred_ahead_7_cases, etc. Is that the right format? Trying those out now
+  #   and getting errors downstream from forecast().
+  # Get the columns to transform.
+  exprs <- rlang::expr(c(!!!object$terms))
+  pos <- tidyselect::eval_select(exprs, components$predictions)
+  col_names <- names(pos)
+
+  # For every column, we need to use the appropriate lambda column, which differs per row.
+  # Note that yj_inverse() is vectorized in x, but not in lambda.
+  if (identical(col_names, ".pred")) {
+    # In this case, we don't get a hint for the outcome column name, so we need to
+    # infer it from the mold. `outcomes` is a vector of objects like
+    # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
+    outcome_cols <- names(components$mold$outcomes) %>%
+      stringr::str_match("ahead_\\d+_(.*)") %>%
+      magrittr::extract(, 2)
+
+    components$predictions <- components$predictions %>%
+      rowwise() %>%
+      mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols))))
+  } else if (identical(col_names, character(0))) {
+    # In this case, we should assume the user wants to transform all outcomes.
+    cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
+  } else {
+    # In this case, we assume that the user has specified the columns they want
+    # transformed here. We then need to determine the lambda columns for each of
+    # these columns. That is, we need to convert a vector of column names like
+    # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
+    # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate").
+    original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
+    outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2]
+    if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
+      cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
+    }
+
+    for (i in seq_along(col_names)) {
+      col <- col_names[i]
+      lambda_col <- paste0(".lambda_", original_outcome_cols[i])
+      components$predictions <- components$predictions %>%
+        rowwise() %>%
+        mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col)))
+    }
+  }
+
+  # Remove the lambda columns.
+  components$predictions <- components$predictions %>%
+    select(-any_of(starts_with(".lambda_"))) %>%
+    ungroup()
+  components
+}
+
+#' @export
+print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) {
+  title <- "Yeo-Johnson transformation (see `lambdas` object for values) on "
+  epipredict:::print_layer(x$terms, title = title, width = width)
+}
+
+#' Inverse Yeo-Johnson transformation
+#'
+#' Inverse of `yj_transform` in step_yeo_johnson.R.
+#'
+#' @keywords internal
+yj_inverse <- function(x, lambda, eps = 0.001) {
+  if (is.na(lambda)) {
+    return(x)
+  }
+  if (!inherits(x, "tbl_df") || is.data.frame(x)) {
+    x <- unlist(x, use.names = FALSE)
+  } else {
+    if (!is.vector(x)) {
+      x <- as.vector(x)
+    }
+  }
+
+  dat_neg <- x < 0
+  ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
+  not_neg <- ind_neg[["not"]]
+  is_neg <- ind_neg[["is"]]
+
+  nn_inv_trans <- function(x, lambda) {
+    if (abs(lambda) < eps) {
+      # log(x + 1)
+      exp(x) - 1
+    } else {
+      # ((x + 1)^lambda - 1) / lambda
+      (lambda * x + 1)^(1 / lambda) - 1
+    }
+  }
+
+  ng_inv_trans <- function(x, lambda) {
+    if (abs(lambda - 2) < eps) {
+      # -log(-x + 1)
+      -(exp(-x) - 1)
+    } else {
+      # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda)
+      -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
+    }
+  }
+
+  if (length(not_neg) > 0) {
+    x[not_neg] <- nn_inv_trans(x[not_neg], lambda)
+  }
+
+  if (length(is_neg) > 0) {
+    x[is_neg] <- ng_inv_trans(x[is_neg], lambda)
+  }
+  x
+}
+
+get_lambdas_in_layer <- function(workflow) {
+  this_recipe <- hardhat::extract_recipe(workflow)
+  if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
+    cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
+  }
+  for (step in this_recipe$steps) {
+    if (inherits(step, "step_epi_YeoJohnson")) {
+      lambdas <- step$lambdas
+      break
+    }
+  }
+  lambdas
+}