Skip to content
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ jobs:
fail-fast: false
matrix:
config:
# The one we use in production.
- { os: ubuntu-latest, r: "renv" }
# See if the latest release works.
- { os: ubuntu-latest, r: "release" }

env:
Expand Down
249 changes: 249 additions & 0 deletions R/new_epipredict_steps/layer_yeo_johnson.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
#' Unormalizing transformation
#'
#' Will undo a step_epi_YeoJohnson transformation.
#'
#' @param frosting a `frosting` postprocessor. The layer will be added to the
#' sequence of operations for this frosting.
#' @param ... One or more selector functions to scale variables
#' for this step. See [recipes::selections()] for more details.
#' @param df a data frame that contains the population data to be used for
#' inverting the existing scaling.
#' @param by A (possibly named) character vector of variables to join by.
#' @param id a random id string
#'
#' @return an updated `frosting` postprocessor
#' @export
#' @examples
#' library(dplyr)
#' jhu <- epidatasets::cases_deaths_subset %>%
#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
#' select(geo_value, time_value, cases)
#'
#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
#'
#' r <- epi_recipe(jhu) %>%
#' step_epi_YeoJohnson(
#' df = pop_data,
#' df_pop_col = "value",
#' by = c("geo_value" = "states"),
#' cases, suffix = "_scaled"
#' ) %>%
#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
#' step_epi_naomit()
#'
#' f <- frosting() %>%
#' layer_predict() %>%
#' layer_threshold(.pred) %>%
#' layer_naomit(.pred) %>%
#' layer_epi_YeoJohnson(.pred,
#' df = pop_data,
#' by = c("geo_value" = "states"),
#' df_pop_col = "value"
#' )
#'
#' wf <- epi_workflow(r, linear_reg()) %>%
#' fit(jhu) %>%
#' add_frosting(f)
#'
#' forecast(wf)
layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE)

add_layer(
frosting,
layer_epi_YeoJohnson_new(
lambdas = lambdas,
by = by,
terms = dplyr::enquos(...),
id = id
)
)
}

layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) {
epipredict:::layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id)
}

#' @export
#' @importFrom workflows extract_preprocessor
slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
rlang::check_dots_empty()

# Get the lambdas from the layer or from the workflow.
lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow)

# If the by is not specified, try to infer it from the lambdas.
if (is.null(object$by)) {
# Assume `layer_predict` has calculated the prediction keys and other
# layers don't change the prediction key colnames:
prediction_key_colnames <- names(components$keys)
lhs_potential_keys <- prediction_key_colnames
rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_")))
object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
suggested_min_keys <- setdiff(lhs_potential_keys, "time_value")
if (!all(suggested_min_keys %in% object$by)) {
cli_warn(
c(
"{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
but {?wasn't/weren't} found in the population `df`.",
"i" = "Defaulting to join by {object$by}",
">" = "Double-check whether column names on the population `df` match those expected in your predictions",
">" = "Consider using population data with breakdowns by {suggested_min_keys}",
">" = "Manually specify `by =` to silence"
),
class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
)
}
}

# Establish the join columns.
object$by <- object$by %||%
intersect(
epipredict:::epi_keys_only(components$predictions),
colnames(select(lambdas, -starts_with(".lambda_")))
)
joinby <- list(x = names(object$by) %||% object$by, y = object$by)
hardhat::validate_column_names(components$predictions, joinby$x)
hardhat::validate_column_names(lambdas, joinby$y)

# Join the lambdas.
components$predictions <- inner_join(
components$predictions,
lambdas,
by = object$by,
relationship = "many-to-one",
unmatched = c("error", "drop")
)

# TODO: There are many possibilities here:
# - (a) the terms can be empty, where we should probably default to
# all_outcomes().
# - (b) explicitly giving all_outcomes(), we end here with terms being empty,
# which doesn't seem right; need to make sure we pull in all the outcome
# columns here. The question is what form should they have?
# - (c) if the user just specifies .pred, then we have to infer the outcome
# from the mold, which is simple enough and the main case I have working.
# - (d) the user might specify outcomes of the form .pred_ahead_1_cases,
# .pred_ahead_7_cases, etc. Is that the right format? Trying those out now
# and getting errors downstream from forecast().
# Get the columns to transform.
exprs <- rlang::expr(c(!!!object$terms))
pos <- tidyselect::eval_select(exprs, components$predictions)
col_names <- names(pos)

# For every column, we need to use the appropriate lambda column, which differs per row.
# Note that yj_inverse() is vectorized in x, but not in lambda.
if (identical(col_names, ".pred")) {
# In this case, we don't get a hint for the outcome column name, so we need to
# infer it from the mold. `outcomes` is a vector of objects like
# ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
outcome_cols <- names(components$mold$outcomes) %>%
stringr::str_match("ahead_\\d+_(.*)") %>%
magrittr::extract(, 2)

components$predictions <- components$predictions %>%
rowwise() %>%
mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols))))
} else if (identical(col_names, character(0))) {
# In this case, we should assume the user wants to transform all outcomes.
cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
} else {
# In this case, we assume that the user has specified the columns they want
# transformed here. We then need to determine the lambda columns for each of
# these columns. That is, we need to convert a vector of column names like
# c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
# c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate").
original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2]
if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
}

for (i in seq_along(col_names)) {
col <- col_names[i]
lambda_col <- paste0(".lambda_", original_outcome_cols[i])
components$predictions <- components$predictions %>%
rowwise() %>%
mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col)))
}
}

# Remove the lambda columns.
components$predictions <- components$predictions %>%
select(-any_of(starts_with(".lambda_"))) %>%
ungroup()
components
}

#' @export
print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) {
title <- "Yeo-Johnson transformation (see `lambdas` object for values) on "
epipredict:::print_layer(x$terms, title = title, width = width)
}

#' Inverse Yeo-Johnson transformation
#'
#' Inverse of `yj_transform` in step_yeo_johnson.R.
#'
#' @keywords internal
yj_inverse <- function(x, lambda, eps = 0.001) {
if (is.na(lambda)) {
return(x)
}
if (!inherits(x, "tbl_df") || is.data.frame(x)) {
x <- unlist(x, use.names = FALSE)
} else {
if (!is.vector(x)) {
x <- as.vector(x)
}
}

dat_neg <- x < 0
ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
not_neg <- ind_neg[["not"]]
is_neg <- ind_neg[["is"]]

nn_inv_trans <- function(x, lambda) {
if (abs(lambda) < eps) {
# log(x + 1)
exp(x) - 1
} else {
# ((x + 1)^lambda - 1) / lambda
(lambda * x + 1)^(1 / lambda) - 1
}
}

ng_inv_trans <- function(x, lambda) {
if (abs(lambda - 2) < eps) {
# -log(-x + 1)
-(exp(-x) - 1)
} else {
# -((-x + 1)^(2 - lambda) - 1) / (2 - lambda)
-(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
}
}

if (length(not_neg) > 0) {
x[not_neg] <- nn_inv_trans(x[not_neg], lambda)
}

if (length(is_neg) > 0) {
x[is_neg] <- ng_inv_trans(x[is_neg], lambda)
}
x
}

get_lambdas_in_layer <- function(workflow) {
this_recipe <- hardhat::extract_recipe(workflow)
if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
}
for (step in this_recipe$steps) {
if (inherits(step, "step_epi_YeoJohnson")) {
lambdas <- step$lambdas
break
}
}
lambdas
}
Loading
Loading