diff --git a/NAMESPACE b/NAMESPACE index 5fe9c99c..01ce353b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) +S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) @@ -233,6 +234,7 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) +import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index d399717f..8d63b7f6 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -1,11 +1,21 @@ #' Unormalizing transformation #' -#' Will undo a step_epi_YeoJohnson transformation. +#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +#' are using this step on a column that will eventually become the outcome +#' variable, you should make sure that the original name of that column is a +#' subset of the outcome variable name. `ahead_7_cases` when `cases` is +#' transformed will work well, while `ahead_7` will not. #' #' @inheritParams layer_population_scaling -#' @param yj_params Internal. A data frame of parameters to be used for -#' inverting the transformation. -#' @param by A (possibly named) character vector of variables to join by. +#' @param yj_params A data frame of parameters to be used for inverting the +#' transformation. Typically set automatically. If you have done multiple +#' transformations such that the outcome variable name no longer contains the +#' column that this step transforms, then you should manually specify this to +#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an +#' example where you wouldn't need to set this, if your output is +#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with +#' other columns), then you wouldn't need to set this. However if you have +#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step. #' #' @return an updated `frosting` postprocessor #' @export @@ -37,22 +47,21 @@ #' # Compare to the original data. #' jhu %>% filter(time_value == "2021-12-31") #' forecast(wf) -layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { +layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) { checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE) add_layer( frosting, layer_epi_YeoJohnson_new( yj_params = yj_params, - by = by, terms = dplyr::enquos(...), id = id ) ) } -layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { - layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id) +layer_epi_YeoJohnson_new <- function(yj_params, terms, id) { + layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id) } #' @export @@ -60,42 +69,14 @@ layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - # TODO: We will error if we don't have a workflow. Write a check later. - - # Get the yj_params from the layer or from the workflow. - yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow) - - # If the by is not specified, try to infer it from the yj_params. - if (is.null(object$by)) { - # Assume `layer_predict` has calculated the prediction keys and other - # layers don't change the prediction key colnames: - prediction_key_colnames <- names(components$keys) - lhs_potential_keys <- prediction_key_colnames - rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_"))) - object$by <- intersect(lhs_potential_keys, rhs_potential_keys) - suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") - if (!all(suggested_min_keys %in% object$by)) { - cli_warn( - c( - "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, - but {?wasn't/weren't} found in the population `df`.", - "i" = "Defaulting to join by {object$by}", - ">" = "Double-check whether column names on the population `df` match those expected in your predictions", - ">" = "Consider using population data with breakdowns by {suggested_min_keys}", - ">" = "Manually specify `by =` to silence" - ), - class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" - ) - } - } + # get the yj_params from the layer or from the workflow. + yj_params <- + object$yj_params %||% + get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params") # Establish the join columns. - object$by <- object$by %||% - intersect( - epi_keys_only(components$predictions), - colnames(select(yj_params, -starts_with(".yj_param_"))) - ) - joinby <- list(x = names(object$by) %||% object$by, y = object$by) + join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort() + joinby <- list(x = join_by_columns, y = join_by_columns) hardhat::validate_column_names(components$predictions, joinby$x) hardhat::validate_column_names(yj_params, joinby$y) @@ -115,55 +96,15 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # The `object$terms` is where the user specifies the columns they want to # untransform. We need to match the outcomes with their yj_param columns in our # parameter table and then apply the inverse transformation. - if (identical(col_names, ".pred")) { - # In this case, we don't get a hint for the outcome column name, so we need - # to infer it from the mold. - if (length(components$mold$outcomes) > 1) { - cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) - } - # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. - # We want to extract the cases part. - outcome_cols <- names(components$mold$outcomes) %>% - stringr::str_match("ahead_\\d+_(.*)") %>% - magrittr::extract(, 2) - + if (length(col_names) == 0) { + # not specified by the user, so just modify everything starting with `.pred` components$predictions <- components$predictions %>% - mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols)))) - } else if (identical(col_names, character(0))) { - # Wish I could suggest `all_outcomes()` here, but currently it's the same as - # not specifying any terms. I don't want to spend time with dealing with - # this case until someone asks for it. - cli::cli_abort( - "Not specifying columns to layer Yeo-Johnson is not implemented. - If you had a single outcome, you can use `.pred` as a column name. - If you had multiple outcomes, you'll need to specify them like - `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) + mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) } else { - # In this case, we assume that the user has specified the columns they want - # transformed here. We then need to determine the yj_param columns for each of - # these columns. That is, we need to convert a vector of column names like - # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to - # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate"). - original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] - outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2] - if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { - cli_abort( - "All columns specified in `...` must be outcome columns. - They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) - } - - for (i in seq_along(col_names)) { - col <- col_names[i] - yj_param_col <- paste0(".yj_param_", original_outcome_cols[i]) - components$predictions <- components$predictions %>% - mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col))) - } + components$predictions <- components$predictions %>% + mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) } # Remove the yj_param columns. @@ -182,75 +123,72 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), # Inverse Yeo-Johnson transformation # # Inverse of `yj_transform` in step_yeo_johnson.R. -yj_inverse <- function(x, lambda, eps = 0.001) { +yj_inverse <- function(x_in, lambda, eps = 0.001) { if (any(is.na(lambda))) { - return(x) - } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) - } else { - if (!is.vector(x)) { - x <- as.vector(x) - } - } - - nn_inv_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 - } - out + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } - - ng_inv_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) - } - out - } - - dat_neg <- x < 0 - not_neg <- which(!dat_neg) - is_neg <- which(dat_neg) - - if (length(not_neg) > 0) { - x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg]) - } - - if (length(is_neg) > 0) { - x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg]) + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] + inv_x <- ifelse( + x < 0, + # negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -(exp(-x) - 1), + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + ), + # non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + (exp(x) - 1), + (lambda * x + 1)^(1 / lambda) - 1 + ) + ) + if (x_in %>% inherits("quantile_pred")) { + inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels") } - x + inv_x } -get_yj_params_in_layer <- function(workflow) { + +#' get the parameters used in the initial step +#' +#' @param workflow the workflow to extract the parameters from +#' @param step_name the name of the step to look for, as recognized by `detect_step` +#' @param param_name the parameter to pull out of the step +#' @keywords internal +get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") { + full_step_name <- glue::glue("step_{step_name}") this_recipe <- hardhat::extract_recipe(workflow) - if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { - cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + if (!(this_recipe %>% recipes::detect_step(step_name))) { + cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call()) + } + outcomes <- + workflows::extract_recipe(workflow)$term_info %>% + filter(role == "outcome") %>% + pull(variable) + if (length(outcomes) > 1) { + cli_abort( + "`layer_{step_name}` doesn't support multiple output columns. + This workflow produces {outcomes} as output columns.", + call = rlang::caller_call(), + class = "epipredict__layer_yeo_johnson_multi_outcome_error" + ) } for (step in this_recipe$steps) { - if (inherits(step, "step_epi_YeoJohnson")) { - yj_params <- step$yj_params + # if it's a `step_name` step that also transforms a column that is a subset + # of the output column name + is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes)) + if (inherits(step, full_step_name) && any(is_outcome_subset)) { + params <- step[[param_name]] %>% + select( + key_colnames(workflow$original_data, exclude = "time_value"), + contains(step$columns[is_outcome_subset]) + ) %>% + rename(.lambda = contains(step$columns)) break } } - yj_params + params } diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 293fad90..56e8fcf0 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) { dplyr::select(-.row) } - # quantiles by treating quantile_pred like a distribution ----------------- @@ -287,6 +286,12 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } +#' Internal vctrs methods +#' +#' @import vctrs +#' @keywords internal +#' @name epipredict-vctrs + #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -294,6 +299,19 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { UseMethod("vec_arith.quantile_pred", y) } + +#' @export +#' @method vec_arith.quantile_pred quantile_pred +vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { + all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) + op_fn <- getExportedValue("base", op) + # Interpolate/extrapolate to the same quantiles + x <- quantile.quantile_pred(x, all_quantiles) + y <- quantile.quantile_pred(y, all_quantiles) + out <- op_fn(x, y, ...) + quantile_pred(out, all_quantiles) +} + #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index 4e429528..ff4016cd 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -188,8 +188,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { # Check that the columns for transformation are present in new_data. if (!all(object$columns %in% colnames(new_data))) { cli::cli_abort( - "The columns for transformation are not present in the new data.", - call = rlang::caller_fn() + "The columns for transformation are not present in the new data." ) } # Check that the columns for transformation are present in new_data. @@ -202,7 +201,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { cli_abort(c( "Some variables used for training are not available in {.arg x}.", i = "The following required columns are missing: {check$missing_names}" - ), call = rlang::caller_fn()) + )) } # Transform each column, using the appropriate yj_param column per row. new_data <- left_join(new_data, object$yj_params, by = key_colnames(new_data, exclude = "time_value")) @@ -243,7 +242,7 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, # x = "Yeo-Johnson parameter could not be estimated for some geos for {col}.", # i = "Using parameter={x$na_fill} in these cases." # ), - # call = rlang::caller_fn() + # call = rlang::caller_call() # ) # } # } @@ -254,75 +253,71 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, } -### Code below taken from recipes::step_YeoJohnson. -### We keep "lambda" here, but above we renamed it to "yj_param". -### Modified yj_transform() to be vectorized in lambda. -### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 - -# Yeo-Johnson transformation -yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { - if (any(is.na(lambda))) { - return(x) - } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort( - "Length of `x` must be equal to length of `lambda` or lambda must be a scalar.", - call = rlang::caller_fn() - ) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) +yj_input_type_management <- function(x_in, lambda) { + if (x_in %>% inherits("quantile_pred")) { + x <- as.matrix(x_in) + if (length(lambda) == 1) { + lambda <- lambda %>% + rep(prod(dim(x))) %>% + matrix(dim(x)) + } else if (length(x_in) == length(lambda)) { + lambda <- lambda %>% + rep(dim(x)[[2]]) %>% + matrix(dim(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) + } + } else if (!inherits(x_in, "tbl_df") || is.data.frame(x_in)) { + x <- unlist(x_in, use.names = FALSE) } else { - if (!is.vector(x)) { - x <- as.vector(x) + if (!is.vector(x_in)) { + x <- as.vector(x_in) + } else { + x <- x_in } } - # TODO case weights: can we use weights here? - if (is.null(ind_neg)) { - dat_neg <- x < 0 - ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) - } - not_neg <- ind_neg[["not"]] - is_neg <- ind_neg[["is"]] - nn_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- log(x[sm_lambdas] + 1) - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- ((x + 1)^lambda - 1) / lambda - } - out + # these only apply if x_in isn't a quantile distribution + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) } - - ng_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -log(-x[near2_lambdas] + 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) - } - out + list(x, lambda) +} +### Code below taken from recipes::step_YeoJohnson. +### We keep "lambda" here, but above we renamed it to "yj_param". +### Modified yj_transform() to be vectorized in lambda. Also modified to work on distributions. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 +# Yeo-Johnson transformation +yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { + if (any(is.na(lambda))) { + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] - if (length(not_neg) > 0) { - x[not_neg] <- nn_trans(x[not_neg], lambda[not_neg]) - } + transformed <- ifelse( + x < 0, + # for negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -log(abs(x) + 1), + -((abs(x) + 1)^(2 - lambda) - 1) / (2 - lambda) + ), + # for non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + log(abs(x) + 1), + ((abs(x) + 1)^lambda - 1) / lambda + ) + ) - if (length(is_neg) > 0) { - x[is_neg] <- ng_trans(x[is_neg], lambda[is_neg]) + if (x_in %>% inherits("quantile_pred")) { + transformed <- transformed %>% quantile_pred(x_in %@% "quantile_levels") } - x + transformed } ## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., @@ -344,7 +339,7 @@ yj_obj <- function(lam, dat, ind_neg, const) { } ## estimates the values -estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE) { na_rows <- which(is.na(dat)) if (length(na_rows) > 0) { if (na_rm) { @@ -355,7 +350,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca x = "Missing values are not allowed for the YJ transformation.", i = "See {.arg na_rm} option." ), - call = call + call = rlang::caller_call(n = 2) ) } } diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd new file mode 100644 index 00000000..a4dabbfa --- /dev/null +++ b/man/epipredict-vctrs.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{epipredict-vctrs} +\alias{epipredict-vctrs} +\alias{vec_arith.quantile_pred} +\title{Internal vctrs methods} +\usage{ +\method{vec_arith}{quantile_pred}(op, x, y, ...) +} +\description{ +Internal vctrs methods +} +\keyword{internal} diff --git a/man/get_params_in_layer.Rd b/man/get_params_in_layer.Rd new file mode 100644 index 00000000..1d6c98ef --- /dev/null +++ b/man/get_params_in_layer.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{get_params_in_layer} +\alias{get_params_in_layer} +\title{get the parameters used in the initial step} +\usage{ +get_params_in_layer( + workflow, + step_name = "epi_YeoJohnson", + param_name = "yj_params" +) +} +\arguments{ +\item{workflow}{the workflow to extract the parameters from} + +\item{step_name}{the name of the step to look for, as recognized by \code{detect_step}} + +\item{param_name}{the parameter to pull out of the step} +} +\description{ +get the parameters used in the initial step +} +\keyword{internal} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd index 53520b4e..0c35ee74 100644 --- a/man/layer_epi_YeoJohnson.Rd +++ b/man/layer_epi_YeoJohnson.Rd @@ -8,7 +8,6 @@ layer_epi_YeoJohnson( frosting, ..., yj_params = NULL, - by = NULL, id = rand_id("epi_YeoJohnson") ) } @@ -19,10 +18,15 @@ sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} -\item{yj_params}{Internal. A data frame of parameters to be used for -inverting the transformation.} - -\item{by}{A (possibly named) character vector of variables to join by.} +\item{yj_params}{A data frame of parameters to be used for inverting the +transformation. Typically set automatically. If you have done multiple +transformations such that the outcome variable name no longer contains the +column that this step transforms, then you should manually specify this to +be the parameters fit in the corresponding \code{step_epi_YeoJohnson}. For an +example where you wouldn't need to set this, if your output is +\code{ahead_7_cases} and \code{step_epi_YeoJohnson} transformed cases (possibly with +other columns), then you wouldn't need to set this. However if you have +renamed your output column to \code{diff_7}, then you will need to extract the \code{yj_params} from the step.} \item{id}{a random id string} } @@ -30,7 +34,11 @@ inverting the transformation.} an updated \code{frosting} postprocessor } \description{ -Will undo a step_epi_YeoJohnson transformation. +Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +are using this step on a column that will eventually become the outcome +variable, you should make sure that the original name of that column is a +subset of the outcome variable name. \code{ahead_7_cases} when \code{cases} is +transformed will work well, while \code{ahead_7} will not. } \examples{ library(dplyr) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e1bafbd..685f806b 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work +#> with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/_snaps/yeo-johnson.md b/tests/testthat/_snaps/yeo-johnson.md new file mode 100644 index 00000000..b3a42c24 --- /dev/null +++ b/tests/testthat/_snaps/yeo-johnson.md @@ -0,0 +1,16 @@ +# Yeo-Johnson transformation inverts correctly + + Code + yj_transform(x, c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + +--- + + Code + yj_transform(list(1, 2), c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + diff --git a/tests/testthat/test-quantile_pred.R b/tests/testthat/test-quantile_pred.R index 70d7c71a..f5097c1d 100644 --- a/tests/testthat/test-quantile_pred.R +++ b/tests/testthat/test-quantile_pred.R @@ -81,6 +81,7 @@ test_that("unary math works on quantiles", { }) test_that("arithmetic works on quantiles", { + # Quantile and numeric arithmetic works dstn <- hardhat::quantile_pred( matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), 1:4 / 5 @@ -100,4 +101,28 @@ test_that("arithmetic works on quantiles", { expect_identical((1 / 4) * dstn, dstn2) expect_snapshot(error = TRUE, sum(dstn)) + + # Quantile and quantile arithmetic works + val <- c(1:4, 8:11) + dstn3 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn4 <- hardhat::quantile_pred( + matrix(val + 2 * val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + expect_identical(dstn3 + (2 * dstn3), dstn4) + + # Extrapolate when quantile_levels are not the same + val <- c(1:4, 8:11) + dstn5 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.1, 0.25, 0.5, 0.75) + ) + dstn6 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.25, 0.5, 0.75, 0.9) + ) + expect_identical((dstn5 + dstn6) %@% "quantile_levels", c(0.1, 0.25, 0.5, 0.75, 0.9)) }) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 9ae82151..c0bd12bd 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -13,6 +13,29 @@ test_that("Yeo-Johnson transformation inverts correctly", { expect_true( sum(abs(yj_inverse(yj_transform(x, lambdas), lambdas) - x)) < 1e-5 ) + + # also works on quantile distributions + x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) + x_back <- map( + lambdas, + \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 + ) + expect_true(all(unlist(x_back))) + + # Get coverage on yj_input_type_management + # Breaks on bad length of lambda + expect_snapshot(error = TRUE, + yj_transform(x, c(1, 2, 3)) + ) + expect_snapshot(error = TRUE, + yj_transform(list(1, 2), c(1, 2, 3)) + ) + expect_true( + identical( + yj_input_type_management(list(1, 2, 3), c(1, 2, 3)), + list(c(1, 2, 3), c(1, 2, 3)) + ) + ) }) test_that("Yeo-Johnson steps and layers invert each other", { @@ -36,16 +59,16 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred) + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) + dplyr::slice_max(time_value, by = geo_value) out2 <- forecast(wf) %>% rename(cases = .pred) expect_equal(out1, out2) - # Make sure it works when there are multiple predictors and outcomes + # Make sure it works when there are multiple predictors jhu_multi <- epidatasets::covid_case_death_rates_extended %>% filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% select(geo_value, time_value, case_rate, death_rate) @@ -53,7 +76,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { r <- epi_recipe(filtered_data) %>% step_epi_YeoJohnson(case_rate, death_rate) %>% step_epi_lag(case_rate, death_rate, lag = 0) %>% - step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_ahead(case_rate, ahead = 0, role = "outcome") %>% step_epi_naomit() tr <- r %>% prep(filtered_data) @@ -66,13 +89,43 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + select(-death_rate) %>% + dplyr::slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(case_rate = .pred) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson layers work on quantiles", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + + f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) - out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) + dplyr::slice_max(time_value, by = geo_value) %>% + rename(.pred = cases) %>% + tidyr::expand_grid(.pred_distn_quantile_level = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)) %>% + mutate(.pred_distn_value = .pred) %>% + select(geo_value, time_value, .pred, .pred_distn_value, .pred_distn_quantile_level) + out2 <- forecast(wf) %>% pivot_quantiles_longer(.pred_distn) %>% as_tibble() expect_equal(out1, out2) }) @@ -123,12 +176,13 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% + layer_residual_quantiles() %>% layer_epi_YeoJohnson(.pred) wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) %>% + dplyr::slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) out2 <- forecast(wf) %>%