From 20453afc8ba0b9541fd648a24b176ba861e8a6a5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 7 Apr 2025 18:30:12 -0500 Subject: [PATCH 1/9] extend to quantile_dist, exclude multi-output --- NAMESPACE | 2 + R/layer_yeo_johnson.R | 268 +++++++++++++++++------------- R/quantile_pred-methods.R | 18 +- R/step_yeo_johnson.R | 112 ++++++------- man/epipredict-vctrs.Rd | 13 ++ man/get_params_in_layer.Rd | 23 +++ man/layer_epi_YeoJohnson.Rd | 17 +- tests/testthat/test-yeo-johnson.R | 50 +++++- 8 files changed, 319 insertions(+), 184 deletions(-) create mode 100644 man/epipredict-vctrs.Rd create mode 100644 man/get_params_in_layer.Rd diff --git a/NAMESPACE b/NAMESPACE index 5fe9c99c..01ce353b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) +S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) @@ -233,6 +234,7 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) +import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index d399717f..c1338503 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -1,10 +1,21 @@ #' Unormalizing transformation #' -#' Will undo a step_epi_YeoJohnson transformation. +#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +#' are using this step on a column that will eventually become the outcome +#' variable, you should make sure that the original name of that column is a +#' subset of the outcome variable name. `ahead_7_cases` when `cases` is +#' transformed will work well, while `ahead_7` will not. #' #' @inheritParams layer_population_scaling -#' @param yj_params Internal. A data frame of parameters to be used for -#' inverting the transformation. +#' @param yj_params A data frame of parameters to be used for inverting the +#' transformation. Typically set automatically. If you have done multiple +#' transformations such that the outcome variable name no longer contains the +#' column that this step transforms, then you should manually specify this to +#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an +#' example where you wouldn't need to set this, if your output is +#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with +#' other columns), then you wouldn't need to set this. However if you have +#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step. #' @param by A (possibly named) character vector of variables to join by. #' #' @return an updated `frosting` postprocessor @@ -60,14 +71,17 @@ layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - # TODO: We will error if we don't have a workflow. Write a check later. + # get the yj_params from the layer or from the workflow. + yj_params <- + object$yj_params %||% + get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params") - # Get the yj_params from the layer or from the workflow. - yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow) - - # If the by is not specified, try to infer it from the yj_params. + # if the by is not specified, try to infer it from the yj_params. if (is.null(object$by)) { - # Assume `layer_predict` has calculated the prediction keys and other + # if not specified, match the keys used in the join step + object$by <- key_colnames(new_data, exclude = "time_value") + yj_params + # assume `layer_predict` has calculated the prediction keys and other # layers don't change the prediction key colnames: prediction_key_colnames <- names(components$keys) lhs_potential_keys <- prediction_key_colnames @@ -79,10 +93,10 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, c( "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, but {?wasn't/weren't} found in the population `df`.", - "i" = "Defaulting to join by {object$by}", - ">" = "Double-check whether column names on the population `df` match those expected in your predictions", - ">" = "Consider using population data with breakdowns by {suggested_min_keys}", - ">" = "Manually specify `by =` to silence" + "i" = "defaulting to join by {object$by}", + ">" = "double-check whether column names on the population `df` match those expected in your predictions", + ">" = "consider using population data with breakdowns by {suggested_min_keys}", + ">" = "manually specify `by =` to silence" ), class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" ) @@ -115,55 +129,21 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # The `object$terms` is where the user specifies the columns they want to # untransform. We need to match the outcomes with their yj_param columns in our # parameter table and then apply the inverse transformation. - if (identical(col_names, ".pred")) { - # In this case, we don't get a hint for the outcome column name, so we need - # to infer it from the mold. - if (length(components$mold$outcomes) > 1) { - cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) - } - # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. - # We want to extract the cases part. - outcome_cols <- names(components$mold$outcomes) %>% - stringr::str_match("ahead_\\d+_(.*)") %>% - magrittr::extract(, 2) - + if (length(col_names) == 0) { + # not specified by the user, so just modify everything starting with `.pred` components$predictions <- components$predictions %>% - mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols)))) - } else if (identical(col_names, character(0))) { - # Wish I could suggest `all_outcomes()` here, but currently it's the same as - # not specifying any terms. I don't want to spend time with dealing with - # this case until someone asks for it. - cli::cli_abort( - "Not specifying columns to layer Yeo-Johnson is not implemented. - If you had a single outcome, you can use `.pred` as a column name. - If you had multiple outcomes, you'll need to specify them like - `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) + mutate(across( + starts_with(".pred"), + \(.pred) yj_inverse(.pred, .lambda) # debug(yj_inverse) + )) %>% + select(-.lambda) } else { - # In this case, we assume that the user has specified the columns they want - # transformed here. We then need to determine the yj_param columns for each of - # these columns. That is, we need to convert a vector of column names like - # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to - # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate"). - original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] - outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2] - if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { - cli_abort( - "All columns specified in `...` must be outcome columns. - They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) - } - - for (i in seq_along(col_names)) { - col <- col_names[i] - yj_param_col <- paste0(".yj_param_", original_outcome_cols[i]) - components$predictions <- components$predictions %>% - mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col))) - } + components$predictions <- components$predictions %>% + mutate(across( + all_of(col_names), + \(.pred) yj_inverse(.pred, .lambda) + )) %>% + select(-.lambda) } # Remove the yj_param columns. @@ -182,75 +162,135 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), # Inverse Yeo-Johnson transformation # # Inverse of `yj_transform` in step_yeo_johnson.R. -yj_inverse <- function(x, lambda, eps = 0.001) { +yj_inverse <- function(x_in, lambda, eps = 0.001) { if (any(is.na(lambda))) { return(x) } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) - } else { - if (!is.vector(x)) { - x <- as.vector(x) - } + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] + inv_x <- ifelse( + x < 0, + # negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -(exp(-x) - 1), + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + ), + # non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + (exp(x) - 1), + (lambda * x + 1)^(1 / lambda) - 1 + ) + ) + if (x_in %>% inherits("quantile_pred")) { + inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels") } + inv_x +} - nn_inv_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 - } - out - } +yj_inverse.quantile_pred <- function(x, lambda, eps = 0.001) { +} - ng_inv_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) - } - out +nn_inv_trans <- function(x, lambda, eps) { + out <- double(length(x)) + sm_lambdas <- abs(lambda) < eps + if (length(sm_lambdas) > 0) { + out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 } - - dat_neg <- x < 0 - not_neg <- which(!dat_neg) - is_neg <- which(dat_neg) - - if (length(not_neg) > 0) { - x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg]) + x <- x[!sm_lambdas] + lambda <- lambda[!sm_lambdas] + if (length(x) > 0) { + out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 } - - if (length(is_neg) > 0) { - x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg]) + out +} +nn_inv_trans_new <- function(x, lambda, eps) { + out <- double(length(x)) + sm_lambdas <- abs(lambda) < eps + pos_inv_lambda_0 <- fifelse( + sm_lambdas, + (exp(x) - 1), + (lambda * x + 1)^(1 / lambda) - 1 + ) + pos_inv <- (!sm_lambdas) * ((lambda * x + 1)^(1 / lambda) - 1) + pos_inv_lambda_0 + if (length(sm_lambdas) > 0) { + out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 + } + x <- x[!sm_lambdas] + lambda <- lambda[!sm_lambdas] + if (length(x) > 0) { + out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 } - x + out } -get_yj_params_in_layer <- function(workflow) { +ng_inv_trans <- function(x, lambda, eps) { + out <- double(length(x)) + near2_lambdas <- abs(lambda - 2) < eps + if (length(near2_lambdas) > 0) { + out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) + } + x <- x[!near2_lambdas] + lambda <- lambda[!near2_lambdas] + if (length(x) > 0) { + out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + } + out +} +ng_inv_trans <- function(x, lambda, eps) { + out <- double(length(x)) + near2_lambdas <- abs(lambda - 2) < eps + near2_value <- near2_lambdas * -(exp(-x) - 1) + away2_value <- -(!near2_lambdas) * (((lambda - 2) * x + 0 * im + 1)^(1 / (2 - lambda)) - 1) + 0 * (near2_value + away2_value) + x <- x[!near2_lambdas] + lambda <- lambda[!near2_lambdas] + if (length(x) > 0) { + out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + } + out +} +#' get the parameters used in the initial step +#' +#' @param workflow the workflow to extract the parameters from +#' @param step_name the name of the step to look for, as recognized by `detect_step` +#' @param param_name the parameter to pull out of the step +#' @keywords internal +get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") { + full_step_name <- glue::glue("step_{step_name}") this_recipe <- hardhat::extract_recipe(workflow) - if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { - cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + if (!(this_recipe %>% recipes::detect_step(step_name))) { + cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_env()) + } + outcomes <- + workflows::extract_recipe(workflow)$term_info %>% + filter(role == "outcome") %>% + pull(variable) + if (length(outcomes) > 1) { + cli_abort( + "`layer_{step_name}` doesn't support multiple output columns. This workflow produces {outcomes} as output columns.", + call = rlang::caller_env(), + class = "epipredict__layer_yeo_johnson_multi_outcome_error" + ) } for (step in this_recipe$steps) { - if (inherits(step, "step_epi_YeoJohnson")) { - yj_params <- step$yj_params + # if it's a `step_name` step that also transforms a column that is a subset + # of the output column name + is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes)) + if (inherits(step, full_step_name) && + any(is_outcome_subset) + ) { + params <- step[[param_name]] %>% + select( + key_colnames(workflow$original_data, exclude = "time_value"), + contains(step$columns[is_outcome_subset]) + ) %>% + rename(.lambda = contains(step$columns)) break } } - yj_params + params } diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 293fad90..16290d22 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) { dplyr::select(-.row) } - # quantiles by treating quantile_pred like a distribution ----------------- @@ -287,6 +286,12 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } +#' Internal vctrs methods +#' +#' @import vctrs +#' @keywords internal +#' @name epipredict-vctrs + #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -294,6 +299,17 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { UseMethod("vec_arith.quantile_pred", y) } + +#' @export +#' @method vec_arith.quantile_pred quantile_pred +vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { + all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) + op_fn <- getExportedValue("base", op) + x <- quantile_pred(as.matrix(x), all_quantiles) + y <- quantile_pred(as.matrix(y), all_quantiles) + quantile_pred(op_fn(x %>% as.matrix(), y %>% as.matrix(), ...), all_quantiles) +} + #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index 4e429528..fe27a4fc 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -254,75 +254,71 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, } +yj_input_type_management <- function(x_in, lambda) { + if (x_in %>% inherits("quantile_pred")) { + x <- as.matrix(x_in) + if (length(lambda) == 1) { + lambda <- lambda %>% + rep(prod(dim(x))) %>% + matrix(dim(x)) + } else if (length(x_in) == length(lambda)) { + lambda <- lambda %>% + rep(dim(x)[[2]]) %>% + matrix(dim(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) + } + } else if (!inherits(x_in, "tbl_df") || is.data.frame(x_in)) { + x <- unlist(x_in, use.names = FALSE) + } else { + if (!is.vector(x_in)) { + x <- as.vector(x_in) + } else { + x <- x_in + } + } + + # these only apply if x_in isn't a quantile distribution + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) + } + list(x, lambda) +} ### Code below taken from recipes::step_YeoJohnson. ### We keep "lambda" here, but above we renamed it to "yj_param". ### Modified yj_transform() to be vectorized in lambda. ### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 - # Yeo-Johnson transformation -yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { +yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { if (any(is.na(lambda))) { return(x) } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort( - "Length of `x` must be equal to length of `lambda` or lambda must be a scalar.", - call = rlang::caller_fn() - ) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) - } else { - if (!is.vector(x)) { - x <- as.vector(x) - } - } - # TODO case weights: can we use weights here? - if (is.null(ind_neg)) { - dat_neg <- x < 0 - ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) - } - not_neg <- ind_neg[["not"]] - is_neg <- ind_neg[["is"]] + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] - nn_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- log(x[sm_lambdas] + 1) - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- ((x + 1)^lambda - 1) / lambda - } - out - } - - ng_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -log(-x[near2_lambdas] + 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) - } - out - } - - if (length(not_neg) > 0) { - x[not_neg] <- nn_trans(x[not_neg], lambda[not_neg]) - } + transformed <- ifelse( + x < 0, + # for negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -log(abs(x) + 1), + -((abs(x) + 1)^(2 - lambda) - 1) / (2 - lambda) + ), + # for non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + log(abs(x) + 1), + ((abs(x) + 1)^lambda - 1) / lambda + ) + ) - if (length(is_neg) > 0) { - x[is_neg] <- ng_trans(x[is_neg], lambda[is_neg]) + if (x_in %>% inherits("quantile_pred")) { + transformed <- transformed %>% quantile_pred(x_in %@% "quantile_levels") } - x + transformed } ## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd new file mode 100644 index 00000000..a4dabbfa --- /dev/null +++ b/man/epipredict-vctrs.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{epipredict-vctrs} +\alias{epipredict-vctrs} +\alias{vec_arith.quantile_pred} +\title{Internal vctrs methods} +\usage{ +\method{vec_arith}{quantile_pred}(op, x, y, ...) +} +\description{ +Internal vctrs methods +} +\keyword{internal} diff --git a/man/get_params_in_layer.Rd b/man/get_params_in_layer.Rd new file mode 100644 index 00000000..1d6c98ef --- /dev/null +++ b/man/get_params_in_layer.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{get_params_in_layer} +\alias{get_params_in_layer} +\title{get the parameters used in the initial step} +\usage{ +get_params_in_layer( + workflow, + step_name = "epi_YeoJohnson", + param_name = "yj_params" +) +} +\arguments{ +\item{workflow}{the workflow to extract the parameters from} + +\item{step_name}{the name of the step to look for, as recognized by \code{detect_step}} + +\item{param_name}{the parameter to pull out of the step} +} +\description{ +get the parameters used in the initial step +} +\keyword{internal} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd index 53520b4e..a6a0e061 100644 --- a/man/layer_epi_YeoJohnson.Rd +++ b/man/layer_epi_YeoJohnson.Rd @@ -19,8 +19,15 @@ sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} -\item{yj_params}{Internal. A data frame of parameters to be used for -inverting the transformation.} +\item{yj_params}{A data frame of parameters to be used for inverting the +transformation. Typically set automatically. If you have done multiple +transformations such that the outcome variable name no longer contains the +column that this step transforms, then you should manually specify this to +be the parameters fit in the corresponding \code{step_epi_YeoJohnson}. For an +example where you wouldn't need to set this, if your output is +\code{ahead_7_cases} and \code{step_epi_YeoJohnson} transformed cases (possibly with +other columns), then you wouldn't need to set this. However if you have +renamed your output column to \code{diff_7}, then you will need to extract the \code{yj_params} from the step.} \item{by}{A (possibly named) character vector of variables to join by.} @@ -30,7 +37,11 @@ inverting the transformation.} an updated \code{frosting} postprocessor } \description{ -Will undo a step_epi_YeoJohnson transformation. +Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +are using this step on a column that will eventually become the outcome +variable, you should make sure that the original name of that column is a +subset of the outcome variable name. \code{ahead_7_cases} when \code{cases} is +transformed will work well, while \code{ahead_7} will not. } \examples{ library(dplyr) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 9ae82151..2d5a0f97 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -13,6 +13,17 @@ test_that("Yeo-Johnson transformation inverts correctly", { expect_true( sum(abs(yj_inverse(yj_transform(x, lambdas), lambdas) - x)) < 1e-5 ) + + # also works on quantile distributions + x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) + yj_inverse(x, lambdas[[1]]) + map(lambdas, \(lambda) yj_transform(x, lambda)) + x_back <- + map( + lambdas, + \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 + ) + expect_true(all(unlist(x_back))) }) test_that("Yeo-Johnson steps and layers invert each other", { @@ -36,16 +47,16 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred) + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) + dplyr::slice_max(time_value, by = geo_value) out2 <- forecast(wf) %>% rename(cases = .pred) expect_equal(out1, out2) - # Make sure it works when there are multiple predictors and outcomes + # Make sure it works when there are multiple predictors jhu_multi <- epidatasets::covid_case_death_rates_extended %>% filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% select(geo_value, time_value, case_rate, death_rate) @@ -53,7 +64,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { r <- epi_recipe(filtered_data) %>% step_epi_YeoJohnson(case_rate, death_rate) %>% step_epi_lag(case_rate, death_rate, lag = 0) %>% - step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_ahead(case_rate, ahead = 0, role = "outcome") %>% step_epi_naomit() tr <- r %>% prep(filtered_data) @@ -66,16 +77,39 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) - out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) + select(-death_rate) %>% + dplyr::slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(case_rate = .pred) expect_equal(out1, out2) }) +test_that("Yeo-Johnson layers work on quantiles", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + + f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out2 <- forecast(wf) +}) + test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { # Small synthetic grad_employ_dataset version. # fmt: skip @@ -128,7 +162,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) %>% + dplyr::slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) out2 <- forecast(wf) %>% From 04261d14d4a78ada73a4b775636996a0502e792c Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 7 Apr 2025 18:34:13 -0500 Subject: [PATCH 2/9] drop unneeded functions --- R/layer_yeo_johnson.R | 62 ------------------------------------------- 1 file changed, 62 deletions(-) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index c1338503..dbcb10e1 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -190,69 +190,7 @@ yj_inverse <- function(x_in, lambda, eps = 0.001) { inv_x } -yj_inverse.quantile_pred <- function(x, lambda, eps = 0.001) { -} - -nn_inv_trans <- function(x, lambda, eps) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 - } - out -} -nn_inv_trans_new <- function(x, lambda, eps) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - pos_inv_lambda_0 <- fifelse( - sm_lambdas, - (exp(x) - 1), - (lambda * x + 1)^(1 / lambda) - 1 - ) - pos_inv <- (!sm_lambdas) * ((lambda * x + 1)^(1 / lambda) - 1) - pos_inv_lambda_0 - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 - } - out -} -ng_inv_trans <- function(x, lambda, eps) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) - } - out -} -ng_inv_trans <- function(x, lambda, eps) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - near2_value <- near2_lambdas * -(exp(-x) - 1) - away2_value <- -(!near2_lambdas) * (((lambda - 2) * x + 0 * im + 1)^(1 / (2 - lambda)) - 1) - 0 * (near2_value + away2_value) - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) - } - out -} #' get the parameters used in the initial step #' #' @param workflow the workflow to extract the parameters from From 0a4cd449233622e3d1fbb7caf7117c96413cb86f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 7 Apr 2025 18:57:27 -0500 Subject: [PATCH 3/9] Drop by specification and infer from the epi_df --- R/layer_yeo_johnson.R | 43 +++++-------------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index dbcb10e1..22696fe3 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -16,7 +16,6 @@ #' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with #' other columns), then you wouldn't need to set this. However if you have #' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step. -#' @param by A (possibly named) character vector of variables to join by. #' #' @return an updated `frosting` postprocessor #' @export @@ -48,22 +47,21 @@ #' # Compare to the original data. #' jhu %>% filter(time_value == "2021-12-31") #' forecast(wf) -layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { +layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) { checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE) add_layer( frosting, layer_epi_YeoJohnson_new( yj_params = yj_params, - by = by, terms = dplyr::enquos(...), id = id ) ) } -layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { - layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id) +layer_epi_YeoJohnson_new <- function(yj_params, terms, id) { + layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id) } #' @export @@ -76,40 +74,9 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, object$yj_params %||% get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params") - # if the by is not specified, try to infer it from the yj_params. - if (is.null(object$by)) { - # if not specified, match the keys used in the join step - object$by <- key_colnames(new_data, exclude = "time_value") - yj_params - # assume `layer_predict` has calculated the prediction keys and other - # layers don't change the prediction key colnames: - prediction_key_colnames <- names(components$keys) - lhs_potential_keys <- prediction_key_colnames - rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_"))) - object$by <- intersect(lhs_potential_keys, rhs_potential_keys) - suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") - if (!all(suggested_min_keys %in% object$by)) { - cli_warn( - c( - "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, - but {?wasn't/weren't} found in the population `df`.", - "i" = "defaulting to join by {object$by}", - ">" = "double-check whether column names on the population `df` match those expected in your predictions", - ">" = "consider using population data with breakdowns by {suggested_min_keys}", - ">" = "manually specify `by =` to silence" - ), - class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" - ) - } - } - # Establish the join columns. - object$by <- object$by %||% - intersect( - epi_keys_only(components$predictions), - colnames(select(yj_params, -starts_with(".yj_param_"))) - ) - joinby <- list(x = names(object$by) %||% object$by, y = object$by) + join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort() + joinby <- list(x = join_by_columns, y = join_by_columns) hardhat::validate_column_names(components$predictions, joinby$x) hardhat::validate_column_names(yj_params, joinby$y) From 5ffa4a97e6d65516f573b028b5f290447c1349da Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 12:52:40 -0700 Subject: [PATCH 4/9] lint+test: test coverage, handle na lambda case, lint --- R/layer_yeo_johnson.R | 19 ++++++------------- R/step_yeo_johnson.R | 4 ++-- tests/testthat/test-yeo-johnson.R | 17 +++++++++++++++-- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index 22696fe3..4590bae3 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -99,17 +99,11 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, if (length(col_names) == 0) { # not specified by the user, so just modify everything starting with `.pred` components$predictions <- components$predictions %>% - mutate(across( - starts_with(".pred"), - \(.pred) yj_inverse(.pred, .lambda) # debug(yj_inverse) - )) %>% + mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>% select(-.lambda) } else { components$predictions <- components$predictions %>% - mutate(across( - all_of(col_names), - \(.pred) yj_inverse(.pred, .lambda) - )) %>% + mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>% select(-.lambda) } @@ -131,7 +125,7 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), # Inverse of `yj_transform` in step_yeo_johnson.R. yj_inverse <- function(x_in, lambda, eps = 0.001) { if (any(is.na(lambda))) { - return(x) + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_fn()) } x_lambda <- yj_input_type_management(x_in, lambda) x <- x_lambda[[1]] @@ -176,7 +170,8 @@ get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_na pull(variable) if (length(outcomes) > 1) { cli_abort( - "`layer_{step_name}` doesn't support multiple output columns. This workflow produces {outcomes} as output columns.", + "`layer_{step_name}` doesn't support multiple output columns. + This workflow produces {outcomes} as output columns.", call = rlang::caller_env(), class = "epipredict__layer_yeo_johnson_multi_outcome_error" ) @@ -185,9 +180,7 @@ get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_na # if it's a `step_name` step that also transforms a column that is a subset # of the output column name is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes)) - if (inherits(step, full_step_name) && - any(is_outcome_subset) - ) { + if (inherits(step, full_step_name) && any(is_outcome_subset)) { params <- step[[param_name]] %>% select( key_colnames(workflow$original_data, exclude = "time_value"), diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index fe27a4fc..adb29478 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -288,12 +288,12 @@ yj_input_type_management <- function(x_in, lambda) { } ### Code below taken from recipes::step_YeoJohnson. ### We keep "lambda" here, but above we renamed it to "yj_param". -### Modified yj_transform() to be vectorized in lambda. +### Modified yj_transform() to be vectorized in lambda. Also modified to work on distributions. ### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 # Yeo-Johnson transformation yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { if (any(is.na(lambda))) { - return(x) + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_fn()) } x_lambda <- yj_input_type_management(x_in, lambda) x <- x_lambda[[1]] diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 2d5a0f97..175711eb 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -16,14 +16,27 @@ test_that("Yeo-Johnson transformation inverts correctly", { # also works on quantile distributions x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) - yj_inverse(x, lambdas[[1]]) - map(lambdas, \(lambda) yj_transform(x, lambda)) x_back <- map( lambdas, \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 ) expect_true(all(unlist(x_back))) + + # Get coverage on yj_input_type_management + # Breaks on bad length of lambda + expect_snapshot(error = TRUE, + yj_input_type_management(x, c(1, 2, 3)) + ) + expect_snapshot(error = TRUE, + yj_input_type_management(list(1, 2), c(1, 2, 3)) + ) + expect_true( + identical( + yj_input_type_management(list(1, 2, 3), c(1, 2, 3)), + list(c(1, 2, 3), c(1, 2, 3)) + ) + ) }) test_that("Yeo-Johnson steps and layers invert each other", { From c60b6b2c672560b46e2e65eb171bb964ec5634d0 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 14:17:28 -0700 Subject: [PATCH 5/9] test: a few more tests --- tests/testthat/test-yeo-johnson.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 175711eb..5d05a593 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -120,7 +120,14 @@ test_that("Yeo-Johnson layers work on quantiles", { wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) - out2 <- forecast(wf) + out1 <- filtered_data %>% + dplyr::slice_max(time_value, by = geo_value) %>% + rename(.pred = cases) %>% + tidyr::expand_grid(.pred_distn_quantile_level = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)) %>% + mutate(.pred_distn_value = .pred) %>% + select(geo_value, time_value, .pred, .pred_distn_value, .pred_distn_quantile_level) + out2 <- forecast(wf) %>% pivot_quantiles_longer(.pred_distn) %>% as_tibble() + expect_equal(out1, out2) }) test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { @@ -170,6 +177,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% + layer_residual_quantiles() %>% layer_epi_YeoJohnson(.pred) wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% From 2af8977f3f56aedf57c182f227eec409586ed23f Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 14:21:23 -0700 Subject: [PATCH 6/9] fix: move quantile_pred stuff to another branch --- NAMESPACE | 2 -- R/quantile_pred-methods.R | 16 ---------------- man/epipredict-vctrs.Rd | 13 ------------- man/layer_epi_YeoJohnson.Rd | 3 --- man/step_adjust_latency.Rd | 4 ++-- 5 files changed, 2 insertions(+), 36 deletions(-) delete mode 100644 man/epipredict-vctrs.Rd diff --git a/NAMESPACE b/NAMESPACE index 01ce353b..5fe9c99c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,7 +122,6 @@ S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) -S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) @@ -234,7 +233,6 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) -import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 16290d22..1ac4904d 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -286,12 +286,6 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } -#' Internal vctrs methods -#' -#' @import vctrs -#' @keywords internal -#' @name epipredict-vctrs - #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -300,16 +294,6 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { } -#' @export -#' @method vec_arith.quantile_pred quantile_pred -vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { - all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) - op_fn <- getExportedValue("base", op) - x <- quantile_pred(as.matrix(x), all_quantiles) - y <- quantile_pred(as.matrix(y), all_quantiles) - quantile_pred(op_fn(x %>% as.matrix(), y %>% as.matrix(), ...), all_quantiles) -} - #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd deleted file mode 100644 index a4dabbfa..00000000 --- a/man/epipredict-vctrs.Rd +++ /dev/null @@ -1,13 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_pred-methods.R -\name{epipredict-vctrs} -\alias{epipredict-vctrs} -\alias{vec_arith.quantile_pred} -\title{Internal vctrs methods} -\usage{ -\method{vec_arith}{quantile_pred}(op, x, y, ...) -} -\description{ -Internal vctrs methods -} -\keyword{internal} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd index a6a0e061..0c35ee74 100644 --- a/man/layer_epi_YeoJohnson.Rd +++ b/man/layer_epi_YeoJohnson.Rd @@ -8,7 +8,6 @@ layer_epi_YeoJohnson( frosting, ..., yj_params = NULL, - by = NULL, id = rand_id("epi_YeoJohnson") ) } @@ -29,8 +28,6 @@ example where you wouldn't need to set this, if your output is other columns), then you wouldn't need to set this. However if you have renamed your output column to \code{diff_7}, then you will need to extract the \code{yj_params} from the step.} -\item{by}{A (possibly named) character vector of variables to join by.} - \item{id}{a random id string} } \value{ diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e1bafbd..685f806b 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work +#> with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as From a58bfb9e9077b54e1dae9293ef6171de90916aa2 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 14:30:35 -0700 Subject: [PATCH 7/9] a --- tests/testthat/test-yeo-johnson.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 5d05a593..93d3bcf6 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -16,8 +16,7 @@ test_that("Yeo-Johnson transformation inverts correctly", { # also works on quantile distributions x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) - x_back <- - map( + x_back <- map( lambdas, \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 ) From 89b643fb53c23dd0ade42819113b6ba924d7e39a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 14:16:27 -0700 Subject: [PATCH 8/9] feat: quantile_pred and quantile_pred arithmetic --- NAMESPACE | 1 + R/quantile_pred-methods.R | 18 ++++++++++++++++++ tests/testthat/test-quantile_pred.R | 25 +++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 5fe9c99c..efbe0981 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -233,6 +233,7 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) +import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 1ac4904d..56e8fcf0 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -286,6 +286,12 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } +#' Internal vctrs methods +#' +#' @import vctrs +#' @keywords internal +#' @name epipredict-vctrs + #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -294,6 +300,18 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { } +#' @export +#' @method vec_arith.quantile_pred quantile_pred +vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { + all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) + op_fn <- getExportedValue("base", op) + # Interpolate/extrapolate to the same quantiles + x <- quantile.quantile_pred(x, all_quantiles) + y <- quantile.quantile_pred(y, all_quantiles) + out <- op_fn(x, y, ...) + quantile_pred(out, all_quantiles) +} + #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/tests/testthat/test-quantile_pred.R b/tests/testthat/test-quantile_pred.R index 70d7c71a..f5097c1d 100644 --- a/tests/testthat/test-quantile_pred.R +++ b/tests/testthat/test-quantile_pred.R @@ -81,6 +81,7 @@ test_that("unary math works on quantiles", { }) test_that("arithmetic works on quantiles", { + # Quantile and numeric arithmetic works dstn <- hardhat::quantile_pred( matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), 1:4 / 5 @@ -100,4 +101,28 @@ test_that("arithmetic works on quantiles", { expect_identical((1 / 4) * dstn, dstn2) expect_snapshot(error = TRUE, sum(dstn)) + + # Quantile and quantile arithmetic works + val <- c(1:4, 8:11) + dstn3 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn4 <- hardhat::quantile_pred( + matrix(val + 2 * val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + expect_identical(dstn3 + (2 * dstn3), dstn4) + + # Extrapolate when quantile_levels are not the same + val <- c(1:4, 8:11) + dstn5 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.1, 0.25, 0.5, 0.75) + ) + dstn6 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.25, 0.5, 0.75, 0.9) + ) + expect_identical((dstn5 + dstn6) %@% "quantile_levels", c(0.1, 0.25, 0.5, 0.75, 0.9)) }) From e226fc56bc365837ac0b095e077f632b1231fb3b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 10 Apr 2025 14:46:55 -0700 Subject: [PATCH 9/9] fix: rlang calls and vector arith --- NAMESPACE | 1 + R/layer_yeo_johnson.R | 6 +++--- R/step_yeo_johnson.R | 17 ++++++++--------- man/epipredict-vctrs.Rd | 13 +++++++++++++ tests/testthat/_snaps/yeo-johnson.md | 16 ++++++++++++++++ tests/testthat/test-yeo-johnson.R | 4 ++-- 6 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 man/epipredict-vctrs.Rd create mode 100644 tests/testthat/_snaps/yeo-johnson.md diff --git a/NAMESPACE b/NAMESPACE index efbe0981..01ce353b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) +S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index 4590bae3..8d63b7f6 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -125,7 +125,7 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), # Inverse of `yj_transform` in step_yeo_johnson.R. yj_inverse <- function(x_in, lambda, eps = 0.001) { if (any(is.na(lambda))) { - cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_fn()) + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } x_lambda <- yj_input_type_management(x_in, lambda) x <- x_lambda[[1]] @@ -162,7 +162,7 @@ get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_na full_step_name <- glue::glue("step_{step_name}") this_recipe <- hardhat::extract_recipe(workflow) if (!(this_recipe %>% recipes::detect_step(step_name))) { - cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_env()) + cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call()) } outcomes <- workflows::extract_recipe(workflow)$term_info %>% @@ -172,7 +172,7 @@ get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_na cli_abort( "`layer_{step_name}` doesn't support multiple output columns. This workflow produces {outcomes} as output columns.", - call = rlang::caller_env(), + call = rlang::caller_call(), class = "epipredict__layer_yeo_johnson_multi_outcome_error" ) } diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index adb29478..ff4016cd 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -188,8 +188,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { # Check that the columns for transformation are present in new_data. if (!all(object$columns %in% colnames(new_data))) { cli::cli_abort( - "The columns for transformation are not present in the new data.", - call = rlang::caller_fn() + "The columns for transformation are not present in the new data." ) } # Check that the columns for transformation are present in new_data. @@ -202,7 +201,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { cli_abort(c( "Some variables used for training are not available in {.arg x}.", i = "The following required columns are missing: {check$missing_names}" - ), call = rlang::caller_fn()) + )) } # Transform each column, using the appropriate yj_param column per row. new_data <- left_join(new_data, object$yj_params, by = key_colnames(new_data, exclude = "time_value")) @@ -243,7 +242,7 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, # x = "Yeo-Johnson parameter could not be estimated for some geos for {col}.", # i = "Using parameter={x$na_fill} in these cases." # ), - # call = rlang::caller_fn() + # call = rlang::caller_call() # ) # } # } @@ -266,7 +265,7 @@ yj_input_type_management <- function(x_in, lambda) { rep(dim(x)[[2]]) %>% matrix(dim(x)) } else if (length(x) != length(lambda)) { - cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) } } else if (!inherits(x_in, "tbl_df") || is.data.frame(x_in)) { x <- unlist(x_in, use.names = FALSE) @@ -282,7 +281,7 @@ yj_input_type_management <- function(x_in, lambda) { if (length(x) > 1 && length(lambda) == 1) { lambda <- rep(lambda, length(x)) } else if (length(x) != length(lambda)) { - cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) } list(x, lambda) } @@ -293,7 +292,7 @@ yj_input_type_management <- function(x_in, lambda) { # Yeo-Johnson transformation yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { if (any(is.na(lambda))) { - cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_fn()) + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } x_lambda <- yj_input_type_management(x_in, lambda) x <- x_lambda[[1]] @@ -340,7 +339,7 @@ yj_obj <- function(lam, dat, ind_neg, const) { } ## estimates the values -estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE) { na_rows <- which(is.na(dat)) if (length(na_rows) > 0) { if (na_rm) { @@ -351,7 +350,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca x = "Missing values are not allowed for the YJ transformation.", i = "See {.arg na_rm} option." ), - call = call + call = rlang::caller_call(n = 2) ) } } diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd new file mode 100644 index 00000000..a4dabbfa --- /dev/null +++ b/man/epipredict-vctrs.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{epipredict-vctrs} +\alias{epipredict-vctrs} +\alias{vec_arith.quantile_pred} +\title{Internal vctrs methods} +\usage{ +\method{vec_arith}{quantile_pred}(op, x, y, ...) +} +\description{ +Internal vctrs methods +} +\keyword{internal} diff --git a/tests/testthat/_snaps/yeo-johnson.md b/tests/testthat/_snaps/yeo-johnson.md new file mode 100644 index 00000000..b3a42c24 --- /dev/null +++ b/tests/testthat/_snaps/yeo-johnson.md @@ -0,0 +1,16 @@ +# Yeo-Johnson transformation inverts correctly + + Code + yj_transform(x, c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + +--- + + Code + yj_transform(list(1, 2), c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 93d3bcf6..c0bd12bd 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -25,10 +25,10 @@ test_that("Yeo-Johnson transformation inverts correctly", { # Get coverage on yj_input_type_management # Breaks on bad length of lambda expect_snapshot(error = TRUE, - yj_input_type_management(x, c(1, 2, 3)) + yj_transform(x, c(1, 2, 3)) ) expect_snapshot(error = TRUE, - yj_input_type_management(list(1, 2), c(1, 2, 3)) + yj_transform(list(1, 2), c(1, 2, 3)) ) expect_true( identical(