fix: wip on layer_yj edge cases

dshemetov · dshemetov · commit 61d4e96667f0 · 2025-03-17T17:50:21.000-07:00
diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R
@@ -70,7 +70,6 @@ layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) {
 slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
   rlang::check_dots_empty()
 
-
   # Get the lambdas from the layer or from the workflow.
   lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow)
 
@@ -108,20 +107,6 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
   hardhat::validate_column_names(components$predictions, joinby$x)
   hardhat::validate_column_names(lambdas, joinby$y)
 
-  # TODO: We don't do multiple outcomes, do we? Assume not for now.
-  # Get the columns to transform. In components$predictions, the output is
-  # .pred, so col_names should just be ".pred".
-  exprs <- rlang::expr(c(!!!object$terms))
-  pos <- tidyselect::eval_select(exprs, components$predictions)
-  col_names <- names(pos)
-
-  # Get the outcome. `outcomes` is a vector of objects like ahead_1_cases,
-  # ahead_7_cases, etc. We want to extract the cases part.
-  outcome_col <- names(components$mold$outcomes) %>%
-    stringr::str_extract("(?<=_)[^_]+$") %>%
-    unique() %>%
-    extract(1)
-
   # Join the lambdas.
   components$predictions <- inner_join(
     components$predictions,
@@ -130,16 +115,57 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
     relationship = "many-to-one",
     unmatched = c("error", "drop")
   )
+
+  # TODO: There are many possibilities here:
+  # - (a) the terms can be empty, where we should probably default to all_outcomes()
+  # - (b) explicitly giving all_outcomes(), we end here with terms being empty
+  # - (c) if the user just specifies .pred, then we have to infer the outcome from the mold
+  # - (d) the user might specify outcomes of the form .pred_ahead_1_cases, .pred_ahead_7_cases, etc.
+  # Get the columns to transform.
+  exprs <- rlang::expr(c(!!!object$terms))
+  pos <- tidyselect::eval_select(exprs, components$predictions)
+  col_names <- names(pos)
+
   # For every column, we need to use the appropriate lambda column, which differs per row.
   # Note that yj_inverse() is vectorized.
-  for (col in col_names) {
+  if (identical(col_names, ".pred")) {
+    # In this case, we don't get a hint for the outcome column name, so we need to
+    # infer it from the mold. `outcomes` is a vector of objects like
+    # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
+    outcome_cols <- names(components$mold$outcomes) %>%
+      stringr::str_match("ahead_\\d+_(.*)") %>%
+      extract(, 2)
+
     components$predictions <- components$predictions %>%
       rowwise() %>%
-      mutate(!!col := yj_inverse(!!sym(col), !!sym(paste0("lambda_", outcome_col))))
+      mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols))))
+  } else if (identical(col_names, character(0))) {
+    # In this case, we should assume the user wants to transform all outcomes.
+    cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
+  } else {
+    # In this case, we assume that the user has specified the columns they want
+    # transformed here. We then need to determine the lambda columns for each of
+    # these columns. That is, we need to convert a vector of column names like
+    # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
+    # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate").
+    original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
+    if (all(original_outcome_cols %nin% names(components$mold$outcomes))) {
+      cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
+    }
+
+    for (i in seq_along(col_names)) {
+      col <- col_names[i]
+      lambda_col <- paste0("lambda_", original_outcome_cols[i])
+      components$predictions <- components$predictions %>%
+        rowwise() %>%
+        mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col)))
+    }
   }
+
   # Remove the lambda columns.
   components$predictions <- components$predictions %>%
-    select(-any_of(starts_with("lambda_")))
+    select(-any_of(starts_with("lambda_"))) %>%
+    ungroup()
   components
 }
 
diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R
@@ -182,7 +182,7 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) {
     num_unique = x$num_unique,
     na_rm = x$na_rm,
     epi_keys_checked = x$epi_keys_checked,
-    forecast_date = epipredict:::get_forecast_date(training),
+    forecast_date = attributes(training)$metadata$as_of,
     metadata = attributes(training)$metadata,
     columns = col_names,
     skip = x$skip,
diff --git a/test-yeo-johnson.Rmd b/test-yeo-johnson.Rmd
@@ -1,10 +1,34 @@
 ---
 title: "Yeo-Johnson Transformation Testing"
 output: html_document
+editor_options:
+  chunk_output_type: console
 ---
 
 ```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(
+  digits = 3,
+  comment = "#>",
+  collapse = TRUE,
+  cache = TRUE,
+  dev.args = list(bg = "transparent"),
+  dpi = 300,
+  cache.lazy = FALSE,
+  out.width = "90%",
+  fig.align = "center",
+  fig.width = 9,
+  fig.height = 6
+)
+ggplot2::theme_set(ggplot2::theme_bw())
+options(
+  dplyr.print_min = 6,
+  dplyr.print_max = 6,
+  pillar.max_footer_lines = 2,
+  pillar.min_chars = 15,
+  stringr.view_n = 6,
+  pillar.bold = TRUE,
+  width = 77
+)
 suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 ```
 
@@ -14,13 +38,9 @@ First, we'll set up the environment and load the necessary data:
 
 ```{r setup-env}
 # Simple case with keys = geo_value.
-jhu <- cases_deaths_subset %>%
+filtered_data <- cases_deaths_subset %>%
   filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
   select(geo_value, time_value, cases)
-
-# Load and prepare the data
-data <- jhu
-filtered_data <- data %>% select(geo_value, time_value, cases)
 ```
 
 ## Yeo-Johnson Transformation
@@ -48,6 +68,7 @@ Now, let's compare the Yeo-Johnson transformation with a manual whitening approa
 out1 <- r %>% bake(filtered_data)
 out2 <- filtered_data %>%
   mutate(cases = (cases + 0.01)^(1 / 4))
+
 filtered_data %>%
   mutate(cases = log(cases)) %>%
   ggplot(aes(time_value, cases)) +
@@ -59,74 +80,4 @@ filtered_data %>%
   facet_wrap(~geo_value, scales = "free_y") +
   theme_minimal() +
   labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases")
-
-# Test filtered_data not an epi_df still works.
-filtered_data_not_epi_df <- filtered_data %>%
-  as_tibble()
-r %>% bake(filtered_data_not_epi_df)
-```
-
-## Workflow Testing
-
-Finally, let's test the workflow with the Yeo-Johnson transformation:
-
-```{r workflow-test}
-# debugonce(slather.layer_epi_YeoJohnson)
-data <- filtered_data
-
-# Create and fit the workflow
-r <- epi_recipe(data) %>%
-  step_epi_YeoJohnson(cases) %>%
-  step_epi_lag(cases, lag = 0) %>%
-  step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
-  step_epi_naomit()
-f <- frosting() %>%
-  layer_predict() %>%
-  layer_threshold(.pred) %>%
-  layer_naomit(.pred) %>%
-  layer_epi_YeoJohnson(.pred)
-
-wf <- epi_workflow(r, linear_reg()) %>%
-  fit(data) %>%
-  add_frosting(f)
-
-forecast(wf)
-data %>% slice_max(time_value, by = geo_value)
 ```
-
-```{r}
-library(epipredict)
-debugonce(epipredict:::slather.layer_population_scaling)
-jhu <- cases_deaths_subset %>%
-  filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
-  select(geo_value, time_value, cases)
-
-pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
-
-r <- epi_recipe(jhu) %>%
-  step_population_scaling(
-    df = pop_data,
-    df_pop_col = "value",
-    by = c("geo_value" = "states"),
-    cases, suffix = "_scaled"
-  ) %>%
-  step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
-  step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
-  step_epi_naomit()
-
-f <- frosting() %>%
-  layer_predict() %>%
-  layer_threshold(.pred) %>%
-  layer_naomit(.pred) %>%
-  layer_population_scaling(.pred,
-    df = pop_data,
-    by = c("geo_value" = "states"),
-    df_pop_col = "value"
-  )
-
-wf <- epi_workflow(r, linear_reg()) %>%
-  fit(jhu) %>%
-  add_frosting(f)
-
-forecast(wf)
-```
diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R
@@ -1,65 +1,103 @@
 suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
 
+test_that("Yeo-Johnson transformation inverts correctly", {
+  expect_true(
+    map_lgl(seq(-5, 5, 0.1), function(lambda) {
+      map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all()
+    }) %>%
+      all()
+  )
+})
 
-# Real data test
-Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
+test_that("Yeo-Johnson steps and layers invert each other", {
+  jhu <- cases_deaths_subset %>%
+    filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
+    select(geo_value, time_value, cases)
+  filtered_data <- jhu
 
+  # Get some lambda values
+  r <- epi_recipe(filtered_data) %>%
+    step_epi_YeoJohnson(cases) %>%
+    step_epi_lag(cases, lag = 0) %>%
+    step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
+    step_epi_naomit()
+  tr <- r %>% prep(filtered_data)
 
-# Transform with Yeo-Johnson
-data <- tar_read(joined_archive_data) %>%
-  epix_as_of(as.Date("2023-11-08"))
-state_geo_values <- data %>% filter(source == "nhsn") %>% pull(geo_value) %>% unique()
-filtered_data <- data %>%
-  filter(geo_value %in% state_geo_values) %>%
-  select(geo_value, source, time_value, hhs)
-r <- epi_recipe(filtered_data) %>%
-  step_epi_YeoJohnson(hhs) %>%
-  prep(filtered_data)
-r
-# Inspect the lambda values (a few states have default lambda = 0.25, because
-# they have issues)
-r$steps[[1]]$lambdas %>% print(n = 55)
-out1 <- r %>% bake(filtered_data)
+  # Check general lambda values tibble structure
+  expect_true("lambda_cases" %in% names(tr$steps[[1]]$lambdas))
+  expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_cases))
+  # Still works on a tibble
+  expect_equal(
+    tr %>% bake(filtered_data %>% as_tibble()),
+    tr %>% bake(filtered_data)
+  )
 
-# Transform with manual whitening (quarter root scaling)
-# learned_params <- calculate_whitening_params(filtered_data, "hhs", scale_method = "none", center_method = "none", nonlin_method = "quart_root")
-out2 <- filtered_data %>%
-  mutate(hhs = (hhs + 0.01)^(1 / 4))
+  # Make sure that the inverse transformation works
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_epi_YeoJohnson(.pred)
+  wf <- epi_workflow(r, linear_reg()) %>%
+    fit(filtered_data) %>%
+    add_frosting(f)
+  out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
+  out2 <- forecast(wf) %>% rename(cases = .pred)
+  expect_equal(out1, out2)
 
-out1 %>%
-  left_join(out2, by = c("geo_value", "source", "time_value")) %>%
-  mutate(hhs_diff = hhs.x - hhs.y) %>%
-  ggplot(aes(time_value, hhs_diff)) +
-  geom_line() +
-  facet_wrap(~geo_value, scales = "free_y") +
-  theme_minimal() +
-  labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS")
+  # Make sure it works when there are multiple predictors and outcomes
+  jhu_multi <- epidatasets::covid_case_death_rates_extended %>%
+    filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
+    select(geo_value, time_value, case_rate, death_rate)
+  filtered_data <- jhu_multi
+  r <- epi_recipe(filtered_data) %>%
+    step_epi_YeoJohnson(case_rate, death_rate) %>%
+    step_epi_lag(case_rate, death_rate, lag = 0) %>%
+    step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>%
+    step_epi_naomit()
+  tr <- r %>% prep(filtered_data)
 
-# Plot the real data before and after transformation
-geo_filter <- "ca"
-filtered_data %>%
-  filter(geo_value == geo_filter, source == "nhsn") %>%
-  mutate(hhs = log(hhs)) %>%
-  ggplot(aes(time_value, hhs)) +
-  geom_line(color = "blue") +
-  geom_line(data = out1 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "green") +
-  geom_line(data = out2 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "red") +
-  theme_minimal() +
-  labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS")
+  # Check general lambda values tibble structure
+  expect_true("lambda_case_rate" %in% names(tr$steps[[1]]$lambdas))
+  expect_true("lambda_death_rate" %in% names(tr$steps[[1]]$lambdas))
+  expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_case_rate))
+  expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_death_rate))
 
+  # TODO: Make sure that the inverse transformation works
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_epi_YeoJohnson(.pred_ahead_0_case_rate)
+  wf <- epi_workflow(r, linear_reg()) %>%
+    fit(filtered_data) %>%
+    add_frosting(f)
+  out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
+  # debugonce(slather.layer_epi_YeoJohnson)
+  out2 <- forecast(wf) %>% rename(case_rate = .pred)
+  expect_equal(out1, out2)
+})
 
-# TODO: Test this.
-## Layer Yeo-Johnson2
-postproc <- frosting() %>%
-  layer_epi_YeoJohnson()
+test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", {
+  jhu <- cases_deaths_subset %>%
+    filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
+    select(geo_value, time_value, cases)
+  filtered_data <- jhu
 
-wf <- epi_workflow(r) %>%
-  fit(data) %>%
-  add_frosting(postproc)
+  # Get some lambda values
+  r <- epi_recipe(filtered_data) %>%
+    step_epi_YeoJohnson(cases) %>%
+    step_epi_lag(cases, lag = 0) %>%
+    step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
+    step_epi_naomit()
+  tr <- r %>% prep(filtered_data)
+  # Check for fixed lambda values
+  expect_true(all(near(tr$steps[[1]]$lambdas$lambda_cases, c(0.856, 0.207), tol = 0.001)))
 
-
-# Test inverse transformation
-map_lgl(seq(-5, 5, 0.1), function(lambda) {
-  map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all()
-}) %>%
-  all()
+  # Make sure that the inverse transformation works
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_epi_YeoJohnson(.pred)
+  wf <- epi_workflow(r, linear_reg()) %>%
+    fit(filtered_data) %>%
+    add_frosting(f)
+  out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
+  out2 <- forecast(wf) %>% rename(cases = .pred)
+  expect_equal(out1, out2)
+})