Skip to content

Commit 61d4e96

Browse files
committed
fix: wip on layer_yj edge cases
1 parent ce3b19d commit 61d4e96

File tree

4 files changed

+163
-148
lines changed

4 files changed

+163
-148
lines changed

R/new_epipredict_steps/layer_yeo_johnson.R

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) {
7070
slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
7171
rlang::check_dots_empty()
7272

73-
7473
# Get the lambdas from the layer or from the workflow.
7574
lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow)
7675

@@ -108,20 +107,6 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
108107
hardhat::validate_column_names(components$predictions, joinby$x)
109108
hardhat::validate_column_names(lambdas, joinby$y)
110109

111-
# TODO: We don't do multiple outcomes, do we? Assume not for now.
112-
# Get the columns to transform. In components$predictions, the output is
113-
# .pred, so col_names should just be ".pred".
114-
exprs <- rlang::expr(c(!!!object$terms))
115-
pos <- tidyselect::eval_select(exprs, components$predictions)
116-
col_names <- names(pos)
117-
118-
# Get the outcome. `outcomes` is a vector of objects like ahead_1_cases,
119-
# ahead_7_cases, etc. We want to extract the cases part.
120-
outcome_col <- names(components$mold$outcomes) %>%
121-
stringr::str_extract("(?<=_)[^_]+$") %>%
122-
unique() %>%
123-
extract(1)
124-
125110
# Join the lambdas.
126111
components$predictions <- inner_join(
127112
components$predictions,
@@ -130,16 +115,57 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
130115
relationship = "many-to-one",
131116
unmatched = c("error", "drop")
132117
)
118+
119+
# TODO: There are many possibilities here:
120+
# - (a) the terms can be empty, where we should probably default to all_outcomes()
121+
# - (b) explicitly giving all_outcomes(), we end here with terms being empty
122+
# - (c) if the user just specifies .pred, then we have to infer the outcome from the mold
123+
# - (d) the user might specify outcomes of the form .pred_ahead_1_cases, .pred_ahead_7_cases, etc.
124+
# Get the columns to transform.
125+
exprs <- rlang::expr(c(!!!object$terms))
126+
pos <- tidyselect::eval_select(exprs, components$predictions)
127+
col_names <- names(pos)
128+
133129
# For every column, we need to use the appropriate lambda column, which differs per row.
134130
# Note that yj_inverse() is vectorized.
135-
for (col in col_names) {
131+
if (identical(col_names, ".pred")) {
132+
# In this case, we don't get a hint for the outcome column name, so we need to
133+
# infer it from the mold. `outcomes` is a vector of objects like
134+
# ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
135+
outcome_cols <- names(components$mold$outcomes) %>%
136+
stringr::str_match("ahead_\\d+_(.*)") %>%
137+
extract(, 2)
138+
136139
components$predictions <- components$predictions %>%
137140
rowwise() %>%
138-
mutate(!!col := yj_inverse(!!sym(col), !!sym(paste0("lambda_", outcome_col))))
141+
mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols))))
142+
} else if (identical(col_names, character(0))) {
143+
# In this case, we should assume the user wants to transform all outcomes.
144+
cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
145+
} else {
146+
# In this case, we assume that the user has specified the columns they want
147+
# transformed here. We then need to determine the lambda columns for each of
148+
# these columns. That is, we need to convert a vector of column names like
149+
# c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
150+
# c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate").
151+
original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
152+
if (all(original_outcome_cols %nin% names(components$mold$outcomes))) {
153+
cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
154+
}
155+
156+
for (i in seq_along(col_names)) {
157+
col <- col_names[i]
158+
lambda_col <- paste0("lambda_", original_outcome_cols[i])
159+
components$predictions <- components$predictions %>%
160+
rowwise() %>%
161+
mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col)))
162+
}
139163
}
164+
140165
# Remove the lambda columns.
141166
components$predictions <- components$predictions %>%
142-
select(-any_of(starts_with("lambda_")))
167+
select(-any_of(starts_with("lambda_"))) %>%
168+
ungroup()
143169
components
144170
}
145171

R/new_epipredict_steps/step_yeo_johnson.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) {
182182
num_unique = x$num_unique,
183183
na_rm = x$na_rm,
184184
epi_keys_checked = x$epi_keys_checked,
185-
forecast_date = epipredict:::get_forecast_date(training),
185+
forecast_date = attributes(training)$metadata$as_of,
186186
metadata = attributes(training)$metadata,
187187
columns = col_names,
188188
skip = x$skip,

test-yeo-johnson.Rmd

Lines changed: 27 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,34 @@
11
---
22
title: "Yeo-Johnson Transformation Testing"
33
output: html_document
4+
editor_options:
5+
chunk_output_type: console
46
---
57

68
```{r setup, include=FALSE}
7-
knitr::opts_chunk$set(echo = TRUE)
9+
knitr::opts_chunk$set(
10+
digits = 3,
11+
comment = "#>",
12+
collapse = TRUE,
13+
cache = TRUE,
14+
dev.args = list(bg = "transparent"),
15+
dpi = 300,
16+
cache.lazy = FALSE,
17+
out.width = "90%",
18+
fig.align = "center",
19+
fig.width = 9,
20+
fig.height = 6
21+
)
22+
ggplot2::theme_set(ggplot2::theme_bw())
23+
options(
24+
dplyr.print_min = 6,
25+
dplyr.print_max = 6,
26+
pillar.max_footer_lines = 2,
27+
pillar.min_chars = 15,
28+
stringr.view_n = 6,
29+
pillar.bold = TRUE,
30+
width = 77
31+
)
832
suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
933
```
1034

@@ -14,13 +38,9 @@ First, we'll set up the environment and load the necessary data:
1438

1539
```{r setup-env}
1640
# Simple case with keys = geo_value.
17-
jhu <- cases_deaths_subset %>%
41+
filtered_data <- cases_deaths_subset %>%
1842
filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
1943
select(geo_value, time_value, cases)
20-
21-
# Load and prepare the data
22-
data <- jhu
23-
filtered_data <- data %>% select(geo_value, time_value, cases)
2444
```
2545

2646
## Yeo-Johnson Transformation
@@ -48,6 +68,7 @@ Now, let's compare the Yeo-Johnson transformation with a manual whitening approa
4868
out1 <- r %>% bake(filtered_data)
4969
out2 <- filtered_data %>%
5070
mutate(cases = (cases + 0.01)^(1 / 4))
71+
5172
filtered_data %>%
5273
mutate(cases = log(cases)) %>%
5374
ggplot(aes(time_value, cases)) +
@@ -59,74 +80,4 @@ filtered_data %>%
5980
facet_wrap(~geo_value, scales = "free_y") +
6081
theme_minimal() +
6182
labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases")
62-
63-
# Test filtered_data not an epi_df still works.
64-
filtered_data_not_epi_df <- filtered_data %>%
65-
as_tibble()
66-
r %>% bake(filtered_data_not_epi_df)
67-
```
68-
69-
## Workflow Testing
70-
71-
Finally, let's test the workflow with the Yeo-Johnson transformation:
72-
73-
```{r workflow-test}
74-
# debugonce(slather.layer_epi_YeoJohnson)
75-
data <- filtered_data
76-
77-
# Create and fit the workflow
78-
r <- epi_recipe(data) %>%
79-
step_epi_YeoJohnson(cases) %>%
80-
step_epi_lag(cases, lag = 0) %>%
81-
step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
82-
step_epi_naomit()
83-
f <- frosting() %>%
84-
layer_predict() %>%
85-
layer_threshold(.pred) %>%
86-
layer_naomit(.pred) %>%
87-
layer_epi_YeoJohnson(.pred)
88-
89-
wf <- epi_workflow(r, linear_reg()) %>%
90-
fit(data) %>%
91-
add_frosting(f)
92-
93-
forecast(wf)
94-
data %>% slice_max(time_value, by = geo_value)
9583
```
96-
97-
```{r}
98-
library(epipredict)
99-
debugonce(epipredict:::slather.layer_population_scaling)
100-
jhu <- cases_deaths_subset %>%
101-
filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
102-
select(geo_value, time_value, cases)
103-
104-
pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
105-
106-
r <- epi_recipe(jhu) %>%
107-
step_population_scaling(
108-
df = pop_data,
109-
df_pop_col = "value",
110-
by = c("geo_value" = "states"),
111-
cases, suffix = "_scaled"
112-
) %>%
113-
step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
114-
step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
115-
step_epi_naomit()
116-
117-
f <- frosting() %>%
118-
layer_predict() %>%
119-
layer_threshold(.pred) %>%
120-
layer_naomit(.pred) %>%
121-
layer_population_scaling(.pred,
122-
df = pop_data,
123-
by = c("geo_value" = "states"),
124-
df_pop_col = "value"
125-
)
126-
127-
wf <- epi_workflow(r, linear_reg()) %>%
128-
fit(jhu) %>%
129-
add_frosting(f)
130-
131-
forecast(wf)
132-
```

tests/testthat/test-yeo-johnson.R

Lines changed: 91 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,103 @@
11
suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
22

3+
test_that("Yeo-Johnson transformation inverts correctly", {
4+
expect_true(
5+
map_lgl(seq(-5, 5, 0.1), function(lambda) {
6+
map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all()
7+
}) %>%
8+
all()
9+
)
10+
})
311

4-
# Real data test
5-
Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
12+
test_that("Yeo-Johnson steps and layers invert each other", {
13+
jhu <- cases_deaths_subset %>%
14+
filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
15+
select(geo_value, time_value, cases)
16+
filtered_data <- jhu
617

18+
# Get some lambda values
19+
r <- epi_recipe(filtered_data) %>%
20+
step_epi_YeoJohnson(cases) %>%
21+
step_epi_lag(cases, lag = 0) %>%
22+
step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
23+
step_epi_naomit()
24+
tr <- r %>% prep(filtered_data)
725

8-
# Transform with Yeo-Johnson
9-
data <- tar_read(joined_archive_data) %>%
10-
epix_as_of(as.Date("2023-11-08"))
11-
state_geo_values <- data %>% filter(source == "nhsn") %>% pull(geo_value) %>% unique()
12-
filtered_data <- data %>%
13-
filter(geo_value %in% state_geo_values) %>%
14-
select(geo_value, source, time_value, hhs)
15-
r <- epi_recipe(filtered_data) %>%
16-
step_epi_YeoJohnson(hhs) %>%
17-
prep(filtered_data)
18-
r
19-
# Inspect the lambda values (a few states have default lambda = 0.25, because
20-
# they have issues)
21-
r$steps[[1]]$lambdas %>% print(n = 55)
22-
out1 <- r %>% bake(filtered_data)
26+
# Check general lambda values tibble structure
27+
expect_true("lambda_cases" %in% names(tr$steps[[1]]$lambdas))
28+
expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_cases))
29+
# Still works on a tibble
30+
expect_equal(
31+
tr %>% bake(filtered_data %>% as_tibble()),
32+
tr %>% bake(filtered_data)
33+
)
2334

24-
# Transform with manual whitening (quarter root scaling)
25-
# learned_params <- calculate_whitening_params(filtered_data, "hhs", scale_method = "none", center_method = "none", nonlin_method = "quart_root")
26-
out2 <- filtered_data %>%
27-
mutate(hhs = (hhs + 0.01)^(1 / 4))
35+
# Make sure that the inverse transformation works
36+
f <- frosting() %>%
37+
layer_predict() %>%
38+
layer_epi_YeoJohnson(.pred)
39+
wf <- epi_workflow(r, linear_reg()) %>%
40+
fit(filtered_data) %>%
41+
add_frosting(f)
42+
out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
43+
out2 <- forecast(wf) %>% rename(cases = .pred)
44+
expect_equal(out1, out2)
2845

29-
out1 %>%
30-
left_join(out2, by = c("geo_value", "source", "time_value")) %>%
31-
mutate(hhs_diff = hhs.x - hhs.y) %>%
32-
ggplot(aes(time_value, hhs_diff)) +
33-
geom_line() +
34-
facet_wrap(~geo_value, scales = "free_y") +
35-
theme_minimal() +
36-
labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS")
46+
# Make sure it works when there are multiple predictors and outcomes
47+
jhu_multi <- epidatasets::covid_case_death_rates_extended %>%
48+
filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
49+
select(geo_value, time_value, case_rate, death_rate)
50+
filtered_data <- jhu_multi
51+
r <- epi_recipe(filtered_data) %>%
52+
step_epi_YeoJohnson(case_rate, death_rate) %>%
53+
step_epi_lag(case_rate, death_rate, lag = 0) %>%
54+
step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>%
55+
step_epi_naomit()
56+
tr <- r %>% prep(filtered_data)
3757

38-
# Plot the real data before and after transformation
39-
geo_filter <- "ca"
40-
filtered_data %>%
41-
filter(geo_value == geo_filter, source == "nhsn") %>%
42-
mutate(hhs = log(hhs)) %>%
43-
ggplot(aes(time_value, hhs)) +
44-
geom_line(color = "blue") +
45-
geom_line(data = out1 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "green") +
46-
geom_line(data = out2 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "red") +
47-
theme_minimal() +
48-
labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS")
58+
# Check general lambda values tibble structure
59+
expect_true("lambda_case_rate" %in% names(tr$steps[[1]]$lambdas))
60+
expect_true("lambda_death_rate" %in% names(tr$steps[[1]]$lambdas))
61+
expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_case_rate))
62+
expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_death_rate))
4963

64+
# TODO: Make sure that the inverse transformation works
65+
f <- frosting() %>%
66+
layer_predict() %>%
67+
layer_epi_YeoJohnson(.pred_ahead_0_case_rate)
68+
wf <- epi_workflow(r, linear_reg()) %>%
69+
fit(filtered_data) %>%
70+
add_frosting(f)
71+
out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
72+
# debugonce(slather.layer_epi_YeoJohnson)
73+
out2 <- forecast(wf) %>% rename(case_rate = .pred)
74+
expect_equal(out1, out2)
75+
})
5076

51-
# TODO: Test this.
52-
## Layer Yeo-Johnson2
53-
postproc <- frosting() %>%
54-
layer_epi_YeoJohnson()
77+
test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", {
78+
jhu <- cases_deaths_subset %>%
79+
filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>%
80+
select(geo_value, time_value, cases)
81+
filtered_data <- jhu
5582

56-
wf <- epi_workflow(r) %>%
57-
fit(data) %>%
58-
add_frosting(postproc)
83+
# Get some lambda values
84+
r <- epi_recipe(filtered_data) %>%
85+
step_epi_YeoJohnson(cases) %>%
86+
step_epi_lag(cases, lag = 0) %>%
87+
step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
88+
step_epi_naomit()
89+
tr <- r %>% prep(filtered_data)
90+
# Check for fixed lambda values
91+
expect_true(all(near(tr$steps[[1]]$lambdas$lambda_cases, c(0.856, 0.207), tol = 0.001)))
5992

60-
61-
# Test inverse transformation
62-
map_lgl(seq(-5, 5, 0.1), function(lambda) {
63-
map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all()
64-
}) %>%
65-
all()
93+
# Make sure that the inverse transformation works
94+
f <- frosting() %>%
95+
layer_predict() %>%
96+
layer_epi_YeoJohnson(.pred)
97+
wf <- epi_workflow(r, linear_reg()) %>%
98+
fit(filtered_data) %>%
99+
add_frosting(f)
100+
out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value)
101+
out2 <- forecast(wf) %>% rename(cases = .pred)
102+
expect_equal(out1, out2)
103+
})

0 commit comments

Comments
 (0)