Skip to content

Commit 5a5a24f

Browse files
committed
|> in backtesting, dropped a section in get started
1 parent cc1b515 commit 5a5a24f

File tree

2 files changed

+77
-41
lines changed

2 files changed

+77
-41
lines changed

vignettes/backtesting.Rmd

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ medical insurance claims and the number of new confirmed COVID-19 cases per
5858

5959
```{r grab-epi-data}
6060
# Select the `percent_cli` column from the data archive
61-
doctor_visits <- archive_cases_dv_subset$DT %>%
62-
select(geo_value, time_value, version, percent_cli) %>%
63-
drop_na(percent_cli) %>%
61+
doctor_visits <- archive_cases_dv_subset$DT |>
62+
select(geo_value, time_value, version, percent_cli) |>
63+
tidyr::drop_na(percent_cli) |>
6464
as_epi_archive(compactify = TRUE)
6565
```
6666

@@ -77,8 +77,8 @@ doctor_visits <- pub_covidcast(
7777
geo_values = "ca,fl,ny,tx",
7878
time_values = epirange(20200601, 20211201),
7979
issues = epirange(20200601, 20211201)
80-
) %>%
81-
rename(version = issue, percent_cli = value) %>%
80+
) |>
81+
rename(version = issue, percent_cli = value) |>
8282
as_epi_archive(compactify = TRUE)
8383
```
8484

@@ -99,20 +99,20 @@ percent_cli_data <- bind_rows(
9999
# Snapshotted data for the version-faithful forecasts
100100
map(
101101
forecast_dates,
102-
~ doctor_visits %>%
103-
epix_as_of(.x) %>%
102+
~ doctor_visits |>
103+
epix_as_of(.x) |>
104104
mutate(version = .x)
105-
) %>%
106-
bind_rows() %>%
105+
) |>
106+
bind_rows() |>
107107
mutate(version_faithful = TRUE),
108108
# Latest data for the version-faithless forecasts
109-
doctor_visits %>%
110-
epix_as_of(doctor_visits$versions_end) %>%
109+
doctor_visits |>
110+
epix_as_of(doctor_visits$versions_end) |>
111111
mutate(version_faithful = FALSE)
112112
)
113113
114114
p0 <-
115-
ggplot(data = percent_cli_data %>% filter(geo_value == geo_choose)) +
115+
ggplot(data = percent_cli_data |> filter(geo_value == geo_choose)) +
116116
geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +
117117
geom_line(
118118
aes(x = time_value, y = percent_cli, color = factor(version)),
@@ -154,9 +154,9 @@ of the red time-series to its left.
154154
In fact, if we take a snapshot and get the last `time_value`:
155155

156156
```{r}
157-
doctor_visits %>%
158-
epix_as_of(as.Date("2020-08-01")) %>%
159-
pull(time_value) %>%
157+
doctor_visits |>
158+
epix_as_of(as.Date("2020-08-01")) |>
159+
pull(time_value) |>
160160
max()
161161
```
162162

@@ -185,14 +185,14 @@ One way to do this is by setting the `.version` argument for `epix_slide()`:
185185

186186
```{r single_version, warn = FALSE}
187187
forecast_date <- as.Date("2021-04-06")
188-
forecasts <- doctor_visits %>%
188+
forecasts <- doctor_visits |>
189189
epix_slide(
190190
~ arx_forecaster(
191191
.x,
192192
outcome = "percent_cli",
193193
predictors = "percent_cli",
194194
args_list = arx_args_list()
195-
)$predictions %>%
195+
)$predictions |>
196196
pivot_quantiles_wider(.pred_distn),
197197
.versions = forecast_date
198198
)
@@ -202,12 +202,12 @@ As truth data, we'll compare with the `epix_as_of()` to generate a snapshot of
202202
the archive at the last date[^1].
203203

204204
```{r compare_single_with_result}
205-
forecasts %>%
205+
forecasts |>
206206
inner_join(
207-
doctor_visits %>%
207+
doctor_visits |>
208208
epix_as_of(doctor_visits$versions_end),
209209
by = c("geo_value", "target_date" = "time_value")
210-
) %>%
210+
) |>
211211
select(geo_value, forecast_date, .pred, `0.05`, `0.95`, percent_cli)
212212
```
213213

@@ -227,9 +227,9 @@ This has the effect of simulating a data set that receives the final version
227227
updates every day.
228228

229229
```{r}
230-
archive_cases_dv_subset_faux <- doctor_visits %>%
231-
epix_as_of(doctor_visits$versions_end) %>%
232-
mutate(version = time_value) %>%
230+
archive_cases_dv_subset_faux <- doctor_visits |>
231+
epix_as_of(doctor_visits$versions_end) |>
232+
mutate(version = time_value) |>
233233
as_epi_archive()
234234
```
235235

@@ -251,10 +251,10 @@ forecast_wrapper <- function(
251251
lags = c(0:7, 14, 21),
252252
adjust_latency = "extend_ahead"
253253
)
254-
)$predictions %>%
254+
)$predictions |>
255255
pivot_quantiles_wider(.pred_distn)
256256
}
257-
) %>%
257+
) |>
258258
bind_rows()
259259
}
260260
```
@@ -276,20 +276,20 @@ forecast_dates <- seq(
276276
)
277277
aheads <- c(1, 7, 14, 21, 28)
278278
279-
version_faithless <- archive_cases_dv_subset_faux %>%
279+
version_faithless <- archive_cases_dv_subset_faux |>
280280
epix_slide(
281281
~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
282282
.before = 120,
283283
.versions = forecast_dates
284-
) %>%
284+
) |>
285285
mutate(version_faithful = FALSE)
286286
287-
version_faithful <- doctor_visits %>%
287+
version_faithful <- doctor_visits |>
288288
epix_slide(
289289
~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
290290
.before = 120,
291291
.versions = forecast_dates
292-
) %>%
292+
) |>
293293
mutate(version_faithful = TRUE)
294294
295295
forecasts <-
@@ -316,8 +316,8 @@ ny), we'll just display the results for two states, California (CA) and Florida
316316

317317
```{r plot_ca_forecasts, warning = FALSE}
318318
geo_choose <- "ca"
319-
forecasts_filtered <- forecasts %>%
320-
filter(geo_value == geo_choose) %>%
319+
forecasts_filtered <- forecasts |>
320+
filter(geo_value == geo_choose) |>
321321
mutate(time_value = version)
322322
323323
p1 <- # first plotting the forecasts as bands, lines and points
@@ -326,10 +326,10 @@ p1 <- # first plotting the forecasts as bands, lines and points
326326
geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
327327
geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
328328
# the forecast date
329-
geom_vline(data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
329+
geom_vline(data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
330330
# the underlying data
331331
geom_line(
332-
data = percent_cli_data %>% filter(geo_value == geo_choose),
332+
data = percent_cli_data |> filter(geo_value == geo_choose),
333333
aes(x = time_value, y = percent_cli, color = factor(version)),
334334
inherit.aes = FALSE, na.rm = TRUE
335335
) +
@@ -342,8 +342,8 @@ p1 <- # first plotting the forecasts as bands, lines and points
342342

343343
```{r plot_fl_forecasts, warning = FALSE}
344344
geo_choose <- "fl"
345-
forecasts_filtered <- forecasts %>%
346-
filter(geo_value == geo_choose) %>%
345+
forecasts_filtered <- forecasts |>
346+
filter(geo_value == geo_choose) |>
347347
mutate(time_value = version)
348348
349349
p2 <-
@@ -352,11 +352,11 @@ p2 <-
352352
geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
353353
geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
354354
geom_vline(
355-
data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful),
355+
data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful),
356356
aes(color = factor(version), xintercept = version), lty = 2
357357
) +
358358
geom_line(
359-
data = percent_cli_data %>% filter(geo_value == geo_choose),
359+
data = percent_cli_data |> filter(geo_value == geo_choose),
360360
aes(x = time_value, y = percent_cli, color = factor(version)),
361361
inherit.aes = FALSE, na.rm = TRUE
362362
) +
@@ -398,7 +398,7 @@ p2
398398

399399

400400
[^1]: For forecasting a single day like this, we could have actually just used
401-
`doctor_visits %>% epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
401+
`doctor_visits |> epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
402402
page](../index.html#motivating-example).
403403

404404

vignettes/epipredict.Rmd

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ library(recipes)
1919
library(epidatasets)
2020
library(epipredict)
2121
library(ggplot2)
22+
library(purrr)
2223
forecast_date <- as.Date("2021-08-01")
2324
used_locations <- c("ca", "ma", "ny", "tx")
2425
library(epidatr)
@@ -331,8 +332,41 @@ autoplot(
331332
The 8 graphs are all pairs of the `geo_values` (`"Quebec"` and `"British Columbia"`), `edu_quals` (`"Undergraduate degree"` and `"Professional degree"`), and age brackets (`"15 to 34 years"` and `"35 to 64 years"`).
332333

333334
## Fitting a non-geo-pooled model
334-
The primary difference to avoid geo-pooling is to first `group_by(geo_value)`
335-
before forecasting
335+
336+
Because our internal methods fit a single model, to fit a non-geo-pooled model
337+
that has a different fit for each geography, one either needs a multi-level
338+
engine (which at the moment parsnip doesn't support), or one needs to map over
339+
geographies.
340+
341+
```{r fit_non_geo_pooled, warning=FALSE}
342+
geo_values <- covid_case_death_rates |> pull(geo_value) |> unique()
343+
344+
all_fits <-
345+
purrr::map(geo_values, \(geo) {
346+
covid_case_death_rates |>
347+
filter(
348+
geo_value == geo,
349+
time_value <= forecast_date) |>
350+
arx_forecaster(
351+
outcome = "death_rate",
352+
trainer = linear_reg(),
353+
predictors = c("death_rate"),
354+
args_list = arx_args_list(
355+
lags = list(c(0, 7, 14)),
356+
ahead = 14
357+
)
358+
)
359+
})
360+
map_df(all_fits, ~ pluck(., "predictions"))
361+
```
362+
363+
This is both 56 times slower[^7], and uses far less data to fit each model.
364+
If the geographies are at all comparable, for example by normalization, we would
365+
get much better results by pooling.
366+
367+
If we wanted to build a geo-aware model, such as one that sets the constant in a
368+
linear regression fit to be different for each geography, we would need to build a [Custom workflow](custom_epiworkflows) with geography as a factor.
369+
336370
# Anatomy of a canned forecaster
337371
## Code object
338372
Let's dissect the forecaster we trained back on the [landing
@@ -390,7 +424,7 @@ An `epi_workflow()` consists of 3 parts:
390424
5 of as these well. You can inspect the layers more closely by running
391425
`epipredict::extract_layers(four_week_ahead$epi_workflow)`.
392426

393-
See the [Guts vignette](preprocessing-and-models) for recreating and then
427+
See the [Guts vignette](custom_epiworkflows) for recreating and then
394428
extending `four_week_ahead` using the custom forecaster framework.
395429

396430
## Mathematical description
@@ -436,3 +470,5 @@ without `NA` values is a training point to fit the coefficients $a_0,\ldots, a_6
436470

437471
[^6]: alternatively, for an unfit version of the preprocessor, you can call
438472
`hardhat::extract_preprocessor(four_week_ahead$epi_workflow)`
473+
474+
[^7]: the number of geographies

0 commit comments

Comments
 (0)