Skip to content

Commit 2a3a60a

Browse files
committed
functional, if not helpful
1 parent 0e83b1a commit 2a3a60a

File tree

3 files changed

+153
-81
lines changed

3 files changed

+153
-81
lines changed

R/aux_data_utils.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -710,13 +710,15 @@ create_nhsn_data_archive <- function(disease_name) {
710710
}
711711

712712

713-
up_to_date_nssp_state_archive <- function() {
713+
up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
714+
disease <- arg_match(disease)
714715
nssp_state <- pub_covidcast(
715716
source = "nssp",
716-
signal = "pct_ed_visits_influenza",
717+
signal = glue::glue("pct_ed_visits_{disease}"),
717718
time_type = "week",
718719
geo_type = "state",
719-
geo_values = "*"
720+
geo_values = "*",
721+
issues = "*"
720722
)
721723
nssp_state %>%
722724
select(geo_value, time_value, issue, nssp = value) %>%

scripts/covid_hosp_prod.R

Lines changed: 85 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,83 @@ forecast_generation_date <- Sys.Date()
1515
# Usually, the forecast_date is the same as the generation date, but you can
1616
# override this. It should be a Wednesday.
1717
forecast_date <- round_date(forecast_generation_date, "weeks", week_start = 3)
18-
# If doing backfill, you can set the forecast_date to a sequence of dates.
19-
# forecast_date <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
2018
# forecast_generation_date needs to follow suit, but it's more complicated
2119
# because sometimes we forecast on Thursday.
22-
# forecast_generation_date <- c(as.Date(c("2024-11-21", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
20+
#forecast_generation_date <- c(as.Date(c("2024-11-20", "2024-11-27", "2024-12-04", "2024-12-11", "2024-12-18", "2024-12-26", "2025-01-02")), seq.Date(as.Date("2025-01-08"), Sys.Date(), by = 7L))
21+
# If doing backfill, you can set the forecast_date to a sequence of dates.
22+
#forecast_date <- seq.Date(as.Date("2024-11-20"), Sys.Date(), by = 7L)
2323

2424
forecaster_fns <- list2(
25-
linear = function(...) {
26-
forecaster_baseline_linear(..., residual_tail = 0.97, residual_center = 0.097, no_intercept = TRUE)
25+
linear = function(epi_data, ahead, extra_data, ...) {
26+
forecaster_baseline_linear(epi_data, ahead, ..., residual_tail = 0.97, residual_center = 0.097, no_intercept = TRUE)
2727
},
28-
# linearlog = function(...) {
28+
# linearlog = function(epi_data, ahead, extra_data, ...) {
2929
# forecaster_baseline_linear(..., log = TRUE)
3030
# },
31-
climate_base = function(...) {
31+
climate_base = function(epi_data, ahead, extra_data, ...) {
3232
climatological_model(
33-
...,
33+
epi_data, ahead, ...,
3434
)
3535
},
36-
climate_geo_agged = function(...) {
36+
climate_geo_agged = function(epi_data, ahead, extra_data, ...) {
3737
climatological_model(
38-
...,
38+
epi_data, ahead, ...,
3939
geo_agg = TRUE
4040
)
4141
},
42+
windowed_seasonal_extra_sources = function(epi_data, ahead, extra_data, ...) {
43+
fcst <-
44+
epi_data %>%
45+
left_join(extra_data, by = join_by(geo_value, time_value)) %>%
46+
scaled_pop_seasonal(
47+
outcome = "value",
48+
ahead = ahead * 7,
49+
extra_sources = "nssp",
50+
...,
51+
seasonal_method = "window",
52+
trainer = epipredict::quantile_reg(),
53+
drop_non_seasons = TRUE,
54+
pop_scaling = FALSE,
55+
lags = list(c(0, 7), c(0, 7))
56+
) %>%
57+
mutate(target_end_date = target_end_date + 3)
58+
fcst
59+
}
4260
)
4361

4462
rlang::list2(
4563
tar_target(aheads, command = -1:3),
4664
tar_target(forecasters, command = seq_along(forecaster_fns)),
65+
tar_target(
66+
download_latest,
67+
command = {
68+
if (wday(Sys.Date()) < 6 & wday(Sys.Date()) > 3) {
69+
# download from the preliminary data source from Wednesday to Friday
70+
most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/mpgq-jmmr.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
71+
} else {
72+
most_recent_result <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm,totalconfflunewadm")
73+
}
74+
most_recent_result <-
75+
most_recent_result %>%
76+
process_nhsn_data() %>%
77+
filter(disease == "nhsn_covid") %>%
78+
select(-disease) %>%
79+
filter(geo_value %nin% insufficient_data_geos)
80+
# if there's not already a result we need to save it no matter what
81+
if (file.exists(here::here(".nhsn_covid_cache.parquet"))) {
82+
previous_result <- qs::qread(here::here(".nhsn_covid_cache.parquet"))
83+
} else
84+
# if something is different, update the file
85+
if (any(previous_result != most_recent_result)) {
86+
qs::qsave(most_recent_result, here::here(".nhsn_covid_cache.parquet"))
87+
} else {
88+
qs::qsave(most_recent_result, here::here(".nhsn_covid_cache.parquet"))
89+
}
90+
NULL
91+
},
92+
description = "Download the result, and update the file only if it's actually different",
93+
priority = 1
94+
),
4795
tar_target(
4896
nhsn_latest_data,
4997
command = {
@@ -59,14 +107,19 @@ rlang::list2(
59107
select(-disease) %>%
60108
filter(geo_value %nin% insufficient_data_geos)
61109
},
62-
cue = tar_cue(mode = "always")
63110
),
64111
tar_target(
65112
name = nhsn_archive_data,
66113
command = {
67114
create_nhsn_data_archive(disease = "nhsn_covid")
68115
}
69116
),
117+
tar_target(
118+
current_nssp_archive,
119+
command = {
120+
up_to_date_nssp_state_archive("covid")
121+
}
122+
),
70123
tar_map(
71124
values = tidyr::expand_grid(
72125
tibble(
@@ -84,7 +137,6 @@ rlang::list2(
84137
}
85138
geo_forecasters_weights
86139
},
87-
cue = tar_cue(mode = "always")
88140
),
89141
tar_target(
90142
name = geo_exclusions,
@@ -109,29 +161,31 @@ rlang::list2(
109161
data_substitutions(disease = "covid") %>%
110162
as_epi_df(as_of = as.Date(forecast_date_int))
111163
}
164+
nssp <- current_nssp_archive %>% epix_as_of(min(forecast_date, current_nssp_archive$versions_end))
112165
attributes(train_data)$metadata$as_of <- as.Date(forecast_date_int)
113166
train_data %>%
114-
forecaster_fns[[forecasters]](ahead = aheads) %>%
167+
forecaster_fns[[forecasters]](ahead = aheads, extra_data = nssp) %>%
115168
mutate(
116169
forecaster = names(forecaster_fns[forecasters]),
117170
geo_value = as.factor(geo_value)
118171
)
119172
},
120-
pattern = cross(aheads, forecasters),
121-
cue = tar_cue(mode = "always")
173+
pattern = cross(aheads, forecasters)
122174
),
123175
tar_target(
124176
name = ensemble_res,
125177
command = {
126178
forecast_res %>%
127-
mutate(quantile = round(quantile, digits = 3)) %>%
128-
left_join(geo_forecasters_weights, by = join_by(forecast_date, forecaster, geo_value)) %>%
129-
mutate(value = value * weight) %>%
130-
group_by(forecast_date, geo_value, target_end_date, quantile) %>%
131-
summarize(value = sum(value, na.rm = TRUE), .groups = "drop") %>%
132-
filter(geo_value %nin% geo_exclusions)
179+
ensemble_linear_climate(
180+
aheads,
181+
other_weights = geo_forecasters_weights,
182+
max_climate_ahead_weight = 0.6,
183+
max_climate_quantile_weight = 0.6
184+
) %>%
185+
filter(geo_value %nin% geo_exclusions) %>%
186+
ungroup() %>%
187+
sort_by_quantile()
133188
},
134-
cue = tar_cue(mode = "always")
135189
),
136190
tar_target(
137191
name = ensemble_mixture_res,
@@ -144,7 +198,11 @@ rlang::list2(
144198
max_climate_quantile_weight = 0.6
145199
) %>%
146200
filter(geo_value %nin% geo_exclusions) %>%
147-
ungroup()
201+
ungroup() %>%
202+
bind_rows(forecast_res %>% filter(forecaster == "windowed_seasonal_extra_sources")) %>%
203+
group_by(geo_value, forecast_date, target_end_date, quantile) %>%
204+
summarize(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
205+
sort_by_quantile()
148206
},
149207
),
150208
tar_target(
@@ -168,7 +226,6 @@ rlang::list2(
168226
format_flusight(disease = "covid") %>%
169227
write_submission_file(forecast_reference_date, file.path(submission_directory, "model-output/CMU-TimeSeries"))
170228
},
171-
cue = tar_cue(mode = "always")
172229
),
173230
tar_target(
174231
name = make_climate_submission_csv,
@@ -183,12 +240,11 @@ rlang::list2(
183240
format_flusight(disease = "covid") %>%
184241
write_submission_file(
185242
get_forecast_reference_date(forecast_date_int),
186-
submission_directory = file.path(submission_directory, "model-output/CMU-climatological-baseline"),
187-
file_name = "CMU-climatological-baseline"
243+
submission_directory = file.path(submission_directory, "model-output/CMU-climate_baseline"),
244+
file_name = "CMU-climate_baseline"
188245
)
189246
}
190247
},
191-
cue = tar_cue(mode = "always")
192248
),
193249
tar_target(
194250
name = validate_result,
@@ -205,7 +261,6 @@ rlang::list2(
205261
}
206262
validation
207263
},
208-
cue = tar_cue(mode = "always")
209264
),
210265
tar_target(
211266
name = validate_climate_result,
@@ -215,14 +270,13 @@ rlang::list2(
215270
if (submission_directory != "cache" && submit_climatological) {
216271
validation <- validate_submission(
217272
submission_directory,
218-
file_path = sprintf("CMU-climatological-baseline/%s-CMU-climatological-baseline.csv", get_forecast_reference_date(forecast_date_int))
273+
file_path = sprintf("CMU-climate_baseline/%s-CMU-climate_baseline.csv", get_forecast_reference_date(forecast_date_int))
219274
)
220275
} else {
221276
validation <- "not validating when there is no hub (set submission_directory)"
222277
}
223278
validation
224279
},
225-
cue = tar_cue(mode = "always")
226280
),
227281
tar_target(
228282
name = truth_data,
@@ -261,7 +315,6 @@ rlang::list2(
261315
select(-rel_max_value)
262316
truth_data %>% bind_rows(nssp_renormalized)
263317
},
264-
cue = tar_cue(mode = "always")
265318
),
266319
tar_target(
267320
notebook,
@@ -271,7 +324,7 @@ rlang::list2(
271324
"scripts/reports/forecast_report.Rmd",
272325
output_file = here::here(
273326
"reports",
274-
sprintf("%s_covid_prod_on_%s.html", as.Date(forecast_date_int), as.Date(forecast_generation_date_int))
327+
sprintf("%s_covid_prod_on_%s.html", as.Date(forecast_date_int), as.Date(Sys.Date()))
275328
),
276329
params = list(
277330
disease = "covid",
@@ -281,7 +334,6 @@ rlang::list2(
281334
)
282335
)
283336
},
284-
cue = tar_cue(mode = "always")
285337
)
286338
),
287339
)

0 commit comments

Comments
 (0)