Skip to content

Commit 070e8bd

Browse files
dsweber2dshemetov
andauthored
various backtesting utils (#171)
* various backtesting utils * geo_exclusions updates * no regions, allow multi-subs, [2025-02-26] forecasts --------- Co-authored-by: Dmitry Shemetov <dshemetov@ucdavis.edu>
1 parent 8ab8e89 commit 070e8bd

File tree

8 files changed

+419
-156
lines changed

8 files changed

+419
-156
lines changed

R/aux_data_utils.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,7 @@ delete_files_from_s3 <- function(keys, bucket, batch_size = 500, .progress = TRU
689689
get_nhsn_data_archive <- function(disease_name) {
690690
aws.s3::s3read_using(nanoparquet::read_parquet, object = "nhsn_data_archive.parquet", bucket = "forecasting-team-data") %>%
691691
filter(disease == disease_name) %>%
692+
filter(!grepl("region.*", geo_value)) %>%
692693
select(-version_timestamp, -disease) %>%
693694
as_epi_archive(compactify = TRUE)
694695
}

R/targets/score_targets.R

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,19 @@ score_forecasts <- function(nhsn_latest_data, joined_forecasts_and_ensembles) {
2222
drop_na() %>%
2323
rename(location = state_code) %>%
2424
select(-geo_value)
25+
# limit the forecasts to the same set of forecasting times
26+
max_forecast_date <-
27+
joined_forecasts_and_ensembles %>%
28+
group_by(forecaster) %>%
29+
summarize(max_forecast = max(forecast_date)) %>%
30+
pull(max_forecast) %>%
31+
min()
2532
forecasts_formatted <-
2633
joined_forecasts_and_ensembles %>%
34+
filter(forecast_date <= max_forecast_date) %>%
2735
format_scoring_utils(disease = "covid")
2836
scores <- forecasts_formatted %>%
29-
filter(location != "US") %>%
37+
filter(location %nin% c("US", "60", "66", "78")) %>%
3038
hubEvals::score_model_out(
3139
truth_data,
3240
metrics = c("wis", "ae_median", "interval_coverage_50", "interval_coverage_90"),

R/utils.R

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,10 @@ data_substitutions <- function(dataset, substitutions_path, forecast_generation_
163163
) %>%
164164
filter(forecast_date == forecast_generation_date) %>%
165165
select(-forecast_date) %>%
166-
rename(new_value = value) %>%
167-
select(-time_value)
166+
rename(new_value = value)
168167
# Replace the most recent values in the appropriate keys with the substitutions
169168
new_values <- dataset %>%
170-
group_by(geo_value) %>%
171-
slice_max(time_value) %>%
172-
inner_join(substitutions, by = "geo_value") %>%
169+
inner_join(substitutions, by = join_by(geo_value, time_value)) %>%
173170
mutate(value = ifelse(!is.na(new_value), new_value, value)) %>%
174171
select(-new_value)
175172
# Remove keys from dataset that have been substituted
@@ -383,8 +380,14 @@ update_site <- function(sync_to_s3 = TRUE) {
383380
slice_max(generation_date)
384381
# iterating over the diseases
385382
for (row_num in seq_along(used_files$filename)) {
383+
file_name <- path_file(used_files$filename[[row_num]])
386384
scoring_index <- which(grepl("### Scoring this season", report_md_content)) + 1
387-
score_link <- sprintf("- [%s Scoring, Rendered %s](%s)", str_to_title(used_files$disease[[row_num]]), used_files$generation_date[[row_num]], used_files$filename[[row_num]])
385+
score_link <- sprintf(
386+
"- [%s Scoring, Rendered %s](%s)",
387+
str_to_title(used_files$disease[[row_num]]),
388+
used_files$generation_date[[row_num]],
389+
file_name
390+
)
388391
report_md_content <- append(report_md_content, score_link, after = scoring_index)
389392
}
390393
}

0 commit comments

Comments
 (0)