feat: add utils to manage reports

dsweber2 · dsweber2 · commit ba0ceffb7124 · 2025-02-08T23:03:04.000Z
diff --git a/R/utils.R b/R/utils.R
@@ -275,7 +275,14 @@ get_forecast_reference_date <- function(date) {
   MMWRweek::MMWRweek2Date(lubridate::epiyear(date), lubridate::epiweek(date)) + 6
 }
 
-update_site <- function() {
+#' Update the site with the latest reports.
+#'
+#' Looks at that `reports/` directory and updates `template.md` with new reports
+#' that follow a naming convention. This is translated into `report.md` which is
+#' then converted to `index.html` with pandoc.
+#'
+#' @param sync_to_s3 Whether to sync the reports to the S3 bucket.
+update_site <- function(sync_to_s3 = TRUE) {
   library(fs)
   library(stringr)
   # Define the directories
@@ -288,7 +295,9 @@ update_site <- function() {
   }
 
   # Sync the reports directory with the S3 bucket
-  aws.s3::s3sync(path = reports_dir, bucket = "forecasting-team-data", prefix = "reports-2024/", verbose = FALSE)
+  if (sync_to_s3) {
+    aws.s3::s3sync(path = reports_dir, bucket = "forecasting-team-data", prefix = "reports-2024/", verbose = FALSE)
+  }
 
   # Read the template file
   if (!file_exists(template_path)) {
@@ -313,8 +322,7 @@ update_site <- function() {
   # forecast date
   used_reports <- report_table %>%
     group_by(forecast_date, disease) %>%
-    arrange(generation_date) %>%
-    filter(generation_date == max(generation_date)) %>%
+    slice_max(generation_date) %>%
     ungroup() %>%
     arrange(forecast_date)
 
@@ -324,8 +332,9 @@ update_site <- function() {
     file_parts <- str_split(fs::path_ext_remove(file_name), "_", simplify = TRUE)
     date <- file_parts[1]
     disease <- file_parts[2]
+    generation_date <- file_parts[5]
 
-    report_link <- sprintf("- [%s Forecasts %s](%s)", str_to_title(disease), date, file_name)
+    report_link <- sprintf("- [%s Forecasts %s, Rendered %s](%s)", str_to_title(disease), date, generation_date, file_name)
 
     # Insert into Production Reports section, skipping a line
     prod_reports_index <- which(grepl("## Production Reports", report_md_content)) + 1
@@ -340,6 +349,76 @@ update_site <- function() {
   system("pandoc reports/report.md -s -o reports/index.html --css=reports/style.css --mathjax='https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js' --metadata pagetitle='Delphi Reports'")
 }
 
+#' Delete unused reports from the S3 bucket.
+#'
+#' @param dry_run List files that would be deleted if `dry_run` is `FALSE`.
+delete_extra_s3_files <- function(dry_run = TRUE) {
+  local_path <- "reports"
+  bucket <- "forecasting-team-data"
+  prefix <- "reports-2024/"
+  # Get list of local files (relative paths)
+  local_files <- list.files(local_path, recursive = TRUE)
+
+  # Get list of S3 files
+  s3_objects <- aws.s3::get_bucket(bucket, prefix = prefix)
+  s3_files <- sapply(s3_objects, function(x) x$Key)
+
+  # Find files that exist in S3 but not locally
+  # Remove prefix from s3_files for comparison
+  s3_files_clean <- gsub(prefix, "", s3_files)
+  files_to_delete <- s3_files[!(s3_files_clean %in% local_files)]
+
+  if (dry_run) {
+    message("Would delete ", length(files_to_delete), " files from S3")
+    message("Files: ", paste(files_to_delete, collapse = ", "))
+    return(invisible(files_to_delete))
+  }
+
+  # Delete each extra file
+  if (length(files_to_delete) > 0) {
+    message("Deleting ", length(files_to_delete), " files from S3")
+    for (file in files_to_delete) {
+      message("Deleting: ", file)
+      aws.s3::delete_object(file, bucket)
+    }
+  } else {
+    message("No files to delete")
+  }
+}
+
+#' Find unused report files in index.html.
+find_unused_report_files <- function() {
+  library(rvest)
+  library(fs)
+  library(stringr)
+
+  # Read all files in reports directory
+  all_files <- dir_ls("reports", recurse = TRUE) %>%
+    path_file() # just get filenames, not full paths
+
+  # Read index.html and extract all href links
+  index_html <- read_html("reports/index.html")
+  used_files <- index_html %>%
+    html_elements("a") %>%
+    html_attr("href") %>%
+    # Add known required files like CSS
+    c("style.css", "template.md", "report.md", "index.html", .) %>%
+    # Remove links like "https://" from the list
+    keep(~ !grepl("^https?://", .))
+
+  # Find files that exist but aren't referenced
+  unused_files <- setdiff(all_files, used_files)
+
+  if (length(unused_files) > 0) {
+    cat("The following files in 'reports' are not referenced in index.html:\n")
+    cat(paste("-", unused_files), sep = "\n")
+  } else {
+    cat("All files in 'reports' are referenced in index.html\n")
+  }
+
+  return(invisible(unused_files))
+}
+
 #' Ensure that forecast values are monotically increasing
 #' in quantile order.
 sort_by_quantile <- function(forecasts) {