cmu-delphi
diff --git a/‎data-raw/cancovid.R‎
Lines changed: 96 additions & 20 deletions b/‎data-raw/cancovid.R‎
Lines changed: 96 additions & 20 deletions
diff --git a/‎data-raw/get_hist.sh‎
Lines changed: 0 additions & 24 deletions b/‎data-raw/get_hist.sh‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎data-raw/modify_csv.R‎
Lines changed: 0 additions & 37 deletions b/‎data-raw/modify_csv.R‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎data/cancovid.rda‎
461 Bytes b/‎data/cancovid.rda‎
461 Bytes
@@ -4,37 +4,112 @@ library(dplyr)
 library(epiprocess)
 library(readr)
 library(purrr)
+library(httr)
+library(jsonlite)
 
 
-# Steps
-# - Run <get_hist.sh> to get the data. Stored in <updates.nosync/> folder
-# - <modify_csv.R> request and construct data
-# - <proc_dat.R> make data into "reported" and "revised" format
-# - reported-revised format data can be found in <intermediate/changes.csv>
+# Look for a GitHub API token.
+# Returns an empty string "" if env variable not found.
+gh_token <- Sys.getenv("GITHUB_PAT")
+if (gh_token == "") {
+  # Try again with the secondary name.
+  gh_token <- Sys.getenv("GITHUB_TOKEN")
+}
+if (gh_token == "") {
+  warning("Token is not set or is not able to be fetched from the environment.",
+          " Proceeding without authentication, but the requests may be blocked", 
+          " due to GitHub API rate limits.")
+}
+
+# Construct a header to send with GET requests
+if (gh_token == "") {
+  # Empty header
+  auth_header <- httr::add_headers()
+} else {
+  auth_header <- httr::add_headers(Authorization = paste("Bearer", gh_token))
+}
+
+## Get list of new and modified files to download
+# The `path` field filters commits to only those that modifying the listed dir
+BASE_URL <- "https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s"
+ITEMS_PER_PAGE <- 100
+BRANCH <- "master"
+
+
+
+# We want to fetch all commits made since Mar 13 2022 (version the original
+# dataset was created from).
 #
-# Name of each csv file match with the last date of that
-# csv file only starts from 11-13-2021. Therefore, csv file names shouldn't be
-# used as date index, but rather, the last date of each csv file.
+# Timestamp should be in ISO 8601 format. See
+# https://docs.github.com/en/rest/reference/commits#list-commits--parameters for
+# details.
+since_date <- strftime("2022-03-13", "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
+
+page <- 0
+commit_pages <- list()
 
+# Fetch list of commits from API, one page at a time. Each page contains up to
+# 100 commits. If a page contains 100 commits, assume that there are more
+# results and fetch the next page.
+while (page == 0 || nrow(commit_page) == 100) {
+  page <- page + 1
+  # Construct the URL
+  commits_url <- sprintf(BASE_URL, BRANCH, ITEMS_PER_PAGE, since_date, page)
+  
+  request <- GET(commits_url, auth_header)
+  # Convert any HTTP errors to R errors automatically.
+  stop_for_status(request)
+  
+  # Convert results from nested JSON/list to dataframe. If no results returned,
+  # `commit_page` will be an empty list.
+  commit_page <- content(request, as = "text") %>%
+    fromJSON(simplifyDataFrame = TRUE, flatten = TRUE) %>% 
+    # Trim message down a bit.
+    mutate(message = substr(commit.message, 1, 40)) %>% 
+    select(sha, url = commit.url, message)
+  
+  # No more results are being returned.
+  if (identical(commit_page, list())) {
+    break
+  }
+  
+  commit_pages[[page]] <- commit_page
+}
+
+# Combine all requested pages of commits into one dataframe
+commit_pages <- bind_rows(commit_pages)
 
-path_to_csvs <- here::here("data-raw/updates.nosync/")
-files <- list.files(path_to_csvs)
+# Missing value `%s` to be filled in with a commit sha or branch name.
+BASE_DATA_URL <- "https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv"
 
 fc_time_values <- seq(as.Date("2021-02-01"), as.Date("2021-12-01"),
                       by = "1 month")
-ca_as_ofs <- as.Date(substr(files, 1, 10)) %>%
-  intersect(fc_time_values) %>%
-  as.Date(origin = "1970-01-01")
+commit_pages <- mutate(
+  commit_pages,
+  data_url = sprintf(BASE_DATA_URL, sha),
+  date = strsplit(message, " ") %>% map_chr(~ substr(.x[3], start=1, stop=10)) %>% as.Date()
+) %>% 
+  # select(data_url, date) %>% 
+  na.omit() %>% 
+  filter(date %in% fc_time_values)
 
 # From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv
 ca_pop_url <- "https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv"
-ca_pop <- read_csv(ca_pop_url) %>% 
+ca_pop <- read_csv(
+  ca_pop_url,
+  col_types = cols(
+    Province = col_character(),
+    shortProvince = col_character(),
+    Population = col_integer()
+  )
+) %>% 
   rename(province = Province, abbreviation = shortProvince, population = Population)
 abbrev_map <- setNames(ca_pop$province, ca_pop$abbreviation)
 
-can <- purrr::map(ca_as_ofs, function(.x) {
+# Read in data and convert to `epi_df`s.
+cancovid <- purrr::map2(commit_pages$data_url, commit_pages$date, function(url, date) {
   raw <- readr::read_csv(
-    here::here(path_to_csvs, paste0(.x, ".csv")),
+    url,
     col_types = cols(
       province = col_character(),
       date_report = col_character(),
@@ -60,12 +135,13 @@ can <- purrr::map(ca_as_ofs, function(.x) {
     mutate(geo_value = province,
            case_rate = cases / population * 1e5) %>%
     select(geo_value, time_value, case_rate) %>%
-    as_epi_df(geo_type = "province", as_of = .x)
+    as_epi_df(geo_type = "province", as_of = date)
 
   return(result)
 })
-names(can) <- ca_as_ofs
-cancovid <- can %>% bind_rows(.id = "version") %>%
-  mutate(version = lubridate::ymd(version))
+names(cancovid) <- commit_pages$date
+cancovid <- cancovid %>% bind_rows(.id = "version") %>%
+  mutate(version = lubridate::ymd(version)) %>% 
+  arrange(version)
 
 usethis::use_data(cancovid, overwrite = TRUE)