Skip to content

Commit 18a4d34

Browse files
committed
move cancovid creation into R only
1 parent 5fa284c commit 18a4d34

File tree

4 files changed

+96
-81
lines changed

4 files changed

+96
-81
lines changed

data-raw/cancovid.R

Lines changed: 96 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,112 @@ library(dplyr)
44
library(epiprocess)
55
library(readr)
66
library(purrr)
7+
library(httr)
8+
library(jsonlite)
79

810

9-
# Steps
10-
# - Run <get_hist.sh> to get the data. Stored in <updates.nosync/> folder
11-
# - <modify_csv.R> request and construct data
12-
# - <proc_dat.R> make data into "reported" and "revised" format
13-
# - reported-revised format data can be found in <intermediate/changes.csv>
11+
# Look for a GitHub API token.
12+
# Returns an empty string "" if env variable not found.
13+
gh_token <- Sys.getenv("GITHUB_PAT")
14+
if (gh_token == "") {
15+
# Try again with the secondary name.
16+
gh_token <- Sys.getenv("GITHUB_TOKEN")
17+
}
18+
if (gh_token == "") {
19+
warning("Token is not set or is not able to be fetched from the environment.",
20+
" Proceeding without authentication, but the requests may be blocked",
21+
" due to GitHub API rate limits.")
22+
}
23+
24+
# Construct a header to send with GET requests
25+
if (gh_token == "") {
26+
# Empty header
27+
auth_header <- httr::add_headers()
28+
} else {
29+
auth_header <- httr::add_headers(Authorization = paste("Bearer", gh_token))
30+
}
31+
32+
## Get list of new and modified files to download
33+
# The `path` field filters commits to only those that modifying the listed dir
34+
BASE_URL <- "https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s"
35+
ITEMS_PER_PAGE <- 100
36+
BRANCH <- "master"
37+
38+
39+
40+
# We want to fetch all commits made since Mar 13 2022 (version the original
41+
# dataset was created from).
1442
#
15-
# Name of each csv file match with the last date of that
16-
# csv file only starts from 11-13-2021. Therefore, csv file names shouldn't be
17-
# used as date index, but rather, the last date of each csv file.
43+
# Timestamp should be in ISO 8601 format. See
44+
# https://docs.github.com/en/rest/reference/commits#list-commits--parameters for
45+
# details.
46+
since_date <- strftime("2022-03-13", "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")
47+
48+
page <- 0
49+
commit_pages <- list()
1850

51+
# Fetch list of commits from API, one page at a time. Each page contains up to
52+
# 100 commits. If a page contains 100 commits, assume that there are more
53+
# results and fetch the next page.
54+
while (page == 0 || nrow(commit_page) == 100) {
55+
page <- page + 1
56+
# Construct the URL
57+
commits_url <- sprintf(BASE_URL, BRANCH, ITEMS_PER_PAGE, since_date, page)
58+
59+
request <- GET(commits_url, auth_header)
60+
# Convert any HTTP errors to R errors automatically.
61+
stop_for_status(request)
62+
63+
# Convert results from nested JSON/list to dataframe. If no results returned,
64+
# `commit_page` will be an empty list.
65+
commit_page <- content(request, as = "text") %>%
66+
fromJSON(simplifyDataFrame = TRUE, flatten = TRUE) %>%
67+
# Trim message down a bit.
68+
mutate(message = substr(commit.message, 1, 40)) %>%
69+
select(sha, url = commit.url, message)
70+
71+
# No more results are being returned.
72+
if (identical(commit_page, list())) {
73+
break
74+
}
75+
76+
commit_pages[[page]] <- commit_page
77+
}
78+
79+
# Combine all requested pages of commits into one dataframe
80+
commit_pages <- bind_rows(commit_pages)
1981

20-
path_to_csvs <- here::here("data-raw/updates.nosync/")
21-
files <- list.files(path_to_csvs)
82+
# Missing value `%s` to be filled in with a commit sha or branch name.
83+
BASE_DATA_URL <- "https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv"
2284

2385
fc_time_values <- seq(as.Date("2021-02-01"), as.Date("2021-12-01"),
2486
by = "1 month")
25-
ca_as_ofs <- as.Date(substr(files, 1, 10)) %>%
26-
intersect(fc_time_values) %>%
27-
as.Date(origin = "1970-01-01")
87+
commit_pages <- mutate(
88+
commit_pages,
89+
data_url = sprintf(BASE_DATA_URL, sha),
90+
date = strsplit(message, " ") %>% map_chr(~ substr(.x[3], start=1, stop=10)) %>% as.Date()
91+
) %>%
92+
# select(data_url, date) %>%
93+
na.omit() %>%
94+
filter(date %in% fc_time_values)
2895

2996
# From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv
3097
ca_pop_url <- "https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv"
31-
ca_pop <- read_csv(ca_pop_url) %>%
98+
ca_pop <- read_csv(
99+
ca_pop_url,
100+
col_types = cols(
101+
Province = col_character(),
102+
shortProvince = col_character(),
103+
Population = col_integer()
104+
)
105+
) %>%
32106
rename(province = Province, abbreviation = shortProvince, population = Population)
33107
abbrev_map <- setNames(ca_pop$province, ca_pop$abbreviation)
34108

35-
can <- purrr::map(ca_as_ofs, function(.x) {
109+
# Read in data and convert to `epi_df`s.
110+
cancovid <- purrr::map2(commit_pages$data_url, commit_pages$date, function(url, date) {
36111
raw <- readr::read_csv(
37-
here::here(path_to_csvs, paste0(.x, ".csv")),
112+
url,
38113
col_types = cols(
39114
province = col_character(),
40115
date_report = col_character(),
@@ -60,12 +135,13 @@ can <- purrr::map(ca_as_ofs, function(.x) {
60135
mutate(geo_value = province,
61136
case_rate = cases / population * 1e5) %>%
62137
select(geo_value, time_value, case_rate) %>%
63-
as_epi_df(geo_type = "province", as_of = .x)
138+
as_epi_df(geo_type = "province", as_of = date)
64139

65140
return(result)
66141
})
67-
names(can) <- ca_as_ofs
68-
cancovid <- can %>% bind_rows(.id = "version") %>%
69-
mutate(version = lubridate::ymd(version))
142+
names(cancovid) <- commit_pages$date
143+
cancovid <- cancovid %>% bind_rows(.id = "version") %>%
144+
mutate(version = lubridate::ymd(version)) %>%
145+
arrange(version)
70146

71147
usethis::use_data(cancovid, overwrite = TRUE)

data-raw/get_hist.sh

Lines changed: 0 additions & 24 deletions
This file was deleted.

data-raw/modify_csv.R

Lines changed: 0 additions & 37 deletions
This file was deleted.

data/cancovid.rda

461 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)