@@ -4,37 +4,112 @@ library(dplyr)
44library(epiprocess )
55library(readr )
66library(purrr )
7+ library(httr )
8+ library(jsonlite )
79
810
9- # Steps
10- # - Run <get_hist.sh> to get the data. Stored in <updates.nosync/> folder
11- # - <modify_csv.R> request and construct data
12- # - <proc_dat.R> make data into "reported" and "revised" format
13- # - reported-revised format data can be found in <intermediate/changes.csv>
11+ # Look for a GitHub API token.
12+ # Returns an empty string "" if env variable not found.
13+ gh_token <- Sys.getenv(" GITHUB_PAT" )
14+ if (gh_token == " " ) {
15+ # Try again with the secondary name.
16+ gh_token <- Sys.getenv(" GITHUB_TOKEN" )
17+ }
18+ if (gh_token == " " ) {
19+ warning(" Token is not set or is not able to be fetched from the environment." ,
20+ " Proceeding without authentication, but the requests may be blocked" ,
21+ " due to GitHub API rate limits." )
22+ }
23+
24+ # Construct a header to send with GET requests
25+ if (gh_token == " " ) {
26+ # Empty header
27+ auth_header <- httr :: add_headers()
28+ } else {
29+ auth_header <- httr :: add_headers(Authorization = paste(" Bearer" , gh_token ))
30+ }
31+
32+ # # Get list of new and modified files to download
33+ # The `path` field filters commits to only those that modifying the listed dir
34+ BASE_URL <- " https://api.github.com/repos/ccodwg/Covid19Canada/commits?sha=%s&per_page=%s&path=timeseries_prov/cases_timeseries_prov.csv&until=%s&page=%s"
35+ ITEMS_PER_PAGE <- 100
36+ BRANCH <- " master"
37+
38+
39+
40+ # We want to fetch all commits made since Mar 13 2022 (version the original
41+ # dataset was created from).
1442#
15- # Name of each csv file match with the last date of that
16- # csv file only starts from 11-13-2021. Therefore, csv file names shouldn't be
17- # used as date index, but rather, the last date of each csv file.
43+ # Timestamp should be in ISO 8601 format. See
44+ # https://docs.github.com/en/rest/reference/commits#list-commits--parameters for
45+ # details.
46+ since_date <- strftime(" 2022-03-13" , " %Y-%m-%dT%H:%M:%SZ" , tz = " UTC" )
47+
48+ page <- 0
49+ commit_pages <- list ()
1850
51+ # Fetch list of commits from API, one page at a time. Each page contains up to
52+ # 100 commits. If a page contains 100 commits, assume that there are more
53+ # results and fetch the next page.
54+ while (page == 0 || nrow(commit_page ) == 100 ) {
55+ page <- page + 1
56+ # Construct the URL
57+ commits_url <- sprintf(BASE_URL , BRANCH , ITEMS_PER_PAGE , since_date , page )
58+
59+ request <- GET(commits_url , auth_header )
60+ # Convert any HTTP errors to R errors automatically.
61+ stop_for_status(request )
62+
63+ # Convert results from nested JSON/list to dataframe. If no results returned,
64+ # `commit_page` will be an empty list.
65+ commit_page <- content(request , as = " text" ) %> %
66+ fromJSON(simplifyDataFrame = TRUE , flatten = TRUE ) %> %
67+ # Trim message down a bit.
68+ mutate(message = substr(commit.message , 1 , 40 )) %> %
69+ select(sha , url = commit.url , message )
70+
71+ # No more results are being returned.
72+ if (identical(commit_page , list ())) {
73+ break
74+ }
75+
76+ commit_pages [[page ]] <- commit_page
77+ }
78+
79+ # Combine all requested pages of commits into one dataframe
80+ commit_pages <- bind_rows(commit_pages )
1981
20- path_to_csvs <- here :: here( " data-raw/updates.nosync/ " )
21- files <- list.files( path_to_csvs )
82+ # Missing value `%s` to be filled in with a commit sha or branch name.
83+ BASE_DATA_URL <- " https://raw.githubusercontent.com/ccodwg/Covid19Canada/%s/timeseries_prov/cases_timeseries_prov.csv "
2284
2385fc_time_values <- seq(as.Date(" 2021-02-01" ), as.Date(" 2021-12-01" ),
2486 by = " 1 month" )
25- ca_as_ofs <- as.Date(substr(files , 1 , 10 )) %> %
26- intersect(fc_time_values ) %> %
27- as.Date(origin = " 1970-01-01" )
87+ commit_pages <- mutate(
88+ commit_pages ,
89+ data_url = sprintf(BASE_DATA_URL , sha ),
90+ date = strsplit(message , " " ) %> % map_chr(~ substr(.x [3 ], start = 1 , stop = 10 )) %> % as.Date()
91+ ) %> %
92+ # select(data_url, date) %>%
93+ na.omit() %> %
94+ filter(date %in% fc_time_values )
2895
2996# From https://github.com/mountainMath/BCCovidSnippets/blob/main/data/prov_pop.csv
3097ca_pop_url <- " https://raw.githubusercontent.com/mountainMath/BCCovidSnippets/main/data/prov_pop.csv"
31- ca_pop <- read_csv(ca_pop_url ) %> %
98+ ca_pop <- read_csv(
99+ ca_pop_url ,
100+ col_types = cols(
101+ Province = col_character(),
102+ shortProvince = col_character(),
103+ Population = col_integer()
104+ )
105+ ) %> %
32106 rename(province = Province , abbreviation = shortProvince , population = Population )
33107abbrev_map <- setNames(ca_pop $ province , ca_pop $ abbreviation )
34108
35- can <- purrr :: map(ca_as_ofs , function (.x ) {
109+ # Read in data and convert to `epi_df`s.
110+ cancovid <- purrr :: map2(commit_pages $ data_url , commit_pages $ date , function (url , date ) {
36111 raw <- readr :: read_csv(
37- here :: here( path_to_csvs , paste0( .x , " .csv " )) ,
112+ url ,
38113 col_types = cols(
39114 province = col_character(),
40115 date_report = col_character(),
@@ -60,12 +135,13 @@ can <- purrr::map(ca_as_ofs, function(.x) {
60135 mutate(geo_value = province ,
61136 case_rate = cases / population * 1e5 ) %> %
62137 select(geo_value , time_value , case_rate ) %> %
63- as_epi_df(geo_type = " province" , as_of = .x )
138+ as_epi_df(geo_type = " province" , as_of = date )
64139
65140 return (result )
66141})
67- names(can ) <- ca_as_ofs
68- cancovid <- can %> % bind_rows(.id = " version" ) %> %
69- mutate(version = lubridate :: ymd(version ))
142+ names(cancovid ) <- commit_pages $ date
143+ cancovid <- cancovid %> % bind_rows(.id = " version" ) %> %
144+ mutate(version = lubridate :: ymd(version )) %> %
145+ arrange(version )
70146
71147usethis :: use_data(cancovid , overwrite = TRUE )
0 commit comments