Skip to content

Commit 5890c7f

Browse files
authored
Merge pull request #271 from cmu-delphi/r-csv-api
Use CSV format for API access in R package
2 parents 9bd976b + e12eb5f commit 5890c7f

File tree

16 files changed

+112
-176
lines changed

16 files changed

+112
-176
lines changed

R-packages/covidcast/NEWS.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ Released TODO DATE.
88
filter data frames with multiple issues of each observation, obtaining only
99
the latest or earliest issue of each.
1010

11+
- `covidcast_signal()` now batches requests, so that many days of data can be
12+
fetched in one API call. This dramatically improves the speed of fetching
13+
state-, MSA-, and HRR-level data, since many days of data can be fetched in
14+
one API call. County-level signals, such as cases and deaths, may still
15+
require one API call per day, since the API's row limit is only slightly
16+
larger than the number of counties in the United States.
17+
18+
- `covidcast_signal()` now fetches data from the API server in CSV format,
19+
rather than JSON, which requires less bandwidth and parsing.
1120

1221
# covidcast 0.3.1
1322

R-packages/covidcast/R/covidcast.R

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -429,15 +429,15 @@ covidcast_signals <- function(data_source, signal,
429429
#'
430430
#' @export
431431
covidcast_meta <- function() {
432-
meta <- .request(list(source='covidcast_meta', cached="true"))
432+
meta <- .request(
433+
list(source = "covidcast_meta",
434+
format = "csv"))
433435

434-
if (meta$message != "success") {
435-
abort(paste0("Failed to obtain metadata: ", meta$message, "."),
436-
err_msg = meta$message,
437-
class = "covidcast_meta_fetch_failed")
436+
if (nchar(meta) == 0) {
437+
abort("Failed to obtain metadata", class = "covidcast_meta_fetch_failed")
438438
}
439439

440-
meta <- meta$epidata %>%
440+
meta <- read.csv(textConnection(meta), stringsAsFactors = FALSE) %>%
441441
dplyr::mutate(min_time = api_to_date(.data$min_time),
442442
max_time = api_to_date(.data$max_time),
443443
max_issue = api_to_date(.data$max_issue))
@@ -560,14 +560,14 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type,
560560

561561
# The API limits the number of rows that can be returned at once, so we query
562562
# in batches.
563-
for (i in seq(1, num_batches)) {
563+
for (i in seq_len(num_batches)) {
564564
start_offset <- (i - 1) * max_days_at_time
565565
end_offset <- min(i * max_days_at_time, ndays) - 1
566566
query_start_day <- start_day + start_offset
567567
query_end_day <- start_day + end_offset
568568

569569
time_values <- date_to_string(days[(start_offset + 1):(end_offset + 1)])
570-
dat[[i]] <- covidcast(data_source = data_source,
570+
response <- covidcast(data_source = data_source,
571571
signal = signal,
572572
time_type = "day",
573573
geo_type = geo_type,
@@ -576,22 +576,37 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type,
576576
as_of = as_of,
577577
issues = issues,
578578
lag = lag)
579+
580+
if (is.null(response)) {
581+
warn(paste0("Fetching ", signal, " from ", data_source, " for ",
582+
query_start_day, " to ", query_end_day,
583+
" in geography '", geo_value, "': no results"),
584+
data_source = data_source,
585+
signal = signal,
586+
start_day = query_start_day,
587+
end_day = query_end_day,
588+
geo_value = geo_value,
589+
class = "covidcast_fetch_failed")
590+
591+
next
592+
}
593+
594+
dat[[i]] <- response
595+
579596
summary <- sprintf(
580-
"Fetched day %s to %s: %s, %s, num_entries = %s",
597+
"Fetched day %s to %s: num_entries = %s",
581598
query_start_day,
582599
query_end_day,
583-
dat[[i]]$result,
584-
dat[[i]]$message,
585-
nrow(dat[[i]]$epidata)
586-
)
600+
nrow(response))
601+
587602
if (length(summary) != 0) {
588603
message(summary)
589604
}
590-
if (dat[[i]]$message == "success") {
605+
606+
if (nrow(response) > 0) {
591607
desired_geos <- tolower(unique(geo_value))
592608

593-
returned_epidata <- dat[[i]]$epidata
594-
returned_geo_array <- returned_epidata %>%
609+
returned_geo_array <- response %>%
595610
dplyr::select(geo_value, time_value) %>%
596611
dplyr::group_by(time_value) %>%
597612
dplyr::summarize(geo_value = list(geo_value))
@@ -607,10 +622,10 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type,
607622
signal = signal,
608623
day = missing_dates,
609624
geo_value = geo_value,
610-
api_msg = dat[[i]]$message,
611-
class = "covidcast_missing_geo_values"
625+
class = "covidcast_missing_time_values"
612626
)
613627
}
628+
614629
if (!identical("*", geo_value)) {
615630
missing_geo_array <- returned_geo_array[
616631
lapply(returned_geo_array$geo_value, length) < length(desired_geos), ]
@@ -626,26 +641,13 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type,
626641
signal = signal,
627642
day = api_to_date(missing_geo_array$time_value),
628643
geo_value = geo_value,
629-
api_msg = dat[[i]]$message,
630644
class = "covidcast_missing_geo_values")
631645
}
632646
}
633-
} else {
634-
warn(paste0("Fetching ", signal, " from ", data_source, " for ",
635-
query_start_day, " to ", query_end_day, " in geography '",
636-
geo_value, "': ", dat[[i]]$message),
637-
data_source = data_source,
638-
signal = signal,
639-
start_day = query_start_day,
640-
end_day = query_end_day,
641-
geo_value = geo_value,
642-
api_msg = dat[[i]]$message,
643-
class = "covidcast_fetch_failed")
644647
}
645648
}
646649

647650
df <- dat %>%
648-
purrr::map("epidata") %>% # just want $epidata part
649651
purrr::map(purrr::compact) %>% # remove the list elements that are NULL
650652
dplyr::bind_rows() # make this into a data frame
651653

@@ -681,22 +683,24 @@ geo_warning_message <- function(row, desired_geos) {
681683
covidcast <- function(data_source, signal, time_type, geo_type, time_values,
682684
geo_value, as_of, issues, lag) {
683685
# Check parameters
684-
if(missing(data_source) || missing(signal) || missing(time_type) ||
686+
if (missing(data_source) || missing(signal) || missing(time_type) ||
685687
missing(geo_type) || missing(time_values) || missing(geo_value)) {
686688
stop("`data_source`, `signal`, `time_type`, `geo_type`, `time_values`, ",
687689
"and `geo_value` are all required.")
688690
}
689691

690692
# Set up request
691693
params <- list(
692-
source = 'covidcast',
694+
source = "covidcast",
693695
data_source = data_source,
694696
signal = signal,
695697
time_type = time_type,
696698
geo_type = geo_type,
697699
time_values = .list(time_values),
698-
geo_value = geo_value
700+
geo_value = geo_value,
701+
format = "csv"
699702
)
703+
700704
if (length(params$geo_value) > 1) {
701705
params$geo_values <- paste0(params$geo_value, collapse = ",") #convert to string
702706
params$geo_value <- NULL
@@ -721,8 +725,19 @@ covidcast <- function(data_source, signal, time_type, geo_type, time_values,
721725
params$lag <- lag
722726
}
723727

724-
# Make the API call
725-
return(.request(params))
728+
# Make the API call. If the API returns a non-200 status code, indicating e.g.
729+
# a database error, .request() raises an error. It returns an empty string if
730+
# there are no results for our query.
731+
response <- .request(params)
732+
if (nchar(response) == 0) {
733+
# empty if no results
734+
return(NULL)
735+
}
736+
737+
# geo_value must be read as character so FIPS codes are returned as character,
738+
# not numbers (with leading 0s potentially removed)
739+
return(read.csv(textConnection(response), stringsAsFactors = FALSE,
740+
colClasses = c("geo_value" = "character")))
726741
}
727742

728743
# Helper function to cast values and/or ranges to strings
@@ -751,8 +766,8 @@ covidcast <- function(data_source, signal, time_type, geo_type, time_values,
751766

752767
httr::stop_for_status(response, task = "fetch data from API")
753768

754-
return(jsonlite::fromJSON(httr::content(response, as = "text",
755-
encoding = "utf-8")))
769+
return(httr::content(response, as = "text",
770+
encoding = "utf-8"))
756771
}
757772

758773
# This is the date format expected by the API
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
geo_value,signal,time_value,issue,lag,value,stderr,sample_size
2+
01000,bar-not-found,20200101,20200102,1,1.0,0.1,2.0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
signal,geo_value,value,time_value,issue,lag,sample_size,stderr
2+
bar,pa,1,20200101,20200101,0,1,1
3+
bar,tx,1,20200101,20200101,0,1,1

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-64a69c.json

Lines changed: 0 additions & 26 deletions
This file was deleted.

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-6a5814.json

Lines changed: 0 additions & 26 deletions
This file was deleted.

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-96f6a5.json

Lines changed: 0 additions & 4 deletions
This file was deleted.

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-b6e478.csv

Whitespace-only changes.

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-cb89ad.json

Lines changed: 0 additions & 17 deletions
This file was deleted.

R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d2e163.json

Lines changed: 0 additions & 30 deletions
This file was deleted.

0 commit comments

Comments
 (0)