diff --git a/R-packages/covidcast/NEWS.md b/R-packages/covidcast/NEWS.md index 76781ae7..7fcf31ad 100644 --- a/R-packages/covidcast/NEWS.md +++ b/R-packages/covidcast/NEWS.md @@ -8,6 +8,15 @@ Released TODO DATE. filter data frames with multiple issues of each observation, obtaining only the latest or earliest issue of each. +- `covidcast_signal()` now batches requests, so that many days of data can be + fetched in one API call. This dramatically improves the speed of fetching + state-, MSA-, and HRR-level data, since many days of data can be fetched in + one API call. County-level signals, such as cases and deaths, may still + require one API call per day, since the API's row limit is only slightly + larger than the number of counties in the United States. + +- `covidcast_signal()` now fetches data from the API server in CSV format, + rather than JSON, which requires less bandwidth and parsing. # covidcast 0.3.1 diff --git a/R-packages/covidcast/R/covidcast.R b/R-packages/covidcast/R/covidcast.R index 79817c29..ebdc1e4d 100644 --- a/R-packages/covidcast/R/covidcast.R +++ b/R-packages/covidcast/R/covidcast.R @@ -429,15 +429,15 @@ covidcast_signals <- function(data_source, signal, #' #' @export covidcast_meta <- function() { - meta <- .request(list(source='covidcast_meta', cached="true")) + meta <- .request( + list(source = "covidcast_meta", + format = "csv")) - if (meta$message != "success") { - abort(paste0("Failed to obtain metadata: ", meta$message, "."), - err_msg = meta$message, - class = "covidcast_meta_fetch_failed") + if (nchar(meta) == 0) { + abort("Failed to obtain metadata", class = "covidcast_meta_fetch_failed") } - meta <- meta$epidata %>% + meta <- read.csv(textConnection(meta), stringsAsFactors = FALSE) %>% dplyr::mutate(min_time = api_to_date(.data$min_time), max_time = api_to_date(.data$max_time), max_issue = api_to_date(.data$max_issue)) @@ -560,14 +560,14 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type, # The API limits the number of rows that can be returned at once, so we query # in batches. - for (i in seq(1, num_batches)) { + for (i in seq_len(num_batches)) { start_offset <- (i - 1) * max_days_at_time end_offset <- min(i * max_days_at_time, ndays) - 1 query_start_day <- start_day + start_offset query_end_day <- start_day + end_offset time_values <- date_to_string(days[(start_offset + 1):(end_offset + 1)]) - dat[[i]] <- covidcast(data_source = data_source, + response <- covidcast(data_source = data_source, signal = signal, time_type = "day", geo_type = geo_type, @@ -576,22 +576,37 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type, as_of = as_of, issues = issues, lag = lag) + + if (is.null(response)) { + warn(paste0("Fetching ", signal, " from ", data_source, " for ", + query_start_day, " to ", query_end_day, + " in geography '", geo_value, "': no results"), + data_source = data_source, + signal = signal, + start_day = query_start_day, + end_day = query_end_day, + geo_value = geo_value, + class = "covidcast_fetch_failed") + + next + } + + dat[[i]] <- response + summary <- sprintf( - "Fetched day %s to %s: %s, %s, num_entries = %s", + "Fetched day %s to %s: num_entries = %s", query_start_day, query_end_day, - dat[[i]]$result, - dat[[i]]$message, - nrow(dat[[i]]$epidata) - ) + nrow(response)) + if (length(summary) != 0) { message(summary) } - if (dat[[i]]$message == "success") { + + if (nrow(response) > 0) { desired_geos <- tolower(unique(geo_value)) - returned_epidata <- dat[[i]]$epidata - returned_geo_array <- returned_epidata %>% + returned_geo_array <- response %>% dplyr::select(geo_value, time_value) %>% dplyr::group_by(time_value) %>% dplyr::summarize(geo_value = list(geo_value)) @@ -607,10 +622,10 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type, signal = signal, day = missing_dates, geo_value = geo_value, - api_msg = dat[[i]]$message, - class = "covidcast_missing_geo_values" + class = "covidcast_missing_time_values" ) } + if (!identical("*", geo_value)) { missing_geo_array <- returned_geo_array[ lapply(returned_geo_array$geo_value, length) < length(desired_geos), ] @@ -626,26 +641,13 @@ covidcast_days <- function(data_source, signal, start_day, end_day, geo_type, signal = signal, day = api_to_date(missing_geo_array$time_value), geo_value = geo_value, - api_msg = dat[[i]]$message, class = "covidcast_missing_geo_values") } } - } else { - warn(paste0("Fetching ", signal, " from ", data_source, " for ", - query_start_day, " to ", query_end_day, " in geography '", - geo_value, "': ", dat[[i]]$message), - data_source = data_source, - signal = signal, - start_day = query_start_day, - end_day = query_end_day, - geo_value = geo_value, - api_msg = dat[[i]]$message, - class = "covidcast_fetch_failed") } } df <- dat %>% - purrr::map("epidata") %>% # just want $epidata part purrr::map(purrr::compact) %>% # remove the list elements that are NULL dplyr::bind_rows() # make this into a data frame @@ -681,7 +683,7 @@ geo_warning_message <- function(row, desired_geos) { covidcast <- function(data_source, signal, time_type, geo_type, time_values, geo_value, as_of, issues, lag) { # Check parameters - if(missing(data_source) || missing(signal) || missing(time_type) || + if (missing(data_source) || missing(signal) || missing(time_type) || missing(geo_type) || missing(time_values) || missing(geo_value)) { stop("`data_source`, `signal`, `time_type`, `geo_type`, `time_values`, ", "and `geo_value` are all required.") @@ -689,14 +691,16 @@ covidcast <- function(data_source, signal, time_type, geo_type, time_values, # Set up request params <- list( - source = 'covidcast', + source = "covidcast", data_source = data_source, signal = signal, time_type = time_type, geo_type = geo_type, time_values = .list(time_values), - geo_value = geo_value + geo_value = geo_value, + format = "csv" ) + if (length(params$geo_value) > 1) { params$geo_values <- paste0(params$geo_value, collapse = ",") #convert to string params$geo_value <- NULL @@ -721,8 +725,19 @@ covidcast <- function(data_source, signal, time_type, geo_type, time_values, params$lag <- lag } - # Make the API call - return(.request(params)) + # Make the API call. If the API returns a non-200 status code, indicating e.g. + # a database error, .request() raises an error. It returns an empty string if + # there are no results for our query. + response <- .request(params) + if (nchar(response) == 0) { + # empty if no results + return(NULL) + } + + # geo_value must be read as character so FIPS codes are returned as character, + # not numbers (with leading 0s potentially removed) + return(read.csv(textConnection(response), stringsAsFactors = FALSE, + colClasses = c("geo_value" = "character"))) } # Helper function to cast values and/or ranges to strings @@ -751,8 +766,8 @@ covidcast <- function(data_source, signal, time_type, geo_type, time_values, httr::stop_for_status(response, task = "fetch data from API") - return(jsonlite::fromJSON(httr::content(response, as = "text", - encoding = "utf-8"))) + return(httr::content(response, as = "text", + encoding = "utf-8")) } # This is the date format expected by the API diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-1d9b5c.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-1d9b5c.csv new file mode 100644 index 00000000..be57bbb1 --- /dev/null +++ b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-1d9b5c.csv @@ -0,0 +1,2 @@ +geo_value,signal,time_value,issue,lag,value,stderr,sample_size +01000,bar-not-found,20200101,20200102,1,1.0,0.1,2.0 diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-3e1dc3.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-3e1dc3.csv new file mode 100644 index 00000000..ee6523be --- /dev/null +++ b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-3e1dc3.csv @@ -0,0 +1,3 @@ +signal,geo_value,value,time_value,issue,lag,sample_size,stderr +bar,pa,1,20200101,20200101,0,1,1 +bar,tx,1,20200101,20200101,0,1,1 diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-64a69c.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-64a69c.json deleted file mode 100644 index ea92a725..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-64a69c.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "result": 1, - "message": "success", - "epidata": [ - { - "signal": "bar", - "geo_value": "pa", - "value": 1, - "time_value": "20200101", - "issue": "20200101", - "lag": 0, - "sample_size": 1, - "stderr": 1 - }, - { - "signal": "bar", - "geo_value": "tx", - "value": 1, - "time_value": "20200101", - "issue": "20200101", - "lag": 0, - "sample_size": 1, - "stderr": 1 - } - ] -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-6a5814.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-6a5814.json deleted file mode 100644 index ea92a725..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-6a5814.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "result": 1, - "message": "success", - "epidata": [ - { - "signal": "bar", - "geo_value": "pa", - "value": 1, - "time_value": "20200101", - "issue": "20200101", - "lag": 0, - "sample_size": 1, - "stderr": 1 - }, - { - "signal": "bar", - "geo_value": "tx", - "value": 1, - "time_value": "20200101", - "issue": "20200101", - "lag": 0, - "sample_size": 1, - "stderr": 1 - } - ] -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-96f6a5.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-96f6a5.json deleted file mode 100644 index c6adc102..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-96f6a5.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "result": -2, - "message": "no results" -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-b6e478.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-b6e478.csv new file mode 100644 index 00000000..e69de29b diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-cb89ad.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-cb89ad.json deleted file mode 100644 index f0bf49d9..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-cb89ad.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "message": "success", - "result": 1, - "epidata": [ - { - "geo_value": "01000", - "signal": "bar-not-found", - "time_value": 20200101, - "direction": null, - "issue": 20200102, - "lag": 1, - "value": 1.0, - "stderr": 0.1, - "sample_size": 2.0 - } - ] -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d2e163.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d2e163.json deleted file mode 100644 index 7805d705..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d2e163.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "message": "success", - "result": 1, - "epidata": [ - { - "data_source": "foo", - "signal": "bar", - "min_time": 20200101, - "max_time": 20200102, - "max_issue": 20200404, - "min_value": 0, - "max_value": 10, - "num_locations": 100, - "time_type": "day", - "geo_type": "county" - }, - { - "data_source": "foo", - "signal": "bar2", - "min_time": 20201002, - "max_time": 20201003, - "max_issue": 20201101, - "num_locations": 100, - "min_value": 0, - "max_value": 10, - "time_type": "day", - "geo_type": "county" - } - ] -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d707dc.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-d707dc.csv new file mode 100644 index 00000000..e69de29b diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-da6974.json b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-da6974.json deleted file mode 100644 index c6adc102..00000000 --- a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-da6974.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "result": -2, - "message": "no results" -} diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-dd024f.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-dd024f.csv new file mode 100644 index 00000000..27fa487a --- /dev/null +++ b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-dd024f.csv @@ -0,0 +1,3 @@ +data_source,signal,time_type,geo_type,min_time,max_time,min_value,max_value,num_locations,max_issue +foo,bar,day,county,20200101,20200102,0,10,100,20200404 +foo,bar2,day,county,20201002,20201003,0,10,100,20201101 diff --git a/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-f666a2.csv b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-f666a2.csv new file mode 100644 index 00000000..ee6523be --- /dev/null +++ b/R-packages/covidcast/tests/testthat/api.covidcast.cmu.edu/epidata/api.php-f666a2.csv @@ -0,0 +1,3 @@ +signal,geo_value,value,time_value,issue,lag,sample_size,stderr +bar,pa,1,20200101,20200101,0,1,1 +bar,tx,1,20200101,20200101,0,1,1 diff --git a/R-packages/covidcast/tests/testthat/test-covidcast.R b/R-packages/covidcast/tests/testthat/test-covidcast.R index 9f1961a3..46b8c70b 100644 --- a/R-packages/covidcast/tests/testthat/test-covidcast.R +++ b/R-packages/covidcast/tests/testthat/test-covidcast.R @@ -13,8 +13,10 @@ library(dplyr) # types of errors. # # 2. Once you've written a test, it can be difficult to find the file storing -# the JSON needed for that test. We hence store the filename in comments -# adjacent to every call. +# the JSON or CSV needed for that test. We hence store the filename in comments +# adjacent to every call. (Note that we request CSVs from the API for +# covidcast_signal, and httptest suggests filenames ending in .json by default. +# You can use .csv instead and httptest will correctly locate those files.) # # 3. covidcast_signal() calls covidcast_meta() unconditionally. We hence need a # single meta file that suffices for all tests that call covidcast_signal(). @@ -38,20 +40,20 @@ library(dplyr) with_mock_api({ test_that("covidcast_meta formats result correctly", { - # api.php-d2e163.json + # api.php-dd024f.csv expect_equal(covidcast_meta(), structure( data.frame( data_source = "foo", signal = c("bar", "bar2"), + time_type = "day", + geo_type = "county", min_time = as.Date(c("2020-01-01", "2020-10-02")), max_time = as.Date(c("2020-01-02", "2020-10-03")), - max_issue = as.Date(c("2020-04-04", "2020-11-01")), min_value = 0, max_value = 10, num_locations = 100, - time_type = "day", - geo_type = "county" + max_issue = as.Date(c("2020-04-04", "2020-11-01")) ), class = c("covidcast_meta", "data.frame") )) @@ -59,8 +61,7 @@ with_mock_api({ }) test_that("covidcast_meta raises error when API signals one", { - stub(covidcast_meta, ".request", - list(message = "argle-bargle")) + stub(covidcast_meta, ".request", "") expect_error(covidcast_meta(), class = "covidcast_meta_fetch_failed") @@ -69,13 +70,13 @@ test_that("covidcast_meta raises error when API signals one", { with_mock_api({ ## covidcast_signal() tests test_that("covidcast_signal warns when requested geo_values are unavailable", { - # api.php-6a5814.json + # api.php-3e1dc3.csv expect_warning(covidcast_signal("foo", "bar", "2020-01-01", "2020-01-01", geo_values = c("pa", "tx", "DUCKS")), class = "covidcast_missing_geo_values") # ...but not when they *are* available. - # api.php-64a69c.json + # api.php-f666a2.csv expect_silent(suppressMessages( covidcast_signal("foo", "bar", "2020-01-01", "2020-01-01", geo_values = c("pa", "tx")))) @@ -83,12 +84,12 @@ with_mock_api({ test_that("covidcast_signal warns when requested dates are unavailable", { # with geo_values = "*". - # api.php-96f6a5.json + # api.php-b6e478.csv expect_warning(covidcast_signal("foo", "bar", "2020-01-02", "2020-01-02"), class = "covidcast_fetch_failed") # and with geo_values = "pa" - # api.php-da6974.json + # api.php-d707dc.csv expect_warning(covidcast_signal("foo", "bar", "2020-01-02", "2020-01-02", geo_values = "pa"), class = "covidcast_fetch_failed") @@ -101,7 +102,7 @@ with_mock_api({ test_that("covidcast_signal works for signals with no meta", { # when no meta is available, we must provide start_day and end_day. - # api.php-cb89ad.json + # api.php-1d9b5c.csv expect_equal( covidcast_signal("foo", "bar-not-found", "2020-01-01", "2020-01-01"), @@ -123,7 +124,7 @@ with_mock_api({ }) test_that("covidcast_signal stops when end_day < start_day", { - # reusing api.php-da6974.json + # reusing api.php-dd024f.csv for metadata expect_error(covidcast_signal("foo", "bar", "2020-01-02", "2020-01-01")) }) @@ -139,7 +140,7 @@ with_mock_api({ test_that("covidcast_days does not treat \"*\" as a missing geo_value", { stub(covidcast_days, "covidcast", - list(message = "success", epidata = data.frame( + data.frame( geo_value = c("geoa", "geob"), signal = "signal", time_value = c(20201030, 20201031), @@ -149,7 +150,8 @@ test_that("covidcast_days does not treat \"*\" as a missing geo_value", { value = 3, stderr = NA, sample_size = NA - ), result = 1)) + )) + # Expect no warning expect_warning( covidcast_days( @@ -168,7 +170,7 @@ test_that("covidcast_days does not treat \"*\" as a missing geo_value", { test_that("covidcast_days does not raise warnings for full response", { stub(covidcast_days, "covidcast", - list(message = "success", epidata = data.frame( + data.frame( geo_value = c("geoa"), signal = "signal", time_value = c(20201030, 20201031), @@ -178,7 +180,8 @@ test_that("covidcast_days does not raise warnings for full response", { value = 3, stderr = NA, sample_size = NA - ), result = 1)) + )) + # Expect no warning expect_warning( covidcast_days( @@ -196,19 +199,22 @@ test_that("covidcast_days does not raise warnings for full response", { }) test_that("covidcast_days batches calls to covidcast", { - covidcast_returns <- rep(list(list(message = "success", epidata = data.frame( - geo_value = c("geoa"), - signal = "signal", - time_value = rep(NA, 3), - direction = NA, - issue = as.Date("2020-11-04"), - lag = 2, - value = 3, - stderr = NA, - sample_size = NA - ), result = 1)), 10) - covidcast_returns[[1]]$epidata$time_value <- 20101001:20101003 - covidcast_returns[[2]]$epidata$time_value <- 20101004:20101006 + covidcast_returns <- rep( + list( + data.frame( + geo_value = c("geoa"), + signal = "signal", + time_value = rep(NA, 3), + direction = NA, + issue = as.Date("2020-11-04"), + lag = 2, + value = 3, + stderr = NA, + sample_size = NA + )), + 10) + covidcast_returns[[1]]$time_value <- 20101001:20101003 + covidcast_returns[[2]]$time_value <- 20101004:20101006 m <- mock(covidcast_returns[[1]], covidcast_returns[[2]]) stub(covidcast_days, "covidcast", m) diff --git a/R-packages/covidcast/vignettes/correlation-utils.Rmd b/R-packages/covidcast/vignettes/correlation-utils.Rmd index 2ca9e3c2..80106e96 100644 --- a/R-packages/covidcast/vignettes/correlation-utils.Rmd +++ b/R-packages/covidcast/vignettes/correlation-utils.Rmd @@ -147,6 +147,8 @@ we can plot these correlations in space as a choropleth map, using ```{r, fig.width = 10, fig.height = 8} # Set a bunch of fields so that the data frame knows how to plot itself +df_cor2$data_source <- "cor" +df_cor2$signal <- "cor" df_cor2$time_value <- start_day df_cor2$issue <- start_day attributes(df_cor2)$metadata$geo_type <- "county"