Skip to content

Commit 1cb710f

Browse files
author
admin
committed
Merge branch '98-improve-documentation-experience-for-users-constructing-first-epi_archive' of https://github.com/cmu-delphi/epiprocess into 98-improve-documentation-experience-for-users-constructing-first-epi_archive
2 parents 436f581 + a7735d2 commit 1cb710f

File tree

8 files changed

+268
-53
lines changed

8 files changed

+268
-53
lines changed

R/archive.R

Lines changed: 53 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of
1717
#' class `data.table` from the `data.table` package, with (at least) the
1818
#' following columns:
19-
#'
19+
#'
2020
#' * `geo_value`: the geographic value associated with each row of measurements.
2121
#' * `time_value`: the time value associated with each row of measurements.
2222
#' * `version`: the time value specifying the version for each row of
@@ -45,7 +45,7 @@
4545
#' reference semantics. A primary consequence of this is that objects are not
4646
#' copied when modified. You can read more about this in Hadley Wickham's
4747
#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.
48-
#'
48+
#'
4949
#' @section Metadata:
5050
#' The following pieces of metadata are included as fields in an `epi_archive`
5151
#' object:
@@ -77,7 +77,7 @@
7777
#' sliding computation at any given reference time point t is performed on
7878
#' **data that would have been available as of t**. More details on `slide()`
7979
#' are documented in the wrapper function `epix_slide()`.
80-
#'
80+
#'
8181
#' @importFrom R6 R6Class
8282
#' @export
8383
#' @examples
@@ -104,7 +104,7 @@ epi_archive =
104104
additional_metadata = NULL,
105105
#' @description Creates a new `epi_archive` object.
106106
#' @param x A data frame, data table, or tibble, with columns `geo_value`,
107-
#' `time_value`, `version`, and then any additional number of columns.
107+
#' `time_value`, `version`, and then any additional number of columns.
108108
#' @param geo_type Type for the geo values. If missing, then the function will
109109
#' attempt to infer it from the geo values present; if this fails, then it
110110
#' will be set to "custom".
@@ -124,12 +124,12 @@ epi_archive =
124124
#' Refer to the documentation for [as_epi_archive()] for more information
125125
#' and examples of parameter names.
126126
initialize = function(x, geo_type, time_type, other_keys,
127-
additional_metadata) {
127+
additional_metadata) {
128128
# Check that we have a data frame
129129
if (!is.data.frame(x)) {
130130
Abort("`x` must be a data frame.")
131131
}
132-
132+
133133
# Check that we have geo_value, time_value, version columns
134134
if (!("geo_value" %in% names(x))) {
135135
Abort("`x` must contain a `geo_value` column.")
@@ -140,7 +140,7 @@ epi_archive =
140140
if (!("version" %in% names(x))) {
141141
Abort("`x` must contain a `version` column.")
142142
}
143-
143+
144144
# If geo type is missing, then try to guess it
145145
if (missing(geo_type)) {
146146
geo_type = guess_geo_type(x$geo_value)
@@ -150,7 +150,7 @@ epi_archive =
150150
if (missing(time_type)) {
151151
time_type = guess_time_type(x$time_value)
152152
}
153-
153+
154154
# Finish off with small checks on keys variables and metadata
155155
if (missing(other_keys)) other_keys = NULL
156156
if (missing(additional_metadata)) additional_metadata = list()
@@ -164,7 +164,7 @@ epi_archive =
164164
c("geo_type", "time_type"))) {
165165
Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".")
166166
}
167-
167+
168168
# Create the data table; if x was an un-keyed data.table itself,
169169
# then the call to as.data.table() will fail to set keys, so we
170170
# need to check this, then do it manually if needed
@@ -182,8 +182,8 @@ epi_archive =
182182
cat("An `epi_archive` object, with metadata:\n")
183183
cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type))
184184
cat(sprintf("* %-9s = %s\n", "time_type", self$time_type))
185-
if (!is.null(self$additional_metadata)) {
186-
sapply(self$additional_metadata, function(m) {
185+
if (!is.null(self$additional_metadata)) {
186+
sapply(self$additional_metadata, function(m) {
187187
cat(sprintf("* %-9s = %s\n", names(m), m))
188188
})
189189
}
@@ -197,10 +197,15 @@ epi_archive =
197197
cat(sprintf("* %-14s = %s\n", "max version",
198198
max(self$DT$version)))
199199
cat("----------\n")
200-
cat(sprintf("Data archive (stored in DT field): %i x %i\n",
200+
cat(sprintf("Data archive (stored in DT field): %i x %i\n",
201201
nrow(self$DT), ncol(self$DT)))
202202
cat("----------\n")
203-
cat(sprintf("Public methods: %s",
203+
cat(sprintf("Columns in DT: %s\n", paste(ifelse(length(
204+
colnames(self$DT)) <= 4, paste(colnames(self$DT), collapse = ", "),
205+
paste(paste(colnames(self$DT)[1:4], collapse = ", "), "and",
206+
length(colnames(self$DT)[5:length(colnames(self$DT))]), "more columns")))))
207+
cat("----------\n")
208+
cat(sprintf("Public methods: %s\n",
204209
paste(names(epi_archive$public_methods),
205210
collapse = ", ")))
206211
},
@@ -214,7 +219,7 @@ epi_archive =
214219
other_keys = setdiff(key(self$DT),
215220
c("geo_value", "time_value", "version"))
216221
if (length(other_keys) == 0) other_keys = NULL
217-
222+
218223
# Check a few things on max_version
219224
if (!identical(class(max_version), class(self$DT$version))) {
220225
Abort("`max_version` and `DT$version` must have same class.")
@@ -228,23 +233,23 @@ epi_archive =
228233
if (max_version == self_max) {
229234
Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).")
230235
}
231-
236+
232237
# Filter by version and return
233238
return(
234239
# Make sure to use data.table ways of filtering and selecting
235240
self$DT[time_value >= min_time_value &
236241
version <= max_version, ] %>%
237242
unique(by = c("geo_value", "time_value", other_keys),
238243
fromLast = TRUE) %>%
239-
tibble::as_tibble() %>%
244+
tibble::as_tibble() %>%
240245
dplyr::select(-.data$version) %>%
241246
as_epi_df(geo_type = self$geo_type,
242247
time_type = self$time_type,
243248
as_of = max_version,
244249
additional_metadata = c(self$additional_metadata,
245250
other_keys = other_keys))
246251
)
247-
},
252+
},
248253
#####
249254
#' @description Merges another `data.table` with the current one, and allows for
250255
#' a post-filling of `NA` values by last observation carried forward (LOCF).
@@ -253,7 +258,7 @@ epi_archive =
253258
merge = function(y, ..., locf = TRUE, nan = NA) {
254259
# Check we have a `data.table` object
255260
if (!(inherits(y, "data.table") || inherits(y, "epi_archive"))) {
256-
Abort("`y` must be of class `data.table` or `epi_archive`.")
261+
Abort("`y` must be of class `data.table` or `epi_archive`.")
257262
}
258263

259264
# Use the data.table merge function, carrying through ... args
@@ -268,42 +273,42 @@ epi_archive =
268273

269274
# Important: use nafill and not setnafill because the latter
270275
# returns the entire data frame by reference, and the former can
271-
# be set to act on particular columns by reference using :=
276+
# be set to act on particular columns by reference using :=
272277
self$DT[,
273-
(cols) := nafill(.SD, type = "locf", nan = nan),
274-
.SDcols = cols,
278+
(cols) := nafill(.SD, type = "locf", nan = nan),
279+
.SDcols = cols,
275280
by = by]
276281
}
277-
},
282+
},
278283
#####
279284
#' @description Slides a given function over variables in an `epi_archive`
280285
#' object. See the documentation for the wrapper function `epix_as_of()` for
281-
#' details.
286+
#' details.
282287
#' @importFrom data.table key
283288
#' @importFrom rlang !! !!! enquo enquos is_quosure sym syms
284-
slide = function(f, ..., n = 7, group_by, ref_time_values,
289+
slide = function(f, ..., n = 7, group_by, ref_time_values,
285290
time_step, new_col_name = "slide_value",
286291
as_list_col = FALSE, names_sep = "_",
287-
all_rows = FALSE) {
292+
all_rows = FALSE) {
288293
# If missing, then set ref time values to be everything; else make
289-
# sure we intersect with observed time values
294+
# sure we intersect with observed time values
290295
if (missing(ref_time_values)) {
291296
ref_time_values = unique(self$DT$time_value)
292297
}
293298
else {
294299
ref_time_values = ref_time_values[ref_time_values %in%
295300
unique(self$DT$time_value)]
296301
}
297-
298-
# If a custom time step is specified, then redefine units
302+
303+
# If a custom time step is specified, then redefine units
299304
before_num = n-1
300305
if (!missing(time_step)) before_num = time_step(n-1)
301-
306+
302307
# What to group by? If missing, set according to internal keys
303308
if (missing(group_by)) {
304309
group_by = setdiff(key(self$DT), c("time_value", "version"))
305310
}
306-
311+
307312
# Symbolize column name, defuse grouping variables. We have to do
308313
# the middle step here which is a bit complicated (unfortunately)
309314
# since the function epix_slide() could have called the current one,
@@ -315,20 +320,20 @@ epi_archive =
315320

316321
# Key variable names, apart from time value and version
317322
key_vars = setdiff(key(self$DT), c("time_value", "version"))
318-
323+
319324
# Computation for one group, one time value
320325
comp_one_grp = function(.data_group,
321-
f, ...,
326+
f, ...,
322327
time_value,
323328
key_vars,
324329
new_col) {
325-
# Carry out the specified computation
330+
# Carry out the specified computation
326331
comp_value = f(.data_group, ...)
327332

328333
# Count the number of appearances of the reference time value.
329334
# Note: ideally, we want to directly count occurrences of the ref
330335
# time value but due to latency, this will often not appear in the
331-
# data group. So we count the number of unique key values, outside
336+
# data group. So we count the number of unique key values, outside
332337
# of the time value column
333338
count = sum(!duplicated(.data_group[, key_vars]))
334339

@@ -362,23 +367,23 @@ epi_archive =
362367
else {
363368
Abort("The slide computation must return an atomic vector or a data frame.")
364369
}
365-
370+
366371
# Note that we've already recycled comp value to make size stable,
367372
# so tibble() will just recycle time value appropriately
368-
return(tibble::tibble(time_value = time_value,
373+
return(tibble::tibble(time_value = time_value,
369374
!!new_col := comp_value))
370375
}
371-
376+
372377
# If f is not missing, then just go ahead, slide by group
373378
if (!missing(f)) {
374379
if (rlang::is_formula(f)) f = rlang::as_function(f)
375-
380+
376381
x = purrr::map_dfr(ref_time_values, function(t) {
377382
self$as_of(t, min_time_value = t - before_num) %>%
378-
tibble::as_tibble() %>%
383+
tibble::as_tibble() %>%
379384
dplyr::group_by(!!!group_by) %>%
380385
dplyr::group_modify(comp_one_grp,
381-
f = f, ...,
386+
f = f, ...,
382387
time_value = t,
383388
key_vars = key_vars,
384389
new_col = new_col,
@@ -396,14 +401,14 @@ epi_archive =
396401
if (length(quos) > 1) {
397402
Abort("If `f` is missing then only a single computation can be specified via `...`.")
398403
}
399-
404+
400405
quo = quos[[1]]
401406
f = function(x, quo, ...) rlang::eval_tidy(quo, x)
402407
new_col = sym(names(rlang::quos_auto_name(quos)))
403408

404409
x = purrr::map_dfr(ref_time_values, function(t) {
405410
self$as_of(t, min_time_value = t - before_num) %>%
406-
tibble::as_tibble() %>%
411+
tibble::as_tibble() %>%
407412
dplyr::group_by(!!!group_by) %>%
408413
dplyr::group_modify(comp_one_grp,
409414
f = f, quo = quo,
@@ -414,12 +419,12 @@ epi_archive =
414419
dplyr::ungroup()
415420
})
416421
}
417-
422+
418423
# Unnest if we need to
419424
if (!as_list_col) {
420425
x = tidyr::unnest(x, !!new_col, names_sep = names_sep)
421426
}
422-
427+
423428
# Join to get all rows, if we need to, then return
424429
if (all_rows) {
425430
cols = c(as.character(group_by), "time_value")
@@ -430,7 +435,7 @@ epi_archive =
430435
}
431436
)
432437
)
433-
438+
434439
#' Convert to `epi_archive` format
435440
#'
436441
#' Converts a data frame, data table, or tibble into an `epi_archive`
@@ -498,15 +503,15 @@ epi_archive =
498503
#' time_type = "day",
499504
#' other_keys = "county")
500505
as_epi_archive = function(x, geo_type, time_type, other_keys,
501-
additional_metadata = list()) {
502-
epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)
506+
additional_metadata = list()) {
507+
epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata)
503508
}
504509

505510
#' Test for `epi_archive` format
506511
#'
507512
#' @param x An object.
508513
#' @return `TRUE` if the object inherits from `epi_archive`.
509-
#'
514+
#'
510515
#' @export
511516
#' @examples
512517
#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive)

R/growth_rate.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,17 @@
100100
#' user.
101101
#'
102102
#' @export
103+
#' @examples
104+
#' # COVID cases growth rate by state using default method relative change
105+
#' jhu_csse_daily_subset %>%
106+
#' group_by(geo_value) %>%
107+
#' mutate(cases_gr = growth_rate(x = time_value, y = cases))
108+
#'
109+
#' # Log scale, degree 4 polynomial and 6-fold cross validation
110+
#' jhu_csse_daily_subset %>%
111+
#' group_by(geo_value) %>%
112+
#' mutate(gr_poly = growth_rate( x = time_value, y = cases, log_scale = TRUE, ord = 4, k = 6))
113+
103114
growth_rate = function(x = seq_along(y), y, x0 = x,
104115
method = c("rel_change", "linear_reg",
105116
"smooth_spline", "trend_filter"),

R/slide.R

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,27 @@
8484
#' tidy evaluation (first example, above), then the name for the new column is
8585
#' inferred from the given expression and overrides any name passed explicitly
8686
#' through the `new_col_name` argument.
87-
#'
87+
#'
88+
#' When `f` is a named function with arguments, if a tibble with an unnamed
89+
#' grouping variable is passed in as the method argument to `f`, include a
90+
#' parameter for the grouping-variable in `function()` just prior to
91+
#' specifying the method to prevent that from being overridden. For example:
92+
#' ```
93+
#' # Construct an tibble with an unnamed grouping variable
94+
#' edf = bind_rows(tibble(geo_value = "ak", time_value = as.Date("2020-01-01")
95+
#' + 1:10, x1=1:10, y=1:10 + rnorm(10L))) %>%
96+
#' as_epi_df()
97+
#'
98+
#' # Now, include a row parameter for the grouping variable in the tibble,
99+
#' # which we denote as g, just prior to method = "qr"
100+
#' # Note that if g was not included below, then the method = "qr" would be
101+
#' # overridden, as described above
102+
#' edf %>%
103+
#' group_by(geo_value) %>%
104+
#' epi_slide(function(x, g, method="qr", ...) tibble(model=list(
105+
#' lm(y ~ x1, x, method=method))), n=7L)
106+
#' ```
107+
#'
88108
#' @importFrom lubridate days weeks
89109
#' @importFrom rlang .data .env !! enquo enquos sym
90110
#' @export
@@ -125,7 +145,7 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values,
125145
# intersect with observed time values
126146
if (missing(ref_time_values)) {
127147
ref_time_values = unique(x$time_value)
128-
}
148+
}
129149
else {
130150
ref_time_values = ref_time_values[ref_time_values %in%
131151
unique(x$time_value)]
@@ -168,6 +188,10 @@ epi_slide = function(x, f, ..., n = 7, ref_time_values,
168188
time_range = range(unique(x$time_value))
169189
starts = in_range(ref_time_values - before_num, time_range)
170190
stops = in_range(ref_time_values + after_num, time_range)
191+
192+
if( length(starts) == 0 || length(stops) == 0 ) {
193+
Abort("The starting and/or stopping times for sliding are out of bounds with respect to the range of times in your data. Check your settings for ref_time_values and align (and before, if specified).")
194+
}
171195

172196
# Symbolize new column name
173197
new_col = sym(new_col_name)

0 commit comments

Comments
 (0)