1616# ' @details An `epi_archive` is an R6 class which contains a data table `DT`, of
1717# ' class `data.table` from the `data.table` package, with (at least) the
1818# ' following columns:
19- # '
19+ # '
2020# ' * `geo_value`: the geographic value associated with each row of measurements.
2121# ' * `time_value`: the time value associated with each row of measurements.
2222# ' * `version`: the time value specifying the version for each row of
4545# ' reference semantics. A primary consequence of this is that objects are not
4646# ' copied when modified. You can read more about this in Hadley Wickham's
4747# ' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book.
48- # '
48+ # '
4949# ' @section Metadata:
5050# ' The following pieces of metadata are included as fields in an `epi_archive`
5151# ' object:
7777# ' sliding computation at any given reference time point t is performed on
7878# ' **data that would have been available as of t**. More details on `slide()`
7979# ' are documented in the wrapper function `epix_slide()`.
80- # '
80+ # '
8181# ' @importFrom R6 R6Class
8282# ' @export
8383# ' @examples
@@ -104,7 +104,7 @@ epi_archive =
104104 additional_metadata = NULL ,
105105# ' @description Creates a new `epi_archive` object.
106106# ' @param x A data frame, data table, or tibble, with columns `geo_value`,
107- # ' `time_value`, `version`, and then any additional number of columns.
107+ # ' `time_value`, `version`, and then any additional number of columns.
108108# ' @param geo_type Type for the geo values. If missing, then the function will
109109# ' attempt to infer it from the geo values present; if this fails, then it
110110# ' will be set to "custom".
@@ -124,12 +124,12 @@ epi_archive =
124124# ' Refer to the documentation for [as_epi_archive()] for more information
125125# ' and examples of parameter names.
126126 initialize = function (x , geo_type , time_type , other_keys ,
127- additional_metadata ) {
127+ additional_metadata ) {
128128 # Check that we have a data frame
129129 if (! is.data.frame(x )) {
130130 Abort(" `x` must be a data frame." )
131131 }
132-
132+
133133 # Check that we have geo_value, time_value, version columns
134134 if (! (" geo_value" %in% names(x ))) {
135135 Abort(" `x` must contain a `geo_value` column." )
@@ -140,7 +140,7 @@ epi_archive =
140140 if (! (" version" %in% names(x ))) {
141141 Abort(" `x` must contain a `version` column." )
142142 }
143-
143+
144144 # If geo type is missing, then try to guess it
145145 if (missing(geo_type )) {
146146 geo_type = guess_geo_type(x $ geo_value )
@@ -150,7 +150,7 @@ epi_archive =
150150 if (missing(time_type )) {
151151 time_type = guess_time_type(x $ time_value )
152152 }
153-
153+
154154 # Finish off with small checks on keys variables and metadata
155155 if (missing(other_keys )) other_keys = NULL
156156 if (missing(additional_metadata )) additional_metadata = list ()
@@ -164,7 +164,7 @@ epi_archive =
164164 c(" geo_type" , " time_type" ))) {
165165 Warn(" `additional_metadata` names overlap with existing metadata fields \" geo_type\" , \" time_type\" ." )
166166 }
167-
167+
168168 # Create the data table; if x was an un-keyed data.table itself,
169169 # then the call to as.data.table() will fail to set keys, so we
170170 # need to check this, then do it manually if needed
@@ -182,8 +182,8 @@ epi_archive =
182182 cat(" An `epi_archive` object, with metadata:\n " )
183183 cat(sprintf(" * %-9s = %s\n " , " geo_type" , self $ geo_type ))
184184 cat(sprintf(" * %-9s = %s\n " , " time_type" , self $ time_type ))
185- if (! is.null(self $ additional_metadata )) {
186- sapply(self $ additional_metadata , function (m ) {
185+ if (! is.null(self $ additional_metadata )) {
186+ sapply(self $ additional_metadata , function (m ) {
187187 cat(sprintf(" * %-9s = %s\n " , names(m ), m ))
188188 })
189189 }
@@ -197,10 +197,15 @@ epi_archive =
197197 cat(sprintf(" * %-14s = %s\n " , " max version" ,
198198 max(self $ DT $ version )))
199199 cat(" ----------\n " )
200- cat(sprintf(" Data archive (stored in DT field): %i x %i\n " ,
200+ cat(sprintf(" Data archive (stored in DT field): %i x %i\n " ,
201201 nrow(self $ DT ), ncol(self $ DT )))
202202 cat(" ----------\n " )
203- cat(sprintf(" Public methods: %s" ,
203+ cat(sprintf(" Columns in DT: %s\n " , paste(ifelse(length(
204+ colnames(self $ DT )) < = 4 , paste(colnames(self $ DT ), collapse = " , " ),
205+ paste(paste(colnames(self $ DT )[1 : 4 ], collapse = " , " ), " and" ,
206+ length(colnames(self $ DT )[5 : length(colnames(self $ DT ))]), " more columns" )))))
207+ cat(" ----------\n " )
208+ cat(sprintf(" Public methods: %s\n " ,
204209 paste(names(epi_archive $ public_methods ),
205210 collapse = " , " )))
206211 },
@@ -214,7 +219,7 @@ epi_archive =
214219 other_keys = setdiff(key(self $ DT ),
215220 c(" geo_value" , " time_value" , " version" ))
216221 if (length(other_keys ) == 0 ) other_keys = NULL
217-
222+
218223 # Check a few things on max_version
219224 if (! identical(class(max_version ), class(self $ DT $ version ))) {
220225 Abort(" `max_version` and `DT$version` must have same class." )
@@ -228,23 +233,23 @@ epi_archive =
228233 if (max_version == self_max ) {
229234 Warn(" Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization)." )
230235 }
231-
236+
232237 # Filter by version and return
233238 return (
234239 # Make sure to use data.table ways of filtering and selecting
235240 self $ DT [time_value > = min_time_value &
236241 version < = max_version , ] %> %
237242 unique(by = c(" geo_value" , " time_value" , other_keys ),
238243 fromLast = TRUE ) %> %
239- tibble :: as_tibble() %> %
244+ tibble :: as_tibble() %> %
240245 dplyr :: select(- .data $ version ) %> %
241246 as_epi_df(geo_type = self $ geo_type ,
242247 time_type = self $ time_type ,
243248 as_of = max_version ,
244249 additional_metadata = c(self $ additional_metadata ,
245250 other_keys = other_keys ))
246251 )
247- },
252+ },
248253 # ####
249254# ' @description Merges another `data.table` with the current one, and allows for
250255# ' a post-filling of `NA` values by last observation carried forward (LOCF).
@@ -253,7 +258,7 @@ epi_archive =
253258 merge = function (y , ... , locf = TRUE , nan = NA ) {
254259 # Check we have a `data.table` object
255260 if (! (inherits(y , " data.table" ) || inherits(y , " epi_archive" ))) {
256- Abort(" `y` must be of class `data.table` or `epi_archive`." )
261+ Abort(" `y` must be of class `data.table` or `epi_archive`." )
257262 }
258263
259264 # Use the data.table merge function, carrying through ... args
@@ -268,42 +273,42 @@ epi_archive =
268273
269274 # Important: use nafill and not setnafill because the latter
270275 # returns the entire data frame by reference, and the former can
271- # be set to act on particular columns by reference using :=
276+ # be set to act on particular columns by reference using :=
272277 self $ DT [,
273- (cols ) : = nafill(.SD , type = " locf" , nan = nan ),
274- .SDcols = cols ,
278+ (cols ) : = nafill(.SD , type = " locf" , nan = nan ),
279+ .SDcols = cols ,
275280 by = by ]
276281 }
277- },
282+ },
278283 # ####
279284# ' @description Slides a given function over variables in an `epi_archive`
280285# ' object. See the documentation for the wrapper function `epix_as_of()` for
281- # ' details.
286+ # ' details.
282287# ' @importFrom data.table key
283288# ' @importFrom rlang !! !!! enquo enquos is_quosure sym syms
284- slide = function (f , ... , n = 7 , group_by , ref_time_values ,
289+ slide = function (f , ... , n = 7 , group_by , ref_time_values ,
285290 time_step , new_col_name = " slide_value" ,
286291 as_list_col = FALSE , names_sep = " _" ,
287- all_rows = FALSE ) {
292+ all_rows = FALSE ) {
288293 # If missing, then set ref time values to be everything; else make
289- # sure we intersect with observed time values
294+ # sure we intersect with observed time values
290295 if (missing(ref_time_values )) {
291296 ref_time_values = unique(self $ DT $ time_value )
292297 }
293298 else {
294299 ref_time_values = ref_time_values [ref_time_values %in%
295300 unique(self $ DT $ time_value )]
296301 }
297-
298- # If a custom time step is specified, then redefine units
302+
303+ # If a custom time step is specified, then redefine units
299304 before_num = n - 1
300305 if (! missing(time_step )) before_num = time_step(n - 1 )
301-
306+
302307 # What to group by? If missing, set according to internal keys
303308 if (missing(group_by )) {
304309 group_by = setdiff(key(self $ DT ), c(" time_value" , " version" ))
305310 }
306-
311+
307312 # Symbolize column name, defuse grouping variables. We have to do
308313 # the middle step here which is a bit complicated (unfortunately)
309314 # since the function epix_slide() could have called the current one,
@@ -315,20 +320,20 @@ epi_archive =
315320
316321 # Key variable names, apart from time value and version
317322 key_vars = setdiff(key(self $ DT ), c(" time_value" , " version" ))
318-
323+
319324 # Computation for one group, one time value
320325 comp_one_grp = function (.data_group ,
321- f , ... ,
326+ f , ... ,
322327 time_value ,
323328 key_vars ,
324329 new_col ) {
325- # Carry out the specified computation
330+ # Carry out the specified computation
326331 comp_value = f(.data_group , ... )
327332
328333 # Count the number of appearances of the reference time value.
329334 # Note: ideally, we want to directly count occurrences of the ref
330335 # time value but due to latency, this will often not appear in the
331- # data group. So we count the number of unique key values, outside
336+ # data group. So we count the number of unique key values, outside
332337 # of the time value column
333338 count = sum(! duplicated(.data_group [, key_vars ]))
334339
@@ -362,23 +367,23 @@ epi_archive =
362367 else {
363368 Abort(" The slide computation must return an atomic vector or a data frame." )
364369 }
365-
370+
366371 # Note that we've already recycled comp value to make size stable,
367372 # so tibble() will just recycle time value appropriately
368- return (tibble :: tibble(time_value = time_value ,
373+ return (tibble :: tibble(time_value = time_value ,
369374 !! new_col : = comp_value ))
370375 }
371-
376+
372377 # If f is not missing, then just go ahead, slide by group
373378 if (! missing(f )) {
374379 if (rlang :: is_formula(f )) f = rlang :: as_function(f )
375-
380+
376381 x = purrr :: map_dfr(ref_time_values , function (t ) {
377382 self $ as_of(t , min_time_value = t - before_num ) %> %
378- tibble :: as_tibble() %> %
383+ tibble :: as_tibble() %> %
379384 dplyr :: group_by(!!! group_by ) %> %
380385 dplyr :: group_modify(comp_one_grp ,
381- f = f , ... ,
386+ f = f , ... ,
382387 time_value = t ,
383388 key_vars = key_vars ,
384389 new_col = new_col ,
@@ -396,14 +401,14 @@ epi_archive =
396401 if (length(quos ) > 1 ) {
397402 Abort(" If `f` is missing then only a single computation can be specified via `...`." )
398403 }
399-
404+
400405 quo = quos [[1 ]]
401406 f = function (x , quo , ... ) rlang :: eval_tidy(quo , x )
402407 new_col = sym(names(rlang :: quos_auto_name(quos )))
403408
404409 x = purrr :: map_dfr(ref_time_values , function (t ) {
405410 self $ as_of(t , min_time_value = t - before_num ) %> %
406- tibble :: as_tibble() %> %
411+ tibble :: as_tibble() %> %
407412 dplyr :: group_by(!!! group_by ) %> %
408413 dplyr :: group_modify(comp_one_grp ,
409414 f = f , quo = quo ,
@@ -414,12 +419,12 @@ epi_archive =
414419 dplyr :: ungroup()
415420 })
416421 }
417-
422+
418423 # Unnest if we need to
419424 if (! as_list_col ) {
420425 x = tidyr :: unnest(x , !! new_col , names_sep = names_sep )
421426 }
422-
427+
423428 # Join to get all rows, if we need to, then return
424429 if (all_rows ) {
425430 cols = c(as.character(group_by ), " time_value" )
@@ -430,7 +435,7 @@ epi_archive =
430435 }
431436 )
432437 )
433-
438+
434439# ' Convert to `epi_archive` format
435440# '
436441# ' Converts a data frame, data table, or tibble into an `epi_archive`
@@ -498,15 +503,15 @@ epi_archive =
498503# ' time_type = "day",
499504# ' other_keys = "county")
500505as_epi_archive = function (x , geo_type , time_type , other_keys ,
501- additional_metadata = list ()) {
502- epi_archive $ new(x , geo_type , time_type , other_keys , additional_metadata )
506+ additional_metadata = list ()) {
507+ epi_archive $ new(x , geo_type , time_type , other_keys , additional_metadata )
503508}
504509
505510# ' Test for `epi_archive` format
506511# '
507512# ' @param x An object.
508513# ' @return `TRUE` if the object inherits from `epi_archive`.
509- # '
514+ # '
510515# ' @export
511516# ' @examples
512517# ' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive)
0 commit comments