240240# ' value of `clobberable_versions_start` does not fully trust these empty
241241# ' updates, and assumes that any version `>= max(x$version)` could be
242242# ' clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
243+ # ' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
243244# ' @return An `epi_archive` object.
244245# '
245246# ' @importFrom data.table as.data.table key setkeyv
@@ -295,15 +296,16 @@ new_epi_archive <- function(
295296 additional_metadata ,
296297 compactify ,
297298 clobberable_versions_start ,
298- versions_end ) {
299+ versions_end ,
300+ compactify_tol = .Machine $ double.eps ^ 0.5 ) {
299301 # Create the data table; if x was an un-keyed data.table itself,
300302 # then the call to as.data.table() will fail to set keys, so we
301303 # need to check this, then do it manually if needed
302304 key_vars <- c(" geo_value" , " time_value" , other_keys , " version" )
303- DT <- as.data.table(x , key = key_vars ) # nolint: object_name_linter
304- if (! identical(key_vars , key(DT ))) setkeyv(DT , cols = key_vars )
305+ data_table <- as.data.table(x , key = key_vars ) # nolint: object_name_linter
306+ if (! identical(key_vars , key(data_table ))) setkeyv(data_table , cols = key_vars )
305307
306- if (anyDuplicated(DT , by = key(DT )) != 0L ) {
308+ if (anyDuplicated(data_table , by = key(data_table )) != 0L ) {
307309 cli_abort(" `x` must have one row per unique combination of the key variables. If you
308310 have additional key variables other than `geo_value`, `time_value`, and
309311 `version`, such as an age group column, please specify them in `other_keys`.
@@ -313,38 +315,17 @@ new_epi_archive <- function(
313315 )
314316 }
315317
316- # Checks to see if a value in a vector is LOCF
317- is_locf <- function (vec ) { # nolint: object_usage_linter
318- dplyr :: if_else(! is.na(vec ) & ! is.na(dplyr :: lag(vec )),
319- vec == dplyr :: lag(vec ),
320- is.na(vec ) & is.na(dplyr :: lag(vec ))
321- )
322- }
323-
324- # LOCF is defined by a row where all values except for the version
325- # differ from their respective lag values
326-
327- # Checks for LOCF's in a data frame
328- rm_locf <- function (df ) {
329- dplyr :: filter(df , if_any(c(everything(), - version ), ~ ! is_locf(. ))) # nolint: object_usage_linter
330- }
331-
332- # Keeps LOCF values, such as to be printed
333- keep_locf <- function (df ) {
334- dplyr :: filter(df , if_all(c(everything(), - version ), ~ is_locf(. ))) # nolint: object_usage_linter
335- }
336-
318+ nrow_before_compactify <- nrow(data_table )
337319 # Runs compactify on data frame
338320 if (is.null(compactify ) || compactify == TRUE ) {
339- elim <- keep_locf(DT )
340- DT <- rm_locf(DT ) # nolint: object_name_linter
321+ compactified <- apply_compactify(data_table , key_vars , compactify_tol )
341322 } else {
342- # Create empty data frame for nrow(elim) to be 0
343- elim <- tibble :: tibble()
323+ compactified <- data_table
344324 }
345-
346- # Warns about redundant rows
347- if (is.null(compactify ) && nrow(elim ) > 0 ) {
325+ # Warns about redundant rows if the number of rows decreased, and we didn't
326+ # explicitly say to compactify
327+ if (is.null(compactify ) && nrow(compactified ) < nrow_before_compactify ) {
328+ elim <- removed_by_compactify(data_table , key_vars , compactify_tol )
348329 warning_intro <- cli :: format_inline(
349330 " Found rows that appear redundant based on
350331 last (version of each) observation carried forward;
@@ -366,7 +347,7 @@ new_epi_archive <- function(
366347
367348 structure(
368349 list (
369- DT = DT ,
350+ DT = compactified ,
370351 geo_type = geo_type ,
371352 time_type = time_type ,
372353 additional_metadata = additional_metadata ,
@@ -377,6 +358,63 @@ new_epi_archive <- function(
377358 )
378359}
379360
361+ # ' given a tibble as would be found in an epi_archive, remove duplicate entries.
362+ # ' @description
363+ # ' works by shifting all rows except the version, then comparing values to see
364+ # ' if they've changed. We need to arrange in descending order, but note that
365+ # ' we don't need to group, since at least one column other than version has
366+ # ' changed, and so is kept.
367+ # ' @keywords internal
368+ # ' @importFrom dplyr filter
369+ apply_compactify <- function (df , keys , tolerance = .Machine $ double.eps ^ .5 ) {
370+ df %> %
371+ arrange(!!! keys ) %> %
372+ filter(if_any(
373+ c(everything(), - version ), # all non-version columns
374+ ~ ! is_locf(. , tolerance )
375+ ))
376+ }
377+
378+ # ' get the entries that `compactify` would remove
379+ # ' @keywords internal
380+ # ' @importFrom dplyr filter if_all everything
381+ removed_by_compactify <- function (df , keys , tolerance ) {
382+ df %> %
383+ arrange(!!! keys ) %> %
384+ filter(if_all(
385+ c(everything(), - version ),
386+ ~ is_locf(. , tolerance )
387+ )) # nolint: object_usage_linter
388+ }
389+
390+ # ' Checks to see if a value in a vector is LOCF
391+ # ' @description
392+ # ' LOCF meaning last observation carried forward. lags the vector by 1, then
393+ # ' compares with itself. For doubles it uses float comparison via
394+ # ' [`dplyr::near`], otherwise it uses equality. `NA`'s and `NaN`'s are
395+ # ' considered equal to themselves and each other.
396+ # ' @importFrom dplyr lag if_else near
397+ # ' @keywords internal
398+ is_locf <- function (vec , tolerance ) { # nolint: object_usage_linter
399+ lag_vec <- dplyr :: lag(vec )
400+ if (typeof(vec ) == " double" ) {
401+ res <- if_else(
402+ ! is.na(vec ) & ! is.na(lag_vec ),
403+ near(vec , lag_vec , tol = tolerance ),
404+ is.na(vec ) & is.na(lag_vec )
405+ )
406+ return (res )
407+ } else {
408+ res <- if_else(
409+ ! is.na(vec ) & ! is.na(lag_vec ),
410+ vec == lag_vec ,
411+ is.na(vec ) & is.na(lag_vec )
412+ )
413+ return (res )
414+ }
415+ }
416+
417+
380418# ' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`.
381419# '
382420# ' @rdname epi_archive
0 commit comments