Skip to content

Commit ea8c2aa

Browse files
committed
perf: speed up compactification with approx_equal
1 parent 6ff743f commit ea8c2aa

File tree

5 files changed

+25
-82
lines changed

5 files changed

+25
-82
lines changed

NAMESPACE

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ importFrom(dplyr,if_all)
175175
importFrom(dplyr,if_any)
176176
importFrom(dplyr,if_else)
177177
importFrom(dplyr,is_grouped_df)
178-
importFrom(dplyr,lag)
179178
importFrom(dplyr,mutate)
180179
importFrom(dplyr,pick)
181180
importFrom(dplyr,pull)
@@ -215,7 +214,6 @@ importFrom(rlang,expr_label)
215214
importFrom(rlang,f_env)
216215
importFrom(rlang,f_rhs)
217216
importFrom(rlang,is_bare_integerish)
218-
importFrom(rlang,is_bare_numeric)
219217
importFrom(rlang,is_environment)
220218
importFrom(rlang,is_formula)
221219
importFrom(rlang,is_function)

R/archive.R

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -473,52 +473,30 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
473473
ekt_names <- ukey_names[ukey_names != "version"]
474474
val_names <- all_names[!all_names %in% ukey_names]
475475

476-
Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) &
477-
Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE))
478-
}
479-
480-
#' Checks to see if a value in a vector is LOCF
481-
#' @description LOCF meaning last observation carried forward (to later
482-
#' versions). Lags the vector by 1, then compares with itself. If `is_key` is
483-
#' `TRUE`, only values that are exactly the same between the lagged and
484-
#' original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
485-
#' of numbers ([`base::is.numeric`]), then approximate equality will be used,
486-
#' checking whether the absolute difference between each pair of entries is
487-
#' `<= abs_tol`; if `vec` is something else, then exact equality is used
488-
#' instead.
489-
#'
490-
#' @details
491-
#'
492-
#' We include epikey-time columns in LOCF comparisons as part of an optimization
493-
#' to avoid slower grouped operations while still ensuring that the first
494-
#' observation for each time series will not be marked as LOCF. We test these
495-
#' key columns for exact equality to prevent chopping off consecutive
496-
#' time_values during flat periods when `abs_tol` is high.
497-
#'
498-
#' We use exact equality for non-`is.numeric` double/integer columns such as
499-
#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
500-
#' used as part of re-indexing or grouping procedures, and we don't want to
501-
#' change the number of groups for those operations when we remove LOCF data
502-
#' during compactification.
503-
#'
504-
#' @importFrom dplyr lag if_else
505-
#' @importFrom rlang is_bare_numeric
506-
#' @importFrom vctrs vec_equal
507-
#' @keywords internal
508-
is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
509-
lag_vec <- lag(vec)
510-
if (is.vector(vec, mode = "numeric") && !is_key) {
511-
# (integer or double vector, no class (& no dims); maybe names, which we'll
512-
# ignore like `vec_equal`); not a key column
513-
res <- unname(if_else(
514-
!is.na(vec) & !is.na(lag_vec),
515-
abs(vec - lag_vec) <= abs_tol,
516-
is.na(vec) & is.na(lag_vec)
517-
))
518-
return(res)
476+
n_updates <- nrow(arranged_updates_df)
477+
if (n_updates == 0L) {
478+
logical(0L)
479+
} else if (n_updates == 1L) {
480+
FALSE # sole observation is not LOCF
519481
} else {
520-
res <- vec_equal(vec, lag_vec, na_equal = TRUE)
521-
return(res)
482+
ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
483+
vals_tbl <- new_tibble(updates_col_refs[val_names])
484+
# n_updates >= 2L so we can use `:` naturally (this is the reason for
485+
# separating out n_updates == 1L from this case):
486+
inds1 <- 2L:n_updates
487+
inds2 <- 1L:(n_updates - 1L)
488+
c(
489+
FALSE, # first observation is not LOCF
490+
approx_equal0(ekts_tbl,
491+
inds1 = inds1, ekts_tbl, inds2 = inds2,
492+
# check ekt cols without tolerance:
493+
abs_tol = 0, na_equal = TRUE
494+
) &
495+
approx_equal0(vals_tbl,
496+
inds1 = inds1, vals_tbl, inds2 = inds2,
497+
abs_tol = abs_tol, na_equal = TRUE
498+
)
499+
)
522500
}
523501
}
524502

R/patch.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# TODO use these in apply_compactify
21
approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) {
32
# Recycle inds if provided; vecs if not:
43
common_size <- vec_size_common(

man/is_locf.Rd

Lines changed: 0 additions & 32 deletions
This file was deleted.

tests/testthat/test-archive.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns
217217
expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch")
218218
})
219219

220-
test_that("is_locf works as expected", {
220+
test_that("is_locf replacement works as expected", {
221221
vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN)
222222
is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1)
223-
expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated))
223+
expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), .Machine$double.eps^0.5, na_equal = TRUE)), as.logical(is_repeated))
224224
})

0 commit comments

Comments
 (0)