perf: speed up compactification with approx_equal

brookslogan · brookslogan · commit ea8c2aa5ca6e · 2025-03-05T10:24:05.000-08:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -175,7 +175,6 @@ importFrom(dplyr,if_all)
 importFrom(dplyr,if_any)
 importFrom(dplyr,if_else)
 importFrom(dplyr,is_grouped_df)
-importFrom(dplyr,lag)
 importFrom(dplyr,mutate)
 importFrom(dplyr,pick)
 importFrom(dplyr,pull)
@@ -215,7 +214,6 @@ importFrom(rlang,expr_label)
 importFrom(rlang,f_env)
 importFrom(rlang,f_rhs)
 importFrom(rlang,is_bare_integerish)
-importFrom(rlang,is_bare_numeric)
 importFrom(rlang,is_environment)
 importFrom(rlang,is_formula)
 importFrom(rlang,is_function)
diff --git a/R/archive.R b/R/archive.R
@@ -473,52 +473,30 @@ update_is_locf <- function(arranged_updates_df, ukey_names, abs_tol) {
   ekt_names <- ukey_names[ukey_names != "version"]
   val_names <- all_names[!all_names %in% ukey_names]
 
-  Reduce(`&`, lapply(updates_col_refs[ekt_names], is_locf, abs_tol, TRUE)) &
-    Reduce(`&`, lapply(updates_col_refs[val_names], is_locf, abs_tol, FALSE))
-}
-
-#' Checks to see if a value in a vector is LOCF
-#' @description LOCF meaning last observation carried forward (to later
-#'   versions). Lags the vector by 1, then compares with itself. If `is_key` is
-#'   `TRUE`, only values that are exactly the same between the lagged and
-#'   original are considered LOCF. If `is_key` is `FALSE` and `vec` is a vector
-#'   of numbers ([`base::is.numeric`]), then approximate equality will be used,
-#'   checking whether the absolute difference between each pair of entries is
-#'   `<= abs_tol`; if `vec` is something else, then exact equality is used
-#'   instead.
-#'
-#' @details
-#'
-#' We include epikey-time columns in LOCF comparisons as part of an optimization
-#' to avoid slower grouped operations while still ensuring that the first
-#' observation for each time series will not be marked as LOCF. We test these
-#' key columns for exact equality to prevent chopping off consecutive
-#' time_values during flat periods when `abs_tol` is high.
-#'
-#' We use exact equality for non-`is.numeric` double/integer columns such as
-#' dates, datetimes, difftimes, `tsibble::yearmonth`s, etc., as these may be
-#' used as part of re-indexing or grouping procedures, and we don't want to
-#' change the number of groups for those operations when we remove LOCF data
-#' during compactification.
-#'
-#' @importFrom dplyr lag if_else
-#' @importFrom rlang is_bare_numeric
-#' @importFrom vctrs vec_equal
-#' @keywords internal
-is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter
-  lag_vec <- lag(vec)
-  if (is.vector(vec, mode = "numeric") && !is_key) {
-    # (integer or double vector, no class (& no dims); maybe names, which we'll
-    # ignore like `vec_equal`); not a key column
-    res <- unname(if_else(
-      !is.na(vec) & !is.na(lag_vec),
-      abs(vec - lag_vec) <= abs_tol,
-      is.na(vec) & is.na(lag_vec)
-    ))
-    return(res)
+  n_updates <- nrow(arranged_updates_df)
+  if (n_updates == 0L) {
+    logical(0L)
+  } else if (n_updates == 1L) {
+    FALSE # sole observation is not LOCF
   } else {
-    res <- vec_equal(vec, lag_vec, na_equal = TRUE)
-    return(res)
+    ekts_tbl <- new_tibble(updates_col_refs[ekt_names])
+    vals_tbl <- new_tibble(updates_col_refs[val_names])
+    # n_updates >= 2L so we can use `:` naturally (this is the reason for
+    # separating out n_updates == 1L from this case):
+    inds1 <- 2L:n_updates
+    inds2 <- 1L:(n_updates - 1L)
+    c(
+      FALSE, # first observation is not LOCF
+      approx_equal0(ekts_tbl,
+        inds1 = inds1, ekts_tbl, inds2 = inds2,
+        # check ekt cols without tolerance:
+        abs_tol = 0, na_equal = TRUE
+      ) &
+        approx_equal0(vals_tbl,
+          inds1 = inds1, vals_tbl, inds2 = inds2,
+          abs_tol = abs_tol, na_equal = TRUE
+        )
+    )
   }
 }
 
diff --git a/R/patch.R b/R/patch.R
@@ -1,4 +1,3 @@
-# TODO use these in apply_compactify
 approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) {
   # Recycle inds if provided; vecs if not:
   common_size <- vec_size_common(
diff --git a/man/is_locf.Rd b/man/is_locf.Rd
diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R
@@ -217,8 +217,8 @@ test_that("`epi_archive` rejects dataframes where time_value and version columns
   expect_error(as_epi_archive(tbl3), class = "epiprocess__time_value_version_mismatch")
 })
 
-test_that("is_locf works as expected", {
+test_that("is_locf replacement works as expected", {
   vec <- c(1, 1, 1e-10, 1.1e-10, NA, NA, NaN, NaN)
   is_repeated <- c(0, 1, 0, 1, 0, 1, 1, 1)
-  expect_equal(is_locf(vec, .Machine$double.eps^0.5, FALSE), as.logical(is_repeated))
+  expect_equal(c(FALSE, approx_equal(head(vec, -1L), tail(vec, -1L), .Machine$double.eps^0.5, na_equal = TRUE)), as.logical(is_repeated))
 })

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-# TODO use these in apply_compactify`
`2`	`1`	`approx_equal <- function(vec1, vec2, abs_tol, na_equal, .ptype = NULL, inds1 = NULL, inds2 = NULL) {`
`3`	`2`	`# Recycle inds if provided; vecs if not:`
`4`	`3`	`common_size <- vec_size_common(`