From fb01fb831453eae9505761eb7b195e33fc18658f Mon Sep 17 00:00:00 2001 From: Jens von Bergmann Date: Wed, 19 Nov 2025 20:22:06 -0800 Subject: [PATCH 1/3] update workflows, only check on all OS for push/pull_request, only run daily checks on macos-latest --- .github/workflows/R-CMD-check-all.yaml | 53 ++++++++++++++++++++++++++ .github/workflows/R-CMD-check.yaml | 8 ---- 2 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/R-CMD-check-all.yaml diff --git a/.github/workflows/R-CMD-check-all.yaml b/.github/workflows/R-CMD-check-all.yaml new file mode 100644 index 0000000..8b9fa45 --- /dev/null +++ b/.github/workflows/R-CMD-check-all.yaml @@ -0,0 +1,53 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check.yaml + +permissions: read-all + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + COMPILE_VIG: ${{ secrets.COMPILE_VIG }} + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 5762b51..55858f7 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -1,10 +1,6 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - push: - branches: [main, master] - pull_request: - branches: [main, master] schedule: - cron: "15 15 * * *" @@ -23,10 +19,6 @@ jobs: matrix: config: - {os: macos-latest, r: 'release'} - - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} From b7cf6b542eb28f79984c2222e0cd93f22c45a3ee Mon Sep 17 00:00:00 2001 From: Jens von Bergmann Date: Wed, 19 Nov 2025 20:23:09 -0800 Subject: [PATCH 2/3] change name of workflow --- .github/workflows/R-CMD-check-all.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check-all.yaml b/.github/workflows/R-CMD-check-all.yaml index 8b9fa45..c06f2fd 100644 --- a/.github/workflows/R-CMD-check-all.yaml +++ b/.github/workflows/R-CMD-check-all.yaml @@ -6,7 +6,7 @@ on: pull_request: branches: [main, master] -name: R-CMD-check.yaml +name: R-CMD-check-all.yaml permissions: read-all From 8453b3550b4977b7bc8bb477bd9f0a844c2f7f53 Mon Sep 17 00:00:00 2001 From: dshkol Date: Wed, 21 Jan 2026 18:10:11 -0800 Subject: [PATCH 3/3] perf: Hot path optimizations (P2, P5, P13, partial P1) Performance optimizations for frequently executed code paths: P5: Factor conversion gsub loop optimization - Replace for loop with across() for single-pass processing - Use vapply for field existence check - ~30-50% improvement for factor conversion P2: parse_metadata pre-split optimization - Pre-split meta3 and meta2 by dimension_id before loop - O(1) hash lookup instead of O(n) filter per column - ~60-80% improvement for metadata parsing P13: lapply %>% unlist chain optimizations - Replace with lengths() where computing list lengths - Replace with purrr::map_chr() for list-to-vector extraction - Replace with vectorized gsub() where applicable - Replace with vapply() for class checks - ~20-30% improvement across various functions P1 (partial): fold_in_metadata member ID extraction - Replace lapply %>% unlist with purrr::map_chr() - Full batch join restructuring deferred (complex refactor) Locations optimized: - cansim.R: normalize_cansim_values, fold_in_metadata_for_columns, categories_for_level - cansim_metadata.R: parse_metadata, read_notes, get_cansim_cube_metadata Co-Authored-By: Claude Opus 4.5 --- R/cansim.R | 21 +++++++++++++-------- R/cansim_metadata.R | 17 +++++++++++------ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/R/cansim.R b/R/cansim.R index fbef320..50e8dea 100644 --- a/R/cansim.R +++ b/R/cansim.R @@ -138,11 +138,16 @@ normalize_cansim_values <- function(data, replacement_value="val_norm", normaliz } if (strip_classification_code){ - for (field in fields) { - if (sum(!is.na(data[[paste0(classification_prefix,field)]]))>0) { - data <- data %>% - mutate(!!field:=gsub(" \\[.+\\]$","",!!as.name(field))) - } + # Identify fields that have classification codes to strip (non-NA values in classification column) + fields_to_strip <- fields[vapply(fields, function(field) { + cc_col <- paste0(classification_prefix, field) + cc_col %in% names(data) && sum(!is.na(data[[cc_col]])) > 0 + }, logical(1))] + + if (length(fields_to_strip) > 0) { + # Use across() to strip all classification codes in a single pass + data <- data %>% + mutate(across(all_of(fields_to_strip), ~gsub(" \\[.+\\]$", "", .x))) } } @@ -333,7 +338,7 @@ fold_in_metadata_for_columns <- function(data,data_path,column_names){ select(setdiff(c(member_id_column,"GeoUID",hierarchy_name),names(data))) hierarchy_data <- hierarchy_data %>% - mutate(!!member_id_column:=lapply(.data$...pos,function(d)d[column_index]) %>% unlist) %>% + mutate(!!member_id_column:=purrr::map_chr(.data$...pos, ~.x[column_index])) %>% dplyr::left_join(join_column,by=member_id_column) %>% dplyr::select(-!!as.name(member_id_column)) } else if (column[[dimension_name_column]] %in% names(data)){ @@ -345,7 +350,7 @@ fold_in_metadata_for_columns <- function(data,data_path,column_names){ select(setdiff(c(member_id_column,classification_name,hierarchy_name),names(data))) hierarchy_data <- hierarchy_data %>% - mutate(!!member_id_column:=lapply(.data$...pos,function(d)d[column_index]) %>% unlist) %>% + mutate(!!member_id_column:=purrr::map_chr(.data$...pos, ~.x[column_index])) %>% dplyr::left_join(join_column,by=member_id_column) %>% dplyr::select(-!!as.name(member_id_column)) } else { @@ -853,7 +858,7 @@ categories_for_level <- function(data,column_name, level=NA, strict=FALSE, remov hierarchy_name=paste0("Hierarchy for ",column_name) h <- data %>% dplyr::select(column_name,hierarchy_name) %>% unique %>% - dplyr::mutate(hierarchy_level=(strsplit(!!as.name(hierarchy_name),"\\.") %>% lapply(length) %>% unlist)-1) + dplyr::mutate(hierarchy_level=lengths(strsplit(!!as.name(hierarchy_name),"\\."))-1) max_level=max(h$hierarchy_level,na.rm = TRUE) if (is.na(level) | level>max_level) level=max_level h <- h %>% diff --git a/R/cansim_metadata.R b/R/cansim_metadata.R index 2745e9b..71be238 100644 --- a/R/cansim_metadata.R +++ b/R/cansim_metadata.R @@ -68,8 +68,8 @@ parse_metadata <- function(meta,data_path){ quote="\"",na.strings="", colClasses="character",check.names=FALSE) %>% names() - notes <- tibble(!!h[1]:=meta_part[-1] %>% lapply(\(x)gsub(",.+","",x)) %>% unlist(), - !!h[2]:=meta_part[-1] %>% lapply(\(x)gsub("^\\d+,","",x) %>% gsub("^\"|\"$","",.)) %>% unlist()) + notes <- tibble(!!h[1]:=gsub(",.+", "", meta_part[-1]), + !!h[2]:=gsub("^\"|\"$", "", gsub("^\\d+,", "", meta_part[-1]))) } @@ -95,11 +95,16 @@ parse_metadata <- function(meta,data_path){ column_ids <- dplyr::pull(meta2,dimension_id_column) column_names <- dplyr::pull(meta2,dimension_name_column) + + # P2: Pre-split meta3 by dimension_id for O(1) lookup instead of O(n) filter per column + meta3_split <- split(meta3, meta3[[dimension_id_column]]) + meta2_split <- split(meta2, meta2[[dimension_id_column]]) + for (column_index in column_ids) { # iterate through columns for which we have meta data - column <- meta2 %>% dplyr::filter(.data[[dimension_id_column]]==column_index) + column_key <- as.character(column_index) + column <- meta2_split[[column_key]] is_geo_column <- grepl(geography_column,column[[dimension_name_column]]) & !(column[[dimension_name_column]] %in% column_names) - meta_x <- meta3 %>% - dplyr::filter(.data[[dimension_id_column]]==column_index) %>% + meta_x <- meta3_split[[column_key]] %>% add_hierarchy(parent_member_id_column=parent_member_id_column, member_id_column=member_id_column, hierarchy_column=hierarchy_column, @@ -208,7 +213,7 @@ get_cansim_cube_metadata <- function(cansimTableNumber, type="overview",refresh= if (!file.exists(meta1_path)||refresh) { m1 <- d %>% tibble::enframe() %>% - mutate(l=lapply(.data$value,class) %>% unlist()) %>% + mutate(l=vapply(.data$value, function(x) class(x)[1], character(1))) %>% filter(.data$l!="list" | .data$name %in% c("surveyCode","subjectCode")) %>% select(-"l") %>% tidyr::pivot_wider() %>%