StatFunGen · gaow · Apr 24, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,6 +20,7 @@ export(coloc_wrapper)
 export(colocboost_analysis_pipeline)
 export(compute_LD)
 export(compute_qtl_enrichment)
+export(ctwas_bimfile_loader)
 export(dentist)
 export(dentist_single_window)
 export(dpr_weights)
@@ -35,6 +36,7 @@ export(find_data)
 export(find_duplicate_variants)
 export(fsusie_get_cs)
 export(fsusie_wrapper)
+export(get_ctwas_meta_data)
 export(get_filter_lbf_index)
 export(get_nested_element)
 export(get_ref_variant_info)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
@@ -457,13 +457,22 @@ qc_regional_data <- function(region_data,
                              impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) {
   qc_method <- match.arg(qc_method)
 
-  # Validate and recycle pip_cutoff_to_skip_ind: scalar -> recycled to n_contexts
+  # Validate and recycle pip_cutoff_to_skip_ind: scalar -> named vector for all contexts
   if (!is.null(region_data$individual_data)) {
-    n_ind_contexts <- length(region_data$individual_data$residual_Y)
-    if (length(pip_cutoff_to_skip_ind) == 1) {
-      pip_cutoff_to_skip_ind <- rep(pip_cutoff_to_skip_ind, n_ind_contexts)
-    } else if (length(pip_cutoff_to_skip_ind) != n_ind_contexts) {
-      stop("pip_cutoff_to_skip_ind must be a scalar or a vector with length equal to the number of individual contexts (", n_ind_contexts, ").")
+    ind_context_names <- names(region_data$individual_data$residual_Y)
+    n_ind_contexts <- length(ind_context_names)
+    if (length(pip_cutoff_to_skip_ind) == 1 && is.null(names(pip_cutoff_to_skip_ind))) {
+      pip_cutoff_to_skip_ind <- setNames(rep(pip_cutoff_to_skip_ind, n_ind_contexts), ind_context_names)
+    } else if (!is.null(names(pip_cutoff_to_skip_ind))) {
+      # Named vector: fill missing contexts with 0
+      missing <- setdiff(ind_context_names, names(pip_cutoff_to_skip_ind))
+      if (length(missing) > 0) {
+        pip_cutoff_to_skip_ind <- c(pip_cutoff_to_skip_ind, setNames(rep(0, length(missing)), missing))
+      }
+    } else if (length(pip_cutoff_to_skip_ind) == n_ind_contexts) {
+      names(pip_cutoff_to_skip_ind) <- ind_context_names
+    } else {
+      stop("pip_cutoff_to_skip_ind must be a scalar, a named vector, or a vector with length equal to the number of individual contexts (", n_ind_contexts, ").")
     }
   }
 
@@ -543,10 +552,14 @@ qc_regional_data <- function(region_data,
     results <- purrr::imap(X, function(resX, context) {
       resY <- Y[[context]]
       maf <- MAF[[context]]
-      i_context <- match(context, names(X))
+      pip_cutoff <- if (context %in% names(pip_cutoff_to_skip_ind)) {
+        pip_cutoff_to_skip_ind[[context]]
+      } else {
+        0
+      }
       if (is.null(resY)) return(NULL)
       resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf)
-      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context)
+      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff, context = context)
       if (is.null(resY)) return(NULL)
       list(X = resX, Y = resY)
     }) %>% purrr::compact()

diff --git a/R/ctwas_wrapper.R b/R/ctwas_wrapper.R
@@ -1,5 +1,85 @@
-### File-I/O functions (ctwas_bimfile_loader, get_ctwas_meta_data) have been
-### removed. Use ld_loader() and read_bim() from the standard I/O path instead.
+#' Load a PLINK .bim file for cTWAS
+#'
+#' @description
+#' \strong{Deprecated.} Use [read_bim()] via the standard I/O path
+#' instead. This wrapper remains for backwards compatibility and calls
+#' [read_bim()] internally, mapping its output to the legacy column names.
+#'
+#' @param bim_file_path Path to a PLINK \code{.bim} file (or a \code{.bed}
+#'   file — the \code{.bim} extension is resolved automatically).
+#'
+#' @return A data.frame with columns \code{chrom}, \code{id}, \code{GD},
+#'   \code{pos}, \code{A1}, \code{A2}. Variant IDs are normalised via
+#'   [normalize_variant_id()].
+#'
+#' @export
+ctwas_bimfile_loader <- function(bim_file_path) {
+  .Deprecated("read_bim", package = "pecotmr",
+              msg = "ctwas_bimfile_loader() is deprecated. Use read_bim() instead.")
+  # read_bim() expects a .bed path and derives .bim from it.
+  # Accept either .bim or .bed and normalise to .bed.
+  bed_path <- sub("\\.bim$", ".bed", bim_file_path)
+  bim <- read_bim(bed_path)
+  # Map new column names back to legacy names
+  snp_info <- data.frame(
+    chrom = bim$chrom,
+    id    = normalize_variant_id(bim$id),
+    GD    = bim$gpos,
+    pos   = bim$pos,
+    A1    = bim$a1,
+    A2    = bim$a0,
+    stringsAsFactors = FALSE
+  )
+  return(snp_info)
+}
+
+#' Load cTWAS LD meta-data
+#'
+#' @description
+#' \strong{Deprecated.} Use [ld_loader()] with its \code{LD_info}
+#' argument instead. This wrapper remains for backwards compatibility and
+#' produces the same \code{list(LD_info, region_info)} output as the original.
+#'
+#' @param ld_meta_data_file Path to the LD meta-data TSV file.
+#' @param subset_region_ids Optional character vector of region IDs
+#'   (\code{"chrom_start_end"}) to subset to.
+#'
+#' @return A list with components:
+#' \describe{
+#'   \item{LD_info}{Data.frame with columns \code{region_id}, \code{LD_file},
+#'     \code{SNP_file}.}
+#'   \item{region_info}{Data.frame with columns \code{chrom}, \code{start},
+#'     \code{stop}, \code{region_id}.}
+#' }
+#'
+#' @importFrom vroom vroom
+#' @export
+get_ctwas_meta_data <- function(ld_meta_data_file, subset_region_ids = NULL) {
+  .Deprecated("ld_loader", package = "pecotmr",
+              msg = "get_ctwas_meta_data() is deprecated. Use ld_loader() with LD_info instead.")
+  LD_info <- as.data.frame(vroom(ld_meta_data_file))
+  colnames(LD_info)[1] <- "chrom"
+  LD_info$region_id <- paste(as.integer(strip_chr_prefix(LD_info$chrom)),
+                             LD_info$start, LD_info$end, sep = "_")
+  LD_info$LD_file <- paste0(dirname(ld_meta_data_file), "/",
+                            gsub(",.*$", "", LD_info$path))
+  LD_info$SNP_file <- paste0(LD_info$LD_file, ".bim")
+  LD_info <- LD_info[, c("region_id", "LD_file", "SNP_file")]
+  region_info <- LD_info[, "region_id", drop = FALSE]
+  region_info$chrom <- as.integer(gsub("\\_.*$", "", region_info$region_id))
+  region_info$start <- as.integer(gsub("\\_.*$", "",
+                                       sub("^.*?\\_", "", region_info$region_id)))
+  region_info$stop <- as.integer(sub("^.*?\\_", "",
+                                      sub("^.*?\\_", "", region_info$region_id)))
+  region_info$region_id <- paste0(region_info$chrom, "_",
+                                   region_info$start, "_",
+                                   region_info$stop)
+  region_info <- region_info[, c("chrom", "start", "stop", "region_id")]
+  if (!is.null(subset_region_ids)) {
+    region_info <- region_info[region_info$region_id %in% subset_region_ids, ]
+  }
+  return(list(LD_info = LD_info, region_info = region_info))
+}
 
 #' Function to select variants for ctwas weights input
 #' @param region_data A list of list containing weights list and snp_info list data for multiple genes/events within a single LD block region.

diff --git a/man/ctwas_bimfile_loader.Rd b/man/ctwas_bimfile_loader.Rd
diff --git a/man/get_ctwas_meta_data.Rd b/man/get_ctwas_meta_data.Rd
diff --git a/tests/testthat/test_colocboost_pipeline.R b/tests/testthat/test_colocboost_pipeline.R
@@ -1768,6 +1768,78 @@ test_that("qc_regional_data: mismatched pip_cutoff_to_skip_ind length errors", {
   )
 })
 
+test_that("qc_regional_data: named pip_cutoff_to_skip_ind works with context names", {
+  region_data <- make_individual_region_data(n = 20, p = 8, n_contexts = 2, n_events = 2)
+
+  # Named vector matching context names
+  result <- pecotmr:::qc_regional_data(
+    region_data,
+    pip_cutoff_to_skip_ind = c(ctx1 = 0, ctx2 = 0),
+    qc_method = "slalom"
+  )
+  expect_type(result, "list")
+})
+
+test_that("qc_regional_data: named pip_cutoff_to_skip_ind fills missing contexts with 0", {
+  region_data <- make_individual_region_data(n = 20, p = 8, n_contexts = 3, n_events = 2)
+
+  # Only specify cutoff for ctx1 — ctx2 and ctx3 should default to 0
+  result <- pecotmr:::qc_regional_data(
+    region_data,
+    pip_cutoff_to_skip_ind = c(ctx1 = 0),
+    qc_method = "slalom"
+  )
+  expect_type(result, "list")
+})
+
+test_that("qc_regional_data: scalar pip_cutoff_to_skip_ind becomes named vector", {
+  region_data <- make_individual_region_data(n = 20, p = 8, n_contexts = 2, n_events = 2)
+
+  # This exercises the scalar -> named vector recycling path
+  result <- pecotmr:::qc_regional_data(
+    region_data,
+    pip_cutoff_to_skip_ind = 0,
+    qc_method = "slalom"
+  )
+  expect_type(result, "list")
+})
+
+test_that("qc_regional_data: pip_cutoff_to_skip_ind lookup works when X and Y have different contexts", {
+  # Simulate a case where residual_X has a context not in residual_Y
+  set.seed(303)
+  n <- 20; p <- 8; n_events <- 2
+  make_ctx <- function() {
+    X <- matrix(rnorm(n * p), n, p)
+    colnames(X) <- paste0("chr1:", seq_len(p) * 100, ":A:G")
+    Y <- matrix(rnorm(n * n_events), n, n_events)
+    colnames(Y) <- paste0("event", seq_len(n_events))
+    maf <- runif(p, 0.05, 0.45)
+    list(X = X, Y = Y, maf = maf)
+  }
+  ctx1 <- make_ctx()
+  ctx2 <- make_ctx()
+  ctx3 <- make_ctx()
+
+  # residual_X has 3 contexts, residual_Y only has 2
+  # pip_cutoff_to_skip_ind is recycled from residual_Y (length 2)
+  region_data <- list(
+    individual_data = list(
+      residual_Y = list(ctx1 = ctx1$Y, ctx2 = ctx2$Y),
+      residual_X = list(ctx1 = ctx1$X, ctx2 = ctx2$X, ctx3 = ctx3$X),
+      maf = list(ctx1 = ctx1$maf, ctx2 = ctx2$maf, ctx3 = ctx3$maf)
+    ),
+    sumstat_data = NULL
+  )
+
+  # Should not error — ctx3 in X has no pip_cutoff entry, defaults to 0
+  result <- pecotmr:::qc_regional_data(
+    region_data,
+    pip_cutoff_to_skip_ind = 0,
+    qc_method = "slalom"
+  )
+  expect_type(result, "list")
+})
+
 # ===========================================================================
 # SECTION U: colocboost_analysis_pipeline - sumstat with NA z filtering (line 252-258)
 # ===========================================================================