From 2f3bdb96e77dd42507b6d4e60f1cbac2caa74f41 Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Tue, 3 Mar 2026 12:13:23 -0800 Subject: [PATCH] Fix: share LD partition across GWAS, readr namespace, susie robustness - Pre-compute partition_LD_matrix once per LD block (shared across all GWAS) instead of recomputing for each study; avoids redundant processing - Use readr::col_* in load_genotype_region to avoid namespace conflicts - susie_rss_wrapper: handle NA in LD matrix, force symmetry, remap PIP/cs when variants are dropped Made-with: Cursor --- NAMESPACE | 3 ++ R/colocboost_pipeline.R | 11 +++++-- R/file_utils.R | 2 +- R/susie_wrapper.R | 66 ++++++++++++++++++++++++++++------------- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 517a2be8..ac94ba27 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -176,6 +176,9 @@ importFrom(purrr,map2) importFrom(purrr,map_int) importFrom(purrr,map_lgl) importFrom(purrr,pmap) +importFrom(readr,col_character) +importFrom(readr,col_guess) +importFrom(readr,col_integer) importFrom(readr,cols) importFrom(readr,parse_number) importFrom(readr,read_delim) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index e934459f..24d90636 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -611,6 +611,12 @@ qc_regional_data <- function(region_data, for (i in 1:n_LD) { LD_data <- sumstat_data$LD_info[[i]] sumstats <- sumstat_data$sumstats[[i]] + + # Pre-compute LD partition once per block (shared across all GWAS studies) + if (impute) { + LD_matrix_partitioned <- partition_LD_matrix(LD_data) + } + for (ii in 1:length(sumstats)) { sumstat <- sumstats[[ii]] if (nrow(sumstat$sumstats) == 0) next @@ -652,10 +658,9 @@ qc_regional_data <- function(region_data, sumstat$sumstats <- qc_results$sumstats LD_mat <- qc_results$LD_mat } - # Perform imputation + # Perform imputation (LD_matrix_partitioned pre-computed above per LD block) if (impute) { - LD_matrix <- partition_LD_matrix(LD_data) - impute_results <- raiss(LD_data$ref_panel, sumstat$sumstats, LD_matrix, + impute_results <- raiss(LD_data$ref_panel, sumstat$sumstats, LD_matrix_partitioned, rcond = impute_opts$rcond, R2_threshold = impute_opts$R2_threshold, minimum_ld = impute_opts$minimum_ld, lamb = impute_opts$lamb ) diff --git a/R/file_utils.R b/R/file_utils.R index 86269df8..df8ba365 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -162,7 +162,7 @@ load_genotype_region <- function(genotype, region = NULL, keep_indel = TRUE, kee start <- parsed_region$start end <- parsed_region$end # 6 columns for bim file - col_types <- list(col_character(), col_character(), col_guess(), col_integer(), col_guess(), col_guess()) + col_types <- list(readr::col_character(), readr::col_character(), readr::col_guess(), readr::col_integer(), readr::col_guess(), readr::col_guess()) # Read a few lines of the bim file to check for 'chr' prefix bim_sample <- vroom(paste0(genotype, ".bim"), n_max = 5, col_names = FALSE, col_types = col_types) chr_prefix_present <- any(grepl("^chr", bim_sample$X1)) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 0479c4cd..ff6d225c 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -152,38 +152,64 @@ susie_wrapper <- function(X, y, init_L = 5, max_L = 30, l_step = 5, ...) { #' @export susie_rss_wrapper <- function(z, R, n = NULL, var_y = NULL, L = 10, max_L = 30, l_step = 5, coverage = 0.95, ...) { + original_n <- length(z) + keep_indices <- seq_len(original_n) + + if (any(is.na(R))) { + na_variants <- rowSums(is.na(R)) > 0 | colSums(is.na(R)) > 0 + if (all(na_variants)) { + stop("All variants have NAs in LD matrix. Cannot proceed with analysis.") + } + keep_variants <- !na_variants + keep_indices <- which(keep_variants) + R <- R[keep_variants, keep_variants, drop = FALSE] + z <- z[keep_variants] + warning(paste("Removed", sum(na_variants), "variants with NAs in LD matrix. Remaining:", length(z))) + } + + if (!isSymmetric(R)) { + warning("R matrix is not symmetric; forcing symmetry with (R + t(R))/2") + R <- (R + t(R)) / 2 + } + if (L == 1) { - return(susie_rss( + result <- susie_rss( z = z, R = R, var_y = var_y, n = n, L = 1, max_iter = 1, coverage = coverage, ... - )) - } - if (L == max_L) { - return(susie_rss( - z = z, R = R, var_y = var_y, n = n, L = L, - coverage = coverage, ... - )) - } - while (TRUE) { - st <- proc.time() - susie_rss_result <- susie_rss( + ) + } else if (L == max_L) { + result <- susie_rss( z = z, R = R, var_y = var_y, n = n, L = L, coverage = coverage, ... ) - susie_rss_result$time_elapsed <- proc.time() - st - # Check for convergence and adjust L if necessary - if (!is.null(susie_rss_result$sets$cs)) { - if (length(susie_rss_result$sets$cs) >= L && L <= max_L) { - L <- L + l_step # Increase L for the next iteration + } else { + while (TRUE) { + st <- proc.time() + result <- susie_rss( + z = z, R = R, var_y = var_y, n = n, L = L, + coverage = coverage, ... + ) + result$time_elapsed <- proc.time() - st + if (!is.null(result$sets$cs) && length(result$sets$cs) >= L && L <= max_L) { + L <- L + l_step } else { break } - } else { - break # Break the loop if no credible sets are found } } - return(susie_rss_result) + # Expand PIP back to original length if variants were dropped, + # and remap credible set indices to original positions. + if (length(keep_indices) < original_n) { + full_pip <- rep(0, original_n) + full_pip[keep_indices] <- result$pip + result$pip <- full_pip + if (!is.null(result$sets$cs)) { + result$sets$cs <- lapply(result$sets$cs, function(cs) keep_indices[cs]) + } + } + + return(result) } #' Run the SuSiE RSS pipeline