StatFunGen · gaow · Mar 4, 2026 · Mar 3, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -176,6 +176,9 @@ importFrom(purrr,map2)
 importFrom(purrr,map_int)
 importFrom(purrr,map_lgl)
 importFrom(purrr,pmap)
+importFrom(readr,col_character)
+importFrom(readr,col_guess)
+importFrom(readr,col_integer)
 importFrom(readr,cols)
 importFrom(readr,parse_number)
 importFrom(readr,read_delim)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
@@ -611,6 +611,12 @@ qc_regional_data <- function(region_data,
     for (i in 1:n_LD) {
       LD_data <- sumstat_data$LD_info[[i]]
       sumstats <- sumstat_data$sumstats[[i]]
+
+      # Pre-compute LD partition once per block (shared across all GWAS studies)
+      if (impute) {
+        LD_matrix_partitioned <- partition_LD_matrix(LD_data)
+      }
+
       for (ii in 1:length(sumstats)) {
         sumstat <- sumstats[[ii]]
         if (nrow(sumstat$sumstats) == 0) next
@@ -652,10 +658,9 @@ qc_regional_data <- function(region_data,
           sumstat$sumstats <- qc_results$sumstats
           LD_mat <- qc_results$LD_mat
         }
-        # Perform imputation
+        # Perform imputation (LD_matrix_partitioned pre-computed above per LD block)
         if (impute) {
-          LD_matrix <- partition_LD_matrix(LD_data)
-          impute_results <- raiss(LD_data$ref_panel, sumstat$sumstats, LD_matrix,
+          impute_results <- raiss(LD_data$ref_panel, sumstat$sumstats, LD_matrix_partitioned,
             rcond = impute_opts$rcond,
             R2_threshold = impute_opts$R2_threshold, minimum_ld = impute_opts$minimum_ld, lamb = impute_opts$lamb
           )

diff --git a/R/file_utils.R b/R/file_utils.R
@@ -162,7 +162,7 @@ load_genotype_region <- function(genotype, region = NULL, keep_indel = TRUE, kee
     start <- parsed_region$start
     end <- parsed_region$end
     # 6 columns for bim file
-    col_types <- list(col_character(), col_character(), col_guess(), col_integer(), col_guess(), col_guess())
+    col_types <- list(readr::col_character(), readr::col_character(), readr::col_guess(), readr::col_integer(), readr::col_guess(), readr::col_guess())
     # Read a few lines of the bim file to check for 'chr' prefix
     bim_sample <- vroom(paste0(genotype, ".bim"), n_max = 5, col_names = FALSE, col_types = col_types)
     chr_prefix_present <- any(grepl("^chr", bim_sample$X1))

diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R
@@ -152,38 +152,64 @@ susie_wrapper <- function(X, y, init_L = 5, max_L = 30, l_step = 5, ...) {
 #' @export
 susie_rss_wrapper <- function(z, R, n = NULL, var_y = NULL, L = 10, max_L = 30, l_step = 5,
                               coverage = 0.95, ...) {
+  original_n <- length(z)
+  keep_indices <- seq_len(original_n)
+
+  if (any(is.na(R))) {
+    na_variants <- rowSums(is.na(R)) > 0 | colSums(is.na(R)) > 0
+    if (all(na_variants)) {
+      stop("All variants have NAs in LD matrix. Cannot proceed with analysis.")
+    }
+    keep_variants <- !na_variants
+    keep_indices <- which(keep_variants)
+    R <- R[keep_variants, keep_variants, drop = FALSE]
+    z <- z[keep_variants]
+    warning(paste("Removed", sum(na_variants), "variants with NAs in LD matrix. Remaining:", length(z)))
+  }
+
+  if (!isSymmetric(R)) {
+    warning("R matrix is not symmetric; forcing symmetry with (R + t(R))/2")
+    R <- (R + t(R)) / 2
+  }
+
   if (L == 1) {
-    return(susie_rss(
+    result <- susie_rss(
       z = z, R = R, var_y = var_y, n = n,
       L = 1, max_iter = 1, coverage = coverage, ...
-    ))
-  }
-  if (L == max_L) {
-    return(susie_rss(
-      z = z, R = R, var_y = var_y, n = n, L = L,
-      coverage = coverage, ...
-    ))
-  }
-  while (TRUE) {
-    st <- proc.time()
-    susie_rss_result <- susie_rss(
+    )
+  } else if (L == max_L) {
+    result <- susie_rss(
       z = z, R = R, var_y = var_y, n = n, L = L,
       coverage = coverage, ...
     )
-    susie_rss_result$time_elapsed <- proc.time() - st
-    # Check for convergence and adjust L if necessary
-    if (!is.null(susie_rss_result$sets$cs)) {
-      if (length(susie_rss_result$sets$cs) >= L && L <= max_L) {
-        L <- L + l_step # Increase L for the next iteration
+  } else {
+    while (TRUE) {
+      st <- proc.time()
+      result <- susie_rss(
+        z = z, R = R, var_y = var_y, n = n, L = L,
+        coverage = coverage, ...
+      )
+      result$time_elapsed <- proc.time() - st
+      if (!is.null(result$sets$cs) && length(result$sets$cs) >= L && L <= max_L) {
+        L <- L + l_step
       } else {
         break
       }
-    } else {
-      break # Break the loop if no credible sets are found
     }
   }
 
-  return(susie_rss_result)
+  # Expand PIP back to original length if variants were dropped,
+  # and remap credible set indices to original positions.
+  if (length(keep_indices) < original_n) {
+    full_pip <- rep(0, original_n)
+    full_pip[keep_indices] <- result$pip
+    result$pip <- full_pip
+    if (!is.null(result$sets$cs)) {
+      result$sets$cs <- lapply(result$sets$cs, function(cs) keep_indices[cs])
+    }
+  }
+
+  return(result)
 }
 
 #' Run the SuSiE RSS pipeline