StatFunGen · gaow · May 15, 2025 · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -39,7 +39,9 @@ export(find_duplicate_variants)
 export(fsusie_get_cs)
 export(fsusie_wrapper)
 export(gbayes_rss)
+export(get_cormat)
 export(get_ctwas_meta_data)
+export(get_filter_lbf_index)
 export(get_nested_element)
 export(get_susie_result)
 export(glmnet_weights)
@@ -170,6 +172,7 @@ importFrom(purrr,exec)
 importFrom(purrr,map)
 importFrom(purrr,map2)
 importFrom(purrr,map_int)
+importFrom(purrr,map_lgl)
 importFrom(purrr,pmap)
 importFrom(readr,cols)
 importFrom(readr,parse_number)

diff --git a/R/allele_qc.R b/R/allele_qc.R
@@ -27,7 +27,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
                       match_min_prop = 0.2, remove_dups = TRUE,
                       remove_indels = FALSE, remove_strand_ambiguous = TRUE,
                       flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) {
-  strand_flip <- function(ref) {
+    strand_flip <- function(ref) {
     # Define a mapping for complementary bases
     base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
 
@@ -130,25 +130,25 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
     match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"])
   }
 
+  # Remove all unnecessary columns used to determine qc status
+  # Finally keep those variants with FLAG keep = TRUE
+  result <- match_result[match_result$keep, , drop = FALSE]
+
   # FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE.
   # It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly.
   # What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this
   # remove_dup name
   if (remove_dups) {
-    dups <- vec_duplicate_detect(match_result[, c("chrom", "pos", "variants_id_qced")])
+    dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")])
     if (any(dups)) {
-      match_result <- match_result[!dups, , drop = FALSE]
-      message("Some duplicates were removed.")
+      result <- result[!dups, , drop = FALSE]
+      Warning("Unexpected duplicates were removed.")
     }
   }
-
-  # Remove all unnecessary columns used to determine qc status
-  # Finally keep those variants with FLAG keep = TRUE
-  result <- match_result[match_result$keep, , drop = FALSE]
-
+
   result <- result %>%
     select(-(flip1.ref:keep)) %>%
-    select(-variants_id_original, -A1.target, -A2.target) %>%
+    select(-A1.target, -A2.target) %>%
     rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
 
   # default FALSE, but if want to remove same variants having different z score, then set as TRUE
@@ -161,14 +161,15 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
   }
 
   if (!remove_unmatched) {
-    match_variant <- match_result %>% pull(variants_id_original)
+    match_variant <- result %>% pull(variants_id_original)
     match_result <- select(match_result, -(flip1.ref:keep)) %>%
       select(-variants_id_original, -A1.target, -A2.target) %>%
       rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
     target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":"))
     if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) {
       unmatch_data <- target_data %>% filter(!variant_id %in% match_variant)
-      result <- rbind(result, unmatch_data)
+      result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id))
+      result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original)
     }
   }
 
@@ -230,7 +231,7 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) {
     ref_variants = reference_df,
     col_to_flip = NULL,
     match_min_prop = 0,
-    remove_dups = TRUE,
+    remove_dups = FALSE, #otherwise case like c('14:31234238:C:A','14:31234238:C:G') -- c('14:31234238:G:C','14:31234238:G:C') would not work
     flip_strand = TRUE,
     remove_indels = remove_indels,
     remove_strand_ambiguous = FALSE,

diff --git a/R/encoloc.R b/R/encoloc.R
@@ -112,11 +112,75 @@ calculate_cumsum <- function(coloc_results) {
 
 #' Function to load and extract LD matrix
 #' @noRd
-load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants) {
-  # This is a placeholder for loading LD matrix, adjust as per your actual function
-  ld_ref <- load_LD_matrix(LD_meta_file_path = ld_meta_file_path, region = analysis_region)
-  ext_ld <- ld_ref$combined_LD_matrix[variants, variants]
-  ext_ld
+load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants, ld_ref = NULL, in_sample = NULL) {
+  # Extract variant positions
+  var_pos <- str_split(variants, ":", simplify = TRUE)[,2] %>% as.numeric()
+  min_var_pos <- min(var_pos)
+  max_var_pos <- max(var_pos)
+  chr <- str_split(analysis_region, ":", simplify = TRUE)[,1]
+  analysis_region_narrow <- paste0(chr, ":", min_var_pos, "-", max_var_pos)
+
+  # --- Determine mode if not explicitly provided ---
+  if (is.null(ld_ref) && is.null(in_sample)) {
+    ld_ref <- TRUE  # Default assumption
+    if (grepl("plink|genotype|\\.bed$|\\.bim$|\\.fam$", ld_meta_file_path)) {
+      ld_ref <- FALSE
+      in_sample <- TRUE
+    } else {
+      in_sample <- FALSE
+    }
+  }
+
+  # --- Enforce exclusivity ---
+  if (isTRUE(ld_ref)) in_sample <- FALSE
+  if (isTRUE(in_sample)) ld_ref <- FALSE
+
+  # --- LD reference mode ---
+  if (ld_ref) {
+    message("Using LD reference mode")
+    ld_ref_data <- load_LD_matrix(
+      LD_meta_file_path = ld_meta_file_path,
+      region = analysis_region_narrow
+    )
+    ext_ld <- ld_ref_data$combined_LD_matrix[variants, variants]
+    return(ext_ld)
+  }
+
+  # --- In-sample genotype mode ---
+  if (in_sample) {
+    message("Using in-sample genotype mode")
+
+    if (grepl("\\.txt$", ld_meta_file_path)) {
+      geno_meta <- read_tsv(ld_meta_file_path, comment = "#", col_names = c("id", "path"), show_col_types = FALSE)
+      chr_num <- gsub("^chr", "", chr)
+      geno_path <- geno_meta %>% filter(id == chr_num) %>% pull(path) %>% basename %>% paste0(dirname(ld_meta_file_path), "/", .)
+
+      if (length(geno_path) != 1) stop("No matching entry found in metadata for chromosome ", chr)
+
+      geno_prefix <- str_remove(geno_path, "\\.bed$")
+    } else if (grepl("\\.bed$", ld_meta_file_path)) {
+      geno_prefix <- str_remove(ld_meta_file_path, "\\.bed$")
+    } else {
+      stop("In in-sample mode, expected plink file or .txt genotype metadata file.")
+    }
+
+    # Load original genotype data 
+    ld_mat <- load_genotype_region(geno_prefix, region = analysis_region_narrow, keep_indel = TRUE, keep_variants_path = NULL )
+
+    # Change colname format of genotype data
+    colnames(ld_mat) <- align_variant_names(colnames(ld_mat), variants)$aligned_variants
+    ld_mat <- ld_mat[, variants]  # subset target variants then get imputation and correlation
+    if(length(variants) > 1) {
+        # Mean imputation
+        ld_mat_imputed <- apply(ld_mat, 2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
+        ext_ld <- get_cormat(ld_mat_imputed)
+    } else {
+        ext_ld = as.matrix(1)
+    }
+    return(ext_ld)
+  }
+
+  stop("Neither LD mode was activated — check inputs.")
 }
 
 #' Function to calculate purity
@@ -225,18 +289,30 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path, analysis_regi
 coloc_wrapper <- function(xqtl_file, gwas_files,
                           xqtl_finemapping_obj = NULL, xqtl_varname_obj = NULL, xqtl_region_obj = NULL,
                           gwas_finemapping_obj = NULL, gwas_varname_obj = NULL, gwas_region_obj = NULL,
-                          filter_lbf_cs = FALSE, prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) {
+                          filter_lbf_cs = FALSE, filter_lbf_cs_secondary = NULL,
+                          prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) {
+  region <- NULL # define first to avoid element can not be found
   # Load and process GWAS data
   gwas_lbf_matrices <- lapply(gwas_files, function(file) {
     raw_data <- readRDS(file)[[1]]
     gwas_data <- if (!is.null(gwas_finemapping_obj)) get_nested_element(raw_data, gwas_finemapping_obj) else raw_data
     gwas_lbf_matrix <- as.data.frame(gwas_data$lbf_variable)
-    if (filter_lbf_cs) {
-      gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index, ]
+      # fsusie has a different structure
+    if(is.null(gwas_lbf_matrix) | nrow(gwas_lbf_matrix)==0){
+        gwas_lbf_matrix <- do.call(rbind, raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame
+        if(nrow(gwas_lbf_matrix) > 0) message("This is a fSuSiE case")
+    }
+    if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) {
+        gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index,, drop = F]
+    } else if (!is.null(filter_lbf_cs_secondary)) {
+        gwas_lbf_filter_index <- get_filter_lbf_index(gwas_data, coverage = filter_lbf_cs_secondary) 
+        gwas_lbf_matrix <- gwas_lbf_matrix[gwas_lbf_filter_index,, drop = F]
     } else {
-      gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol, ]
+      gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol,, drop = F]
     }
     if (!is.null(gwas_varname_obj)) colnames(gwas_lbf_matrix) <- get_nested_element(raw_data, gwas_varname_obj)
+    # fsusie could have NA in variant name
+    gwas_lbf_matrix <- gwas_lbf_matrix[,!is.na(colnames(gwas_lbf_matrix))] 
     return(gwas_lbf_matrix)
   })
 
@@ -267,15 +343,25 @@ coloc_wrapper <- function(xqtl_file, gwas_files,
   }
   if (!is.null(xqtl_data)) {
     xqtl_lbf_matrix <- as.data.frame(xqtl_data$lbf_variable)
+    # fsusie has a different structure
+    if(is.null(xqtl_lbf_matrix) | nrow(xqtl_lbf_matrix)==0){
+        xqtl_lbf_matrix <- do.call(rbind, xqtl_raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame
+        if(nrow(xqtl_lbf_matrix) > 0) message("This is a fSuSiE case")
+    }
     # fsusie data does not have V element in results
-    if (filter_lbf_cs) {
-      xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, ]
+    if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) {
+        xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, , drop = F]
+    } else if (!is.null(filter_lbf_cs_secondary)) {
+        xqtl_lbf_filter_index <- get_filter_lbf_index(xqtl_data, coverage = filter_lbf_cs_secondary) 
+        xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_lbf_filter_index,, drop = F]
     } else {
-      if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, ] else (message("No V found in orginal data."))
+      if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, , drop = F] else (message("No V found in orginal data."))
     }
     if (nrow(combined_gwas_lbf_matrix) > 0 && nrow(xqtl_lbf_matrix) > 0) {
       if (!is.null(xqtl_varname_obj)) colnames(xqtl_lbf_matrix) <- get_nested_element(xqtl_raw_data, xqtl_varname_obj)
-
+      # fsusie could have NA in variant name
+      xqtl_lbf_matrix <- xqtl_lbf_matrix[,!is.na(colnames(xqtl_lbf_matrix))] 
+
       colnames(xqtl_lbf_matrix) <- align_variant_names(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))$aligned_variants
       common_colnames <- intersect(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))
       xqtl_lbf_matrix <- xqtl_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix()

diff --git a/R/file_utils.R b/R/file_utils.R
@@ -1010,6 +1010,46 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo
   return(batches)
 }
 
+#' Compute genotype correlation 
+#' @export
+get_cormat <- function(X, intercepte = TRUE) {
+  X <- t(X)
+  # Center each variable
+  if (intercepte) {
+    X <- X - rowMeans(X)
+  }
+  # Standardize each variable
+  X <- X / sqrt(rowSums(X^2))
+  # Calculate correlations
+  cr <- tcrossprod(X)
+  return(cr)
+}
+
+# Function to filter a single credible set based on coverage and purity
+#' @importFrom susieR susie_get_cs
+#' @importFrom purrr map_lgl
+#' @export      
+get_filter_lbf_index <- function(susie_obj, coverage = 0.5, size_factor = 0.5) {
+  susie_obj$V <- NULL  # ensure no filtering by estimated prior
+
+  # Get CS list with coverage
+  cs_list <- susie_get_cs(susie_obj, coverage = coverage, dedup = FALSE)
+
+  # Total number of variants
+  total_variants <- ncol(susie_obj$alpha)
+
+  # Maximum allowed CS size to be considered 'concentrated'
+  max_size <- total_variants * coverage * size_factor
+
+  # Identify which CSs are 'concentrated enough'
+  keep_idx <- map_lgl(cs_list$cs, ~ length(.x) < max_size)
+
+  # Extract the CS indices that pass the filter
+  cs_index <- which(keep_idx) %>% names %>% gsub("L","", .) %>% as.numeric
+
+  # Return filtered lbf_variable rows (one per CS)
+  return(cs_index)
+
 #' Function to load LD reference data variants
 #' @export
 #' @noRd

diff --git a/R/misc.R b/R/misc.R
@@ -326,6 +326,7 @@ get_nested_element <- function(nested_list, name_vector) {
   if (is.null(name_vector)) {
     return(NULL)
   }
+  name_vector <- name_vector[name_vector!='']
   current_element <- nested_list
   for (name in name_vector) {
     if (is.null(current_element[[name]])) {

diff --git a/man/coloc_wrapper.Rd b/man/coloc_wrapper.Rd
diff --git a/man/get_cormat.Rd b/man/get_cormat.Rd