diff --git a/NAMESPACE b/NAMESPACE index 4e873d5f..0dc45d68 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,7 +39,9 @@ export(find_duplicate_variants) export(fsusie_get_cs) export(fsusie_wrapper) export(gbayes_rss) +export(get_cormat) export(get_ctwas_meta_data) +export(get_filter_lbf_index) export(get_nested_element) export(get_susie_result) export(glmnet_weights) @@ -170,6 +172,7 @@ importFrom(purrr,exec) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map_int) +importFrom(purrr,map_lgl) importFrom(purrr,pmap) importFrom(readr,cols) importFrom(readr,parse_number) diff --git a/R/allele_qc.R b/R/allele_qc.R index acde4c8f..18076802 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -27,7 +27,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, match_min_prop = 0.2, remove_dups = TRUE, remove_indels = FALSE, remove_strand_ambiguous = TRUE, flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) { - strand_flip <- function(ref) { + strand_flip <- function(ref) { # Define a mapping for complementary bases base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") @@ -130,25 +130,25 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"]) } + # Remove all unnecessary columns used to determine qc status + # Finally keep those variants with FLAG keep = TRUE + result <- match_result[match_result$keep, , drop = FALSE] + # FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE. # It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly. # What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this # remove_dup name if (remove_dups) { - dups <- vec_duplicate_detect(match_result[, c("chrom", "pos", "variants_id_qced")]) + dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) if (any(dups)) { - match_result <- match_result[!dups, , drop = FALSE] - message("Some duplicates were removed.") + result <- result[!dups, , drop = FALSE] + Warning("Unexpected duplicates were removed.") } } - - # Remove all unnecessary columns used to determine qc status - # Finally keep those variants with FLAG keep = TRUE - result <- match_result[match_result$keep, , drop = FALSE] - + result <- result %>% select(-(flip1.ref:keep)) %>% - select(-variants_id_original, -A1.target, -A2.target) %>% + select(-A1.target, -A2.target) %>% rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) # default FALSE, but if want to remove same variants having different z score, then set as TRUE @@ -161,14 +161,15 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, } if (!remove_unmatched) { - match_variant <- match_result %>% pull(variants_id_original) + match_variant <- result %>% pull(variants_id_original) match_result <- select(match_result, -(flip1.ref:keep)) %>% select(-variants_id_original, -A1.target, -A2.target) %>% rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":")) if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) { unmatch_data <- target_data %>% filter(!variant_id %in% match_variant) - result <- rbind(result, unmatch_data) + result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id)) + result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original) } } @@ -230,7 +231,7 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) { ref_variants = reference_df, col_to_flip = NULL, match_min_prop = 0, - remove_dups = TRUE, + remove_dups = FALSE, #otherwise case like c('14:31234238:C:A','14:31234238:C:G') -- c('14:31234238:G:C','14:31234238:G:C') would not work flip_strand = TRUE, remove_indels = remove_indels, remove_strand_ambiguous = FALSE, diff --git a/R/encoloc.R b/R/encoloc.R index 15c6e1e0..0a1f2fb0 100644 --- a/R/encoloc.R +++ b/R/encoloc.R @@ -112,11 +112,75 @@ calculate_cumsum <- function(coloc_results) { #' Function to load and extract LD matrix #' @noRd -load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants) { - # This is a placeholder for loading LD matrix, adjust as per your actual function - ld_ref <- load_LD_matrix(LD_meta_file_path = ld_meta_file_path, region = analysis_region) - ext_ld <- ld_ref$combined_LD_matrix[variants, variants] - ext_ld +load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants, ld_ref = NULL, in_sample = NULL) { + # Extract variant positions + var_pos <- str_split(variants, ":", simplify = TRUE)[,2] %>% as.numeric() + min_var_pos <- min(var_pos) + max_var_pos <- max(var_pos) + chr <- str_split(analysis_region, ":", simplify = TRUE)[,1] + analysis_region_narrow <- paste0(chr, ":", min_var_pos, "-", max_var_pos) + + # --- Determine mode if not explicitly provided --- + if (is.null(ld_ref) && is.null(in_sample)) { + ld_ref <- TRUE # Default assumption + if (grepl("plink|genotype|\\.bed$|\\.bim$|\\.fam$", ld_meta_file_path)) { + ld_ref <- FALSE + in_sample <- TRUE + } else { + in_sample <- FALSE + } + } + + # --- Enforce exclusivity --- + if (isTRUE(ld_ref)) in_sample <- FALSE + if (isTRUE(in_sample)) ld_ref <- FALSE + + # --- LD reference mode --- + if (ld_ref) { + message("Using LD reference mode") + ld_ref_data <- load_LD_matrix( + LD_meta_file_path = ld_meta_file_path, + region = analysis_region_narrow + ) + ext_ld <- ld_ref_data$combined_LD_matrix[variants, variants] + return(ext_ld) + } + + # --- In-sample genotype mode --- + if (in_sample) { + message("Using in-sample genotype mode") + + if (grepl("\\.txt$", ld_meta_file_path)) { + geno_meta <- read_tsv(ld_meta_file_path, comment = "#", col_names = c("id", "path"), show_col_types = FALSE) + chr_num <- gsub("^chr", "", chr) + geno_path <- geno_meta %>% filter(id == chr_num) %>% pull(path) %>% basename %>% paste0(dirname(ld_meta_file_path), "/", .) + + if (length(geno_path) != 1) stop("No matching entry found in metadata for chromosome ", chr) + + geno_prefix <- str_remove(geno_path, "\\.bed$") + } else if (grepl("\\.bed$", ld_meta_file_path)) { + geno_prefix <- str_remove(ld_meta_file_path, "\\.bed$") + } else { + stop("In in-sample mode, expected plink file or .txt genotype metadata file.") + } + + # Load original genotype data + ld_mat <- load_genotype_region(geno_prefix, region = analysis_region_narrow, keep_indel = TRUE, keep_variants_path = NULL ) + + # Change colname format of genotype data + colnames(ld_mat) <- align_variant_names(colnames(ld_mat), variants)$aligned_variants + ld_mat <- ld_mat[, variants] # subset target variants then get imputation and correlation + if(length(variants) > 1) { + # Mean imputation + ld_mat_imputed <- apply(ld_mat, 2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)) + ext_ld <- get_cormat(ld_mat_imputed) + } else { + ext_ld = as.matrix(1) + } + return(ext_ld) + } + + stop("Neither LD mode was activated — check inputs.") } #' Function to calculate purity @@ -225,18 +289,30 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path, analysis_regi coloc_wrapper <- function(xqtl_file, gwas_files, xqtl_finemapping_obj = NULL, xqtl_varname_obj = NULL, xqtl_region_obj = NULL, gwas_finemapping_obj = NULL, gwas_varname_obj = NULL, gwas_region_obj = NULL, - filter_lbf_cs = FALSE, prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) { + filter_lbf_cs = FALSE, filter_lbf_cs_secondary = NULL, + prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) { + region <- NULL # define first to avoid element can not be found # Load and process GWAS data gwas_lbf_matrices <- lapply(gwas_files, function(file) { raw_data <- readRDS(file)[[1]] gwas_data <- if (!is.null(gwas_finemapping_obj)) get_nested_element(raw_data, gwas_finemapping_obj) else raw_data gwas_lbf_matrix <- as.data.frame(gwas_data$lbf_variable) - if (filter_lbf_cs) { - gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index, ] + # fsusie has a different structure + if(is.null(gwas_lbf_matrix) | nrow(gwas_lbf_matrix)==0){ + gwas_lbf_matrix <- do.call(rbind, raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame + if(nrow(gwas_lbf_matrix) > 0) message("This is a fSuSiE case") + } + if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) { + gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index,, drop = F] + } else if (!is.null(filter_lbf_cs_secondary)) { + gwas_lbf_filter_index <- get_filter_lbf_index(gwas_data, coverage = filter_lbf_cs_secondary) + gwas_lbf_matrix <- gwas_lbf_matrix[gwas_lbf_filter_index,, drop = F] } else { - gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol, ] + gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol,, drop = F] } if (!is.null(gwas_varname_obj)) colnames(gwas_lbf_matrix) <- get_nested_element(raw_data, gwas_varname_obj) + # fsusie could have NA in variant name + gwas_lbf_matrix <- gwas_lbf_matrix[,!is.na(colnames(gwas_lbf_matrix))] return(gwas_lbf_matrix) }) @@ -267,15 +343,25 @@ coloc_wrapper <- function(xqtl_file, gwas_files, } if (!is.null(xqtl_data)) { xqtl_lbf_matrix <- as.data.frame(xqtl_data$lbf_variable) + # fsusie has a different structure + if(is.null(xqtl_lbf_matrix) | nrow(xqtl_lbf_matrix)==0){ + xqtl_lbf_matrix <- do.call(rbind, xqtl_raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame + if(nrow(xqtl_lbf_matrix) > 0) message("This is a fSuSiE case") + } # fsusie data does not have V element in results - if (filter_lbf_cs) { - xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, ] + if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) { + xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, , drop = F] + } else if (!is.null(filter_lbf_cs_secondary)) { + xqtl_lbf_filter_index <- get_filter_lbf_index(xqtl_data, coverage = filter_lbf_cs_secondary) + xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_lbf_filter_index,, drop = F] } else { - if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, ] else (message("No V found in orginal data.")) + if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, , drop = F] else (message("No V found in orginal data.")) } if (nrow(combined_gwas_lbf_matrix) > 0 && nrow(xqtl_lbf_matrix) > 0) { if (!is.null(xqtl_varname_obj)) colnames(xqtl_lbf_matrix) <- get_nested_element(xqtl_raw_data, xqtl_varname_obj) - + # fsusie could have NA in variant name + xqtl_lbf_matrix <- xqtl_lbf_matrix[,!is.na(colnames(xqtl_lbf_matrix))] + colnames(xqtl_lbf_matrix) <- align_variant_names(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))$aligned_variants common_colnames <- intersect(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix)) xqtl_lbf_matrix <- xqtl_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix() diff --git a/R/file_utils.R b/R/file_utils.R index e8ebb87d..23662fe1 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -1010,6 +1010,46 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo return(batches) } +#' Compute genotype correlation +#' @export +get_cormat <- function(X, intercepte = TRUE) { + X <- t(X) + # Center each variable + if (intercepte) { + X <- X - rowMeans(X) + } + # Standardize each variable + X <- X / sqrt(rowSums(X^2)) + # Calculate correlations + cr <- tcrossprod(X) + return(cr) +} + +# Function to filter a single credible set based on coverage and purity +#' @importFrom susieR susie_get_cs +#' @importFrom purrr map_lgl +#' @export +get_filter_lbf_index <- function(susie_obj, coverage = 0.5, size_factor = 0.5) { + susie_obj$V <- NULL # ensure no filtering by estimated prior + + # Get CS list with coverage + cs_list <- susie_get_cs(susie_obj, coverage = coverage, dedup = FALSE) + + # Total number of variants + total_variants <- ncol(susie_obj$alpha) + + # Maximum allowed CS size to be considered 'concentrated' + max_size <- total_variants * coverage * size_factor + + # Identify which CSs are 'concentrated enough' + keep_idx <- map_lgl(cs_list$cs, ~ length(.x) < max_size) + + # Extract the CS indices that pass the filter + cs_index <- which(keep_idx) %>% names %>% gsub("L","", .) %>% as.numeric + + # Return filtered lbf_variable rows (one per CS) + return(cs_index) + #' Function to load LD reference data variants #' @export #' @noRd diff --git a/R/misc.R b/R/misc.R index 8f9e5a7f..f4f18477 100644 --- a/R/misc.R +++ b/R/misc.R @@ -326,6 +326,7 @@ get_nested_element <- function(nested_list, name_vector) { if (is.null(name_vector)) { return(NULL) } + name_vector <- name_vector[name_vector!=''] current_element <- nested_list for (name in name_vector) { if (is.null(current_element[[name]])) { diff --git a/man/coloc_wrapper.Rd b/man/coloc_wrapper.Rd index 36b6ae0f..0cefde7c 100644 --- a/man/coloc_wrapper.Rd +++ b/man/coloc_wrapper.Rd @@ -14,6 +14,7 @@ coloc_wrapper( gwas_varname_obj = NULL, gwas_region_obj = NULL, filter_lbf_cs = FALSE, + filter_lbf_cs_secondary = NULL, prior_tol = 1e-09, p1 = 1e-04, p2 = 1e-04, diff --git a/man/get_cormat.Rd b/man/get_cormat.Rd new file mode 100644 index 00000000..3d9f93ea --- /dev/null +++ b/man/get_cormat.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{get_cormat} +\alias{get_cormat} +\title{Compute genotype correlation} +\usage{ +get_cormat(X, intercepte = TRUE) +} +\description{ +Compute genotype correlation +}