Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ export(find_duplicate_variants)
export(fsusie_get_cs)
export(fsusie_wrapper)
export(gbayes_rss)
export(get_cormat)
export(get_ctwas_meta_data)
export(get_filter_lbf_index)
export(get_nested_element)
export(get_susie_result)
export(glmnet_weights)
Expand Down Expand Up @@ -170,6 +172,7 @@ importFrom(purrr,exec)
importFrom(purrr,map)
importFrom(purrr,map2)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
importFrom(purrr,pmap)
importFrom(readr,cols)
importFrom(readr,parse_number)
Expand Down
27 changes: 14 additions & 13 deletions R/allele_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
match_min_prop = 0.2, remove_dups = TRUE,
remove_indels = FALSE, remove_strand_ambiguous = TRUE,
flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) {
strand_flip <- function(ref) {
strand_flip <- function(ref) {
# Define a mapping for complementary bases
base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")

Expand Down Expand Up @@ -130,25 +130,25 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"])
}

# Remove all unnecessary columns used to determine qc status
# Finally keep those variants with FLAG keep = TRUE
result <- match_result[match_result$keep, , drop = FALSE]

# FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE.
# It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly.
# What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this
# remove_dup name
if (remove_dups) {
dups <- vec_duplicate_detect(match_result[, c("chrom", "pos", "variants_id_qced")])
dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")])
if (any(dups)) {
match_result <- match_result[!dups, , drop = FALSE]
message("Some duplicates were removed.")
result <- result[!dups, , drop = FALSE]
Warning("Unexpected duplicates were removed.")
}
}

# Remove all unnecessary columns used to determine qc status
# Finally keep those variants with FLAG keep = TRUE
result <- match_result[match_result$keep, , drop = FALSE]


result <- result %>%
select(-(flip1.ref:keep)) %>%
select(-variants_id_original, -A1.target, -A2.target) %>%
select(-A1.target, -A2.target) %>%
rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)

# default FALSE, but if want to remove same variants having different z score, then set as TRUE
Expand All @@ -161,14 +161,15 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
}

if (!remove_unmatched) {
match_variant <- match_result %>% pull(variants_id_original)
match_variant <- result %>% pull(variants_id_original)
match_result <- select(match_result, -(flip1.ref:keep)) %>%
select(-variants_id_original, -A1.target, -A2.target) %>%
rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":"))
if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) {
unmatch_data <- target_data %>% filter(!variant_id %in% match_variant)
result <- rbind(result, unmatch_data)
result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id))
result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original)
}
}

Expand Down Expand Up @@ -230,7 +231,7 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) {
ref_variants = reference_df,
col_to_flip = NULL,
match_min_prop = 0,
remove_dups = TRUE,
remove_dups = FALSE, #otherwise case like c('14:31234238:C:A','14:31234238:C:G') -- c('14:31234238:G:C','14:31234238:G:C') would not work
flip_strand = TRUE,
remove_indels = remove_indels,
remove_strand_ambiguous = FALSE,
Expand Down
112 changes: 99 additions & 13 deletions R/encoloc.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,75 @@ calculate_cumsum <- function(coloc_results) {

#' Function to load and extract LD matrix
#' @noRd
load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants) {
# This is a placeholder for loading LD matrix, adjust as per your actual function
ld_ref <- load_LD_matrix(LD_meta_file_path = ld_meta_file_path, region = analysis_region)
ext_ld <- ld_ref$combined_LD_matrix[variants, variants]
ext_ld
load_and_extract_ld_matrix <- function(ld_meta_file_path, analysis_region, variants, ld_ref = NULL, in_sample = NULL) {
# Extract variant positions
var_pos <- str_split(variants, ":", simplify = TRUE)[,2] %>% as.numeric()
min_var_pos <- min(var_pos)
max_var_pos <- max(var_pos)
chr <- str_split(analysis_region, ":", simplify = TRUE)[,1]
analysis_region_narrow <- paste0(chr, ":", min_var_pos, "-", max_var_pos)

# --- Determine mode if not explicitly provided ---
if (is.null(ld_ref) && is.null(in_sample)) {
ld_ref <- TRUE # Default assumption
if (grepl("plink|genotype|\\.bed$|\\.bim$|\\.fam$", ld_meta_file_path)) {
ld_ref <- FALSE
in_sample <- TRUE
} else {
in_sample <- FALSE
}
}

# --- Enforce exclusivity ---
if (isTRUE(ld_ref)) in_sample <- FALSE
if (isTRUE(in_sample)) ld_ref <- FALSE

# --- LD reference mode ---
if (ld_ref) {
message("Using LD reference mode")
ld_ref_data <- load_LD_matrix(
LD_meta_file_path = ld_meta_file_path,
region = analysis_region_narrow
)
ext_ld <- ld_ref_data$combined_LD_matrix[variants, variants]
return(ext_ld)
}

# --- In-sample genotype mode ---
if (in_sample) {
message("Using in-sample genotype mode")

if (grepl("\\.txt$", ld_meta_file_path)) {
geno_meta <- read_tsv(ld_meta_file_path, comment = "#", col_names = c("id", "path"), show_col_types = FALSE)
chr_num <- gsub("^chr", "", chr)
geno_path <- geno_meta %>% filter(id == chr_num) %>% pull(path) %>% basename %>% paste0(dirname(ld_meta_file_path), "/", .)

if (length(geno_path) != 1) stop("No matching entry found in metadata for chromosome ", chr)

geno_prefix <- str_remove(geno_path, "\\.bed$")
} else if (grepl("\\.bed$", ld_meta_file_path)) {
geno_prefix <- str_remove(ld_meta_file_path, "\\.bed$")
} else {
stop("In in-sample mode, expected plink file or .txt genotype metadata file.")
}

# Load original genotype data
ld_mat <- load_genotype_region(geno_prefix, region = analysis_region_narrow, keep_indel = TRUE, keep_variants_path = NULL )

# Change colname format of genotype data
colnames(ld_mat) <- align_variant_names(colnames(ld_mat), variants)$aligned_variants
ld_mat <- ld_mat[, variants] # subset target variants then get imputation and correlation
if(length(variants) > 1) {
# Mean imputation
ld_mat_imputed <- apply(ld_mat, 2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
ext_ld <- get_cormat(ld_mat_imputed)
} else {
ext_ld = as.matrix(1)
}
return(ext_ld)
}

stop("Neither LD mode was activated — check inputs.")
}

#' Function to calculate purity
Expand Down Expand Up @@ -225,18 +289,30 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path, analysis_regi
coloc_wrapper <- function(xqtl_file, gwas_files,
xqtl_finemapping_obj = NULL, xqtl_varname_obj = NULL, xqtl_region_obj = NULL,
gwas_finemapping_obj = NULL, gwas_varname_obj = NULL, gwas_region_obj = NULL,
filter_lbf_cs = FALSE, prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) {
filter_lbf_cs = FALSE, filter_lbf_cs_secondary = NULL,
prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) {
region <- NULL # define first to avoid element can not be found
# Load and process GWAS data
gwas_lbf_matrices <- lapply(gwas_files, function(file) {
raw_data <- readRDS(file)[[1]]
gwas_data <- if (!is.null(gwas_finemapping_obj)) get_nested_element(raw_data, gwas_finemapping_obj) else raw_data
gwas_lbf_matrix <- as.data.frame(gwas_data$lbf_variable)
if (filter_lbf_cs) {
gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index, ]
# fsusie has a different structure
if(is.null(gwas_lbf_matrix) | nrow(gwas_lbf_matrix)==0){
gwas_lbf_matrix <- do.call(rbind, raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame
if(nrow(gwas_lbf_matrix) > 0) message("This is a fSuSiE case")
}
if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) {
gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index,, drop = F]
} else if (!is.null(filter_lbf_cs_secondary)) {
gwas_lbf_filter_index <- get_filter_lbf_index(gwas_data, coverage = filter_lbf_cs_secondary)
gwas_lbf_matrix <- gwas_lbf_matrix[gwas_lbf_filter_index,, drop = F]
} else {
gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol, ]
gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol,, drop = F]
}
if (!is.null(gwas_varname_obj)) colnames(gwas_lbf_matrix) <- get_nested_element(raw_data, gwas_varname_obj)
# fsusie could have NA in variant name
gwas_lbf_matrix <- gwas_lbf_matrix[,!is.na(colnames(gwas_lbf_matrix))]
return(gwas_lbf_matrix)
})

Expand Down Expand Up @@ -267,15 +343,25 @@ coloc_wrapper <- function(xqtl_file, gwas_files,
}
if (!is.null(xqtl_data)) {
xqtl_lbf_matrix <- as.data.frame(xqtl_data$lbf_variable)
# fsusie has a different structure
if(is.null(xqtl_lbf_matrix) | nrow(xqtl_lbf_matrix)==0){
xqtl_lbf_matrix <- do.call(rbind, xqtl_raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame
if(nrow(xqtl_lbf_matrix) > 0) message("This is a fSuSiE case")
}
# fsusie data does not have V element in results
if (filter_lbf_cs) {
xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, ]
if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) {
xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, , drop = F]
} else if (!is.null(filter_lbf_cs_secondary)) {
xqtl_lbf_filter_index <- get_filter_lbf_index(xqtl_data, coverage = filter_lbf_cs_secondary)
xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_lbf_filter_index,, drop = F]
} else {
if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, ] else (message("No V found in orginal data."))
if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, , drop = F] else (message("No V found in orginal data."))
}
if (nrow(combined_gwas_lbf_matrix) > 0 && nrow(xqtl_lbf_matrix) > 0) {
if (!is.null(xqtl_varname_obj)) colnames(xqtl_lbf_matrix) <- get_nested_element(xqtl_raw_data, xqtl_varname_obj)

# fsusie could have NA in variant name
xqtl_lbf_matrix <- xqtl_lbf_matrix[,!is.na(colnames(xqtl_lbf_matrix))]

colnames(xqtl_lbf_matrix) <- align_variant_names(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))$aligned_variants
common_colnames <- intersect(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))
xqtl_lbf_matrix <- xqtl_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix()
Expand Down
40 changes: 40 additions & 0 deletions R/file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,46 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo
return(batches)
}

#' Compute genotype correlation
#' @export
get_cormat <- function(X, intercepte = TRUE) {
X <- t(X)
# Center each variable
if (intercepte) {
X <- X - rowMeans(X)
}
# Standardize each variable
X <- X / sqrt(rowSums(X^2))
# Calculate correlations
cr <- tcrossprod(X)
return(cr)
}

# Function to filter a single credible set based on coverage and purity
#' @importFrom susieR susie_get_cs
#' @importFrom purrr map_lgl
#' @export
get_filter_lbf_index <- function(susie_obj, coverage = 0.5, size_factor = 0.5) {
susie_obj$V <- NULL # ensure no filtering by estimated prior

# Get CS list with coverage
cs_list <- susie_get_cs(susie_obj, coverage = coverage, dedup = FALSE)

# Total number of variants
total_variants <- ncol(susie_obj$alpha)

# Maximum allowed CS size to be considered 'concentrated'
max_size <- total_variants * coverage * size_factor

# Identify which CSs are 'concentrated enough'
keep_idx <- map_lgl(cs_list$cs, ~ length(.x) < max_size)

# Extract the CS indices that pass the filter
cs_index <- which(keep_idx) %>% names %>% gsub("L","", .) %>% as.numeric

# Return filtered lbf_variable rows (one per CS)
return(cs_index)

#' Function to load LD reference data variants
#' @export
#' @noRd
Expand Down
1 change: 1 addition & 0 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ get_nested_element <- function(nested_list, name_vector) {
if (is.null(name_vector)) {
return(NULL)
}
name_vector <- name_vector[name_vector!='']
current_element <- nested_list
for (name in name_vector) {
if (is.null(current_element[[name]])) {
Expand Down
1 change: 1 addition & 0 deletions man/coloc_wrapper.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions man/get_cormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading