diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index e8de97a..b7b5986 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -63,7 +63,7 @@ colocboost_assemble <- function(cb_obj, cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info", "ucos_details", "diagnostic_details")] } if (data_info$n_outcome == 1){ - cb_output <- list("ucos_summary", "pip" = NULL, "ucos_details" = NULL, "data_info" = data_info) + cb_output <- list("ucos_summary", "vpa" = NULL, "ucos_details" = NULL, "data_info" = data_info) } } @@ -223,14 +223,14 @@ colocboost_assemble <- function(cb_obj, cb_output <- cb_output[-match(c("cos_summary","vcp","cos_details"), names(cb_output))] remain_obj <- names(cb_output) if (!is.null(cb_output$ucos_details$ucos)){ - cb_output$pip <- apply(do.call(cbind,cb_output$ucos_details$ucos_weight), 1, function(w0) 1-prod(1-w0)) - names(cb_output$pip) <- data_info$variables + cb_output$vpa <- apply(do.call(cbind,cb_output$ucos_details$ucos_weight), 1, function(w0) 1-prod(1-w0)) + names(cb_output$vpa) <- data_info$variables cb_output$ucos_summary <- get_ucos_summary(cb_output) } else { - tmp <- list("pip" = NULL, "ucos_summary" = NULL) + tmp <- list("vpa" = NULL, "ucos_summary" = NULL) cb_output <- c(cb_output, tmp) } - cb_output <- cb_output[c("ucos_summary", "pip", remain_obj)] + cb_output <- cb_output[c("ucos_summary", "vpa", remain_obj)] } } } diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index 3dae7eb..7406989 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -6,7 +6,14 @@ #' #' @details #' The following functions are included in this set: -#' `get_abc` get the colocalization summary table with or without the specific outcomes. +#' `get_cormat` a fast function to calulate correlation matrix from individual level data. +#' `check_null_post` a function to remove the spurious signals. +#' `get_purity` a function to calculate within-CoS purity. +#' `get_modularity` a function to calculate modularity for a given number of clusters. +#' `get_n_cluster` a function to get the number of clusters based on modularity-based hierarchical clustering method. +#' `get_between_purity` a function to calculate purity between two CoS. +#' `get_cos_evidence` a function to get the evidence of colocalization. +#' `w_purity` a function to calculate within-CoS purity for each single weight from one SEC. #' #' These functions are not exported individually and are accessed via `colocboost_post_inference`. #' @@ -33,7 +40,10 @@ get_cormat <- function(X, intercepte = FALSE){ } +#' Function to remove the spurious signals #' @importFrom utils head tail +#' @keywords cb_post_inference +#' @noRd check_null_post <- function(cb_obj, coloc_sets_temp, coloc_outcomes, @@ -181,6 +191,9 @@ check_null_post <- function(cb_obj, return(ll) } +#' Function to calculate within-CoS purity +#' @keywords cb_post_inference +#' @noRd #' @importFrom stats na.omit get_purity <- function(pos, X=NULL, Xcorr=NULL, N = NULL, n = 100) { get_upper_tri = Rfast::upper_tri @@ -215,6 +228,12 @@ get_purity <- function(pos, X=NULL, Xcorr=NULL, N = NULL, n = 100) { } # ------ Calculate modularity ---------- +#' Function to calculate modularity for a given number of clusters +#' @param Weight A matrix of weights +#' @param B A matrix of binary values indicating the clusters +#' @return The modularity value +#' @keywords cb_post_inference +#' @noRd get_modularity <- function(Weight, B){ if (dim(Weight)[1] == 1){ Q <- 0 @@ -246,6 +265,14 @@ get_modularity <- function(Weight, B){ } } +#' Function to get the number of clusters based on modularity-based hierarchical clustering method +#' @param hc A hierarchical clustering object +#' @param Sigma A matrix of weights +#' @param m The number of clusters +#' @param min_cluster_corr The minimum correlation threshold for clustering within the same cluster +#' @return A list containing the number of clusters and modularity values +#' @keywords cb_post_inference +#' @noRd #' @importFrom stats cutree get_n_cluster <- function(hc, Sigma, m=ncol(Sigma), min_cluster_corr = 0.8){ if (min(Sigma) > min_cluster_corr){ @@ -269,6 +296,9 @@ get_n_cluster <- function(hc, Sigma, m=ncol(Sigma), min_cluster_corr = 0.8){ "Qmodularity" = Q)) } +#' Function to calculate within-CoS purity for each single weight from one SEC +#' @keywords cb_post_inference +#' @noRd w_purity <- function(weights, X=NULL, Xcorr=NULL, N = NULL, n = 100, coverage = 0.95, min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL){ @@ -289,9 +319,10 @@ w_purity <- function(weights, X=NULL, Xcorr=NULL, N = NULL, n = 100, coverage = return(is_pure) } - +#' Calculate purity between two CoS +#' @keywords cb_post_inference +#' @noRd #' @importFrom stats na.omit -# - Calculate purity between two confidence sets get_between_purity <- function(pos1, pos2, X=NULL, Xcorr=NULL, N = NULL, miss_idx = NULL, P = NULL){ get_matrix_mult <- function(X_sub1, X_sub2){ @@ -329,6 +360,9 @@ get_between_purity <- function(pos1, pos2, X=NULL, Xcorr=NULL, N = NULL, miss_id return(c(min(value), max(value), get_median(value))) } +#' Function to get the evidence of colocalization +#' @keywords cb_post_inference +#' @noRd #' @importFrom stats var #' @importFrom utils tail get_cos_evidence <- function(cb_obj, coloc_out, data_info){ @@ -369,7 +403,6 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info){ cos_profile } - # - get_outcome_profile_change <- function(outcome_idx, cos, cb_obj, check_null_max){ extract_last <- function(lst) { tail(lst, n = 1)} cb_data <- cb_obj$cb_data @@ -386,7 +419,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info){ ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) } - # - calculate best configuration likelihood explained by minimal configuration + # - Calculate best configuration likelihood explained by minimal configuration get_normalization_evidence <- function(profile_change, null_max, outcomes, outcome_names) { # Define the exponential likelihood ratio normalization (ELRN) logLR_normalization <- function(ratio) { 1 - exp( - 2*ratio ) } diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R index d15c1ee..7d95795 100644 --- a/R/colocboost_one_causal.R +++ b/R/colocboost_one_causal.R @@ -1,5 +1,18 @@ - - +#' @title Set of functions for ColocBoost based on per-trait-one-causal assumption, including LD-free mode and one iteration mode. +#' +#' @description +#' The `colocboost_one_causal` functions access the set of functions for ColocBoost based on per-trait-one-causal assumption. +#' +#' +#' @details +#' The following functions are included in this set: +#' `get_abc` get the colocalization summary table with or without the specific outcomes. +#' +#' These functions are not exported individually and are accessed via `colocboost_one_causal`. +#' +#' @rdname colocboost_one_causal +#' @keywords cb_one_causal +#' @noRd colocboost_one_causal <- function(cb_model, cb_model_para, cb_data, jk_equiv_corr = 0.8, jk_equiv_loglik = 1, @@ -37,8 +50,20 @@ colocboost_one_causal <- function(cb_model, cb_model_para, cb_data, - -# under one causal per trait assumption with one iteration +#' Under one causal per trait assumption with one iteration mode +#' +#' @description +#' Executes one iteration of the ColocBoost algorithm under the one-causal-variant-per-trait +#' assumption. This function provides a simplified approach to examine colocalization patterns +#' for traits with strong marginal associations. +#' +#' @details +#' This version implements an LD-dependent mode that can handle scenarios where LD structures +#' may not perfectly match between datasets but still contain informative signals. The function +#' identifies the most strongly associated variants and evaluates their colocalization evidence. +#' +#' @keywords cb_one_causal +#' @noRd colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, jk_equiv_corr = 0.8, jk_equiv_loglik = 1, @@ -91,7 +116,14 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, } - +#' Identify and best update trait group for one causal variant - one iteration mode +#' +#' @details +#' The function operates by partitioning traits into different equivalence groups based on +#' correlation patterns and log-likelihood differences. +#' +#' @keywords cb_one_causal +#' @noRd boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data, prioritize_jkstar = TRUE, jk_equiv_corr = 0.8, @@ -166,7 +198,14 @@ boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data, -# under one causal per trait assumption with no LD +#' ColocBoost under one causal per trait assumption - LD-free mode +#' @description +#' Executes one iteration of the ColocBoost algorithm under the one-causal-variant-per-trait +#' assumption. This function provides a simplified approach to examine colocalization patterns +#' for traits without using any LD information. +#' +#' @keywords cb_one_causal +#' @noRd colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, jk_equiv_corr = 0, jk_equiv_loglik = 0.1, @@ -300,6 +339,14 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, } +#' Check for overlapping causal effects between trait pairs +#' +#' @description +#' This function evaluates pairs of traits to determine if they share overlapping +#' causal effects. It will used to create the trait group when LD information is not available. +#' +#' @keywords cb_one_causal +#' @noRd check_pair_overlap <- function(weights, coverage = 0.95){ overlap_pair <- matrix(NA, nrow = nrow(weights), ncol = nrow(weights)) diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 742596f..ad44032 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -396,12 +396,37 @@ get_cos_different_coverage <- function(cb_output, coverage = 0.95){ cos_vcp <- cb_output$cos_details$cos_vcp cos_diff_coverage <- lapply(cos_vcp, function(w){ - temp <- order(w, decreasing=T) - temp[1:min(which(cumsum(w[temp]) > coverage))] # 95% + unlist(get_in_cos(w, coverage = coverage)) }) return(cos_diff_coverage) } +#' Get integrated weight from different outcomes +#' @keywords cb_get_functions +#' @noRd +get_integrated_weight <- function(weights, weight_fudge_factor = 1.5){ + av <- apply(weights, 1, function(w) prod(w^(weight_fudge_factor/ncol(weights)))) + return(av / sum(av)) +} + +#' Extract CoS based on coverage +#' @keywords cb_get_functions +#' @noRd +get_in_cos <- function(weights, coverage = 0.95){ + # Get indices in decreasing weight order + temp <- order(weights, decreasing=TRUE) + weights_ordered <- weights[temp] + # Find position where cumsum exceeds coverage + pos <- min(which(cumsum(weights_ordered) >= coverage)) + # Get the actual threshold weight value + weight_thresh <- weights_ordered[pos] + # Find all indices with weights >= threshold + indices <- which(weights >= weight_thresh) + # Order these indices by their weights in decreasing order + csets <- indices[order(weights[indices], decreasing=TRUE)] + return(list(csets)) +} + #' @title Set of internal functions to re-organize ColocBoost output format #' @@ -669,16 +694,8 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output cb_model_para <- cb_obj$cb_model_para ## - obtain the order of variables based on the variables names if it has position information - if (is.null(variables)){ - variables <- cb_obj$cb_model_para$variables - if (all(grepl("chr", variables))){ - # - if variables_name has position informtaion, re-order all_info based on positions - position <- as.numeric(gsub(".*:(\\d+).*", "\\1", variables)) - # position <- as.numeric(sapply(variables_name, function(tmp) strsplit(tmp, ":")[[1]][2])) - ordered <- order(position, decreasing = FALSE) - } else { - ordered <- 1:length(variables) - } + if (!is.null(variables)){ + ordered <- 1:length(cb_obj$cb_model_para$variables) } else { ordered <- match(variables, cb_obj$cb_model_para$variables) } diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index d496555..21e55cb 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -1,3 +1,18 @@ +#' @title Set of internal functions to refrine colocalization confidence sets +#' +#' @description +#' The `colocboost_refine_cos` functions serves as a summary for the following two refine functions. +#' +#' @details +#' The following functions are included in this set: +#' `merge_cos_ucos` merge a trait-specific confidence set CS into a colocalization set CoS. +#' `merge_ucos` merge two trait-specific confidence sets. +#' +#' These functions are not exported individually and are accessed via `colocboost_refine_cos`. +#' +#' @rdname colocboost_refine_cos +#' @keywords cb_refine_cos +#' @noRd merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, min_abs_corr = 0.5, tol = 1e-9, median_cos_abs_corr = 0.8){ @@ -255,7 +270,21 @@ merge_ucos <- function(cb_obj, past_out, return(ll) } - +#' @title Set of internal utils functions +#' +#' @description +#' The `colocboost_utils` functions serves as a summary for the following two refine functions. +#' +#' @details +#' The following functions are included in this set: +#' `merge_cos_ucos` merge a trait-specific confidence set CS into a colocalization set CoS. +#' `merge_ucos` merge two trait-specific confidence sets. +#' +#' These functions are not exported individually and are accessed via `colocboost_utils`. +#' +#' @rdname colocboost_utils +#' @keywords cb_utils +#' @noRd get_vcp <- function(past_out, P){ if (!is.null(past_out$cos$cos$cos)){ avW_coloc_vcp <- sapply(past_out$cos$cos$avWeight, get_integrated_weight) @@ -315,7 +344,6 @@ check_two_overlap_sets <- function(total, i, j){ } } - # Function to merge overlapping sets merge_sets <- function(vec) { split_lists <- lapply(vec, function(x) as.numeric(unlist(strsplit(x, ";")))) @@ -344,7 +372,6 @@ merge_sets <- function(vec) { } - get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weight=FALSE){ avWeight <- lapply(coloc_outcomes, function(i){ @@ -360,7 +387,6 @@ get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weigh } - get_max_profile <- function(cb_obj, check_null_max=0.02, check_null_method = "profile"){ for (i in 1:cb_obj$cb_model_para$L){ cb <- cb_obj$cb_model[[i]] @@ -377,27 +403,13 @@ get_max_profile <- function(cb_obj, check_null_max=0.02, check_null_method = "pr ### Function for check cs for each weight -w_cs <- function(w, coverage = 0.95){ - n <- sum(cumsum(sort(w,decreasing = TRUE)) < coverage) + 1 - o <- order(w,decreasing = TRUE) - result = rep(0,length(w)) - result[o[1:n]] = 1 +w_cs <- function(weights, coverage = 0.95){ + indices <- unlist(get_in_cos(weights, coverage = coverage)) + result = rep(0,length(weights)) + result[indices] = 1 return(result) } -get_integrated_weight <- function(avWeight, weight_fudge_factor = 1.5){ - av <- apply(avWeight, 1, function(w) prod(w^(weight_fudge_factor/ncol(avWeight)))) - return(av / sum(av)) -} - -get_in_cos <- function(weights, coverage = 0.95){ - - temp <- order(weights, decreasing=T) - csets <- temp[1:min(which(cumsum(weights[temp]) >= coverage))] # 95% - return(list(csets)) - -} - #' Pure R implementation (fallback) #' @noRd diff --git a/R/data.R b/R/data.R index 13f910a..5713291 100644 --- a/R/data.R +++ b/R/data.R @@ -9,7 +9,7 @@ #' \item{Y}{List of outcomes} #' \item{TrueCausalVariants}{List of causal variants} #' } -#' @source FINISH +#' @source See Cao et. al. 2025 for details. TO-DO-LIST "Ind_5traits" #' Summary level data for 5 traits @@ -17,11 +17,59 @@ #' An example dataset with simulated statistics and LD for 5 traits #' #' @format ## `Sumstat_5traits` -#' A list with 3 elements +#' A list with 2 elements #' \describe{ #' \item{sumstat}{Summary statistics for 5 traits} -#' \item{LD}{Matrix of LD between variants} #' \item{TrueCausalVariants}{List of causal variants} #' } -#' @source FINISH +#' @source See Cao et. al. 2025 for details. TO-DO-LIST "Sumstat_5traits" + + +#' Summary level data for 5 traits +#' +#' An example dataset with simulated statistics and LD for 5 traits +#' +#' @format ## `Heterogeneous_Effect` +#' A list with 3 elements +#' \describe{ +#' \item{X}{List of genotype matrices} +#' \item{Y}{List of outcomes} +#' \item{TrueCausalVariants}{List of causal variants} +#' } +#' @source See Cao et. al. 2025 for details. TO-DO-LIST +"Heterogeneous_Effect" + + +#' Summary level data for 5 traits +#' +#' An example dataset with simulated statistics and LD for 5 traits +#' +#' @format ## `Weaker_GWAS_Effect` +#' A list with 3 elements +#' \describe{ +#' \item{X}{List of genotype matrices} +#' \item{Y}{List of outcomes} +#' \item{TrueCausalVariants}{List of causal variants} +#' } +#' @source See Cao et. al. 2025 for details. TO-DO-LIST +"Weaker_GWAS_Effect" + + +#' Summary level data for 5 traits +#' +#' An example dataset with simulated statistics and LD for 5 traits +#' +#' @format ## `Non_Causal_Strongest_Marginal` +#' A list with 3 elements +#' \describe{ +#' \item{X}{List of genotype matrices} +#' \item{Y}{List of outcomes} +#' \item{TrueCausalVariants}{List of causal variants} +#' } +#' @source See Cao et. al. 2025 for details. TO-DO-LIST +"Non_Causal_Strongest_Marginal" + + + + diff --git a/README.md b/README.md index 01c3075..0206b49 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,6 @@ conda install -c dnachun r-colocboost ``` ## Usage -### Single-trait Fine-mapping (FineBoost) -Run FineBoost for single-trait fine-mapping (similar interface to SuSiE) -```r -result <- colocboost(X=X, Y=y) -``` - ### Multi-trait Colocalization ```r # Basic multi-trait analysis @@ -67,6 +61,12 @@ filtered <- get_strong_colocalization(result, cos_npc_cutoff = 0.5) For more complex analyses involving multiple datasets mixing individual level and summary statistics data, we recommend using [this pipeline wrapper](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R) from the `pecotmr` package. The `pecotmr` package can be installed either from source or from our conda package at https://anaconda.org/dnachun/r-pecotmr. +### Single-trait Fine-mapping (FineBoost) - Special Case +Run FineBoost for single-trait fine-mapping (similar interface to SuSiE) +```r +result <- colocboost(X=X, Y=y) +``` + ## Citation If you use ColocBoost in your research, please cite: diff --git a/data/Heterogeneous_Effect.rda b/data/Heterogeneous_Effect.rda new file mode 100644 index 0000000..00e2b1d Binary files /dev/null and b/data/Heterogeneous_Effect.rda differ diff --git a/data/Non_Causal_Strongest_Marginal.rda b/data/Non_Causal_Strongest_Marginal.rda new file mode 100644 index 0000000..06756e8 Binary files /dev/null and b/data/Non_Causal_Strongest_Marginal.rda differ diff --git a/data/Sumstat_5traits.rda b/data/Sumstat_5traits.rda index b55b42e..f6e654a 100644 Binary files a/data/Sumstat_5traits.rda and b/data/Sumstat_5traits.rda differ diff --git a/data/Weaker_GWAS_Effect.rda b/data/Weaker_GWAS_Effect.rda new file mode 100644 index 0000000..98b4f46 Binary files /dev/null and b/data/Weaker_GWAS_Effect.rda differ