From 845df217d82f45e850251a4152e266b9b7c701e2 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 23 Apr 2025 16:40:59 -0400 Subject: [PATCH 1/2] minor fix for CRAN --- R/colocboost.R | 6 +++--- R/colocboost_assemble_ucos.R | 2 +- R/colocboost_init.R | 8 ++++++-- R/colocboost_output.R | 2 +- R/colocboost_update.R | 9 ++++----- R/colocboost_workhorse.R | 5 ++--- man/colocboost.Rd | 4 ++-- 7 files changed, 19 insertions(+), 17 deletions(-) diff --git a/R/colocboost.R b/R/colocboost.R index 25fc181..ca6ae8e 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -57,8 +57,8 @@ #' @param func_multi_test The alternative method to check the stop criteria. When \code{func_multi_test = "lfdr"}, boosting iterations will be stopped #' if the local FDR for all variables are greater than \code{lfsr_max}. #' @param stop_null The cutoff of nominal p-value when \code{func_multi_test = "Z"}. -#' @param multi_test_max The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. -#' @param multi_test_thresh The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. +#' @param multi_test_max The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. +#' @param multi_test_thresh The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. #' @param ash_prior The prior distribution for calculating lfsr when \code{func_multi_test = "lfsr"}. #' @param p.adjust.methods The adjusted pvalue method in stats:p.adj when \code{func_multi_test = "fdr"} #' @param residual_correlation The residual correlation based on the sample overlap, it is diagonal if it is NULL. @@ -502,7 +502,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data z <- summstat_tmp[, "z"] } if (anyNA(z)) { - warning(paste("summary statistic dataset", i.sumstat, "contains NA values that are replaced with 0")) + warning(paste("summary statistic dataset", i.summstat, "contains NA values that are replaced with 0")) z[is.na(z)] <- 0 } diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R index db695a6..3336f19 100644 --- a/R/colocboost_assemble_ucos.R +++ b/R/colocboost_assemble_ucos.R @@ -29,7 +29,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, "evidence_strength" = NULL, "requested_coverage" = coverage ) - pip_av <- rep(0, cc$P) + pip_av <- rep(0, cb_model_para$P) } else if (ncol(temp) == 1 | nrow(temp) == 1) { weights <- as.vector(weights) avWeight <- weights diff --git a/R/colocboost_init.R b/R/colocboost_init.R index dc1b21d..c6e9f85 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -219,6 +219,7 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, lambda_focal_outcome = 1, learning_rate_decay = 1, multi_test_thresh = 1, + multi_test_max = 1, func_multi_test = "lfdr", LD_free = FALSE, outcome_names = NULL, @@ -296,7 +297,10 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, "coveraged" = TRUE, "num_updates" = 0, "coveraged_outcome" = coveraged_outcome, - "num_updates_outcome" = num_updates_outcome + "num_updates_outcome" = num_updates_outcome, + "func_multi_test" = func_multi_test, + "multi_test_thresh" = multi_test_thresh, + "multi_test_max" = multi_test_max ) class(cb_model_para) <- "colocboost" @@ -599,7 +603,7 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic # var_y, shat (and bhat) are provided, so the effects are on the # *original scale*. adj <- 1 / (Z_extend^2 + current_n - 2) - if (!is.null(LD_tmp)) { + if (!is.null(ld_submatrix)) { XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2) xtx <- t(ld_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) tmp$XtX <- (xtx + t(xtx)) / 2 diff --git a/R/colocboost_output.R b/R/colocboost_output.R index fb9bd92..ba42098 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -625,7 +625,7 @@ get_cos_summary <- function(cb_output, coloc_outcome <- lapply(cb_output$cos_details$cos_outcomes$outcome_index, function(idx) analysis_outcome[idx]) coloc_sets <- cb_output$cos_details$cos$cos_index if (!is.null(cb_output$cos_warnings)) { - cos_warnings + message(cb_output$cos_warnings$warning_message) } vcp <- as.numeric(cb_output$vcp) diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 5ae8f3e..21b1bb5 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -175,13 +175,12 @@ boost_KL_delta <- function(z, ld_feature, adj_dep, } -boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage, - multi_test_max = 1) { +boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage) { # - check the iteration for the stop outcome (pos_stop has the same jk with original data) iter_each <- sapply(pos_stop, function(i) { length(cb_model[[i]]$obj_path) - 1 }) - lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < multi_test_max) + lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < cb_model_para$multi_test_max) pos_need_more <- which(iter_each <= 10 & lfsr_each) # pos_need_more <- which(iter_each <= 10) @@ -197,9 +196,9 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage cb_model_para$need_more <- update_need_more for (i in update_need_more) { cb_model[[i]]$stop_thresh <- cb_model[[i]]$stop_thresh * 0.5 - if (stop_method == "Z") { + if (cb_model_para$func_multi_test == "Z") { cb_model[[i]]$stop_null <- cb_model[[i]]$stop_null - 0.1 - } else if (stop_method == "lfsr" | stop_method == "lfdr") { + } else if (cb_model_para$func_multi_test == "lfsr" | cb_model_para$func_multi_test == "lfdr") { cb_model[[i]]$stop_null <- cb_model[[i]]$stop_null + 0.05 } cb_model[[i]]$learning_rate_init <- cb_model[[i]]$learning_rate_init * 0.5 diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index da0fa0f..7583eee 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -58,6 +58,7 @@ colocboost_workhorse <- function(cb_data, lambda_focal_outcome = lambda_focal_outcome, learning_rate_decay = learning_rate_decay, multi_test_thresh = multi_test_thresh, + multi_test_max = multi_test_max, func_multi_test = func_multi_test, LD_free = LD_free, outcome_names = outcome_names, @@ -194,9 +195,7 @@ colocboost_workhorse <- function(cb_data, cb_model_para$update_y <- cb_model_para$update_y } else { pos_stop <- which(stop) # which outcome reach the stop criterion - ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage, - multi_test_max = multi_test_max - ) + ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage) cb_model_para <- ttmp$cb_model_para cb_model <- ttmp$cb_model # - if there is some outcomes need stop diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 5cfbacc..b241b2a 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -139,9 +139,9 @@ if the local FDR for all variables are greater than \code{lfsr_max}.} \item{stop_null}{The cutoff of nominal p-value when \code{func_multi_test = "Z"}.} -\item{multi_test_max}{The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} +\item{multi_test_max}{The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} -\item{multi_test_thresh}{The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} +\item{multi_test_thresh}{The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} \item{ash_prior}{The prior distribution for calculating lfsr when \code{func_multi_test = "lfsr"}.} From 79ae025ce51941106bdf5ec52d00fe1e891d45eb Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 23 Apr 2025 17:14:58 -0400 Subject: [PATCH 2/2] minor fix spelling --- R/colocboost.R | 12 ++++---- R/colocboost_output.R | 8 +++--- R/colocboost_plot.R | 14 +++++----- R/data.R | 18 ++++++------ man/Heterogeneous_Effect.Rd | 4 +-- man/Ind_5traits.Rd | 2 +- man/Non_Causal_Strongest_Marginal.Rd | 6 ++-- man/Sumstat_5traits.Rd | 2 +- man/Weaker_GWAS_Effect.Rd | 4 +-- man/colocboost.Rd | 12 ++++---- man/colocboost_plot.Rd | 2 +- man/get_colocboost_summary.Rd | 2 +- man/get_cos.Rd | 2 +- man/get_robust_colocalization.Rd | 4 +-- vignettes/Ambiguous_Colocalization.Rmd | 28 +++++++++---------- vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 3 +- .../Disease_Prioritized_Colocalization.Rmd | 2 +- vignettes/Input_Data_Format.Rmd | 16 +++++------ vignettes/Interpret_ColocBoost_Output.Rmd | 4 +-- .../Summary_Statistics_Colocalization.Rmd | 4 +-- vignettes/announcements.Rmd | 2 +- 21 files changed, 76 insertions(+), 75 deletions(-) diff --git a/R/colocboost.R b/R/colocboost.R index ca6ae8e..0d0fed1 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -3,7 +3,7 @@ #' @title ColocBoost: A gradient boosting informed multi-omics xQTL colocalization method #' #' @description `colocboost` implements a proximity adaptive smoothing gradient boosting approach for multi-trait colocalization at gene loci, -#' accommodating multiple causal variants. This method, introduced by Cao et al. (2025), is particularly suited for scaling +#' accommodating multiple causal variants. This method, introduced by Cao etc. (2025), is particularly suited for scaling #' to large datasets involving numerous molecular quantitative traits and disease traits. #' In brief, this function fits a multiple linear regression model \eqn{Y = XB + E} in matrix form. #' ColocBoost can be generally used in multi-task variable selection regression problem. @@ -17,12 +17,12 @@ #' Each matrix should have column names, if sample sizes and variables possibly differing across matrices. #' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes. #' @param sumstat A list of data.frames of summary statistics. -#' The coloumns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}. +#' The columns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}. #' \code{n} is the sample size for the summary statistics, it is highly recommendation to provide. #' \code{variant} is required if sumstat for different outcomes do not have the same number of variables. -#' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standarized} scale). +#' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale). #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -#' obtained from the same gentoypes. +#' obtained from the same genotypes. #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. #' The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. #' The innovation: do not provide the same matrix in \code{X} to reduce the computational burden. @@ -51,7 +51,7 @@ #' @param jk_equiv_corr The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome #' @param jk_equiv_loglik The change of loglikelihood cutoff between overall best update jk-star and marginal best update jk-l for lth outcome #' @param coloc_thresh The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC). -#' @param lambda The ratio \[0,1\] for z^2 and z in fun_prior simplex, defult is 0.5 +#' @param lambda The ratio \[0,1\] for z^2 and z in fun_prior simplex, default is 0.5 #' @param lambda_focal_outcome The ratio for z^2 and z in fun_prior simplex for the focal outcome, default is 1 #' @param func_simplex The data-driven local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD. #' @param func_multi_test The alternative method to check the stop criteria. When \code{func_multi_test = "lfdr"}, boosting iterations will be stopped @@ -79,7 +79,7 @@ #' @param tol A small, non-negative number specifying the convergence tolerance for checking the overlap of the variables in different sets. #' @param merge_cos When \code{merge_cos = TRUE}, the sets for only one outcome will be merged if passed the \code{median_cos_abs_corr}. #' @param sec_coverage_thresh A number between 0 and 1 specifying the weight in each SEC (default is 0.8). -#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5 +#' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5 #' @param check_null The cut off value for change conditional objective function. Default is 0.1. #' @param check_null_method The metric to check the null sets. Default is "profile" #' @param check_null_max The smallest value of change of profile loglikelihood for each outcome. diff --git a/R/colocboost_output.R b/R/colocboost_output.R index ba42098..d22b6e0 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -6,7 +6,7 @@ #' with or without the outcomes of interest. #' #' @param cb_output Output object from `colocboost` analysis -#' @param summary_level When \code{summary_level = 1}, return basic sumamry table for colocalization results. See details in `get_ucos_summary` function when \code{summary_level = 2}. +#' @param summary_level When \code{summary_level = 1}, return basic summary table for colocalization results. See details in `get_ucos_summary` function when \code{summary_level = 2}. #' @param outcome_names Optional vector of names of outcomes, which has the same order as Y in the original analysis. #' @param interest_outcome Optional vector specifying a subset of outcomes from \code{outcome_names} to focus on. When provided, only colocalization events that include at least one of these outcomes will be returned. #' @param region_name Optional character string. When provided, adds a column with this gene name to the output table for easier filtering in downstream analyses. @@ -123,8 +123,8 @@ get_colocboost_summary <- function(cb_output, #' @param cb_output Output object from `colocboost` analysis #' @param cos_npc_cutoff Minimum threshold of normalized probability of colocalization (NPC) for CoS. #' @param npc_outcome_cutoff Minimum threshold of normalized probability of colocalized traits in each CoS. -#' @param pvalue_cutoff Maximum threshold of margianl p-values of colocalized variants on colocalized traits in each CoS. -#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5 +#' @param pvalue_cutoff Maximum threshold of marginal p-values of colocalized variants on colocalized traits in each CoS. +#' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5 #' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95). #' #' @return A \code{"colocboost"} object with some or all of the following elements: @@ -857,7 +857,7 @@ get_ucos_summary <- function(cb_output, outcome_names = NULL, region_name = NULL return(output_summary) } -#' Extract CoS at different coverages +#' Extract CoS at different coverage #' #' @description `get_cos` extracts colocalization confidence sets (CoS) at different coverage levels #' from ColocBoost results. When genotype data (X) or correlation matrix (Xcorr) is provided, it diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index a0150e7..0af168b 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -36,7 +36,7 @@ #' @param title_style Vector of two numbers for title style (size, boldness), default is c(2.5, 2) #' @param ... Additional parameters passed to `plot` functions #' -#' @return Visualization plot for each colcoalization event. +#' @return Visualization plot for each colocalization event. #' #' @examples #' # colocboost example @@ -434,9 +434,9 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, coloc_hits <- coloc_hits[select_cs] } else { if (cb_output$data_info$n_outcomes == 1) { - warnings("No fine-mapped causal effects in this region!") + warning("No fine-mapped causal effects in this region!") } else { - warnings("No colocalized effects in this region!") + warning("No colocalized effects in this region!") } ncos <- 0 coloc_index <- select_cs <- NULL @@ -536,12 +536,12 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, cos_to_uncoloc <- coloc_cos cos_idx_to_uncoloc <- 1:length(coloc_index) if (is.null(show_cos_to_uncoloc_outcome)) { - warning("Show all CoSs to uncolocalized outcomes.") + message("Show all CoSs to uncolocalized outcomes.") outcome_to_uncoloc <- lapply(coloc_index, function(cidx) { setdiff(1:length(analysis_outcome), cidx) }) } else { - warning("Show all CoSs to uncolocalized outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ",")) + message("Show all CoSs to uncolocalized outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ",")) outcome_to_uncoloc <- lapply(coloc_index, function(cidx) { setdiff(show_cos_to_uncoloc_outcome, cidx) }) @@ -558,7 +558,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, cos_idx_to_uncoloc <- show_cos_to_uncoloc_idx cos_to_uncoloc <- coloc_cos[show_cos_to_uncoloc_idx] if (is.null(show_cos_to_uncoloc_outcome)) { - warning( + message( "Show the ordered ", paste(cos_idx_to_uncoloc, collapse = ","), " CoS for all uncolocalized outcomes." ) @@ -568,7 +568,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, return(l) }) } else { - warning( + message( "Show the ordered ", paste(cos_idx_to_uncoloc, collapse = ","), " CoS for outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ",") ) diff --git a/R/data.R b/R/data.R index eb49650..4724c18 100644 --- a/R/data.R +++ b/R/data.R @@ -11,7 +11,7 @@ #' } #' @source The Ind_5traits dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. #' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multi-trait colocalization analysis -#' with individual-level data. See Cao et. al. 2025 for details. +#' with individual-level data. See Cao etc. 2025 for details. #' #' @family colocboost_data "Ind_5traits" @@ -29,7 +29,7 @@ #' @source The Sumstat_5traits dataset contains 5 simulated summary statistics, #' where it is directly derived from the Ind_5traits dataset using marginal association. #' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost -#' in multi-trait colocalization analysis with summary association data. See Cao et. al. 2025 for details. +#' in multi-trait colocalization analysis with summary association data. See Cao etc. 2025 for details. #' #' @family colocboost_data "Sumstat_5traits" @@ -44,11 +44,11 @@ #' \describe{ #' \item{X}{List of genotype matrices} #' \item{Y}{List of traits} -#' \item{variant}{Incides of two causal variants} +#' \item{variant}{indices of two causal variants} #' } #' @source The Heterogeneous_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. #' There are two causal variants, both of which have heterogeneous effects on two traits. -#' See Figure 2b in Cao et. al. 2025 for details. +#' See Figure 2b in Cao etc. 2025 for details. #' #' @family colocboost_data "Heterogeneous_Effect" @@ -63,17 +63,17 @@ #' \describe{ #' \item{X}{List of genotype matrices} #' \item{Y}{List of traits} -#' \item{variant}{Incides of two causal variants} +#' \item{variant}{indices of two causal variants} #' } #' @source The Weaker_GWAS_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. #' There are two causal variants, one of which has a weaker effect on the focal trait compared to the other trait. -#' See Figure 2b in Cao et. al. 2025 for details. +#' See Figure 2b in Cao etc. 2025 for details. #' #' @family colocboost_data "Weaker_GWAS_Effect" -#' Individual level data for 2 traits and 2 causal variants, but the strongest margianl association is not causal +#' Individual level data for 2 traits and 2 causal variants, but the strongest marginal association is not causal #' #' An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants, but the strongest marginal association is not causal variant. #' @@ -82,11 +82,11 @@ #' \describe{ #' \item{X}{List of genotype matrices} #' \item{Y}{List of traits} -#' \item{variant}{Incides of two causal variants} +#' \item{variant}{indices of two causal variants} #' } #' @source The Non_Causal_Strongest_Marginal dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. #' There are two causal variants, but the strongest marginal association is not a causal variant. -#' See Figure 2b in Cao et. al. 2025 for details. +#' See Figure 2b in Cao etc. 2025 for details. #' #' @family colocboost_data "Non_Causal_Strongest_Marginal" diff --git a/man/Heterogeneous_Effect.Rd b/man/Heterogeneous_Effect.Rd index 4708a7c..a8de914 100644 --- a/man/Heterogeneous_Effect.Rd +++ b/man/Heterogeneous_Effect.Rd @@ -11,14 +11,14 @@ A list with 3 elements \describe{ \item{X}{List of genotype matrices} \item{Y}{List of traits} -\item{variant}{Incides of two causal variants} +\item{variant}{indices of two causal variants} } } } \source{ The Heterogeneous_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. There are two causal variants, both of which have heterogeneous effects on two traits. -See Figure 2b in Cao et. al. 2025 for details. +See Figure 2b in Cao etc. 2025 for details. } \usage{ Heterogeneous_Effect diff --git a/man/Ind_5traits.Rd b/man/Ind_5traits.Rd index 6347f4a..4026c98 100644 --- a/man/Ind_5traits.Rd +++ b/man/Ind_5traits.Rd @@ -18,7 +18,7 @@ A list with 3 elements \source{ The Ind_5traits dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multi-trait colocalization analysis -with individual-level data. See Cao et. al. 2025 for details. +with individual-level data. See Cao etc. 2025 for details. } \usage{ Ind_5traits diff --git a/man/Non_Causal_Strongest_Marginal.Rd b/man/Non_Causal_Strongest_Marginal.Rd index 8fba152..53f654b 100644 --- a/man/Non_Causal_Strongest_Marginal.Rd +++ b/man/Non_Causal_Strongest_Marginal.Rd @@ -3,7 +3,7 @@ \docType{data} \name{Non_Causal_Strongest_Marginal} \alias{Non_Causal_Strongest_Marginal} -\title{Individual level data for 2 traits and 2 causal variants, but the strongest margianl association is not causal} +\title{Individual level data for 2 traits and 2 causal variants, but the strongest marginal association is not causal} \format{ \subsection{\code{Non_Causal_Strongest_Marginal}}{ @@ -11,14 +11,14 @@ A list with 3 elements \describe{ \item{X}{List of genotype matrices} \item{Y}{List of traits} -\item{variant}{Incides of two causal variants} +\item{variant}{indices of two causal variants} } } } \source{ The Non_Causal_Strongest_Marginal dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. There are two causal variants, but the strongest marginal association is not a causal variant. -See Figure 2b in Cao et. al. 2025 for details. +See Figure 2b in Cao etc. 2025 for details. } \usage{ Non_Causal_Strongest_Marginal diff --git a/man/Sumstat_5traits.Rd b/man/Sumstat_5traits.Rd index c024bc9..3ba28c8 100644 --- a/man/Sumstat_5traits.Rd +++ b/man/Sumstat_5traits.Rd @@ -18,7 +18,7 @@ A list with 2 elements The Sumstat_5traits dataset contains 5 simulated summary statistics, where it is directly derived from the Ind_5traits dataset using marginal association. The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost -in multi-trait colocalization analysis with summary association data. See Cao et. al. 2025 for details. +in multi-trait colocalization analysis with summary association data. See Cao etc. 2025 for details. } \usage{ Sumstat_5traits diff --git a/man/Weaker_GWAS_Effect.Rd b/man/Weaker_GWAS_Effect.Rd index 14db5a3..87ba36b 100644 --- a/man/Weaker_GWAS_Effect.Rd +++ b/man/Weaker_GWAS_Effect.Rd @@ -11,14 +11,14 @@ A list with 3 elements \describe{ \item{X}{List of genotype matrices} \item{Y}{List of traits} -\item{variant}{Incides of two causal variants} +\item{variant}{indices of two causal variants} } } } \source{ The Weaker_GWAS_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. There are two causal variants, one of which has a weaker effect on the focal trait compared to the other trait. -See Figure 2b in Cao et. al. 2025 for details. +See Figure 2b in Cao etc. 2025 for details. } \usage{ Weaker_GWAS_Effect diff --git a/man/colocboost.Rd b/man/colocboost.Rd index b241b2a..e5a7307 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -71,13 +71,13 @@ Each matrix should have column names, if sample sizes and variables possibly dif \item{Y}{A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.} \item{sumstat}{A list of data.frames of summary statistics. -The coloumns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}. +The columns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}. \code{n} is the sample size for the summary statistics, it is highly recommendation to provide. \code{variant} is required if sumstat for different outcomes do not have the same number of variables. -\code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standarized} scale).} +\code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).} \item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -obtained from the same gentoypes.} +obtained from the same genotypes.} \item{dict_YX}{A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. @@ -128,7 +128,7 @@ we need to have the higher learning rate to improve the computational efficiency \item{coloc_thresh}{The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC).} -\item{lambda}{The ratio [0,1] for z^2 and z in fun_prior simplex, defult is 0.5} +\item{lambda}{The ratio [0,1] for z^2 and z in fun_prior simplex, default is 0.5} \item{lambda_focal_outcome}{The ratio for z^2 and z in fun_prior simplex for the focal outcome, default is 1} @@ -176,7 +176,7 @@ to merge colocalized sets, which may resulting in a huge set.} \item{sec_coverage_thresh}{A number between 0 and 1 specifying the weight in each SEC (default is 0.8).} -\item{weight_fudge_factor}{The strenght to integrate weight from differnt outcomes, default is 1.5} +\item{weight_fudge_factor}{The strength to integrate weight from different outcomes, default is 1.5} \item{check_null}{The cut off value for change conditional objective function. Default is 0.1.} @@ -205,7 +205,7 @@ A \code{"colocboost"} object with some or all of the following elements: } \description{ \code{colocboost} implements a proximity adaptive smoothing gradient boosting approach for multi-trait colocalization at gene loci, -accommodating multiple causal variants. This method, introduced by Cao et al. (2025), is particularly suited for scaling +accommodating multiple causal variants. This method, introduced by Cao etc. (2025), is particularly suited for scaling to large datasets involving numerous molecular quantitative traits and disease traits. In brief, this function fits a multiple linear regression model \eqn{Y = XB + E} in matrix form. ColocBoost can be generally used in multi-task variable selection regression problem. diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd index 7636864..0fdd21c 100644 --- a/man/colocboost_plot.Rd +++ b/man/colocboost_plot.Rd @@ -105,7 +105,7 @@ colocboost_plot( \item{...}{Additional parameters passed to \code{plot} functions} } \value{ -Visualization plot for each colcoalization event. +Visualization plot for each colocalization event. } \description{ \code{colocboost_plot} generates visualization plots for colocalization events from a ColocBoost analysis. diff --git a/man/get_colocboost_summary.Rd b/man/get_colocboost_summary.Rd index 7453d8e..2f7c9da 100644 --- a/man/get_colocboost_summary.Rd +++ b/man/get_colocboost_summary.Rd @@ -21,7 +21,7 @@ get_colocboost_summary( \arguments{ \item{cb_output}{Output object from \code{colocboost} analysis} -\item{summary_level}{When \code{summary_level = 1}, return basic sumamry table for colocalization results. See details in \code{get_ucos_summary} function when \code{summary_level = 2}.} +\item{summary_level}{When \code{summary_level = 1}, return basic summary table for colocalization results. See details in \code{get_ucos_summary} function when \code{summary_level = 2}.} \item{outcome_names}{Optional vector of names of outcomes, which has the same order as Y in the original analysis.} diff --git a/man/get_cos.Rd b/man/get_cos.Rd index f064a46..b3f1854 100644 --- a/man/get_cos.Rd +++ b/man/get_cos.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/colocboost_output.R \name{get_cos} \alias{get_cos} -\title{Extract CoS at different coverages} +\title{Extract CoS at different coverage} \usage{ get_cos( cb_output, diff --git a/man/get_robust_colocalization.Rd b/man/get_robust_colocalization.Rd index b898f54..d4acbce 100644 --- a/man/get_robust_colocalization.Rd +++ b/man/get_robust_colocalization.Rd @@ -24,9 +24,9 @@ get_robust_colocalization( \item{npc_outcome_cutoff}{Minimum threshold of normalized probability of colocalized traits in each CoS.} -\item{pvalue_cutoff}{Maximum threshold of margianl p-values of colocalized variants on colocalized traits in each CoS.} +\item{pvalue_cutoff}{Maximum threshold of marginal p-values of colocalized variants on colocalized traits in each CoS.} -\item{weight_fudge_factor}{The strenght to integrate weight from differnt outcomes, default is 1.5} +\item{weight_fudge_factor}{The strength to integrate weight from different outcomes, default is 1.5} \item{coverage}{A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95).} } diff --git a/vignettes/Ambiguous_Colocalization.Rmd b/vignettes/Ambiguous_Colocalization.Rmd index 2bebfa3..0c45594 100644 --- a/vignettes/Ambiguous_Colocalization.Rmd +++ b/vignettes/Ambiguous_Colocalization.Rmd @@ -15,7 +15,7 @@ knitr::opts_chunk$set( ``` This vignette demonstrates an example of ambiguous colocalization from trait-specific effects using the `colocboost`. -Specifically, we will use the `Ambiguous_Colocalization`, which is output from `colocboost` analyzing GTEx relsease v8 and UK Biobank summary statistics +Specifically, we will use the `Ambiguous_Colocalization`, which is output from `colocboost` analyzing GTEx release v8 and UK Biobank summary statistics (see more details of the original data source in Acknowledgement section). ```{r setup} @@ -44,7 +44,7 @@ This dataset is structured as a list with two main components: # 2. ColocBoost results -In this example, there are two trait-specifc effects for the eQTL and GWAS signals, respectively. But two uCoS have overlapping variants, +In this example, there are two trait-specific effects for the eQTL and GWAS signals, respectively. But two uCoS have overlapping variants, which indicates that the two uCoS are not independent. ColocBoost identifies two uCoS: - `ucos1:y1`: eQTL trait-specific effect has 6 variants. @@ -106,7 +106,7 @@ susie_GWAS$sets$cs$L1 intersect(susie_eQTL$sets$cs$L1, susie_GWAS$sets$cs$L1) ``` -To visulize the fine-mapping results, +To visualize the fine-mapping results, ```{r plot-susie} susieR::susie_plot(susie_eQTL, y = "PIP", pos = 2000:n_variables) @@ -135,7 +135,7 @@ res$summary # 3. Get the ambiguous colocalization results and summary -ColocBoost provides a function to get the ambiguous colocalization results and summary from trait-specifc effects, by considering the correlation of variants between the two uCoS. +ColocBoost provides a function to get the ambiguous colocalization results and summary from trait-specific effects, by considering the correlation of variants between the two uCoS. ## 3.1. Get the ambiguous colocalization results @@ -160,12 +160,12 @@ names(res$ambiguous_cos[[1]]) **Explanation of results** For each ambiguous colocalization, the following information is provided: -- `ambigous_cos`: Contains variants indices and names of the original trait-specific uCoS used to constuct this ambiguouse colocalization. -- `ambigouse_cos_overlap`: Contains the overlapping variants information across the uCoS used to constuct this ambiguouse colocalization. -- `ambigouse_cos_union`: Contains the union of variants information across the uCoS used to constuct this ambiguouse colocalization. -- `ambigouse_cos_outcomes`: Contains the outcomes indicies and names for uCoS used to constuct this ambiguouse colocalization. -- `ambigous_cos_weight`: Contains the trait-specific weights of the uCoS used to constuct this ambiguouse colocalization. -- `ambigous_cos_puriry`: Contains the purity of across uCoS used to constuct this ambiguouse colocalization. +- `ambiguous_cos`: Contains variants indices and names of the original trait-specific uCoS used to construct this ambiguouse colocalization. +- `ambiguous_cos_overlap`: Contains the overlapping variants information across the uCoS used to construct this ambiguouse colocalization. +- `ambiguous_cos_union`: Contains the union of variants information across the uCoS used to construct this ambiguouse colocalization. +- `ambiguous_cos_outcomes`: Contains the outcomes indices and names for uCoS used to construct this ambiguouse colocalization. +- `ambiguous_cos_weight`: Contains the trait-specific weights of the uCoS used to construct this ambiguouse colocalization. +- `ambiguous_cos_puriry`: Contains the purity of across uCoS used to construct this ambiguouse colocalization. - `recalibrated_cos_vcp`: Contains the recalibrated integrative weight to analogous to variant colocalization probability (VCP) from the ambiguous colocalization results. - `recalibrated_cos`: Contains the recalibrated 95% colocalization confidence set (CoS) from the ambiguous colocalization results. @@ -188,7 +188,7 @@ summary_ambiguous <- full_summary$ambiguous_cos_summary colnames(summary_ambiguous) ``` -- `recalibrated_*`: giving the recalibrated weigths and recalibrated 95% colocalization confidece sets (CoS) from the trait-specific effects. +- `recalibrated_*`: giving the recalibrated weights and recalibrated 95% colocalization confidence sets (CoS) from the trait-specific effects. See details of function usage in the [Functions](https://statfungen.github.io/colocboost/reference/index.html). @@ -201,12 +201,12 @@ where trait-specific effects share highly correlated and overlapping variants. when merging two uCoS, which in this example will furnish a colocalization event. Still, we mark such colocalization events as `ambigous_cos`. We recommend users not to lower these thresholds further without strong justification. - We suggest treating this scenario with caution and distinct such merged CoS from typical colocalization events as a result of coupled updates between traits, -if researcher decides to investigate these ambigous colocalization events. +if researcher decides to investigate these ambiguous colocalization events. - As such, - While we provide recalibrated weights as a suggested approach for interpreting ambiguous results, users can still choose between recalibrated weights and trait-specific weights based on their research context. - - The `colocboost_plot` function will not consider it as colocalized but still showing them as uncolocalized events, with overlapping variants color labelled. + - The `colocboost_plot` function will not consider it as colocalized but still showing them as uncolocalized events, with overlapping variants color labeled. -# Acknowledgements +# Acknowledgement - The eQTL data used for the analyses described in this example results were obtained from GTEx release v8 from [GTEx Portal](https://gtexportal.org/home/downloads/adult-gtex/qtl). - The GWAS summary statistics used for the analyses described in this example results were obtained from UK Biobank (UKBB) diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index fdfa020..d9b8cee 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -81,7 +81,8 @@ imiss_cutoff = 0.9 - **`sumstat_path_list`**: A vector of file paths to the summary statistics. - **`column_file_path_list`**: A vector of file paths to the column mapping files. -- **`LD_meta_file_path_list`**: A vector of paths to LD metadata files. LD metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks, and "path" is the path of each LD block, optionally including bim file paths. +- **`LD_meta_file_path_list`**: A vector of paths to LD metadata files. LD metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". +"start" and "end" denote the positions of LD blocks, and "path" is the path of each LD block, optionally including `bim` file paths. - **`match_LD_sumstat`**: Logical indicating whether to match LD blocks with summary statistics. - **`conditions_list_sumstat`**: A vector of strings representing different sumstats. - **`n_samples`**: User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file. diff --git a/vignettes/Disease_Prioritized_Colocalization.Rmd b/vignettes/Disease_Prioritized_Colocalization.Rmd index a721a57..d39cf7f 100644 --- a/vignettes/Disease_Prioritized_Colocalization.Rmd +++ b/vignettes/Disease_Prioritized_Colocalization.Rmd @@ -130,7 +130,7 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutotials portal +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutorials portal at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index c5f1dd3..d1deba3 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -69,11 +69,11 @@ refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocb -# 3. Optional: mapping between arbitary input $X$ and $Y$ +# 3. Optional: mapping between arbitrary input $X$ and $Y$ -For analysis when including multiple genotype matrices `X` with unmatched arbitary phenotype vectors `Y`, +For analysis when including multiple genotype matrices `X` with unmatched arbitrary phenotype vectors `Y`, a mapping dictionary `dict_YX` is required to indicate the relationship between `X` and `Y`. -Similiarly, when multiple LD matrices with unmatched arbitrary multiple summary statistics `sumstat` are used, +Similarly, when multiple LD matrices with unmatched arbitrary multiple summary statistics `sumstat` are used, a mapping dictionary `dict_sumstatLD` is required to indicate the relationship between `sumstat` and `LD`. For example, considering three genotype matrices `X = list(X1, X2, X3)` and 6 phenotype vectors `Y = list(Y1, Y2, Y3, Y4, Y5, Y6)`, where @@ -95,10 +95,10 @@ dict_YX ``` -# 4. Hyprcoloc compatible format: effect size and standard error matrices +# 4. HyPrColoc compatible format: effect size and standard error matrices -ColocBoost also provides a flexibility to use Hyprcoloc compatible format for summary statistics with and without LD matrix. -For example, when anaylze $L$ traits for the same $P$ variants with the specified effect size and standard error matrices: +ColocBoost also provides a flexibility to use HyPrColoc compatible format for summary statistics with and without LD matrix. +For example, when analyze $L$ traits for the same $P$ variants with the specified effect size and standard error matrices: - `effect_est` (required) is $P \times L$ matrix of variable regression coefficients (i.e. regression beta values) in the genomic region. - `effect_se` (required) is $P \times L$ matrix of standard errors for the regression coefficients. @@ -106,7 +106,7 @@ For example, when anaylze $L$ traits for the same $P$ variants with the specifie - `LD` (optional) is LD matrix for the $P$ variants. If it is not provided, it will apply LD-free ColocBoost. -See more details about Hyprcoloc compatible format in [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)). +See more details about HyPrColoc compatible format in [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)). -See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). +See more details about data format to implement LD-free ColocBoost and LD-mismatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). diff --git a/vignettes/Interpret_ColocBoost_Output.Rmd b/vignettes/Interpret_ColocBoost_Output.Rmd index 085cec7..ee9352a 100644 --- a/vignettes/Interpret_ColocBoost_Output.Rmd +++ b/vignettes/Interpret_ColocBoost_Output.Rmd @@ -192,7 +192,7 @@ res$cos_details$cos_purity ``` -- **`cos_top_variables`**: indicies and names of the top variant for each CoS, which is the variant with the highest VCP. +- **`cos_top_variables`**: indices and names of the top variant for each CoS, which is the variant with the highest VCP. - Note that there may exist multiple variants in perfect LD with the same highest VCP. ```{r cos-top} @@ -321,7 +321,7 @@ Y <- Ind_5traits$Y res <- colocboost(X = X, Y = Y, output_level = 3) ``` -- **`cb_model`**: trait-specific proximity gradient boosting model, including proximity weight at each iteration, residual after gradient boosting, et al. +- **`cb_model`**: trait-specific proximity gradient boosting model, including proximity weight at each iteration, residual after gradient boosting, etc. - **`weights_paths`**: individual trait-specific weights for each iteration. ```{r cb-model} diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 4e4f8da..6af293b 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -51,7 +51,7 @@ Sumstat_5traits$true_effect_variants `sumstat` must include the following columns: - `z` or (`beta`, `sebeta`): either z-score or (effect size and standard error) -- `n`: sample size for the summary statistics. **Higly recommended**: Provding the sample size, or even a rough estimate of `n`, +- `n`: sample size for the summary statistics. **Highly recommended**: Providing the sample size, or even a rough estimate of `n`, is highly recommended. Without `n`, the implicit assumption is `n` is large (Inf) and the effect sizes are small (close to zero). - `variant`: required if `sumstat` for different outcomes do not have the same number of variables (multiple `sumstat` and multiple `LD`). @@ -208,6 +208,6 @@ res$cos_details$cos$cos_index colocboost_plot(res) ``` -See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). +See more details about data format to implement LD-free ColocBoost and LD-mismatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd index 1e7bf8c..f31a850 100644 --- a/vignettes/announcements.Rmd +++ b/vignettes/announcements.Rmd @@ -14,4 +14,4 @@ vignette: > ## Software updates -- `v1.0.0` Initial public release. Pre-release versions `0.1.*` for reproducing our earlier work, including the first version of manuscript on biorxiv, are still available on [GitHub](https://github.com/StatFunGen/colocboost/tags). +- `v1.0.0` Initial public release. Pre-release versions `0.1.*` for reproducing our earlier work, including the first version of manuscript on medRxiv, are still available on [GitHub](https://github.com/StatFunGen/colocboost/tags).