StatFunGen · xueweic · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/R/colocboost.R b/R/colocboost.R
@@ -3,7 +3,7 @@
 #' @title ColocBoost: A gradient boosting informed multi-omics xQTL colocalization method
 #'
 #' @description `colocboost` implements a proximity adaptive smoothing gradient boosting approach for multi-trait colocalization at gene loci,
-#'              accommodating multiple causal variants. This method, introduced by Cao et al. (2025), is particularly suited for scaling
+#'              accommodating multiple causal variants. This method, introduced by Cao etc. (2025), is particularly suited for scaling
 #'              to large datasets involving numerous molecular quantitative traits and disease traits.
 #'              In brief, this function fits a multiple linear regression model \eqn{Y = XB + E} in matrix form.
 #'              ColocBoost can be generally used in multi-task variable selection regression problem.
@@ -17,12 +17,12 @@
 #'          Each matrix should have column names, if sample sizes and variables possibly differing across matrices.
 #' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.
 #' @param sumstat A list of data.frames of summary statistics.
-#'                  The coloumns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}.
+#'                  The columns of data.frame should include either \code{z} or \code{beta}/\code{sebeta}.
 #'                  \code{n} is the sample size for the summary statistics, it is highly recommendation to provide.
 #'                  \code{variant} is required if sumstat for different outcomes do not have the same number of variables.
-#'                  \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standarized} scale).
+#'                  \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).
 #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were
-#'           obtained from the same gentoypes.
+#'           obtained from the same genotypes.
 #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix.
 #'                  The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome.
 #'                  The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.
@@ -51,14 +51,14 @@
 #' @param jk_equiv_corr The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome
 #' @param jk_equiv_loglik The change of loglikelihood cutoff between overall best update jk-star and marginal best update jk-l for lth outcome
 #' @param coloc_thresh The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC).
-#' @param lambda The ratio \[0,1\] for z^2 and z in fun_prior simplex, defult is 0.5
+#' @param lambda The ratio \[0,1\] for z^2 and z in fun_prior simplex, default is 0.5
 #' @param lambda_focal_outcome The ratio for z^2 and z in fun_prior simplex for the focal outcome, default is 1
 #' @param func_simplex The data-driven local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD.
 #' @param func_multi_test The alternative method to check the stop criteria. When \code{func_multi_test = "lfdr"}, boosting iterations will be stopped
 #'                      if the local FDR for all variables are greater than \code{lfsr_max}.
 #' @param stop_null The cutoff of nominal p-value when \code{func_multi_test = "Z"}.
-#' @param multi_test_max The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.
-#' @param multi_test_thresh The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.
+#' @param multi_test_max The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.
+#' @param multi_test_thresh The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.
 #' @param ash_prior The prior distribution for calculating lfsr when \code{func_multi_test = "lfsr"}.
 #' @param p.adjust.methods The adjusted pvalue method in stats:p.adj  when \code{func_multi_test = "fdr"}
 #' @param residual_correlation The residual correlation based on the sample overlap, it is diagonal if it is NULL.
@@ -79,7 +79,7 @@
 #' @param tol A small, non-negative number specifying the convergence tolerance for checking the overlap of the variables in different sets.
 #' @param merge_cos When \code{merge_cos = TRUE}, the sets for only one outcome will be merged if passed the \code{median_cos_abs_corr}.
 #' @param sec_coverage_thresh A number between 0 and 1 specifying the weight in each SEC (default is 0.8).
-#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5
+#' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5
 #' @param check_null The cut off value for change conditional objective function. Default is 0.1.
 #' @param check_null_method The metric to check the null sets. Default is "profile"
 #' @param check_null_max The smallest value of change of profile loglikelihood for each outcome.
@@ -502,7 +502,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         z <- summstat_tmp[, "z"]
       }
       if (anyNA(z)) {
-        warning(paste("summary statistic dataset", i.sumstat, "contains NA values that are replaced with 0"))
+        warning(paste("summary statistic dataset", i.summstat, "contains NA values that are replaced with 0"))
         z[is.na(z)] <- 0
       }
 

diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R
@@ -29,7 +29,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
       "evidence_strength" = NULL,
       "requested_coverage" = coverage
     )
-    pip_av <- rep(0, cc$P)
+    pip_av <- rep(0, cb_model_para$P)
   } else if (ncol(temp) == 1 | nrow(temp) == 1) {
     weights <- as.vector(weights)
     avWeight <- weights

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
@@ -219,6 +219,7 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
                                  lambda_focal_outcome = 1,
                                  learning_rate_decay = 1,
                                  multi_test_thresh = 1,
+                                 multi_test_max = 1,
                                  func_multi_test = "lfdr",
                                  LD_free = FALSE,
                                  outcome_names = NULL,
@@ -296,7 +297,10 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
     "coveraged" = TRUE,
     "num_updates" = 0,
     "coveraged_outcome" = coveraged_outcome,
-    "num_updates_outcome" = num_updates_outcome
+    "num_updates_outcome" = num_updates_outcome,
+    "func_multi_test" = func_multi_test,
+    "multi_test_thresh" = multi_test_thresh,
+    "multi_test_max" = multi_test_max
   )
   class(cb_model_para) <- "colocboost"
 
@@ -599,7 +603,7 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
         # var_y, shat (and bhat) are provided, so the effects are on the
         # *original scale*.
         adj <- 1 / (Z_extend^2 + current_n - 2)
-        if (!is.null(LD_tmp)) {
+        if (!is.null(ld_submatrix)) {
           XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2)
           xtx <- t(ld_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag)
           tmp$XtX <- (xtx + t(xtx)) / 2

diff --git a/R/colocboost_output.R b/R/colocboost_output.R
@@ -6,7 +6,7 @@
 #' with or without the outcomes of interest.
 #'
 #' @param cb_output Output object from `colocboost` analysis
-#' @param summary_level When \code{summary_level = 1}, return basic sumamry table for colocalization results. See details in `get_ucos_summary` function when \code{summary_level = 2}.
+#' @param summary_level When \code{summary_level = 1}, return basic summary table for colocalization results. See details in `get_ucos_summary` function when \code{summary_level = 2}.
 #' @param outcome_names Optional vector of names of outcomes, which has the same order as Y in the original analysis.
 #' @param interest_outcome Optional vector specifying a subset of outcomes from \code{outcome_names} to focus on. When provided, only colocalization events that include at least one of these outcomes will be returned.
 #' @param region_name Optional character string. When provided, adds a column with this gene name to the output table for easier filtering in downstream analyses.
@@ -123,8 +123,8 @@ get_colocboost_summary <- function(cb_output,
 #' @param cb_output Output object from `colocboost` analysis
 #' @param cos_npc_cutoff Minimum threshold of normalized probability of colocalization (NPC) for CoS.
 #' @param npc_outcome_cutoff Minimum threshold of normalized probability of colocalized traits in each CoS.
-#' @param pvalue_cutoff Maximum threshold of margianl p-values of colocalized variants on colocalized traits in each CoS.
-#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5
+#' @param pvalue_cutoff Maximum threshold of marginal p-values of colocalized variants on colocalized traits in each CoS.
+#' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5
 #' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95).
 #'
 #' @return A \code{"colocboost"} object with some or all of the following elements:
@@ -625,7 +625,7 @@ get_cos_summary <- function(cb_output,
     coloc_outcome <- lapply(cb_output$cos_details$cos_outcomes$outcome_index, function(idx) analysis_outcome[idx])
     coloc_sets <- cb_output$cos_details$cos$cos_index
     if (!is.null(cb_output$cos_warnings)) {
-      cos_warnings
+      message(cb_output$cos_warnings$warning_message)
     }
     vcp <- as.numeric(cb_output$vcp)
 
@@ -857,7 +857,7 @@ get_ucos_summary <- function(cb_output, outcome_names = NULL, region_name = NULL
   return(output_summary)
 }
 
-#' Extract CoS at different coverages
+#' Extract CoS at different coverage
 #' 
 #' @description `get_cos` extracts colocalization confidence sets (CoS) at different coverage levels 
 #' from ColocBoost results. When genotype data (X) or correlation matrix (Xcorr) is provided, it 

diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R
@@ -36,7 +36,7 @@
 #' @param title_style Vector of two numbers for title style (size, boldness), default is c(2.5, 2)
 #' @param ... Additional parameters passed to `plot` functions
 #'
-#' @return Visualization plot for each colcoalization event.
+#' @return Visualization plot for each colocalization event.
 #'
 #' @examples
 #' # colocboost example
@@ -434,9 +434,9 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     coloc_hits <- coloc_hits[select_cs]
   } else {
     if (cb_output$data_info$n_outcomes == 1) {
-      warnings("No fine-mapped causal effects in this region!")
+      warning("No fine-mapped causal effects in this region!")
     } else {
-      warnings("No colocalized effects in this region!")
+      warning("No colocalized effects in this region!")
     }
     ncos <- 0
     coloc_index <- select_cs <- NULL
@@ -536,12 +536,12 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
       cos_to_uncoloc <- coloc_cos
       cos_idx_to_uncoloc <- 1:length(coloc_index)
       if (is.null(show_cos_to_uncoloc_outcome)) {
-        warning("Show all CoSs to uncolocalized outcomes.")
+        message("Show all CoSs to uncolocalized outcomes.")
         outcome_to_uncoloc <- lapply(coloc_index, function(cidx) {
           setdiff(1:length(analysis_outcome), cidx)
         })
       } else {
-        warning("Show all CoSs to uncolocalized outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ","))
+        message("Show all CoSs to uncolocalized outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ","))
         outcome_to_uncoloc <- lapply(coloc_index, function(cidx) {
           setdiff(show_cos_to_uncoloc_outcome, cidx)
         })
@@ -558,7 +558,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
         cos_idx_to_uncoloc <- show_cos_to_uncoloc_idx
         cos_to_uncoloc <- coloc_cos[show_cos_to_uncoloc_idx]
         if (is.null(show_cos_to_uncoloc_outcome)) {
-          warning(
+          message(
             "Show the ordered ", paste(cos_idx_to_uncoloc, collapse = ","),
             " CoS for all uncolocalized outcomes."
           )
@@ -568,7 +568,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
             return(l)
           })
         } else {
-          warning(
+          message(
             "Show the ordered ", paste(cos_idx_to_uncoloc, collapse = ","),
             " CoS for outcomes ", paste(show_cos_to_uncoloc_outcome, collapse = ",")
           )

diff --git a/R/colocboost_update.R b/R/colocboost_update.R
@@ -175,13 +175,12 @@ boost_KL_delta <- function(z, ld_feature, adj_dep,
 }
 
 
-boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage,
-                             multi_test_max = 1) {
+boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage) {
   # - check the iteration for the stop outcome (pos_stop has the same jk with original data)
   iter_each <- sapply(pos_stop, function(i) {
     length(cb_model[[i]]$obj_path) - 1
   })
-  lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < multi_test_max)
+  lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < cb_model_para$multi_test_max)
   pos_need_more <- which(iter_each <= 10 & lfsr_each)
 
   # pos_need_more <- which(iter_each <= 10)
@@ -197,9 +196,9 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop, stop_no_coverage
     cb_model_para$need_more <- update_need_more
     for (i in update_need_more) {
       cb_model[[i]]$stop_thresh <- cb_model[[i]]$stop_thresh * 0.5
-      if (stop_method == "Z") {
+      if (cb_model_para$func_multi_test == "Z") {
         cb_model[[i]]$stop_null <- cb_model[[i]]$stop_null - 0.1
-      } else if (stop_method == "lfsr" | stop_method == "lfdr") {
+      } else if (cb_model_para$func_multi_test == "lfsr" | cb_model_para$func_multi_test == "lfdr") {
         cb_model[[i]]$stop_null <- cb_model[[i]]$stop_null + 0.05
       }
       cb_model[[i]]$learning_rate_init <- cb_model[[i]]$learning_rate_init * 0.5

diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
@@ -58,6 +58,7 @@ colocboost_workhorse <- function(cb_data,
     lambda_focal_outcome = lambda_focal_outcome,
     learning_rate_decay = learning_rate_decay,
     multi_test_thresh = multi_test_thresh,
+    multi_test_max = multi_test_max,
     func_multi_test = func_multi_test,
     LD_free = LD_free,
     outcome_names = outcome_names,
@@ -194,9 +195,7 @@ colocboost_workhorse <- function(cb_data,
             cb_model_para$update_y <- cb_model_para$update_y
           } else {
             pos_stop <- which(stop) # which outcome reach the stop criterion
-            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage,
-              multi_test_max = multi_test_max
-            )
+            ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, stop_no_coverage)
             cb_model_para <- ttmp$cb_model_para
             cb_model <- ttmp$cb_model
             # - if there is some outcomes need stop

diff --git a/R/data.R b/R/data.R
@@ -11,7 +11,7 @@
 #' }
 #' @source The Ind_5traits dataset contains 5 simulated phenotypes alongside corresponding genotype matrices.
 #' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multi-trait colocalization analysis 
-#' with individual-level data. See Cao et. al. 2025 for details.
+#' with individual-level data. See Cao etc. 2025 for details.
 #' 
 #' @family colocboost_data
 "Ind_5traits"
@@ -29,7 +29,7 @@
 #' @source The Sumstat_5traits dataset contains 5 simulated summary statistics, 
 #' where it is directly derived from the Ind_5traits dataset using marginal association. 
 #' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost 
-#' in multi-trait colocalization analysis with summary association data. See Cao et. al. 2025 for details. 
+#' in multi-trait colocalization analysis with summary association data. See Cao etc. 2025 for details. 
 #' 
 #' @family colocboost_data
 "Sumstat_5traits"
@@ -44,11 +44,11 @@
 #' \describe{
 #'   \item{X}{List of genotype matrices}
 #'   \item{Y}{List of traits}
-#'   \item{variant}{Incides of two causal variants}
+#'   \item{variant}{indices of two causal variants}
 #' }
 #' @source The Heterogeneous_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices.
 #' There are two causal variants, both of which have heterogeneous effects on two traits.
-#' See Figure 2b in Cao et. al. 2025 for details. 
+#' See Figure 2b in Cao etc. 2025 for details. 
 #' 
 #' @family colocboost_data
 "Heterogeneous_Effect"
@@ -63,17 +63,17 @@
 #' \describe{
 #'   \item{X}{List of genotype matrices}
 #'   \item{Y}{List of traits}
-#'   \item{variant}{Incides of two causal variants}
+#'   \item{variant}{indices of two causal variants}
 #' }
 #' @source The Weaker_GWAS_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices.
 #' There are two causal variants, one of which has a weaker effect on the focal trait compared to the other trait.
-#' See Figure 2b in Cao et. al. 2025 for details. 
+#' See Figure 2b in Cao etc. 2025 for details. 
 #' 
 #' @family colocboost_data
 "Weaker_GWAS_Effect"
 
 
-#' Individual level data for 2 traits and 2 causal variants, but the strongest margianl association is not causal
+#' Individual level data for 2 traits and 2 causal variants, but the strongest marginal association is not causal
 #'
 #' An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants, but the strongest marginal association is not causal variant.
 #'
@@ -82,11 +82,11 @@
 #' \describe{
 #'   \item{X}{List of genotype matrices}
 #'   \item{Y}{List of traits}
-#'   \item{variant}{Incides of two causal variants}
+#'   \item{variant}{indices of two causal variants}
 #' }
 #' @source The Non_Causal_Strongest_Marginal dataset contains 2 simulated phenotypes alongside corresponding genotype matrices.
 #' There are two causal variants, but the strongest marginal association is not a causal variant.
-#' See Figure 2b in Cao et. al. 2025 for details. 
+#' See Figure 2b in Cao etc. 2025 for details. 
 #' 
 #' @family colocboost_data
 "Non_Causal_Strongest_Marginal"

diff --git a/man/Heterogeneous_Effect.Rd b/man/Heterogeneous_Effect.Rd
diff --git a/man/Ind_5traits.Rd b/man/Ind_5traits.Rd