diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 9ee1221..eda9e2a 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -794,9 +794,9 @@ get_model_info <- function(cb_obj, outcome_names = NULL) { model_coveraged <- cb_obj$cb_model_para$coveraged jk_update <- cb_obj$cb_model_para$real_update_jk outcome_proximity_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_path) - outcome_coupled_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single) + outcome_coupled_best_update_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single) outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each) - names(outcome_proximity_obj) <- names(outcome_coupled_obj) <- + names(outcome_proximity_obj) <- names(outcome_coupled_best_update_obj) <- names(outcome_profile_loglik) <- outcome_names ll <- list( "model_coveraged" = model_coveraged, @@ -804,7 +804,7 @@ get_model_info <- function(cb_obj, outcome_names = NULL) { "profile_loglik" = profile_loglik, "outcome_profile_loglik" = outcome_profile_loglik, "outcome_proximity_obj" = outcome_proximity_obj, - "outcome_coupled_obj" = outcome_coupled_obj, + "outcome_coupled_best_update_obj" = outcome_coupled_best_update_obj, "jk_update" = jk_update ) return(ll) diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 716c1da..1087cdb 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -144,13 +144,18 @@ colocboost_plot <- function(cb_output, y = "log10p", # - begin plotting coloc_cos <- cb_plot_input$cos outcomes <- cb_plot_input$outcomes + if (length(y)==1) outcome_idx <- 1 if (is.null(outcome_idx)) { if (is.null(coloc_cos)) { # - no colocalized effects, draw all outcomes in this region if (length(cb_plot_input$outcomes) == 1) { message("There is no fine-mapped causal effect in this region!. Showing margianl for this outcome!") } else { - message("There is no colocalization in this region!. Showing margianl for all outcomes!") + if (length(y) == 1){ + message("There is no colocalization in this region!. Showing VCP = 0!") + } else { + message("There is no colocalization in this region!. Showing margianl for all outcomes!") + } } outcome_idx <- 1:length(y) } else { @@ -195,10 +200,12 @@ colocboost_plot <- function(cb_output, y = "log10p", } else { do.call(plot, args) } - mtext(outcomes[iy], - side = cb_plot_init$outcome_legend_pos, line = 0.2, adj = 0.5, - cex = cb_plot_init$outcome_legend_size, font = 1 - ) + if (length(y)!=1){ + mtext(outcomes[iy], + side = cb_plot_init$outcome_legend_pos, line = 0.2, adj = 0.5, + cex = cb_plot_init$outcome_legend_size, font = 1 + ) + } if (add_vertical) { for (iii in 1:length(add_vertical_idx)) { abline(v = add_vertical_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed") @@ -209,6 +216,7 @@ colocboost_plot <- function(cb_output, y = "log10p", if (!is.null(coloc_cos)) { n.coloc <- length(coloc_cos) coloc_index <- cb_plot_input$coloc_index + if (length(y)==1){coloc_index = lapply(coloc_index, function(i) 1 )} legend_text <- list(col = vector()) legend_text$col <- head(cb_plot_init$col, n.coloc) @@ -312,6 +320,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, variables <- cb_output$data_info$variables Z <- cb_output$data_info$z coef <- cb_output$data_info$coef + vcp <- list(as.numeric(cb_output$vcp)) # if finemapping if (cb_output$data_info$n_outcomes == 1) { @@ -336,7 +345,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, names(coloc_hits) <- names(coloc_cos) if (!is.null(coloc_cos)) { # extract vcp each outcome - vcp <- lapply(1:length(analysis_outcome), function(iy) { + cos_vcp <- lapply(1:length(analysis_outcome), function(iy) { pos <- which(sapply(coloc_index, function(idx) iy %in% idx)) if (length(pos) != 0) { w <- do.call(cbind, cb_output$cos_details$cos_vcp[pos]) @@ -372,8 +381,9 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, } else { warnings("No colocalized effects in this region!") } - coloc_index <- NULL - vcp <- rep(0, cb_output$data_info$n_variables) + coloc_index <- select_cs <- NULL + vcp <- list(rep(0, cb_output$data_info$n_variables)) + cos_vcp <- lapply(1:length(analysis_outcome), function(iy) rep(0, cb_output$data_info$n_variables) ) } # extract x axis if (variant_coord) { @@ -398,6 +408,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, "x" = x, "Zscores" = Z, "vcp" = vcp, + "cos_vcp" = cos_vcp, "coef" = coef, "cos" = coloc_cos, "cos_hits" = coloc_hits, @@ -528,18 +539,22 @@ plot_initial <- function(cb_plot_input, y = "log10p", } else if (y == "z_original") { plot_data <- cb_plot_input$Zscores ylab <- "Z score" + } else if (y == "cos_vcp") { + plot_data <- cb_plot_input$cos_vcp + ylab <- "CoS-specific VCP" + args$ylim <- c(0, 1) } else if (y == "vcp") { plot_data <- cb_plot_input$vcp - ylab <- "VCP" if (length(cb_plot_input$outcomes) == 1) { ylab <- "VPA" } + ylab <- "VCP" args$ylim <- c(0, 1) - } else if (y == "coef") { + }else if (y == "coef") { plot_data <- cb_plot_input$coef ylab <- "Coefficients" } else { - stop("Invalid y value! Choose from 'z' or 'z_original'") + stop("Invalid y value! Choose from 'log10p', 'z_original', 'vcp', 'coef', or 'cos_vcp'!") } if (!exists("xlab", args)) args$xlab <- "variables" if (!exists("ylab", args)) args$ylab <- ylab @@ -607,3 +622,4 @@ plot_initial <- function(cb_plot_input, y = "log10p", return(args) } + diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index e81928b..8be177e 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -80,7 +80,7 @@ colocboost_workhorse <- function(cb_data, # - if all outcomes do not have signals, STOP message(paste0( "Using multiple testing correction method: ", func_multi_test, - ". Stop ColocBoost since no outcomes ", focal_outcome_idx, " have association signals." + ". Stop ColocBoost since no outcomes ", focal_outcome_idx, "have association signals." )) } else { message(paste0( diff --git a/R/data.R b/R/data.R index a094387..8be1ab2 100644 --- a/R/data.R +++ b/R/data.R @@ -1,76 +1,92 @@ #' Individual level data for 5 traits #' -#' An example dataset with simulated genotypes and outcomes for 5 traits +#' An example dataset with simulated genotypes and traits for 5 traits #' #' @format ## `Ind_5traits` #' A list with 3 elements #' \describe{ #' \item{X}{List of genotype matrices} -#' \item{Y}{List of outcomes} -#' \item{TrueCausalVariants}{List of causal variants} +#' \item{Y}{List of traits} +#' \item{true_effect_variants}{List of causal variants} #' } -#' @source See Cao et. al. 2025 for details. TO-DO-LIST +#' @source The Ind_5traits dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. +#' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multi-trait colocalization analysis +#' with individual-level data. See Cao et. al. 2025 for details. +#' #' @family colocboost_data "Ind_5traits" #' Summary level data for 5 traits #' -#' An example dataset with simulated statistics and LD for 5 traits +#' An example dataset with simulated statistics for 5 traits #' #' @format ## `Sumstat_5traits` #' A list with 2 elements #' \describe{ #' \item{sumstat}{Summary statistics for 5 traits} -#' \item{TrueCausalVariants}{List of causal variants} +#' \item{true_effect_variants}{List of causal variants} #' } -#' @source See Cao et. al. 2025 for details. TO-DO-LIST +#' @source The Sumstat_5traits dataset contains 5 simulated summary statistics, +#' where it is directly derived from the Ind_5traits dataset using marginal association. +#' The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost +#' in multi-trait colocalization analysis with summary association data. See Cao et. al. 2025 for details. +#' #' @family colocboost_data "Sumstat_5traits" -#' Summary level data for 5 traits +#' Individual level data for 2 traits and 2 causal variants with heterogeneous effects #' -#' An example dataset with simulated statistics and LD for 5 traits +#' An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants with heterogeneous effects #' #' @format ## `Heterogeneous_Effect` #' A list with 3 elements #' \describe{ #' \item{X}{List of genotype matrices} -#' \item{Y}{List of outcomes} -#' \item{TrueCausalVariants}{List of causal variants} +#' \item{Y}{List of traits} +#' \item{variant}{Incides of two causal variants} #' } -#' @source See Cao et. al. 2025 for details. TO-DO-LIST +#' @source The Heterogeneous_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +#' There are two causal variants, both of which have heterogeneous effects on two traits. +#' See Figure 2b in Cao et. al. 2025 for details. +#' #' @family colocboost_data "Heterogeneous_Effect" -#' Summary level data for 5 traits +#' Individual level data for 2 traits and 2 causal variants with weaker effects for focal trait #' -#' An example dataset with simulated statistics and LD for 5 traits +#' An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants with heterogeneous effects #' #' @format ## `Weaker_GWAS_Effect` #' A list with 3 elements #' \describe{ #' \item{X}{List of genotype matrices} -#' \item{Y}{List of outcomes} -#' \item{TrueCausalVariants}{List of causal variants} +#' \item{Y}{List of traits} +#' \item{variant}{Incides of two causal variants} #' } -#' @source See Cao et. al. 2025 for details. TO-DO-LIST +#' @source The Weaker_GWAS_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +#' There are two causal variants, one of which has a weaker effect on the focal trait compared to the other trait. +#' See Figure 2b in Cao et. al. 2025 for details. +#' #' @family colocboost_data "Weaker_GWAS_Effect" -#' Summary level data for 5 traits +#' Individual level data for 2 traits and 2 causal variants, but the strongest margianl association is not causal #' -#' An example dataset with simulated statistics and LD for 5 traits +#' An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants, but the strongest marginal association is not causal variant. #' #' @format ## `Non_Causal_Strongest_Marginal` #' A list with 3 elements #' \describe{ #' \item{X}{List of genotype matrices} -#' \item{Y}{List of outcomes} -#' \item{TrueCausalVariants}{List of causal variants} +#' \item{Y}{List of traits} +#' \item{variant}{Incides of two causal variants} #' } -#' @source See Cao et. al. 2025 for details. TO-DO-LIST +#' @source The Non_Causal_Strongest_Marginal dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +#' There are two causal variants, but the strongest marginal association is not a causal variant. +#' See Figure 2b in Cao et. al. 2025 for details. +#' #' @family colocboost_data "Non_Causal_Strongest_Marginal" diff --git a/README.md b/README.md index 700796a..6669620 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Learn how to perform colocalization analysis with step-by-step examples. For det If you use ColocBoost in your research, please cite: -> Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025+). Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain. bioRxiv. [https://doi.org/](https://doi.org/) +> Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025+). Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain. medRxiv. [https://doi.org/](https://doi.org/) ## License diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..acdf746 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,27 @@ +citHeader("To cite colocboost in publications use:") + +citEntry( + entry = "Article", + title = "Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain", + author = c(person("Xuewei", "Cao"), + person("Haochen", "Sun"), + person("Ru", "Feng"), + person("Rajib", "Mazumder"), + person("Cristian F. B.", "Najar"), + person("Yang I.", "Li"), + person("Philip L.", "de Jager"), + person("David", "Bennett"), + person("The Alzheimer's Disease Functional Genomics Consortium"), + person("Kushal K.", "Dey"), + person("Guanghao", "Wang")), + journal = "medRxiv", + year = "2025", + note = "Preprint", + url = "https://doi.org/", + textVersion = paste( + "Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025).", + "Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain.", + "medRxiv.", + "https://doi.org/[YOUR_DOI_HERE]" + ) +) \ No newline at end of file diff --git a/man/Heterogeneous_Effect.Rd b/man/Heterogeneous_Effect.Rd index 45ae8b2..d4b7d29 100644 --- a/man/Heterogeneous_Effect.Rd +++ b/man/Heterogeneous_Effect.Rd @@ -3,26 +3,28 @@ \docType{data} \name{Heterogeneous_Effect} \alias{Heterogeneous_Effect} -\title{Summary level data for 5 traits} +\title{Individual level data for 2 traits and 2 causal variants with heterogeneous effects} \format{ \subsection{\code{Heterogeneous_Effect}}{ A list with 3 elements \describe{ \item{X}{List of genotype matrices} -\item{Y}{List of outcomes} -\item{TrueCausalVariants}{List of causal variants} +\item{Y}{List of traits} +\item{variant}{Incides of two causal variants} } } } \source{ -See Cao et. al. 2025 for details. TO-DO-LIST +The Heterogeneous_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +There are two causal variants, both of which have heterogeneous effects on two traits. +See Figure 2b in Cao et. al. 2025 for details. } \usage{ Heterogeneous_Effect } \description{ -An example dataset with simulated statistics and LD for 5 traits +An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants with heterogeneous effects } \seealso{ Other colocboost_data: diff --git a/man/Ind_5traits.Rd b/man/Ind_5traits.Rd index 5d1e6f1..a1cc697 100644 --- a/man/Ind_5traits.Rd +++ b/man/Ind_5traits.Rd @@ -10,19 +10,21 @@ A list with 3 elements \describe{ \item{X}{List of genotype matrices} -\item{Y}{List of outcomes} -\item{TrueCausalVariants}{List of causal variants} +\item{Y}{List of traits} +\item{true_effect_variants}{List of causal variants} } } } \source{ -See Cao et. al. 2025 for details. TO-DO-LIST +The Ind_5traits dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. +The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multi-trait colocalization analysis +with individual-level data. See Cao et. al. 2025 for details. } \usage{ Ind_5traits } \description{ -An example dataset with simulated genotypes and outcomes for 5 traits +An example dataset with simulated genotypes and traits for 5 traits } \seealso{ Other colocboost_data: diff --git a/man/Non_Causal_Strongest_Marginal.Rd b/man/Non_Causal_Strongest_Marginal.Rd index 8f6b848..7ecffa9 100644 --- a/man/Non_Causal_Strongest_Marginal.Rd +++ b/man/Non_Causal_Strongest_Marginal.Rd @@ -3,26 +3,28 @@ \docType{data} \name{Non_Causal_Strongest_Marginal} \alias{Non_Causal_Strongest_Marginal} -\title{Summary level data for 5 traits} +\title{Individual level data for 2 traits and 2 causal variants, but the strongest margianl association is not causal} \format{ \subsection{\code{Non_Causal_Strongest_Marginal}}{ A list with 3 elements \describe{ \item{X}{List of genotype matrices} -\item{Y}{List of outcomes} -\item{TrueCausalVariants}{List of causal variants} +\item{Y}{List of traits} +\item{variant}{Incides of two causal variants} } } } \source{ -See Cao et. al. 2025 for details. TO-DO-LIST +The Non_Causal_Strongest_Marginal dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +There are two causal variants, but the strongest marginal association is not a causal variant. +See Figure 2b in Cao et. al. 2025 for details. } \usage{ Non_Causal_Strongest_Marginal } \description{ -An example dataset with simulated statistics and LD for 5 traits +An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants, but the strongest marginal association is not causal variant. } \seealso{ Other colocboost_data: diff --git a/man/Sumstat_5traits.Rd b/man/Sumstat_5traits.Rd index 068b26e..bdf8ee5 100644 --- a/man/Sumstat_5traits.Rd +++ b/man/Sumstat_5traits.Rd @@ -10,18 +10,21 @@ A list with 2 elements \describe{ \item{sumstat}{Summary statistics for 5 traits} -\item{TrueCausalVariants}{List of causal variants} +\item{true_effect_variants}{List of causal variants} } } } \source{ -See Cao et. al. 2025 for details. TO-DO-LIST +The Sumstat_5traits dataset contains 5 simulated summary statistics, +where it is directly derived from the Ind_5traits dataset using marginal association. +The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost +in multi-trait colocalization analysis with summary association data. See Cao et. al. 2025 for details. } \usage{ Sumstat_5traits } \description{ -An example dataset with simulated statistics and LD for 5 traits +An example dataset with simulated statistics for 5 traits } \seealso{ Other colocboost_data: diff --git a/man/Weaker_GWAS_Effect.Rd b/man/Weaker_GWAS_Effect.Rd index 734a889..5819ada 100644 --- a/man/Weaker_GWAS_Effect.Rd +++ b/man/Weaker_GWAS_Effect.Rd @@ -3,26 +3,28 @@ \docType{data} \name{Weaker_GWAS_Effect} \alias{Weaker_GWAS_Effect} -\title{Summary level data for 5 traits} +\title{Individual level data for 2 traits and 2 causal variants with weaker effects for focal trait} \format{ \subsection{\code{Weaker_GWAS_Effect}}{ A list with 3 elements \describe{ \item{X}{List of genotype matrices} -\item{Y}{List of outcomes} -\item{TrueCausalVariants}{List of causal variants} +\item{Y}{List of traits} +\item{variant}{Incides of two causal variants} } } } \source{ -See Cao et. al. 2025 for details. TO-DO-LIST +The Weaker_GWAS_Effect dataset contains 2 simulated phenotypes alongside corresponding genotype matrices. +There are two causal variants, one of which has a weaker effect on the focal trait compared to the other trait. +See Figure 2b in Cao et. al. 2025 for details. } \usage{ Weaker_GWAS_Effect } \description{ -An example dataset with simulated statistics and LD for 5 traits +An example dataset with simulated genotypes and traits for 2 traits and 2 common causal variants with heterogeneous effects } \seealso{ Other colocboost_data: diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index 097b241..0000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.html -*.R diff --git a/vignettes/Disease_Prioritized_Colocalization.Rmd b/vignettes/Disease_Prioritized_Colocalization.Rmd index 61bb78b..a97e29a 100644 --- a/vignettes/Disease_Prioritized_Colocalization.Rmd +++ b/vignettes/Disease_Prioritized_Colocalization.Rmd @@ -85,30 +85,51 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal at FIXME (link). +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and +[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). # 3. ColocBoost in disease-prioritized mode +Disease-prioritized mode of ColocBoost can -TO-DO-LIST +When integrating with GWAS data, to mitigate the risk of biasing early updates toward stronger xQTL signals in LD proximity—and +should in fact colocalize with—weaker signals from the disease trait of interest, +we also implement a disease-prioritized mode of ColocBoost as a soft prioritization strategy to colocalize variants with putative causal effects on the focal trait. + +To run the disease-prioritized mode, you need to specify the `focal_outcome_idx` argument in the `colocboost()` function. + +- `focal_outcome_idx` indicates the index of the focal trait in all traits of interest. +- Note that the `focal_outcome_idx` are counted from *individual level data* to *summmary statistics*. +Therefore, in this example, `focal_outcome_idx = 5` is used to indicate the index of the focal trait from (4 individual level traits) and (1 summary statistcs). ```{r disease-basic} # Run colocboost -res <- colocboost(X = X, Y = Y, sumstat = sumstat, LD = LD, focal_outcome_idx = 5) +res <- colocboost(X = X, Y = Y, + sumstat = sumstat, LD = LD, + focal_outcome_idx = 5) + +# Plotting the focal only results colocalization results +colocboost_plot(res, plot_focal_only = TRUE) +``` +Unlike the other existing methods, the disease-prioritized mode of ColocBoost only used a soft prioritization strategy. +Therefore, it does not only identify the colocalization of the focal trait, but also the colocalization across the other traits without the focal trait. +To extract all CoS and visualization of all colocalization results, you can use the following code: + +```{r all-basic} # Identified CoS res$cos_details$cos$cos_index -# Plotting results +# Plotting all results colocboost_plot(res) ``` +### Results Interpretation - -```{r disease-basic-focal} -# Plotting the focal only results -colocboost_plot(res, plot_focal_only = TRUE) -``` \ No newline at end of file +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and +[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). \ No newline at end of file diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index 370fb80..c6c254b 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -104,5 +104,5 @@ For example, when anaylze L traits for the same P variants with the specified ef - `effect_n` (highly recommended) is either a scalar or a vector of sample sizes for estimating regression coefficients. - `LD` (optional) is LD matrix for the $P$ variants. If it is not provided, it will apply LD-free ColocBoost. -See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in FIXME. +See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in (LD mismatch diagnosis and LD-free analysis (avaiable soon)). diff --git a/vignettes/Interpret_ColocBoost_Output.Rmd b/vignettes/Interpret_ColocBoost_Output.Rmd index 9c0119e..925f73f 100644 --- a/vignettes/Interpret_ColocBoost_Output.Rmd +++ b/vignettes/Interpret_ColocBoost_Output.Rmd @@ -88,16 +88,147 @@ The output from `get_strong_colocalization` is the same as output from `colocboo ## 3. More details on ColocBoost output -TO-DO-LIST - The entire colocalization output from `colocboost` is stored in the `colocboost` object, which contains several components: -- `cos_summary`: A summary table for colocalization events. -- `vcp`: The variable colocalized probability for each variable. -- `cos_details`: A object with all information for colocalization results. -- `data_info`: A object with detailed information from input data. -- `model_info`: A object with detailed information for colocboost model. +- **`cos_summary`**: A summary table for colocalization events (see details in above Section 1). +- **`vcp`**: The variable colocalized probability for each variable. +- **`cos_details`**: A object with all information for colocalization results. +- **`data_info`**: A object with detailed information from input data. +- **`model_info`**: A object with detailed information for colocboost model. + +In this section, we will provide a detailed explanation of the components for deepening into ColocBoost result using a mixed dataset. + +```{r load-mixed-data} +# Load example data +# Load example data +data(Ind_5traits) +data(Sumstat_5traits) +# Create a mixed dataset +X <- Ind_5traits$X[1:4] +Y <- Ind_5traits$Y[1:4] +sumstat <- Sumstat_5traits$sumstat[5] +LD <- get_cormat(Ind_5traits$X[[1]]) +# Run colocboost +res <- colocboost(X = X, Y = Y, sumstat = sumstat, LD = LD) +``` + +### 3.1. Variant colocalization probability (**`vcp`**) + +- **`vcp`** is the probability of a variant being colocalized with at least one traits, serving as analogs of posterior inclusion probabilities (PIPs) in single-trait fine-mapping. +To plot the VCP for the variants with in at least one CoS, you can use the `colocboost_plot` function with the `y` argument set to `"vcp"`. + + +```{r vcp-plot} +colocboost_plot(res, y = "vcp") +``` + +Please visit our documentation portal +at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) for more details on the `colocboost_plot` function + +### 3.2. Analyzed data information (**`data_info`**) + +- **`n_variables`**: number of variants being included. +- **`variables`**: vector of variant names across all traits being included in colocalization analysis. +- **`coef`**: regression coefficients estimated from the colocboost model for each trait. +- **`z`**: z-scores from marginal associations for each trait. +- **`n_outcomes`**: the number of traits being included in colocalization analysis. +- **`outcome_info`** contains information of analyzed data, including sample size and data type. + +```{r analyzed-data-info} +res$data_info$outcome_info +``` + + +### 3.3. Colocalization details (**`cos_details`**) + +**`cos_details`** provides detailed information for colocalization evetns identified by `colocboost`. +This section will provide a detailed explanation of the components in `cos_details`. + +#### 3.3.1. Colocalized variants for each CoS (**`cos`**) + +- **`cos`**: A list with detailed information of colocalized variants for each CoS. + - **`cos_index`**: Indices of colocalized variables with unique identifier for each CoS. + - **`cos_variables`**: Names of colocalized variables with unique identifier for each CoS. +- Note that variants are ordered by their VCP in descending order. + +```{r cos} +res$cos_details$cos +``` + +#### 3.3.2. Colocalized traits for each 95% CoS (**`cos_outcomes`**) +- **`cos_outcomes`**: A list with detailed information of colocalized traits for each CoS. + - **`outcome_index`**: Indices of colocalized traits with unique identifier for each CoS. + - **`outcome_name`**: Names of colocalized traits with unique identifier for each CoS. + +```{r cos-outcome} +res$cos_details$cos_outcomes +``` + +- **`cos_npc`**: normalized probability of colocalization for CoS, providing empirical evidence in favor of colocalization over a trait-specific configuration. +- **`cos_outcomes_npc`**: normalized probability for each colocalized trait in order with evidence strength. +- These two metrices could be used to filter the colocalization events by relative strength of evidence, see details in Section 2. + + +```{r cos-npc} +res$cos_details$cos_npc +res$cos_details$cos_outcomes_npc +``` +- **`cos_purity`**: includes three lists, for each list, it contains $S \times S$ matrix, where $S$ is the number of CoS. + - `min_abs_cor`: the minimum absolute correlation of variants within (diagonal) CoS or in-between (off-diagonal) different CoS. + - `median_abs_cor`: the median absolute correlation of variants within (diagonal) CoS or in-between (off-diagonal) different CoS. + - `max_abs_cor`: the maximum absolute correlation of variants within (diagonal) CoS or in-between (off-diagonal) different CoS. + +```{r cos-purity} +res$cos_details$cos_purity +``` + + +- **`cos_top_variables`**: indicies and names of the top variant for each CoS, which is the variant with the highest VCP. +- Note that there may exist multiple variants in perfect LD with the same highest VCP. + +```{r cos-top} +res$cos_details$cos_top_variables +``` + +- **`cos_weights`**: the integrative weights for each colocalized trait in the CoS. This is used to re-calibrate CoS when some traits are filtered out.. +- **`cos_vcp`**: the single-effect VCP for each CoS. + + +### 3.4. Model information (**`model_info`**) + +- **`model_coveraged`**: if the model is converged. +- **`n_updates`**: number of boosting rounds +- **`jk_update`**: indices of the variants being updated in the model at each boosting round. + +```{r jk_update} +# Pick arbitrary SEC updates, see entire update in advance +res$model_info$jk_update[c(5:10,36:38), ] +``` + +- **`profile_loglik`**: joint profile log-likelihood changes over boosting rounds. +- **`outcome_profile_loglik`**: trait-specifc profile log-likelihood changes over boosting rounds. + +```{r profile_loglik} +par(mfrow=c(2,3),mar=c(4,4,2,1)) +plot(res$model_info$profile_loglik, type="p", col="#3366CC", lwd=2, xlab="", ylab="Joint Profile") +for(i in 1:5){ +plot(res$model_info$outcome_profile_loglik[[i]], type="p", col="#CC3333", lwd=2, xlab="", ylab=paste0("Profile (Trait ", i, ")")) +} +``` + +- **`outcome_proximity_obj`**: trait-specific proximity smoothed objective for each trait. +- **`outcome_coupled_best_update_obj`**: objective at the (coupled) best update variant for each outcome. + +```{r objetive} +par(mfrow=c(2,5), mar=c(4,4,2,1)) +plot(res$model_info$outcome_proximity_obj[[1]], type="p", col="#3366CC", lwd=2, xlab="", ylab="Trait-specific Objective", main = paste0("Trait ", 1)) +for(i in 2:5){ +plot(res$model_info$outcome_proximity_obj[[i]], type="p", col="#3366CC", lwd=2, xlab="", ylab="", main = paste0("Trait ", i)) +} +plot(res$model_info$outcome_coupled_best_update_obj[[1]], type="p", col="#CC3333", lwd=2, xlab="", ylab=paste0("Objetive at best update variant")) +for(i in 2:5){ plot(res$model_info$outcome_coupled_best_update_obj[[i]], type="p", col="#CC3333", lwd=2, xlab="", ylab="") } +``` diff --git a/vignettes/Partial_Overlap_Variants.Rmd b/vignettes/Partial_Overlap_Variants.Rmd index e41b95c..91c2346 100644 --- a/vignettes/Partial_Overlap_Variants.Rmd +++ b/vignettes/Partial_Overlap_Variants.Rmd @@ -24,4 +24,64 @@ library(colocboost) ![](../man/figures/missing_representation.png) -TO-DO-LIST \ No newline at end of file + +### Causal variant structure + +We create an example data from `Ind_5traits` with two causal variants, 644 and 2289, but each of them are only partially overlapping across traits. + +- Causal variant 644 is associated with traits 1, 3, and 4, but is missing in trait 2. +- Causal variant 2289 is associated with traits 2 and 3, but is missing in trait 5. + +This structure creates a realistic scenario where multiple traits from different datasets are not fully overlapping, and the causal variants are not shared across all traits. + +```{r make-data} +# Load example data +data(Ind_5traits) +X <- Ind_5traits$X +Y <- Ind_5traits$Y + +# Creeate causal variants with potentially LD proxies +causal_1 <- c(200:1000) +causal_2 <- c(2000:2600) + +# Create missing data +X[[2]] <- X[[2]][, -causal_1, drop = FALSE] +X[[3]] <- X[[3]][, -causal_2, drop = FALSE] + +# Show format +X[[2]][1:2, 1:10] +X[[3]][1:2, 1:10] +``` + + +To run ColocBoost to the different genotypes with different causal variants, the variant names should be provided as the column names of the `X` matrices. +Otherwise, the `colocboost` function will not be able to identify the variants correctly from different genotype matrices, +and the analysis will fail with an error message `Please verify the variable names across different outcomes.`. + +```{r run-code} +# Run colocboost +res <- colocboost(X = X, Y = Y) + +# The number of variants in the analysis +res$data_info$n_variables + +# Plotting the results +colocboost_plot(res) +``` + + +If we perform colocalization analysis using only overlapping variables, we may fail to detect any colocalization events. +This is because the causal variants, which are only partially overlapping across traits, are excluded during the preprocessing step. +As a result, even though these variants are associated with some traits, they are removed from the analysis, leading to a loss of critical information. +This highlights the importance of handling partial overlaps effectively to ensure that meaningful colocalization signals are not missed. + +```{r run-code-overlap} +# Run colocboost with only overlapping variables +res <- colocboost(X = X, Y = Y, overlap_variables = TRUE) + +# The number of variants in the analysis +res$data_info$n_variables + +# Plotting the results +colocboost_plot(res) +``` diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 4d1dbe1..7da663a 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -63,7 +63,7 @@ head(Sumstat_5traits$sumstat[[1]]) The preferred format for colocalization analysis in ColocBoost using summary statistics data is where one LD matrix is provided for all traits, -and the summary statistics are organized in a list. The **Basic format** us +and the summary statistics are organized in a list. The **Basic format** is - `sumstat` is organized as a list of data.frames for all traits - `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits. diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd index 3ddd617..779e534 100644 --- a/vignettes/Visualization_ColocBoost_Output.Rmd +++ b/vignettes/Visualization_ColocBoost_Output.Rmd @@ -50,8 +50,9 @@ colocboost_plot(res) - `plot_cols = 2` (default) indicates the number of columns in the plot. - `y = "log10p"` (default) with optional - `y = "z_original"` for z-scores - - `y = "vcp"` for variant colocalization probabilities, + - `y = "vcp"` for variant colocalization probabilities (single plot for all variants), - `y = "coef"` for regression coefficients estimated from the colocboost model. + - `y = "cos_vcp"` for variant colocalization probabilities (multiple plots for each CoS - only draw VCP for variants in CoS to the colocalized traits). - `plot_cos_idx = NULL` (default) indicates all colocalization events are plotted. `plot_cos_idx = 1` can be specified to plot the 1st colocalization event, and so on. - `outcome_idx = NULL` (default) indicates only the traits with colocalization are plotted. `outcome_idx = c(1,2,5)` can be specified to plot the traits 1, 2, and 5. - `cos_color = NULL` (default) indicates the colors of the colocalization events. Specify a vector of colors to customize the plot. diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd index aea06a7..e5671ae 100644 --- a/vignettes/announcements.Rmd +++ b/vignettes/announcements.Rmd @@ -10,7 +10,7 @@ vignette: > ## ColocBoost on the media -- *April 17, 2025*: Manuscript describing ColocBoost methods with applications is posted on [biorxiv](). +- *April 17, 2025*: Manuscript describing ColocBoost methods with applications is posted on [medRxiv](). ## Software updates