diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 4ee7f3b..962d12f 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -11,7 +11,7 @@ #' @param outcome_idx Optional indices of outcomes to include in the plot. \code{outcome_idx=NULL} to plot only the outcomes having colocalization. #' @param plot_all_outcome Optional to plot all outcome in the same figure. #' @param plot_focal_only Logical, if TRUE only plots colocalization with focal outcome, default is FALSE. -#' @param plot_focal_cos_outocme_only Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE. +#' @param plot_focal_cos_outcome_only Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE. #' @param points_color Background color for non-colocalized variables, default is "grey80". #' @param cos_color Optional custom colors for CoS. #' @param add_vertical Logical, if TRUE adds vertical lines at specified positions, default is FALSE @@ -25,7 +25,7 @@ #' @param show_cos_to_uncoloc_outcome Optional outcomes for showing CoS to uncolocalized outcomes #' @param plot_ucos Logical, if TRUE plots also trait-specific (uncolocalized) sets , default is FALSE #' @param plot_ucos_idx Optional indices of trait-specific (uncolocalized) sets to plot when included -#' @param gene_name Optional gene name to display in plot title +#' @param title_specific Optional specific title to display in plot title #' @param ylim_each Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE #' @param outcome_legend_pos Position for outcome legend, default is "top" #' @param outcome_legend_size Size for outcome legend text, default is 1.2 @@ -75,7 +75,7 @@ colocboost_plot <- function(cb_output, y = "log10p", outcome_idx = NULL, plot_all_outcome = FALSE, plot_focal_only = FALSE, - plot_focal_cos_outocme_only = FALSE, + plot_focal_cos_outcome_only = FALSE, points_color = "grey80", cos_color = NULL, add_vertical = FALSE, @@ -89,7 +89,7 @@ colocboost_plot <- function(cb_output, y = "log10p", show_cos_to_uncoloc_outcome = NULL, plot_ucos = FALSE, plot_ucos_idx = NULL, - gene_name = NULL, + title_specific = NULL, ylim_each = TRUE, outcome_legend_pos = "top", outcome_legend_size = 1.8, @@ -109,7 +109,7 @@ colocboost_plot <- function(cb_output, y = "log10p", variant_coord = variant_coord, outcome_names = outcome_names, plot_focal_only = plot_focal_only, - plot_focal_cos_outocme_only = plot_focal_cos_outocme_only, + plot_focal_cos_outcome_only = plot_focal_cos_outcome_only, show_cos_to_uncoloc = show_cos_to_uncoloc, show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx, show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome, @@ -118,7 +118,7 @@ colocboost_plot <- function(cb_output, y = "log10p", # get initial set up of plot cb_plot_init <- plot_initial(cb_plot_input, y = y, points_color = points_color, cos_color = cos_color, - ylim_each = ylim_each, gene_name = gene_name, + ylim_each = ylim_each, title_specific = title_specific, outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size, cos_legend_pos = cos_legend_pos, show_variable = show_variable, lab_style = lab_style, axis_style = axis_style, @@ -330,7 +330,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, variant_coord = FALSE, outcome_names = NULL, plot_focal_only = FALSE, - plot_focal_cos_outocme_only = FALSE, + plot_focal_cos_outcome_only = FALSE, show_cos_to_uncoloc = FALSE, show_cos_to_uncoloc_idx = NULL, show_cos_to_uncoloc_outcome = NULL, @@ -411,12 +411,12 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, } select_cs <- plot_cos_idx } else { - if (plot_focal_only || plot_focal_cos_outocme_only) { + if (plot_focal_only || plot_focal_cos_outcome_only) { if (sum(if_focal) == 0) { message("No focal CoS, draw all CoS.") } else if (plot_focal_only) { select_cs <- which(if_focal) - } else { # plot_focal_cos_outocme_only is true here + } else { # plot_focal_cos_outcome_only is true here # Get all outcomes colocalized with focal CoS focal_outcomes <- unique(unlist(coloc_index[if_focal])) # Find CoS that include at least one of these focal outcomes @@ -488,7 +488,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, stop("Please check plot_ucos_idx!") } select_ucos <- plot_ucos_idx - } else if (plot_focal_cos_outocme_only && sum(if_focal) != 0) { + } else if (plot_focal_cos_outcome_only && sum(if_focal) != 0) { # Get all outcomes colocalized with focal CoS focal_outcomes <- unique(unlist(plot_input$coloc_index)) # Find uCoS that include at least one of these focal outcomes @@ -597,7 +597,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, #' @importFrom stats pnorm plot_initial <- function(cb_plot_input, y = "log10p", points_color = "grey80", cos_color = NULL, - ylim_each = TRUE, gene_name = NULL, + ylim_each = TRUE, title_specific = NULL, outcome_legend_size = 1.5, outcome_legend_pos = "right", cos_legend_pos = "bottomleft", @@ -664,7 +664,7 @@ plot_initial <- function(cb_plot_input, y = "log10p", args$lab_face <- lab_style[2] # - set title format - args$title <- gene_name + args$title <- title_specific args$title_size <- as.numeric(title_style[1]) args$title_face <- title_style[2] diff --git a/README.md b/README.md index 6669620..8c441c3 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Learn how to perform colocalization analysis with step-by-step examples. For det If you use ColocBoost in your research, please cite: -> Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025+). Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain. medRxiv. [https://doi.org/](https://doi.org/) +> Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025+). Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain. medRxiv. [https://doi.org/10.1101/2025.04.17.25326042](https://doi.org/10.1101/2025.04.17.25326042) ## License diff --git a/_pkgdown.yml b/_pkgdown.yml index d4a3c28..5354be6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -18,18 +18,26 @@ navbar: href: https://github.com/StatFunGen/colocboost articles: - - title: Vignettes + - title: Performing Colocalization using ColocBoost + desc: "Tutorials on how to perform multi-trait colocalization analysis using ColocBoost with flexible input data formats." contents: - Input_Data_Format - Individual_Level_Colocalization - Summary_Statistics_Colocalization - Disease_Prioritized_Colocalization + + - title: Interpretation and Visualization + desc: "Tutorials on how to interpret and visualize the output from ColocBoost." + contents: - Interpret_ColocBoost_Output - Visualization_ColocBoost_Output + + - title: Advanced Topics + desc: "Advanced topics and special cases in colocalization and fine-mapping analysis." + contents: - Partial_Overlap_Variants - ColocBoost_Wrapper_Pipeline - LD_Free_Colocalization - - ColocBoost_Diagnostics - FineBoost_Special_Case - title: internal @@ -37,25 +45,26 @@ articles: - announcements - installation - Pairwise_Colocalization + - ColocBoost_Diagnostics reference: - title: "Example Data" - desc: "Example datasets for demonstration and testing" + desc: "Example datasets for demonstration and testing." contents: - has_concept("colocboost_data") - title: "Model fitting" - desc: "Functions for fitting colocalization models" + desc: "Main interface function for fitting multi-trait colocalization model." contents: - has_concept("colocboost") - title: "Inference and summary" - desc: "Functions for inference and summary from fitted models" + desc: "Functions for inference and summary from fitted model." contents: - has_concept("colocboost_inference") - title: "Visualization" - desc: "Functions for visualizing ColocBoost results" + desc: "Functions for visualizing ColocBoost result." contents: - has_concept("colocboost_plot") diff --git a/inst/CITATION b/inst/CITATION index acdf746..34719dc 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -17,11 +17,11 @@ citEntry( journal = "medRxiv", year = "2025", note = "Preprint", - url = "https://doi.org/", + url = "https://doi.org/10.1101/2025.04.17.25326042", textVersion = paste( "Cao X, Sun H, Feng R, Mazumder R, Najar CFB, Li YI, de Jager PL, Bennett D, The Alzheimer's Disease Functional Genomics Consortium, Dey KK, Wang G. (2025).", "Integrative multi-omics QTL colocalization maps regulatory architecture in aging human brain.", "medRxiv.", - "https://doi.org/[YOUR_DOI_HERE]" + "https://doi.org/10.1101/2025.04.17.25326042" ) ) \ No newline at end of file diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd index 156178b..7636864 100644 --- a/man/colocboost_plot.Rd +++ b/man/colocboost_plot.Rd @@ -15,7 +15,7 @@ colocboost_plot( outcome_idx = NULL, plot_all_outcome = FALSE, plot_focal_only = FALSE, - plot_focal_cos_outocme_only = FALSE, + plot_focal_cos_outcome_only = FALSE, points_color = "grey80", cos_color = NULL, add_vertical = FALSE, @@ -29,7 +29,7 @@ colocboost_plot( show_cos_to_uncoloc_outcome = NULL, plot_ucos = FALSE, plot_ucos_idx = NULL, - gene_name = NULL, + title_specific = NULL, ylim_each = TRUE, outcome_legend_pos = "top", outcome_legend_size = 1.8, @@ -56,7 +56,7 @@ colocboost_plot( \item{plot_focal_only}{Logical, if TRUE only plots colocalization with focal outcome, default is FALSE.} -\item{plot_focal_cos_outocme_only}{Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE.} +\item{plot_focal_cos_outcome_only}{Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE.} \item{points_color}{Background color for non-colocalized variables, default is "grey80".} @@ -84,7 +84,7 @@ colocboost_plot( \item{plot_ucos_idx}{Optional indices of trait-specific (uncolocalized) sets to plot when included} -\item{gene_name}{Optional gene name to display in plot title} +\item{title_specific}{Optional specific title to display in plot title} \item{ylim_each}{Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE} diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R index 0331c3f..396d610 100644 --- a/tests/testthat/test_plot.R +++ b/tests/testthat/test_plot.R @@ -177,8 +177,8 @@ test_that("colocboost_plot handles layout options", { # When ylim_each is FALSE, we need to provide a ylim parameter expect_error(suppressWarnings(colocboost_plot(cb_res, ylim_each = FALSE, ylim = c(0, 10))), NA) - # Test with gene_name option - expect_error(suppressWarnings(colocboost_plot(cb_res, gene_name = "BRCA1")), NA) + # Test with title_specific option + expect_error(suppressWarnings(colocboost_plot(cb_res, title_specific = "BRCA1")), NA) # Test with variant_coord option expect_error(suppressWarnings(colocboost_plot(cb_res, variant_coord = FALSE)), NA) @@ -305,8 +305,8 @@ test_that("colocboost_plot handles focal outcome in complex cases", { # Test plot_focal_only option expect_error(suppressWarnings(colocboost_plot(cb_res_focal, plot_focal_only = TRUE)), NA) - # Test plot_focal_cos_outocme_only option - expect_error(suppressWarnings(colocboost_plot(cb_res_focal, plot_focal_cos_outocme_only = TRUE)), NA) + # Test plot_focal_cos_outcome_only option + expect_error(suppressWarnings(colocboost_plot(cb_res_focal, plot_focal_cos_outcome_only = TRUE)), NA) # Combine focal outcome filtering with other options expect_error(suppressWarnings(colocboost_plot(cb_res_focal, @@ -314,12 +314,12 @@ test_that("colocboost_plot handles focal outcome in complex cases", { y = "cos_vcp")), NA) expect_error(suppressWarnings(colocboost_plot(cb_res_focal, - plot_focal_cos_outocme_only = TRUE, + plot_focal_cos_outcome_only = TRUE, plot_ucos = TRUE)), NA) # Test focusing only on outcomes colocalized with focal outcome expect_error(suppressWarnings(colocboost_plot(cb_res_focal, - plot_focal_cos_outocme_only = TRUE, + plot_focal_cos_outcome_only = TRUE, outcome_idx = 1:3)), NA) }) diff --git a/vignettes/ColocBoost_Diagnostics.Rmd b/vignettes/ColocBoost_Diagnostics.Rmd index da9d886..0344b32 100644 --- a/vignettes/ColocBoost_Diagnostics.Rmd +++ b/vignettes/ColocBoost_Diagnostics.Rmd @@ -32,7 +32,7 @@ names(res) - **`cb_model_para`**: parameters used in fitting ColocBoost model. -```{r cb-model} +```{r cb-model-para} names(res$diagnostic_details$cb_model_para) ``` @@ -40,9 +40,9 @@ names(res$diagnostic_details$cb_model_para) # 2. Diagnostic details of the model fitting - **`cb_model`**: trait-specific proximity gradient boosting model, including proximity weight at each iteration, residual after gradient boosting, et al. -- **`weights_paths``**: individual trait-specific weights for each iteration. +- **`weights_paths`**: individual trait-specific weights for each iteration. -```{r cb-model-para} +```{r cb-model} names(res$diagnostic_details$cb_model) names(res$diagnostic_details$cb_model$ind_outcome_1) ``` diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index 412641c..4d42e14 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -15,30 +15,31 @@ knitr::opts_chunk$set( ``` -This vignette demonstrates how to use the ColocBoost wrapper pipeline to perform colocalization analysis using `colocboost`. -See more details about functions in package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and +This vignette demonstrates how to use the ColocBoost wrapper pipeline to perform colocalization analysis with `colocboost`. + +- See more details about functions in the package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and `colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). -See more etails about input data preparation in `xqtl_protocal` with [https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html]. +- See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html). -# 1. Loading Data use `colocboost_analysis_pipeline` function +# 1. Loading Data using `colocboost_analysis_pipeline` function This function harmonizes the input data and prepares it for colocalization analysis. In this section, we introduce how to load the regional data required for the ColocBoost analysis using the `load_multitask_regional_data` function. -This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data) -or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets +This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data) +or summary statistics (sumstats, LD). Run `load_regional_univariate_data` and `load_rss_data` multiple times for different datasets. Below are the input parameters for this function: -## 1.1. Loading individual level data from multiple corhorts +## 1.1. Loading individual-level data from multiple cohorts -- **`region`** (required): A string of `chr:start-end` for the phenotype region you want to analyzed. -- **`genotype_list`**: A vector of path for PLINK bed files containing genotype data. -- **`phenotype_list`**: A vector of path for phenotype file names. -- **`covariate_list`**: A vector of path for covariate file names corresponding to the phenotype file vector. +- **`region`** (required): A string of `chr:start-end` for the phenotype region you want to analyze. +- **`genotype_list`**: A vector of paths for PLINK bed files containing genotype data. +- **`phenotype_list`**: A vector of paths for phenotype file names. +- **`covariate_list`**: A vector of paths for covariate file names corresponding to the phenotype file vector. - **`conditions_list_individual`**: A vector of strings representing different conditions or groups. - **`match_geno_pheno`**: A vector of indices of phenotypes matched to genotype if multiple genotype PLINK files are used. - **`maf_cutoff`**: Minimum minor allele frequency (MAF) cutoff. Default is 0. @@ -50,14 +51,10 @@ Below are the input parameters for this function: - **`region_name_col`**: Column name containing the region name. Default is `NULL`. - **`keep_indel`**: Logical indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`. - **`keep_samples`**: A vector of sample names to keep. Default is `NULL`. -- **`phenotype_header`**: Number of rows to skip at the beginning of the transposed phenotype file (default is 4 for `chr`, `start`, `end`, and `ID`). -- **`scale_residuals`**: Logical indicating whether to scale residuals. Default is `FALSE`. -- **`tabix_header`**: Logical indicating whether the tabix file has a header. Default is `TRUE`. - **Illustrated example** -The following example demonstrates how to set up an input with 3 phenotypes and 2 cohorts, where the first cohort has 2 phenotypes and the second cohort has 1 phenotype. +The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts, where the first cohort has 2 phenotypes and the second cohort has 1 phenotype. ```{r, data-loader-individual} # Example of loading individual-level data @@ -80,7 +77,7 @@ imiss_cutoff = 0.9 -## 1.2. Loading summary statistics from multiple corhorts or data set +## 1.2. Loading summary statistics from multiple cohorts or datasets - **`sumstat_path_list`**: A vector of file paths to the summary statistics. - **`column_file_path_list`**: A vector of file paths to the column mapping files. @@ -98,7 +95,7 @@ imiss_cutoff = 0.9 **Illustrated example** -The following example demonstrates how to set up an input with 2 summary and one LD reference. +The following example demonstrates how to set up input data with 2 summary statistics and one LD reference. ```{r, data-loader-sumstat} # Example of loading summary statistics @@ -118,13 +115,13 @@ n_controls = c(20000, 40000) -# Perform ColocBoost using `colocboost_analysis_pipeline` function +# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function In this section, we perform the colocalization analysis using the `colocboost_analysis_pipeline` function. Below are the input parameters for this function: -- **`region_data`**: The output of the `load_multitask_regional_data` function, which contains harmonized summary statistics and LD matrices for the region of interest. -- **`focal_trait`**: Name of trait if performing disease prioritized ColocBoost. +- **`region_data`**: The output of the `load_multitask_regional_data` function. +- **`focal_trait`**: Name of the trait if performing disease-prioritized ColocBoost. - **`event_filters`**: A list of patterns for filtering events based on context names. Example: for sQTL, list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:"). - **`maf_cutoff`**: A scalar to remove variants with maf < maf_cutoff, default is 0.005. @@ -135,4 +132,7 @@ Example: for sQTL, list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = " - **`impute_opts`**: A list of imputation options including rcond, R2_threshold, and minimum_ld (default: list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)). - +```{r, colocboost-analysis} +# region_data <- load_multitask_regional_data(...) +# res <- colocboost_analysis_pipeline(region_data) +``` \ No newline at end of file diff --git a/vignettes/Disease_Prioritized_Colocalization.Rmd b/vignettes/Disease_Prioritized_Colocalization.Rmd index 844ac51..a721a57 100644 --- a/vignettes/Disease_Prioritized_Colocalization.Rmd +++ b/vignettes/Disease_Prioritized_Colocalization.Rmd @@ -17,7 +17,7 @@ knitr::opts_chunk$set( This vignette demonstrates how to perform multi-trait colocalization analysis using a mixed-type dataset, including both individual level data and summary statistics. ColocBoost provides a flexible framework to integrate data both at the individual level or at the summary statistic level, -allowing to handle scenarios where the individual data is available for some traits (like xQTLs) and the summary data is available for other traits (disease/trait GWAS). +allowing the handling of scenarios where the individual data is available for some traits (like xQTLs) and the summary data is available for other traits (disease/trait GWAS). ```{r setup} @@ -26,9 +26,9 @@ library(colocboost) # 1. Loading individual and summary statistics data -To get started, load both Ind_5traits and Sumstat_5traits datasets into your R session. Once loaded, create a mixed dataset as follows: +To get started, load both `Ind_5traits` and `Sumstat_5traits` datasets into your R session. Once loaded, create a mixed dataset as follows: -- For traits 1, 2, 3, 4: use individual-level gentype and phenotype data. +- For traits 1, 2, 3, 4: use individual level genotype and phenotype data. - For trait 5: use summary statistics data. - Note that `LD` could be calculated from the `X` data in the `Ind_5traits` dataset, but it is not included in the `Sumstat_5traits` dataset. @@ -50,7 +50,7 @@ sumstat <- Sumstat_5traits$sumstat[5] LD <- get_cormat(Ind_5traits$X[[1]]) ``` -For analyze the specific one type of data, you can refer to the following +For analyze a specific one type of data, you can refer to the following tutorials [Individual Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html) and [Summary Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html). @@ -61,8 +61,8 @@ tutorials [Individual Level Data Colocalization](https://statfungen.github.io/co The preferred format for colocalization analysis in ColocBoost using mixed-type dataset: - **Individual level data**: `X` and `Y` are organized as lists, matched by trait index, - - `(X[1], Y[1])` contains individuals for trait 1, - - `(X[2], Y[2])` contains individuals for trait 2, + - `(X[1], Y[1])` contains individual level data for trait 1, + - `(X[2], Y[2])` contains individual level data for trait 2, - And so on for each trait under analysis. - **Summary level data**: @@ -85,7 +85,7 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutorials portal at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). @@ -93,17 +93,15 @@ at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost # 3. ColocBoost in disease-prioritized mode -Disease-prioritized mode of ColocBoost can - -When integrating with GWAS data, to mitigate the risk of biasing early updates toward stronger xQTL signals in LD proximity—and -should in fact colocalize with—weaker signals from the disease trait of interest, +When integrating with GWAS data, to mitigate the risk of biasing early updates toward stronger xQTL signals in LD proximity-and +should in fact colocalize with-weaker signals from the disease trait of interest, we also implement a disease-prioritized mode of ColocBoost as a soft prioritization strategy to colocalize variants with putative causal effects on the focal trait. To run the disease-prioritized mode, you need to specify the `focal_outcome_idx` argument in the `colocboost()` function. - `focal_outcome_idx` indicates the index of the focal trait in all traits of interest. -- Note that the `focal_outcome_idx` are counted from *individual level data* to *summmary statistics*. -Therefore, in this example, `focal_outcome_idx = 5` is used to indicate the index of the focal trait from (4 individual level traits) and (1 summary statistcs). +- Note that the `focal_outcome_idx` are counted from *individual level data* to *summary statistics*. +Therefore, in this example, `focal_outcome_idx = 5` is used to indicate the index of the focal trait from (4 individual level traits) and (1 summary statistics). ```{r disease-basic} @@ -118,7 +116,7 @@ colocboost_plot(res, plot_focal_only = TRUE) Unlike the other existing methods, the disease-prioritized mode of ColocBoost only used a soft prioritization strategy. -Therefore, it does not only identify the colocalization of the focal trait, but also the colocalization across the other traits without the focal trait. +Therefore, it not only identify the colocalization of the focal trait, but also the colocalization across the other traits without the focal trait. To extract all CoS and visualization of all colocalization results, you can use the following code: ```{r all-basic} @@ -132,7 +130,7 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutotials portal at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). diff --git a/vignettes/FineBoost_Special_Case.Rmd b/vignettes/FineBoost_Special_Case.Rmd index 7efa73b..8c19276 100644 --- a/vignettes/FineBoost_Special_Case.Rmd +++ b/vignettes/FineBoost_Special_Case.Rmd @@ -15,9 +15,7 @@ knitr::opts_chunk$set( ``` -This vignette demonstrates how to perform single-trait fine-mapping analysis using FineBoost, a specialized single-trait version of ColocBoost, -from both individual-level data and summary statistics. Specifically focusing on the 2nd trait with 2 causal variants (644 and 2289) from -`Ind_5traits` and `Sumstat_5traits` dataset included in the package. +This vignette demonstrates how to perform single-trait fine-mapping analysis using FineBoost, a specialized single-trait version of ColocBoost, with both individual-level data and summary statistics. Specifically focusing on the 2nd trait with 2 causal variants (644 and 2289) from the `Ind_5traits` and `Sumstat_5traits` datasets included in the package. ```{r setup} @@ -28,10 +26,10 @@ library(colocboost) # 1. Fine-mapping with individual-level data In this section, we demonstrate how to perform fine-mapping using individual-level genotype (`X`) and phenotype (`Y`) data. -This approach directly uses the raw data to identify causal variants. +This approach uses raw data directly to identify causal variants. -```{r load-example-indiviudal} +```{r load-example-individual} # Load example data data(Ind_5traits) X <- Ind_5traits$X[[2]] @@ -44,7 +42,7 @@ colocboost_plot(res) # 2. Fine-mapping with summary statistics -This section demonstrates fine-mapping analysis using summary statistics and a proper LD matrix. +This section demonstrates fine-mapping analysis using summary statistics along with a proper LD matrix. ```{r load-example-sumstat} # Load example data @@ -56,10 +54,11 @@ res <- colocboost(sumstat = sumstat, LD = LD) colocboost_plot(res) ``` + # 3. LD-free fine-mapping with one causal variant assumption -In scenarios where LD information is unavailable, FineBoost can still perform fine-mapping under the assumption of a single causal variant. -This approach is less computationally intensive but assumes that only one variant in a region is causal. +In scenarios where LD information is unavailable, FineBoost can still perform fine-mapping under the assumption that there is a single causal variant. +This approach is less computationally intensive but assumes that only one variant within a region is causal. ```{r ld-free} # Load example data diff --git a/vignettes/Individual_Level_Colocalization.Rmd b/vignettes/Individual_Level_Colocalization.Rmd index 4a5a59e..864c58a 100644 --- a/vignettes/Individual_Level_Colocalization.Rmd +++ b/vignettes/Individual_Level_Colocalization.Rmd @@ -15,10 +15,10 @@ knitr::opts_chunk$set( ``` -ColocBoost provides flexible interface for individual-level colocalization analysis across multiple formats. +ColocBoost provides a flexible interface for individual-level colocalization analysis across multiple formats. We recommend using individual level genotype and phenotype data when available, to gain both sensitivity and precision compared to summary statistics-based approaches. -This vignette demonstrates how to perform multi-trait colocalization analysis using individual-level data in ColocBoost, +This vignette demonstrates how to perform multi-trait colocalization analysis using individual level data in ColocBoost, specifically focusing on the `Ind_5traits` dataset included in the package. @@ -30,8 +30,8 @@ library(colocboost) # 1. The `Ind_5traits` Dataset -The `Ind_5traits` dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. T -he dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multiple trait colocalization analysis with individual-level data. +The `Ind_5traits` dataset contains 5 simulated phenotypes alongside corresponding genotype matrices. +The dataset is specifically designed to evaluate and demonstrate the capabilities of ColocBoost in multi-trait colocalization analysis with individual-level data. - `X`: A list of genotype matrices for different outcomes. - `Y`: A list of phenotype vectors for different outcomes. @@ -54,14 +54,14 @@ Ind_5traits$true_effect_variants ``` -# 2. Matched individual level inputs $X$ and $Y$ +# 2. Matched individual level input $X$ and $Y$ The preferred format for colocalization analysis in ColocBoost using individual level data is where genotype ($X$) and phenotype ($Y$) data are properly matched. - **Basic format**: `X` and `Y` are organized as lists, matched by trait index, - - `(X[1], Y[1])` contains individuals for trait 1, - - `(X[2], Y[2])` contains individuals for trait 2, + - `(X[1], Y[1])` contains individual level data for trait 1, + - `(X[2], Y[2])` contains individual level data for trait 2, - And so on for each trait under analysis. - **Cross-trait flexibility**: - There is no requirement for the same individuals across different traits. This allows for the analysis of traits with different sample sizes. @@ -87,7 +87,7 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutorials portal at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). @@ -117,15 +117,15 @@ res$cos_details$cos$cos_index ``` -## 3.2. Genotype matrix is a superset of samples across different phenotypes +## 3.2. Genotype matrix is a superset of individuals across different phenotypes -When the genotype matrix includes a superset of samples across different phenotypes, with **Input Format**: +When the genotype matrix includes a superset of individuals across different phenotypes, with **Input Format**: - `X` is a matrix of genotype data for all individuals. -- `Y` is a list of phenotype vectors for different outcomes. +- `Y` is a list of phenotype vectors for different traits. - Row names of both `X` and `Y` should be provided to match individuals - same format of individual id. -- It is bettern if `X` contain all samples present in the phenotype vectors (optional). -- This is particularly useful when you have a large genotype matrix and want to use it for multiple phenotypes with different samples. +- It is better if `X` contain all individuals present in the phenotype vectors (optional). +- This is particularly useful when you have a large genotype matrix and want to use it for multiple phenotypes with different individuals. It allows for efficient analysis without redundancy. @@ -146,7 +146,7 @@ res$cos_details$cos$cos_index -## 3.3. Arbitrary input matrices with mapping provided +## 3.3. Arbitrary input matrices with mapping dictionary provided When studying multiple traits with arbitrary genotype matrices for different traits, we also provide the interface for arbitrary genotype matrices with multiple phenotypes. @@ -154,14 +154,15 @@ This particularly benefits meta-analysis across heterogeneous datasets where, fo genotype data comes from different genotyping platforms or sequencing technologies. - **Input Format**: - - `X` is a list of genotype matrices. - - `Y` is a list of phenotype vectors. + - `X = list(X1, X3)` is a list of genotype matrices. + - `Y = list(Y1, Y2, Y3, Y4, Y5)` is a list of phenotype vectors, where traits 1 and 2 matched to the 1st genotype matrix `X1`; + traits 3,4,5 matched to 2nd genotype matrix `X3`. - `dict_YX` is a dictionary matrix that index of Y to index of X. ```{r dictionary-mapped} # Create a simple dictionary for demonstration purposes -X_arbitrary <- X[c(1,3)] # traits 1 and 2 matched to the first genotype matrix; traits 3,4,5 matched to the third genotype matrix. +X_arbitrary <- X[c(1,3)] dict_YX = cbind(c(1:5), c(1,1,2,2,2)) # Display the dictionary diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index 63ae925..c5f1dd3 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -27,7 +27,8 @@ For analyses using individual-level data, the basic format for single trait is a - `X` is an $N \times P$ matrix with $N$ individuals and $P$ variants. Including variant names as column names is highly recommended, especially when working with multiple $X$ matrices and $Y$ vectors. - `Y` is a length $N$ vector containing phenotype values for the same $N$ individuals as $X$. -The input format for multiple traits is similar, but `X` matrix should be a list of matrices, each corresponding to a different trait. `Y` vector should also be a list of vectors. +The input format for multiple traits is similar, but `X` should be a list of genotype matrices, each corresponding to a different trait. +`Y` should also be a list of phenotype vectors. For example: - `X = list(X1, X2, X3, X4, X5)` where each `Xi` is a matrix for trait `i` - with the dimension of $N_i \times P_i$, where $N_i$ and $P_i$ do not need to be the same for different traits. @@ -35,10 +36,10 @@ For example: `colocboost` also offers flexible input options (see detailed usage with different input formats, -refer to [Individual Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html).): +refer to [Individual Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html)): -- Single $X$ matrix with $N \times P$ with $Y$ matrix with $N \times L$ for $L$ traits. -- Multiple $X$ matrices and unmatched $Y$ vectors with a mapping dictionary. +- Single $X$ matrix with $N \times P$, and $Y$ matrix with $N \times L$ for $L$ traits. +- Multiple $X$ matrices and unmatched $Y$ vectors with a mapping dictionary (example shown in section 3 below). # 2. Summary Statistics @@ -60,11 +61,11 @@ head(Sumstat_5traits$sumstat[[1]]) The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, -refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).): +refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)): - One LD matrix with a superset of variants in `sumstat` for all traits is allowed. - Multiple LD matrices, each corresponding to a different trait, are also allowed for the trait-specific LD structure. -- Multiple LD matrices and unmatched `sumstat` data frames with a mapping dictionary are also allowed. +- Multiple LD matrices and unmatched `sumstat` data frames with a mapping dictionary are also allowed (example shown in section 3 below). @@ -104,5 +105,8 @@ For example, when anaylze $L$ traits for the same $P$ variants with the specifie - `effect_n` (highly recommended) is either a scalar or a vector of sample sizes for estimating regression coefficients. - `LD` (optional) is LD matrix for the $P$ variants. If it is not provided, it will apply LD-free ColocBoost. -See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in (LD mismatch diagnosis and LD-free analysis (avaiable soon)). + +See more details about Hyprcoloc compatible format in [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)). + +See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). diff --git a/vignettes/Interpret_ColocBoost_Output.Rmd b/vignettes/Interpret_ColocBoost_Output.Rmd index c7315fb..67aab7e 100644 --- a/vignettes/Interpret_ColocBoost_Output.Rmd +++ b/vignettes/Interpret_ColocBoost_Output.Rmd @@ -15,7 +15,7 @@ knitr::opts_chunk$set( ``` -This vignette demonstrates how to interpret the output of ColocBoost, specifically to get the summary of colocalization and filtering our the weak signals. +This vignette demonstrates how to interpret the output of ColocBoost, specifically to get the summary of colocalization and focusing only on strong colocalization events. ```{r setup} library(colocboost) @@ -57,7 +57,7 @@ The summary includes the following columns: To obtain the summary of colocalization with a specific focus on traits of interest, -you can use the `get_cos_summary`, see detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_cos_summary.html). +you can use the `get_cos_summary`, see the detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_cos_summary.html). This function allows you to filter the colocalization summary based on a particular outcome of interest, making it easier to interpret the results for specific traits. For example, if you are interested in the colocalization events involving the traits `Y1` and `Y2`, you can use the following code: @@ -74,9 +74,9 @@ cos_interest_outcome <- get_cos_summary(res, interest_outcome = c("Y1", "Y2")) In `cos_summary`, for each 95% CoS, the `cos_npc` column provides a normalized probability of colocalization and `min_npc_outcome` column provides the minimum normalized probability among colocalized traits. -Those two metrices are measured as an empirical evidence of colocalization both in CoS-level and in trait-level. -To obtain the best minimal colocalization configuration can be defined by using both `cos_npc` and `npc_outcome` -See detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_robust_colocalization.html). +Those two metrics are measured as an empirical evidence of colocalization both in CoS-level and in trait-level. +To obtain the best minimal colocalization configuration can be defined by using both `cos_npc` and `npc_outcome`. +See the detailed usage of this function in [link](https://statfungen.github.io/colocboost/reference/get_robust_colocalization.html). ```{r run-strong-colocalization} @@ -85,7 +85,7 @@ filter_res <- get_robust_colocalization(res, cos_npc_cutoff = 0.5, npc_outcome_c - The output from `get_robust_colocalization` is the same as output from `colocboost`, which can be directly used in any post inference and visualization. - `npc=0.5` or `npc_outcome = 0.2` maintains robust colocalization signals for cases when many traits are evaluated. -Higher thresholds can be specified if users only want to focus on strong colocalization events. +Higher thresholds can be specified if users want to focus only on strong colocalization events. ## 3. More details on ColocBoost output @@ -95,8 +95,8 @@ The entire colocalization output from `colocboost` is stored in the `colocboost` - **`cos_summary`**: A summary table for colocalization events (see details in above Section 1). - **`vcp`**: The variable colocalized probability for each variable. - **`cos_details`**: A object with all information for colocalization results. -- **`data_info`**: A object with detailed information from input data. -- **`model_info`**: A object with detailed information for colocboost model. +- **`data_info`**: A object with the detailed information from input data. +- **`model_info`**: A object with the detailed information for colocboost model. In this section, we will provide a detailed explanation of the components for deepening into ColocBoost result using a mixed dataset. @@ -116,7 +116,7 @@ res <- colocboost(X = X, Y = Y, sumstat = sumstat, LD = LD) ### 3.1. Variant colocalization probability (**`vcp`**) - **`vcp`** is the probability of a variant being colocalized with at least one traits, serving as analogs of posterior inclusion probabilities (PIPs) in single-trait fine-mapping. -To plot the VCP for the variants with in at least one CoS, you can use the `colocboost_plot` function with the `y` argument set to `"vcp"`. +To plot the VCP for the variants within at least one CoS, you can use the `colocboost_plot` function with the `y` argument set to `"vcp"`. ```{r vcp-plot} @@ -142,12 +142,12 @@ res$data_info$outcome_info ### 3.3. Colocalization details (**`cos_details`**) -**`cos_details`** provides detailed information for colocalization evetns identified by `colocboost`. +**`cos_details`** provides a detailed information for colocalization events identified by `colocboost`. This section will provide a detailed explanation of the components in `cos_details`. #### 3.3.1. Colocalized variants for each CoS (**`cos`**) -- **`cos`**: A list with detailed information of colocalized variants for each CoS. +- **`cos`**: A list with a detailed information of colocalized variants for each CoS. - **`cos_index`**: Indices of colocalized variables with unique identifier for each CoS. - **`cos_variables`**: Names of colocalized variables with unique identifier for each CoS. - Note that variants are ordered by their VCP in descending order. @@ -158,7 +158,7 @@ res$cos_details$cos #### 3.3.2. Colocalized traits for each 95% CoS (**`cos_outcomes`**) -- **`cos_outcomes`**: A list with detailed information of colocalized traits for each CoS. +- **`cos_outcomes`**: A list with a detailed information of colocalized traits for each CoS. - **`outcome_index`**: Indices of colocalized traits with unique identifier for each CoS. - **`outcome_name`**: Names of colocalized traits with unique identifier for each CoS. @@ -168,7 +168,7 @@ res$cos_details$cos_outcomes - **`cos_npc`**: normalized probability of colocalization for CoS, providing empirical evidence in favor of colocalization over a trait-specific configuration. - **`cos_outcomes_npc`**: normalized probability for each colocalized trait in order with evidence strength. -- These two metrices could be used to filter the colocalization events by relative strength of evidence, see details in Section 2. +- These two metrics could be used to filter the colocalization events by relative strength of evidence, see details in Section 2. ```{r cos-npc} @@ -194,7 +194,7 @@ res$cos_details$cos_purity res$cos_details$cos_top_variables ``` -- **`cos_weights`**: the integrative weights for each colocalized trait in the CoS. This is used to re-calibrate CoS when some traits are filtered out.. +- **`cos_weights`**: the integrative weights for each colocalized trait in the CoS. This is used to recalibrate CoS when some traits are filtered out.. - **`cos_vcp`**: the single-effect VCP for each CoS. @@ -210,7 +210,7 @@ res$model_info$jk_star[c(5:10,36:38), ] ``` - **`profile_loglik`**: joint profile log-likelihood changes over boosting rounds. -- **`outcome_profile_loglik`**: trait-specifc profile log-likelihood changes over boosting rounds. +- **`outcome_profile_loglik`**: trait-specific profile log-likelihood changes over boosting rounds. ```{r profile_loglik} # Plotting joint profile log-likelihood (blue) and trait-specific profile log-likelihood (red). @@ -225,7 +225,7 @@ plot(res$model_info$outcome_profile_loglik[[i]], type="p", col="#CC3333", lwd=2, - **`outcome_coupled_best_update_obj`**: objective at the (coupled) best update variant for each outcome. -```{r objetive-proximity} +```{r objective-proximity} # Plotting trait-specific proximity objective par(mfrow=c(2,3), mar=c(4,4,2,1)) for(i in 1:5){ @@ -233,11 +233,11 @@ plot(res$model_info$outcome_proximity_obj[[i]], type="p", col="#3366CC", lwd=2, } ``` -```{r objetive-best} +```{r objective-best} # Plotting trait-specific objective at the best update variant par(mfrow=c(2,3), mar=c(4,4,2,1)) for(i in 1:5){ - plot(res$model_info$outcome_coupled_best_update_obj[[i]], type="p", col="#CC3333", lwd=2, xlab="", ylab=paste0("Objetive at best update variant"), main = paste0("Trait ", i)) + plot(res$model_info$outcome_coupled_best_update_obj[[i]], type="p", col="#CC3333", lwd=2, xlab="", ylab=paste0("Objective at best update variant"), main = paste0("Trait ", i)) } ``` @@ -256,11 +256,11 @@ Y1 <- Heterogeneous_Effect$Y[,1,drop=F] res <- colocboost(X = c(X, list(X1)), Y = c(Y, list(Y1)), output_level = 2) ``` -#### 3.5.1. Trait-specific (uncolocalized) traits (**`ucos`**) +#### 3.5.1. Trait-specific (uncolocalized) confidence sets (**`ucos`**) -- **`ucos`**: A list containing detailed information about trait-specific (uncolocalized) traits for each uCoS. - - **`ucos_index`**: Indices of trait-specific (uncolocalized) variables. - - **`ucos_variables`**: Names of trait-specific (uncolocalized) variables. +- **`ucos`**: A list containing a detailed information about trait-specific (uncolocalized) variants for each uCoS. + - **`ucos_index`**: Indices of trait-specific (uncolocalized) variants. + - **`ucos_variables`**: Names of trait-specific (uncolocalized) variants. ```{r ucos} res$ucos_details$ucos @@ -268,7 +268,7 @@ res$ucos_details$ucos #### 3.5.2. Trait-specific (uncolocalized) outcomes (**`ucos_outcomes`**) -- **`ucos_outcomes`**: A list with detailed information about trait-specific (uncolocalized) outcomes for each uCoS. +- **`ucos_outcomes`**: A list with a detailed information about trait-specific (uncolocalized) outcomes for each uCoS. - **`outcome_index`**: Indices of trait-specific (uncolocalized) outcomes. - **`outcome_name`**: Names of trait-specific (uncolocalized) outcomes. @@ -291,21 +291,40 @@ res$ucos_details$cos_ucos_purity #### 3.5.4. Other components - **`ucos_weight`**: Integrative weights for each trait-specific (uncolocalized) trait, used to recalibrate UCoS when traits are filtered out. -- **`ucos_top_variables`**: Indices and names of the top variable for each UCoS, which is the variable with the highest VCP. +- **`ucos_top_variables`**: Indices and names of the top variable for each uCoS, which is the variable with the highest VCP. - **`ucos_purity`**: Includes three lists, each containing an $uS \times uS$ matrix, where $uS$ is the number of uCoS: - `min_abs_cor`: Minimum absolute correlation of variables within (diagonal) UCoS or between (off-diagonal) different uCoS. - `median_abs_cor`: Median absolute correlation of variables within or between UCoS. - `max_abs_cor`: Maximum absolute correlation of variables within or between UCoS. By analyzing these components, you can gain a deeper understanding of trait-specific (uncolocalized) effects that are not colocalized, -providing additional insights into the dataset. +providing additional insights into the data. ### 3.6. Diagnostic details (**`diagnostic_details`**) There is `diagnostic_details` in ColocBoost output when setting `output_level = 3`: +```{r diagnostic-details} +# Loading the dataset +data(Ind_5traits) +X <- Ind_5traits$X +Y <- Ind_5traits$Y +res <- colocboost(X = X, Y = Y, output_level = 3) +``` + - **`cb_model`**: trait-specific proximity gradient boosting model, including proximity weight at each iteration, residual after gradient boosting, et al. +- **`weights_paths`**: individual trait-specific weights for each iteration. + +```{r cb-model} +names(res$diagnostic_details$cb_model) +names(res$diagnostic_details$cb_model$ind_outcome_1) +``` + + - **`cb_model_para`**: parameters used in fitting ColocBoost model. -For more diagnosis of ColocBoost model fitting, please refer to our Tutorial (TO-DO-LIST). + +```{r cb-model-para} +names(res$diagnostic_details$cb_model_para) +``` diff --git a/vignettes/LD_Free_Colocalization.Rmd b/vignettes/LD_Free_Colocalization.Rmd index 6f8d330..aa58ee4 100644 --- a/vignettes/LD_Free_Colocalization.Rmd +++ b/vignettes/LD_Free_Colocalization.Rmd @@ -15,23 +15,40 @@ knitr::opts_chunk$set( ``` -This vignette demonstrates LD mismatch diagnosis in the `colocboost` package and how to perform LD-free colocalization analysis, +This vignette demonstrates LD mismatch diagnosis in the `colocboost` package and how to perform LD-mismatch and LD-free colocalization analysis, when some traits completely lack LD information or share only partial variant coverage with other traits. + ```{r setup} library(colocboost) ``` # 1. LD mismatch diagnosis -ColocBoost provides the diagnostic warnings for LD matrix is not compatible with the summary statistics. +The `colocboost` assumes that the LD matrix accurately estimates the correlations among variants from the original GWAS genotype data. +Typically, the LD matrix comes from some public databases of genotypes in a suitable reference population. +An inaccurate LD matrix may lead to unreliable colocalization results, especially if the LD matrix is significantly different from the one estimated from the original genotype data. + + +### Why LD Mismatch Matters + +An inaccurate LD matrix can cause inconsistencies between the summary statistics and the reference LD matrix, leading to: +- Biased estimates of causal variants. +- Increased computational time due to slower algorithm convergence. +- Potentially misleading colocalization results. + + +ColocBoost provides diagnostic warnings for assessing the consistency of the summary statistics with the reference LD matrix. - Estimated residual variance of the model is negative or greater than phenotypic variance (`rtr < 0` or `rtr > var_y`; see details in Supplementary Note S3.5.2). - Change in trait-specific profile log-likelihood according to a CoS is negative (see details in Supplementary Note S3.5.3). -In this example, we create a simulated dataset with LD mismatch by changing the sign of Z-scores for 1% variants of each trait. + +### Example of including LD mismatch + +In this example, we create a simulated dataset with LD mismatch by changing the sign of Z-scores for 1% of variants for each trait. ```{r LD-mismatch} # Create a simulated dataset with LD mismatch @@ -39,52 +56,90 @@ data("Sumstat_5traits") data("Ind_5traits") LD <- get_cormat(Ind_5traits$X[[1]]) -# Change sign of Z-score for 1% variants of each trait by including mismatched with LD +# Change sign of Z-score for 1% of variants for each trait by including mismatched LD set.seed(1) miss_prop <- 0.01 sumstat <- lapply(Sumstat_5traits$sumstat, function(ss){ p <- nrow(ss) - pos_miss <- sample(1:p, ceiling(miss_prop*p)) + pos_miss <- sample(1:p, ceiling(miss_prop * p)) ss$z[pos_miss] <- -ss$z[pos_miss] return(ss) }) ``` -There may two types of warnings when running `colocboost` with LD mismatch: + +### Running ColocBoost with LD Mismatch + +When running `colocboost` with an LD mismatch, you may encounter diagnostic warnings. +These warnings are not errors, and the analysis will still proceed. +However, the results may be less reliable due to the mismatch, and the computational time may increase as the algorithm takes longer to converge. ```{r LD-mismatch-runcode} res <- colocboost(sumstat = sumstat, LD = LD) ``` - -Note that the warning messages are not errors, and the analysis will still be performed. But the results may not be reliable due to the LD mismatch. -Additional, the computational time may be longer than expected, as the algorithm may take longer to converge due to the LD mismatch. +These warnings serve as diagnostic tools to alert users about potential inconsistencies in the input data. # 2. LD-free and LD-mismatch colocalization analysis -ColocBoost can perform LD-free colocalization analysis when some traits completely lack LD information with one causal per trait per region assumption. -There are two ways to perform LD-free and LD-mismatch colocalization analysis: +When there is substantial discordance between the LD matrix and summary statistics, +the reliability of colocalization analysis may be compromised. +Such discordance can arise when the LD matrix and summary statistics are derived from different populations +or when the LD matrix is estimated from a smaller or less representative reference sample. +This can lead to unexpected results, such as biased causal variant identification or reduced accuracy in the analysis. + +To address these challenges, ColocBoost provides two alternative approaches for colocalization analysis +with the assumption of one causal variant per trait per region: -- **LD-mismatch** (recommended): when LD matrix is esimated from out-of sample LD reference (smaller sample size or different population), -summary statistics are not compatible with the LD matrix but the LD matrix is still informative for indicating the correlations among different variants. -We recommended to perform colocalization analysis with LD matrix and only 1 iteration of gradient boosting. -In this case, the LD matrix is only used in checking the equivalence among the trait-specific best update variants. -Specially, using the LD matrix for colocalization analysis can help to improve the accuracy of the results. +- **One iteration approach** (recommended): performing only 1 iteration of gradient boosting with the LD matrix ensures that: + - The LD matrix is only used to check the equivalence among trait-specific best update variants. + - The accuracy of the results is improved compared to completely ignoring the LD matrix. + +This method is particularly useful when the LD matrix is mismatched but still provides valuable insights into variant correlations. ```{r LD-mismatch-one-iter} -res_mismatch <- colocboost(sumstat = sumstat, LD = LD, M=1) +# Perform only 1 iteration of gradient boosting with LD matrix +res_mismatch <- colocboost(sumstat = sumstat, LD = LD, M = 1) ``` -- **LD-free**: when some traits completely lack LD information, the LD matrix is not required. +- **LD-free**: when the mismatch between the LD matrix and summary statistics is too large to be useful +or when LD information is completely unavailable, ColocBoost provides an LD-free approach. ```{r LD-free} res_free <- colocboost(sumstat = sumstat) ``` +While this method is computationally efficient, it has limitations due to the strong assumption of a single causal variant per trait per region. +Users should interpret the results with caution, especially in regions with complex LD structures or multiple causal variants. + +ColocBoost also provides a flexibility to use Hyprcoloc compatible format for summary statistics without LD matrix. +```{r hyprcoloc-compatible} +# Loading the Dataset +data(Ind_5traits) +X <- Ind_5traits$X +Y <- Ind_5traits$Y +# Coverting to Hyprcoloc compatible format +effect_est <- effect_se <- effect_n <- c() +for (i in 1:length(X)){ + x <- X[[i]] + y <- Y[[i]] + effect_n[i] <- length(y) + rr <- susieR::univariate_regression(X = x, y = y) + effect_est <- cbind(effect_est, rr$betahat) + effect_se <- cbind(effect_se, rr$sebetahat) +} +colnames(effect_est) <- colnames(effect_se) <- c("Y1", "Y2", "Y3", "Y4", "Y5") +rownames(effect_est) <- rownames(effect_se) <- colnames(X[[1]]) +# Run colocboost +res <- colocboost(effect_est = effect_est, effect_se = effect_se, effect_n = effect_n) + +# Identified CoS +res$cos_details$cos$cos_index +``` diff --git a/vignettes/Partial_Overlap_Variants.Rmd b/vignettes/Partial_Overlap_Variants.Rmd index a730497..ea06466 100644 --- a/vignettes/Partial_Overlap_Variants.Rmd +++ b/vignettes/Partial_Overlap_Variants.Rmd @@ -27,12 +27,12 @@ library(colocboost) ### Causal variant structure -We create an example data from `Ind_5traits` with two causal variants, 644 and 2289, but each of them are only partially overlapping across traits. +We create an example data from `Ind_5traits` with two causal variants, 644 and 2289, but each of them is only partially overlapping across traits. - Causal variant 644 is associated with traits 1, 3, and 4, but is missing in trait 2. - Causal variant 2289 is associated with traits 2 and 3, but is missing in trait 5. -This structure creates a realistic scenario where multiple traits from different datasets are not fully overlapping, and the causal variants are not shared across all traits. +This structure creates a realistic scenario in which multiple traits from different datasets are not fully overlapping, and the causal variants are not shared across all traits. ```{r make-data} # Load example data @@ -40,7 +40,7 @@ data(Ind_5traits) X <- Ind_5traits$X Y <- Ind_5traits$Y -# Creeate causal variants with potentially LD proxies +# Create causal variants with potentially LD proxies causal_1 <- c(200:1000) causal_2 <- c(2000:2600) @@ -49,15 +49,15 @@ X[[2]] <- X[[2]][, -causal_1, drop = FALSE] X[[3]] <- X[[3]][, -causal_2, drop = FALSE] # Show format -X[[2]][1:2, 1:10] -X[[3]][1:2, 1:10] +X[[2]][1:2, 1:6] +X[[3]][1:2, 1:6] ``` ## 1. Run ColocBoost with partial overlapping variants -To run ColocBoost to the different genotypes with different causal variants, the variant names should be provided as the column names of the `X` matrices. +To run ColocBoost on different genotypes with different causal variants, the variant names should be provided as the column names of the `X` matrices. Otherwise, the `colocboost` function will not be able to identify the variants correctly from different genotype matrices, -and the analysis will fail with an error message `Please verify the variable names across different outcomes.`. +and the analysis will fail with the error message `Please verify the variable names across different outcomes.` ```{r run-code} # Run colocboost @@ -72,7 +72,7 @@ colocboost_plot(res) ## 2. Limitations of using only overlapping variables -If we perform colocalization analysis using only overlapping variables, we may fail to detect any colocalization events. +If we perform a colocalization analysis using only overlapping variables, we may fail to detect any colocalization events. This is because the causal variants, which are only partially overlapping across traits, are excluded during the preprocessing step. As a result, even though these variants are associated with some traits, they are removed from the analysis, leading to a loss of critical information. This highlights the importance of handling partial overlaps effectively to ensure that meaningful colocalization signals are not missed. @@ -93,8 +93,8 @@ colocboost_plot(res) In disease-prioritized colocalization analysis with a focal trait, `ColocBoost` recommends prioritizing variants in the focal trait as the default setting. For the example above, if we consider trait 3 as the focal trait, only variants present in trait 3 will be included in the analysis. -This ensures that the analysis focuses on variants relevant to the focal trait while accounting for partial overlaps across other traits. -If you want to include all variants across traits, you can set `focal_outcome_variables = FALSE` to disable this default behavior. +This ensures that the analysis focuses on variants relevant to the focal trait while also accounting for partial overlaps across other traits. +If you want to include all variants across traits, you can set `focal_outcome_variables = FALSE` to override this default behavior. ```{r run-code-focal} # Run colocboost diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 7da663a..17dcaba 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -26,7 +26,7 @@ library(colocboost) # 1. The `Sumstat_5traits` Dataset The `Sumstat_5traits` dataset contains 5 simulated summary statistics, where it is directly derived from the `Ind_5traits` dataset using marginal association. -The dataset is specifically designed for evaluating and demonstrating the capabilities of ColocBoost in multiple trait colocalization analysis with summary association data. +The dataset is specifically designed to evaluate and demonstrate the capabilities of ColocBoost in multi-trait colocalization analysis with summary association data. - `sumstat`: A list of data.frames of summary statistics for different traits. - `true_effect_variants`: True effect variants indices for each trait. @@ -38,7 +38,7 @@ The dataset features two causal variants with indices 644 and 2289. - Causal variant 644 is associated with traits 1, 2, 3, and 4. - Causal variant 2289 is associated with traits 2, 3, and 5. -This structure creates a realistic scenario where multiple traits are influenced by different but overlapping sets of genetic variants. +This structure creates a realistic scenario in which multiple traits are influenced by different but overlapping sets of genetic variants. ```{r load-summary-data} # Loading the Dataset @@ -48,10 +48,11 @@ Sumstat_5traits$true_effect_variants ``` ### Important data format for summary data -Must include the following columns: +`sumstat` must include the following columns: + - `z` or (`beta`, `sebeta`): either z-score or (effect size and standard error) -- `n`: sample size for the summary statistics, it is highly recommendation to provide. -- `variant`: required if sumstat for different outcomes do not have the same number of variables (multiple sumstat and multiple LD). +- `n`: sample size for the summary statistics, it is highly recommended to provide. +- `variant`: required if `sumstat` for different outcomes do not have the same number of variables (multiple `sumstat` and multiple `LD`). ```{r summary-data-format} @@ -88,7 +89,7 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our tutorials portal at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). @@ -97,14 +98,14 @@ at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost ## 3.1. Matched LD with multiple sumstat (Trait-specific LD) -When studying multiple traits with its own trait-specific LD matrix, you could provide a list of LD matrices matched with a list of summary statistics. +When studying multiple traits with their own trait-specific LD matrices, you could provide a list of LD matrices matched with a list of summary statistics. - **Basic format**: `sumstat` and `LD` are organized as lists, matched by trait index, - `(sumstat[1], LD[1])` contains information for trait 1, - `(sumstat[2], LD[2])` contains information for trait 2, - And so on for each trait under analysis. - **Cross-trait flexibility**: - - There is no requirement for the same variants across different traits. This allows for the analysis of traits with variants avaiable. + - There is no requirement for the same variants across different traits. This allows for the analysis of traits with available variants. - This is particularly useful when you have a large dataset with many traits and want to focus on specific variants and trait-specific LD. ```{r matched-LD} @@ -125,7 +126,7 @@ When the LD matrix includes a superset of variants across different summary stat - `sumstat` is a list of data.frames for all traits - `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits. -- The LD matrix should contain all variants present in the summary statistics data frames. +- The LD matrix should contain superset of variants presented in the summary statistics data frames. - This is particularly useful when you have a large LD matrix from a reference panel and want to use it for multiple summary statistics datasets. It allows for efficient analysis without redundancy. @@ -148,11 +149,11 @@ res$cos_details$cos$cos_index When studying multiple traits with arbitrary LD matrices for different summary statistics, we also provide the interface for arbitrary LD matrices with multiple sumstat. This particularly benefits meta-analysis across heterogeneous datasets where, -for different subsets of summary statistics, LD comes from different population. +for different subsets of summary statistics, LD comes from different populations. - **Input Format**: - - `sumstat` is a list of data.frames for all traits. - - `LD` is a list of LD matrices. + - `sumstat = list(sumstat1, sumstat2, sumstat3, sumstat4, sumstat5)` is a list of data.frames for all traits. + - `LD = list(LD1, LD2)` is a list of LD matrices. - `dict_sumstatLD` is a dictionary matrix that index of sumstat to index of LD. @@ -171,3 +172,41 @@ res <- colocboost(sumstat = Sumstat_5traits$sumstat, LD = LD_arbitrary, dict_sum res$cos_details$cos$cos_index ``` + +## 3.4. Hyprcoloc compatible format: effect size and standard error matrices + +ColocBoost also provides a flexibility to use Hyprcoloc compatible format for summary statistics with and without LD matrix. + +```{r hyprcoloc-compatible} +# Loading the Dataset +data(Ind_5traits) +X <- Ind_5traits$X +Y <- Ind_5traits$Y + +# Coverting to Hyprcoloc compatible format +effect_est <- effect_se <- effect_n <- c() +for (i in 1:length(X)){ + x <- X[[i]] + y <- Y[[i]] + effect_n[i] <- length(y) + rr <- susieR::univariate_regression(X = x, y = y) + effect_est <- cbind(effect_est, rr$betahat) + effect_se <- cbind(effect_se, rr$sebetahat) +} +colnames(effect_est) <- colnames(effect_se) <- c("Y1", "Y2", "Y3", "Y4", "Y5") +rownames(effect_est) <- rownames(effect_se) <- colnames(X[[1]]) + +# Run colocboost +LD <- get_cormat(Ind_5traits$X[[1]]) +res <- colocboost(effect_est = effect_est, effect_se = effect_se, effect_n = effect_n, LD = LD) + +# Identified CoS +res$cos_details$cos$cos_index + +# Plotting the results +colocboost_plot(res) +``` + +See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in [LD mismatch and LD-free Colocalization](https://statfungen.github.io/colocboost/articles/LD_Free_Colocalization.html)). + + diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd index e05e04b..41b1495 100644 --- a/vignettes/Visualization_ColocBoost_Output.Rmd +++ b/vignettes/Visualization_ColocBoost_Output.Rmd @@ -38,7 +38,7 @@ res <- colocboost(X = Ind_5traits$X, Y = Ind_5traits$Y) The default plot of the colocboost results provides a visual representation of the colocalization events. -- The x-axis indicates the indices of variants and y-axis indicates the -log10(p-value) from marginal associations. +- The x-axis indicates the indices of variants, and the y-axis indicates the -log10(p-value) from marginal associations. - The colors of the points represent the colocalization events, with different colors indicating different colocalization events. ```{r basic-plot} @@ -51,11 +51,11 @@ colocboost_plot(res) - `y = "log10p"` (default) with optional - `y = "z_original"` for z-scores - `y = "vcp"` for variant colocalization probabilities (single plot for all variants), - - `y = "coef"` for regression coefficients estimated from the colocboost model. + - `y = "coef"` for regression coefficients estimated from the ColocBoost model. - `y = "cos_vcp"` for variant colocalization probabilities (multiple plots for each CoS - only draw VCP for variants in CoS to the colocalized traits). - `plot_cos_idx = NULL` (default) indicates all colocalization events are plotted. `plot_cos_idx = 1` can be specified to plot the 1st colocalization event, and so on. - `outcome_idx = NULL` (default) indicates only the traits with colocalization are plotted. `outcome_idx = c(1,2,5)` can be specified to plot the traits 1, 2, and 5. -- `plot_all_outcome = FALSE` (default) indicates only the traits with colocalization are plotted. If `TRUE` can plot all traits. +- `plot_all_outcome = FALSE` (default) indicates only the traits with colocalization are plotted. If `TRUE`, it will plot all traits. - `cos_color = NULL` (default) indicates the colors of the colocalization events. Specify a vector of colors to customize the plot. @@ -65,7 +65,7 @@ There are several advanced options available for customizing the plot by deepeni ### 2.1. Plot with a zoom-in region -You can specify a zoom-in region by providing a `grange` argument, which is a vector of length 2 indicating the start and end indices of the region to be zoomed in. +You can specify a zoom-in region by providing a `grange` argument, which is a vector indicating the indices of the region to be zoomed in. ```{r zoomin-plot} colocboost_plot(res, grange = c(1:1000)) @@ -111,10 +111,10 @@ colocboost_plot( ### 2.5. Plot with trait-specific sets if exists -There are two options avaiable for plotting the trait-specific (uncolocalized) variants: +There are two options available for plotting the trait-specific (uncolocalized) variants: - `plot_ucos = FALSE` (default), if `TRUE` will plot all trait-specific (uncolocalized) sets. -- `plot_ucos_idx = NULL` (default) indicates all confidence sets are plotted. `plot_cos_idx = 1` can be specified to plot the 1st confidence sets, and so on. +- `plot_ucos_idx = NULL` (default) indicates all confidence sets are plotted. `plot_ucos_idx = 1` can be specified to plot the 1st uncolocalized confidence sets, and so on. *Important Note*: You should use `colocboost(..., output_level = 2)` to obtain the trait-specific (uncolocalized) information. @@ -134,17 +134,17 @@ colocboost_plot(res, plot_ucos = TRUE) In this example, there are two colocalized sets (blue and orange) and two trait-specific sets for trait 4 only (green and purple). -For comprehensive tutorials on result interpretation, please please visit our documentation portal +For comprehensive tutorials on result interpretation, please visit our tutorials portal at [Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). ### 2.6 Plot with focal trait for disease prioritized colocalization -There are three options avaiable for plotting the results from disease prioritized colocalization, considering a focal trait: +There are three options available for plotting the results from disease prioritized colocalization, considering a focal trait: - `plot_focal_only = FALSE` (default), if `TRUE` will only plot CoS with focal trait and ignoring other CoS. -- `plot_focal_cos_outocme_only = FALSE` (default) and **recommend** for visulization for disease prioritized colocalization. +- `plot_focal_cos_outcome_only = FALSE` (default) and **recommend** for visualization for disease prioritized colocalization. If `TRUE` will plot all CoS colocalized with at least on traits within CoS of focal traits. ```{r focal-colocalization} @@ -164,7 +164,7 @@ res <- colocboost(X = X, Y = Y, # Only plot CoS with focal trait colocboost_plot(res, plot_focal_only = TRUE) # Plot all CoS including at least one traits colocalized with focal trait -colocboost_plot(res, plot_focal_cos_outocme_only = TRUE) +colocboost_plot(res, plot_focal_cos_outcome_only = TRUE) ``` diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd index e5671ae..1e7bf8c 100644 --- a/vignettes/announcements.Rmd +++ b/vignettes/announcements.Rmd @@ -10,7 +10,7 @@ vignette: > ## ColocBoost on the media -- *April 17, 2025*: Manuscript describing ColocBoost methods with applications is posted on [medRxiv](). +- *April 20, 2025*: Manuscript describing ColocBoost methods with applications is posted on [medRxiv](https://www.medrxiv.org/content/10.1101/2025.04.17.25326042v1). ## Software updates