From 85290cc4cc850a3bb04808177b7e50fb31de2703 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 17 Apr 2025 08:14:10 -0400 Subject: [PATCH 1/3] cran --- README.md | 3 ++- vignettes/installation.Rmd | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e36bbf4..700796a 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,10 @@ This R package implements ColocBoost---motivated and designed for colocalization ## Quick Start -### CRAN Installation +### CRAN Installation (available soon) Install released versions from CRAN (Linux, macOS and Windows) + ```r install.packages("colocboost") ``` diff --git a/vignettes/installation.Rmd b/vignettes/installation.Rmd index 81ba77c..d83085b 100644 --- a/vignettes/installation.Rmd +++ b/vignettes/installation.Rmd @@ -16,7 +16,7 @@ knitr::opts_chunk$set( ## Installation -### CRAN +### CRAN (available soon) Install released versions from CRAN (Linux, macOS and Windows) ```r From ac2ae47d6c92d28036c1738d11d10826c1e67905 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 17 Apr 2025 13:21:45 -0400 Subject: [PATCH 2/3] add one feature for superset X --- DESCRIPTION | 3 +- R/colocboost.R | 24 +-- R/colocboost_init.R | 181 ++++++++++++------ man/colocboost.Rd | 14 +- man/colocboost_plot.Rd | 15 +- man/get_cormat.Rd | 4 +- man/get_cos.Rd | 14 +- man/get_cos_summary.Rd | 14 +- man/get_strong_colocalization.Rd | 14 +- vignettes/Individual_Level_Colocalization.Rmd | 41 +++- vignettes/Input_Data_Format.Rmd | 49 ++++- .../Summary_Statistics_Colocalization.Rmd | 4 +- 12 files changed, 250 insertions(+), 127 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 72ab83e..b5787c3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,9 +28,10 @@ Suggests: testthat (>= 3.0.0), knitr, rmarkdown, + bookdown, ashr, MASS -VignetteBuilder: knitr +VignetteBuilder: knitr, bookdown Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 License: MIT + file LICENSE diff --git a/R/colocboost.R b/R/colocboost.R index e9a746b..e88ba5d 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -191,15 +191,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data # - check individual level data if (!is.null(X) & !is.null(Y)) { # --- check input - if (is.data.frame(X)) { - X <- as.matrix(X) - } - if (is.data.frame(Y)) { - Y <- as.matrix(Y) - } - if (is.matrix(X)) { - X <- list(X) - } + if (is.data.frame(X)) X <- as.matrix(X) + if (is.data.frame(Y)) Y <- as.matrix(Y) + if (is.matrix(X)) X <- list(X) if (is.atomic(Y) && !is.list(Y)) { Y <- as.matrix(Y) if (ncol(Y) == 1) { @@ -216,23 +210,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } } else { Y <- lapply(1:length(Y), function(ii) { - if (is.null(dict_YX)) { - idx <- ii - } else { - idx <- dict_YX[ii, 2] - } - n <- nrow(X[[idx]]) y <- Y[[ii]] y <- as.matrix(y) + n <- length(y) if (nrow(y) == n) { return(y) } else if (ncol(y) == n) { return(t(y)) - } else { - stop("X and Y do not have the same sample size!") - } + } }) } + # --- check if variables in individual data p.ind <- unique(sapply(X, ncol)) if (length(p.ind) != 1) { diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 0ee9887..6368dfd 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -68,69 +68,18 @@ colocboost_init_data <- function(X, Y, dict_YX, flag <- 1 # if individual: X, Y if (!is.null(X) & !is.null(Y)) { - drop_lowfreq <- c() - dict_YX_final <- dict_YX - for (ij in 1:length(X)) { - index <- which(dict_YX == ij) - dict_YX_final[index] <- index[1] - } + + ind_formated <- process_individual_data( + X, Y, dict_YX, target_variants = keep_variable_names, + intercept = intercept, standardize = standardize + ) for (i in 1:length(Y)) { - tmp <- list( - "X" = NULL, - "Y" = scale(Y[[i]], center = intercept, scale = standardize), - "N" = length(Y[[i]]), - "variable_miss" = NULL - ) - x_tmp <- X[[dict_YX[i]]] - change_x <- if (dict_YX_final[i] == i) TRUE else FALSE - # - if sample different - if (nrow(x_tmp) != length(Y[[i]])) { - change_x <- TRUE - ind_id_Y <- rownames(Y[[i]]) - ind_id_X <- rownames(x_tmp) - if (is.null(ind_id_X) | is.null(ind_id_Y)) { - stop("Please provide the sample index of X and Y, since they do not have the same samples!") - } else { - pos <- match(ind_id_Y, ind_id_X) - x_tmp <- x_tmp[pos, , drop = FALSE] - if (sum(is.na(pos)) != 0) { - tmp$Y <- tmp$Y[-which(is.na(pos))] - } - } - } - # - if missing X - variable.name <- keep_variables[[dict_YX[i]]] - if (length(variable.name) != length(keep_variable_names)) { - x_extend <- matrix(0, - nrow = nrow(x_tmp), ncol = length(keep_variable_names), - dimnames = list(rownames(x_tmp), keep_variable_names) - ) - variable.tmp <- intersect(keep_variable_names, variable.name) - pos <- match(variable.tmp, keep_variable_names) - tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos) - poss <- match(variable.tmp, variable.name) - x_extend[, pos] <- x_tmp[, poss] - x_tmp <- x_extend - } - if (change_x) { - dict_YX_final[i] == i - if (!intercept & !standardize) { - x_stand <- x_tmp - } else { - x_stand <- Rfast::standardise(x_tmp, center = intercept, scale = standardize) - } - x_stand[which(is.na(x_stand))] <- 0 - tmp$X <- x_stand - } - cb_data$data[[flag]] <- tmp + cb_data$data[[flag]] <- ind_formated$result[[i]] names(cb_data$data)[flag] <- paste0("ind_outcome_", i) flag <- flag + 1 } - cb_data$dict <- c(dict_YX_final) - ind_idx <- max(dict_YX) - } else { - ind_idx <- 0 - } + cb_data$dict <- c(ind_formated$new_dict) + } n_ind <- flag - 1 # if summary: XtX XtY, YtY if (!is.null(Z) & !is.null(LD)) { @@ -649,3 +598,117 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic original_dict = dict )) } + +#' process individual level input format +#' @noRd +process_individual_data <- function(X, Y, dict_YX, target_variants, + intercept = TRUE, + standardize = TRUE) { + + # Step 0: Check if sample IDs match between Y and corresponding X + for (i in 1:length(Y)) { + current_matrix_type <- dict_YX[i] + # If row counts match, we assume samples are in the same order + if (nrow(Y[[i]]) != nrow(X[[current_matrix_type]])) { + # Row counts don't match, so check rownames + ind_id_Y <- rownames(Y[[i]]) + ind_id_X <- rownames(X[[current_matrix_type]]) + if (is.null(ind_id_X) || is.null(ind_id_Y)) { + stop("Please provide the sample index of X and Y, since they do not have the same samples!") + } + # Find matching samples + pos <- match(ind_id_Y, ind_id_X) + if (sum(!is.na(pos)) == 0) { + stop("No samples in Y match any samples in the corresponding X matrix!") + } + } + } + + # Step 1: Update dictionary to handle duplicates samples + unique_count <- 0 + new_dict <- integer(length(dict_YX)) + sample_lists <- lapply(Y, rownames) + for (i in 1:length(sample_lists)) { + # Check if we've seen this list before with the same matrix type + found <- FALSE + current_matrix_type <- dict_YX[i] + # Only check previous lists if i > 1 + if (i > 1) { + for (j in 1:(i-1)) { + # Only compare if they're from the same matrix + if (dict_YX[j] == current_matrix_type && + identical(sample_lists[[i]], sample_lists[[j]])) { + new_dict[i] <- new_dict[j] + found <- TRUE + break + } + } + } + # If not seen before, assign new unique index + if (!found) { + unique_count <- unique_count + 1 + new_dict[i] <- unique_count + } + } + + # Step 2: Create result list + result <- list() + + for (i in 1:length(Y)) { + tmp <- list( + "X" = NULL, + "Y" = NULL, + "N" = NULL, + "variable_miss" = NULL + ) + matrix_type <- dict_YX[i] + # Get the appropriate matrix from X list + current_X <- X[[matrix_type]] + current_Y <- Y[[i]] + # Check if we need to match samples or if we can use as-is + if (nrow(current_Y) == nrow(current_X)) { + # Same number of rows, assume same order + matched_X <- current_X + matched_Y <- scale(current_Y, center = intercept, scale = standardize) + } else { + # Different number of rows, find matching samples + overlap_samples <- intersect(rownames(current_Y), rownames(current_X)) + matched_X <- current_X[match(overlap_samples, rownames(current_X)), , drop = FALSE] + matched_Y <- current_Y[match(overlap_samples, rownames(current_Y)), , drop = FALSE] + matched_Y <- scale(matched_Y, center = intercept, scale = standardize) + } + tmp$Y <- matched_Y + tmp$N <- length(matched_Y) + + # - if missing X + variable.name <- colnames(matched_X) + if (length(variable.name) != length(target_variants)) { + x_extend <- matrix(0, nrow = nrow(matched_X), ncol = length(target_variants), + dimnames = list(rownames(matched_X), target_variants) ) + variable.tmp <- intersect(target_variants, variable.name) + pos <- match(variable.tmp, target_variants) + tmp$variable_miss <- setdiff(1:length(target_variants), pos) + poss <- match(variable.tmp, variable.name) + x_extend[, pos] <- matched_X[, poss] + matched_X <- x_extend + } + if (new_dict[i] == i) { + if (!intercept & !standardize) { + x_stand <- matched_X + } else { + x_stand <- Rfast::standardise(matched_X, center = intercept, scale = standardize) + } + x_stand[which(is.na(x_stand))] <- 0 + tmp$X <- x_stand + } + + # Create components for each list + result[[i]] <- tmp + } + # Return results with the unified dictionary + return(list( + result = result, + new_dict = new_dict, + original_dict = dict_YX + )) +} diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 11ca213..28c9d4f 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -213,20 +213,22 @@ that may arise from small sample sizes or discrepancies in minor allele frequenc \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) colnames(X) <- paste0("SNP", 1:P) -L = 3 +L <- 3 true_beta <- matrix(0, P, L) -true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 -true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) +true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 +true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3 Y <- matrix(0, N, L) -for (l in 1:L){ Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) } +for (l in 1:L) { + Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) +} res <- colocboost(X = X, Y = Y) res$cos_details$cos$cos_index diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd index 0c74a54..bff458a 100644 --- a/man/colocboost_plot.Rd +++ b/man/colocboost_plot.Rd @@ -98,24 +98,25 @@ Visualization plot for each colcoalization event. \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) colnames(X) <- paste0("SNP", 1:P) -L = 3 +L <- 3 true_beta <- matrix(0, P, L) -true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 -true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) +true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 +true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3 Y <- matrix(0, N, L) -for (l in 1:L){ Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) } +for (l in 1:L) { + Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) +} res <- colocboost(X = X, Y = Y) colocboost_plot(res, plot_cols = 1) colocboost_plot(res, plot_cols = 1, outcome_idx = 1:3) - } \concept{colocboost_plot} diff --git a/man/get_cormat.Rd b/man/get_cormat.Rd index f01aed9..e089c1c 100644 --- a/man/get_cormat.Rd +++ b/man/get_cormat.Rd @@ -20,8 +20,8 @@ This function calculates the correlation matrix (LD matrix) from individual leve \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) diff --git a/man/get_cos.Rd b/man/get_cos.Rd index 69f3764..b4a94d0 100644 --- a/man/get_cos.Rd +++ b/man/get_cos.Rd @@ -20,20 +20,22 @@ A list of indices of variables in each CoS. \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) colnames(X) <- paste0("SNP", 1:P) -L = 3 +L <- 3 true_beta <- matrix(0, P, L) -true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 -true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) +true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 +true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3 Y <- matrix(0, N, L) -for (l in 1:L){ Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) } +for (l in 1:L) { + Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) +} res <- colocboost(X = X, Y = Y) get_cos(res, coverage = 0.75) diff --git a/man/get_cos_summary.Rd b/man/get_cos_summary.Rd index 4e4197b..73e4cf9 100644 --- a/man/get_cos_summary.Rd +++ b/man/get_cos_summary.Rd @@ -41,20 +41,22 @@ A summary table for colocalization events with the following columns: \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) colnames(X) <- paste0("SNP", 1:P) -L = 3 +L <- 3 true_beta <- matrix(0, P, L) -true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 -true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) +true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 +true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3 Y <- matrix(0, N, L) -for (l in 1:L){ Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) } +for (l in 1:L) { + Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) +} res <- colocboost(X = X, Y = Y) get_cos_summary(res) diff --git a/man/get_strong_colocalization.Rd b/man/get_strong_colocalization.Rd index 0d43f37..5bc2c93 100644 --- a/man/get_strong_colocalization.Rd +++ b/man/get_strong_colocalization.Rd @@ -41,20 +41,22 @@ A \code{"colocboost"} object with some or all of the following elements: \examples{ # colocboost example set.seed(1) -N = 1000 -P = 100 +N <- 1000 +P <- 100 # Generate X with LD structure sigma <- 0.9^abs(outer(1:P, 1:P, "-")) X <- MASS::mvrnorm(N, rep(0, P), sigma) colnames(X) <- paste0("SNP", 1:P) -L = 3 +L <- 3 true_beta <- matrix(0, P, L) -true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 -true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) +true_beta[10, 1] <- 0.5 # SNP10 affects trait 1 +true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized) true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3 Y <- matrix(0, N, L) -for (l in 1:L){ Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) } +for (l in 1:L) { + Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) +} res <- colocboost(X = X, Y = Y) res$cos_details$cos$cos_index filter_res <- get_strong_colocalization(res, cos_npc_cutoff = 0.5, npc_outcome_cutoff = 0.2) diff --git a/vignettes/Individual_Level_Colocalization.Rmd b/vignettes/Individual_Level_Colocalization.Rmd index 2f38a96..74a89a4 100644 --- a/vignettes/Individual_Level_Colocalization.Rmd +++ b/vignettes/Individual_Level_Colocalization.Rmd @@ -87,7 +87,9 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal at FIXME (link). +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and +[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). # 3. Other structures of individual level data @@ -100,21 +102,48 @@ This is particularly useful when the same individuals are used for different tra - **Input Format**: - `X` is a single matrix containing genotype data for all individuals. - - `Y` can be i) a matrix with N * L dimension; ii) a list of phenotype vectors for L traits. + - `Y` can be i) a matrix with $N \times L$ dimension; ii) a list of phenotype vectors for $L$ traits. ```{r single-x} # Extract a single SNP (as a vector) -X_single <- X[[1]] # First SNP for all individuals +# X_single <- X[[1]] # First SNP for all individuals # Run colocboost -res <- colocboost(X = X, Y = Y) +# res <- colocboost(X = X_single, Y = Y) # Identified CoS -res$cos_details$cos$cos_index +# res$cos_details$cos$cos_index +``` + + +## 3.2. Genotype matrix is a superset of samples across different phenotypes + +When the LD matrix includes a superset of variants across different summary statistics, with **Input Format**: + +- `sumstat` is a list of data.frames for all traits +- `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits. +- The LD matrix should contain all variants present in the summary statistics data frames. +- This is particularly useful when you have a large LD matrix from a reference panel and want to use it for multiple summary statistics datasets. +It allows for efficient analysis without redundancy. + + +```{r superset-X} +# Create sumstat with different number of variants - remove 100 variants in each sumstat +# LD_superset <- LD +# sumstat <- lapply(Sumstat_5traits$sumstat, function(x) x[-sample(1:nrow(x), 100), , drop = FALSE]) + +# Run colocboost +# res <- colocboost(sumstat = sumstat, LD = LD_superset) + +# Identified CoS +# res$cos_details$cos$cos_index ``` -## 3.2. Arbitrary input matrices with mapping provided + + + +## 3.3. Arbitrary input matrices with mapping provided When studying multiple traits with arbitrary genotype matrices for different traits, we also provide the interface for arbitrary genotype matrices with multiple phenotypes. diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index 39ba734..e652281 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -2,7 +2,7 @@ title: "Input Data Format" output: bookdown::html_document2: - base_format: rmarkdown::html_vignette + # base_format: rmarkdown::html_vignette toc: true toc_depth: 2 number_sections: true @@ -26,7 +26,7 @@ library(colocboost) This vignette documents the standard input data formats of `colocboost`. -## 1. Individual Level Data +# 1. Individual Level Data For analyses using individual-level data, the basic format for single trait is as follows: @@ -43,11 +43,11 @@ For example: `colocboost` also offers flexible input options (see detailed usage with different input formats, refer to [Individual Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html).): -- Single X matrix with N * P with Y matrix with N * L for L traits. +- Single X matrix with $N \times P$ with Y matrix with $N \times L$ for $L$ traits. - Multiple X matrices and unmatched Y vectors with a mapping dictionary. -## 2. Summary Statistics +# 2. Summary Statistics For analyses using summary statistics, the basic format for single trait is as follows: @@ -66,7 +66,7 @@ head(Sumstat_5traits$sumstat[[1]]) The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, -refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).):: +refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).): - One LD matrix with a superset of variants in `sumstat` for all traits is allowed. - Multiple LD matrices, each corresponding to a different trait, are also allowed for the trait-specific LD structure. @@ -74,10 +74,41 @@ refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocb -## 3. Optional: mapping between arbitary input $X$ and $Y$ +# 3. Optional: mapping between arbitary input $X$ and $Y$ -TO-DO-LIST +For analysis when including multiple genotype matrices `X` with unmatched arbitary phenotype vectors `Y`, +a mapping dictionary `dict_YX` is required to indicate the relationship between `X` and `Y`. +Similiarly, when multiple LD matrices with unmatched arbitrary multiple summary statistics `sumstat` are used, +a mapping dictionary `dict_sumstatLD` is required to indicate the relationship between `sumstat` and `LD`. -## 4. Hyprcoloc compatible format: effect size and standard error matrices +For example, considering three genotype matrices `X = list(X1, X2, X3)` and 6 phenotype vectors `Y = list(Y1, Y2, Y3, Y4, Y5, Y6)`, where + +- `X1` is for trait 1, trait 2, trait 3 +- `X2` is for trait 4, trait 5 +- `X3` is for trait 6 + +Then, you need to define a 6 by 2 matrix mapping dictionary `dict_YX` as follows: + +- The first column should be `c(1,2,3,4,5,6)` for 6 traits. +- The second column should be `c(1,1,1,2,2,3)` for 3 genotype matrices. + +Here, each row indicates the trait index and the corresponding genotype matrix index. + +```{r dict_YX} +dict_YX <- cbind(c(1,2,3,4,5,6), c(1,1,1,2,2,3)) +dict_YX +``` + + +# 4. Hyprcoloc compatible format: effect size and standard error matrices + +ColocBoost also provides a flexibility to use Hyprcoloc compatible format for summary statistics with and without LD matrix. +For example, when anaylze L traits for the same P variants with the specified effect size and standard error matrices: + +- `effect_est` (required) is $P \times L$ matrix of variable regression coefficients (i.e. regression beta values) in the genomic region. +- `effect_se` (required) is $P \times L$ matrix of standard errors for the regression coefficients. +- `effect_n` (highly recommended) is either a scalar or a vector of sample sizes for estimating regression coefficients. +- `LD` (optional) is LD matrix for the $P$ variants. If it is not provided, it will apply LD-free ColocBoost. + +See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in FIXME. -TO-DO-LIST diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index a55e6f9..4d1dbe1 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -88,7 +88,9 @@ colocboost_plot(res) ### Results Interpretation -For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal at FIXME (link). +For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal +at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and +[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html). # 3. Other summary statistics and LD input combinations From 531c6e9af37910e5460f3598e54e99af149c766f Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 17 Apr 2025 13:50:21 -0400 Subject: [PATCH 3/3] minor fix --- R/colocboost_init.R | 29 +++++-------------- vignettes/Individual_Level_Colocalization.Rmd | 29 ++++++++++--------- 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 6368dfd..1223ecd 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -625,30 +625,17 @@ process_individual_data <- function(X, Y, dict_YX, target_variants, } # Step 1: Update dictionary to handle duplicates samples - unique_count <- 0 - new_dict <- integer(length(dict_YX)) sample_lists <- lapply(Y, rownames) - for (i in 1:length(sample_lists)) { - # Check if we've seen this list before with the same matrix type - found <- FALSE - current_matrix_type <- dict_YX[i] - # Only check previous lists if i > 1 - if (i > 1) { - for (j in 1:(i-1)) { - # Only compare if they're from the same matrix - if (dict_YX[j] == current_matrix_type && - identical(sample_lists[[i]], sample_lists[[j]])) { - new_dict[i] <- new_dict[j] - found <- TRUE - break - } + new_dict <- 1:length(dict_YX) + # For each pair of Y indices + for (i in 1:(length(dict_YX)-1)) { + for (j in (i+1):length(dict_YX)) { + # Check if they map to the same X matrix AND have the same samples + if (dict_YX[i] == dict_YX[j] && identical(sample_lists[[i]], sample_lists[[j]])) { + # If same matrix and same samples, use the smaller index + new_dict[j] <- new_dict[i] } } - # If not seen before, assign new unique index - if (!found) { - unique_count <- unique_count + 1 - new_dict[i] <- unique_count - } } # Step 2: Create result list diff --git a/vignettes/Individual_Level_Colocalization.Rmd b/vignettes/Individual_Level_Colocalization.Rmd index 74a89a4..b2a12f7 100644 --- a/vignettes/Individual_Level_Colocalization.Rmd +++ b/vignettes/Individual_Level_Colocalization.Rmd @@ -107,37 +107,40 @@ This is particularly useful when the same individuals are used for different tra ```{r single-x} # Extract a single SNP (as a vector) -# X_single <- X[[1]] # First SNP for all individuals +X_single <- X[[1]] # First SNP for all individuals # Run colocboost -# res <- colocboost(X = X_single, Y = Y) +res <- colocboost(X = X_single, Y = Y) # Identified CoS -# res$cos_details$cos$cos_index +res$cos_details$cos$cos_index ``` ## 3.2. Genotype matrix is a superset of samples across different phenotypes -When the LD matrix includes a superset of variants across different summary statistics, with **Input Format**: +When the genotype matrix includes a superset of samples across different phenotypes, with **Input Format**: -- `sumstat` is a list of data.frames for all traits -- `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits. -- The LD matrix should contain all variants present in the summary statistics data frames. -- This is particularly useful when you have a large LD matrix from a reference panel and want to use it for multiple summary statistics datasets. +- `X` is a matrix of genotype data for all individuals. +- `Y` is a list of phenotype vectors for different outcomes. +- Row names of both `X` and `Y` should be provided to match individuals - same format of individual id. +- It is bettern if `X` contain all samples present in the phenotype vectors (optional). +- This is particularly useful when you have a large genotype matrix and want to use it for multiple phenotypes with different samples. It allows for efficient analysis without redundancy. ```{r superset-X} -# Create sumstat with different number of variants - remove 100 variants in each sumstat -# LD_superset <- LD -# sumstat <- lapply(Sumstat_5traits$sumstat, function(x) x[-sample(1:nrow(x), 100), , drop = FALSE]) +# Create phenotype with different samples - remove 50 samples trait 1 and trait 3. +X_superset <- X[[1]] +Y_remove <- Y +Y_remove[[1]] <- Y[[1]][-sample(1:length(Y[[1]]),50), , drop=F] +Y_remove[[3]] <- Y[[3]][-sample(1:length(Y[[3]]),50), , drop=F] # Run colocboost -# res <- colocboost(sumstat = sumstat, LD = LD_superset) +res <- colocboost(X = X_superset, Y = Y_remove) # Identified CoS -# res$cos_details$cos$cos_index +res$cos_details$cos$cos_index ```