From e81b12c2a726c7144cd5e19c25ffaa62058e01fe Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 17 Apr 2026 01:27:26 -0700 Subject: [PATCH 1/5] remove quantile QTL code --- DESCRIPTION | 3 - NAMESPACE | 8 - R/quail_rank_score.R | 154 ---- R/quail_vqtl.R | 170 ---- R/quantile_twas.R | 451 ----------- R/quantile_twas_weight.R | 1073 -------------------------- R/twas.R | 229 +----- man/QUAIL_pipeline.Rd | 36 - man/QUAIL_rank_score_pipeline.Rd | 31 - man/calculate_xi_correlation.Rd | 27 - man/load_quantile_twas_weights.Rd | 38 - man/multicontext_ld_clumping.Rd | 31 - man/perform_qr_analysis.Rd | 23 - man/qr_screen.Rd | 50 -- man/quantile_twas_weight_pipeline.Rd | 78 -- tests/testthat/test_ctwas.R | 160 ++++ tests/testthat/test_quail_ctwas.R | 374 --------- tests/testthat/test_quail_vqtl.R | 338 -------- tests/testthat/test_quantile_twas.R | 717 ----------------- tests/testthat/test_twas.R | 206 +---- 20 files changed, 186 insertions(+), 4011 deletions(-) delete mode 100644 R/quail_rank_score.R delete mode 100644 R/quail_vqtl.R delete mode 100644 R/quantile_twas.R delete mode 100644 R/quantile_twas_weight.R delete mode 100644 man/QUAIL_pipeline.Rd delete mode 100644 man/QUAIL_rank_score_pipeline.Rd delete mode 100644 man/calculate_xi_correlation.Rd delete mode 100644 man/load_quantile_twas_weights.Rd delete mode 100644 man/multicontext_ld_clumping.Rd delete mode 100644 man/perform_qr_analysis.Rd delete mode 100644 man/qr_screen.Rd delete mode 100644 man/quantile_twas_weight_pipeline.Rd create mode 100644 tests/testthat/test_ctwas.R delete mode 100644 tests/testthat/test_quail_ctwas.R delete mode 100644 tests/testthat/test_quail_vqtl.R delete mode 100644 tests/testthat/test_quantile_twas.R diff --git a/DESCRIPTION b/DESCRIPTION index cae09866..5b28239a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,10 +55,7 @@ Suggests: ncvreg, pgenlibr, qgg, - quadprog, - quantreg, udr, - XICOR, qvalue, rmarkdown, snpStats, diff --git a/NAMESPACE b/NAMESPACE index 3f7f4994..1dc4b07f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(QUAIL_pipeline) -export(QUAIL_rank_score_pipeline) export(adjust_susie_weights) export(align_variant_names) export(allele_qc) @@ -15,7 +13,6 @@ export(bayes_c_weights) export(bayes_l_weights) export(bayes_n_weights) export(bayes_r_weights) -export(calculate_xi_correlation) export(check_ld) export(clean_context_names) export(coloc_post_processor) @@ -58,7 +55,6 @@ export(load_multicontext_sumstats) export(load_multitask_regional_data) export(load_multitrait_R_sumstat) export(load_multitrait_tensorqtl_sumstat) -export(load_quantile_twas_weights) export(load_regional_association_data) export(load_regional_functional_data) export(load_regional_multivariate_data) @@ -79,7 +75,6 @@ export(mr_format) export(mrash_weights) export(mrmash_weights) export(mrmash_wrapper) -export(multicontext_ld_clumping) export(multigene_udr) export(multivariate_analysis_pipeline) export(mvsusie_weights) @@ -89,11 +84,8 @@ export(otters_weights) export(parse_cs_corr) export(parse_region) export(parse_variant_id) -export(perform_qr_analysis) export(prs_cs) export(prs_cs_weights) -export(qr_screen) -export(quantile_twas_weight_pipeline) export(raiss) export(read_afreq) export(region_to_df) diff --git a/R/quail_rank_score.R b/R/quail_rank_score.R deleted file mode 100644 index ce72fa90..00000000 --- a/R/quail_rank_score.R +++ /dev/null @@ -1,154 +0,0 @@ -#' Calculate Quantile Rank Score -#' @param pheno numeric vector of phenotype values -#' @param covariates matrix/data.frame of covariates -#' @param tau quantile level -#' @return vector of rank scores -#' @importFrom stats rnorm -#' @noRd -calculate_rank_score <- function(pheno, covariates, tau, seed = 123) { - # Make sure quantreg is installed - if (!requireNamespace("quantreg", quietly = TRUE)) { - stop("To use this function, please install quantreg: https://cran.r-project.org/web/packages/quantreg/index.html") - } - set.seed(seed) - # Add random variable to covariates - covar_data <- data.frame(covariates) - covar_data$d_rv <- rnorm(nrow(covar_data)) - - # Fit quantile regression - Qreg <- suppressWarnings(quantreg::rq(pheno ~ ., data = covar_data, tau = tau, method = "fn")) - - # Get rank score - coeff <- summary(Qreg, se = "ker")$coefficients - SE_d_rv <- coeff[nrow(coeff), 2] - a_i_tau <- (tau - ifelse(residuals(Qreg) < 0, 1, 0)) * SE_d_rv / sqrt(-tau^2 + tau) - - return(a_i_tau) -} - -#' Fit Rank Scores for All Quantile Levels -#' @param pheno numeric vector of phenotype values -#' @param covariates matrix/data.frame of covariates -#' @param num_tau_levels integer number of quantile levels -#' @param num_cores integer number of cores for parallel processing -#' @return list of rank scores for each quantile level -#' @importFrom parallel mclapply -#' @noRd -fit_rank_scores <- function(pheno, covariates, num_tau_levels, num_cores = 1) { - if (num_cores > 1 && requireNamespace("parallel", quietly = TRUE)) { - mclapply(1:num_tau_levels, function(i) { - tau <- i / (num_tau_levels + 1) - calculate_rank_score(pheno, covariates, tau) - }, mc.cores = num_cores) - } else { - lapply(1:num_tau_levels, function(i) { - tau <- i / (num_tau_levels + 1) - calculate_rank_score(pheno, covariates, tau) - }) - } -} - -#' Calculate Integrated Rank Score -#' @param rank_scores list of rank scores -#' @param method character "equal" or "ivw" -#' @param num_tau_levels integer number of quantile levels -#' @return vector of integrated rank scores -#' @importFrom stats cov -#' @noRd -### calculate_integrated_score considering odds and even number tau -calculate_integrated_score <- function(rank_scores, method = "equal", num_tau_levels) { - if (!requireNamespace("quadprog", quietly = TRUE)) { - stop("To use this function, please install quadprog: https://cran.r-project.org/web/packages/quadprog/index.html") - } - if (num_tau_levels %% 2 == 0) { # even - mid_point <- num_tau_levels / 2 - lower_half <- 1:mid_point - upper_half <- (mid_point + 1):num_tau_levels - n_pairs <- mid_point - } else { # odds - mid_point <- ceiling(num_tau_levels / 2) - lower_half <- 1:(mid_point - 1) - middle <- mid_point - upper_half <- (mid_point + 1):num_tau_levels - n_pairs <- length(lower_half) - } - - if (method == "equal") { - int_rank_score <- 0 - if (num_tau_levels %% 2 == 0) { - # even number tau - for (i in 1:num_tau_levels) { - weight <- ifelse(i > mid_point, 1, -1) - int_rank_score <- int_rank_score + weight * rank_scores[[i]] - } - } else { - # odds number tau - for (i in lower_half) int_rank_score <- int_rank_score - rank_scores[[i]] - # int_rank_score <- int_rank_score + 0 * rank_scores[[middle]] - for (i in upper_half) int_rank_score <- int_rank_score + rank_scores[[i]] - } - } else if (method == "ivw") { - a_i_tau_diff_matrix <- matrix(0, nrow = length(rank_scores[[1]]), ncol = n_pairs) - - for (i in 1:n_pairs) { - if (num_tau_levels %% 2 == 0) { - a_i_tau_diff_matrix[, i] <- rank_scores[[upper_half[i]]] - rank_scores[[lower_half[i]]] - } else { - a_i_tau_diff_matrix[, i] <- rank_scores[[upper_half[i]]] - rank_scores[[lower_half[i]]] - } - } - - # Optimization steps - Y_QI_Var_Cov <- cov(a_i_tau_diff_matrix) - Dmat <- Y_QI_Var_Cov - dvec <- rep(0, n_pairs) - Amat <- cbind(rep(1, n_pairs), diag(n_pairs)) - bvec <- c(1, rep(0, n_pairs)) - - qp <- quadprog::solve.QP(Dmat, dvec, t(Amat), bvec, meq = 1) - weights_vec <- qp$solution - - int_rank_score <- 0 - for (i in 1:n_pairs) { - int_rank_score <- int_rank_score + weights_vec[i] * - (rank_scores[[upper_half[i]]] - rank_scores[[lower_half[i]]]) - } - } - - return(int_rank_score / n_pairs) -} - -#' Main QUAIL Rank Score Pipeline -#' @param phenotype numeric vector of phenotype values -#' @param covariates matrix/data.frame of covariates -#' @param num_tau_levels integer number of quantile levels -#' @param method character "equal" or "ivw" -#' @param num_cores integer number of cores for parallel processing -#' @return data.frame with integrated rank scores -#' @export -QUAIL_rank_score_pipeline <- function(phenotype, covariates, - num_tau_levels = 19, - method = "equal", - num_cores = 1) { - # Input validation - start_time <- Sys.time() - # Calculate rank scores for all quantile levels - if (!is.null(phenotype)) { - if (!is.numeric(phenotype)) { - if (is.data.frame(phenotype) || is.matrix(phenotype)) { - phenotype <- as.numeric(phenotype[[1]]) - } else { - stop("phenotype must be a numeric vector.") - } - } - } - - rank_scores <- fit_rank_scores(phenotype, covariates, num_tau_levels, num_cores) - - # Calculate integrated rank score - int_rank_score <- calculate_integrated_score(rank_scores, method, num_tau_levels) - end_time <- Sys.time() - cat("\nTotal vQTL score runtime:", round(difftime(end_time, start_time, units = "secs"), 2), "seconds\n") - - return(int_rank_score) -} diff --git a/R/quail_vqtl.R b/R/quail_vqtl.R deleted file mode 100644 index 8f27d413..00000000 --- a/R/quail_vqtl.R +++ /dev/null @@ -1,170 +0,0 @@ -#' Perform Univariate Regression for Genotype Data -#' -#' @param X Numeric matrix of genotypes (n x p), where n is the number of samples and p is the number of SNPs. -#' @param y Numeric vector of phenotypes (length n). -#' @param Z Optional numeric matrix of covariates (n x k), where k is the number of covariates. -#' @param center Logical, whether to center the data (default: TRUE). -#' @param scale Logical, whether to scale the data (default: FALSE). -#' @param return_residuals Logical, whether to return residuals (default: FALSE). -#' @return A list containing regression results: \code{betahat}, \code{sebetahat}, \code{z_scores}, \code{p_values}, and \code{q_values}. -#' @examples -#' X <- matrix(rnorm(1000), 100, 10) -#' y <- rnorm(100) -#' results <- univariate_regression(X, y) -#' @noRd -univariate_regression <- function(X, y, Z = NULL, center = TRUE, - scale = FALSE, return_residuals = FALSE) { - y_na <- which(is.na(y)) - if (length(y_na)) { - X <- X[-y_na, ] - y <- y[-y_na] - } - - if (center) { - y <- y - mean(y) - X <- scale(X, center = TRUE, scale = scale) - } else { - X <- scale(X, center = FALSE, scale = scale) - } - - X[is.nan(X)] <- 0 - - if (!is.null(Z)) { - if (center) { - Z <- scale(Z, center = TRUE, scale = scale) - } - residual_x <- X # Store X before modifying y - y <- .lm.fit(Z, y)$residuals - } else { - residual_x <- X - } - - output <- try(do.call( - rbind, - lapply(1:ncol(X), function(i) { - g <- .lm.fit(cbind(1, X[, i]), y) - return(c(coef(g)[2], calc_stderr(cbind(1, X[, i]), g$residuals)[2])) - }) - ), silent = TRUE) - - # Exception handling - if (inherits(output, "try-error")) { - output <- matrix(0, ncol(X), 2) - for (i in 1:ncol(X)) { - fit <- summary(lm(y ~ X[, i]))$coef - if (nrow(fit) == 2) { - output[i, ] <- as.vector(summary(lm(y ~ X[, i]))$coef[2, 1:2]) - } else { - output[i, ] <- c(0, 0) - } - } - } - - # Calculate z-scores (t-statistics) - z_scores <- output[, 1] / output[, 2] - - # Calculate p-values from z-scores - p_values <- 2 * pnorm(abs(z_scores), lower.tail = FALSE) - - # Calculate q-values using the provided function - q_values <- compute_qvalues(p_values) - - # Use residual_x's column names as the column names for the results - rownames(output) <- colnames(residual_x) - - result_list <- list( - betahat = setNames(output[, 1], rownames(output)), - sebetahat = setNames(output[, 2], rownames(output)), - z_scores = setNames(z_scores, rownames(output)), - p_values = setNames(p_values, rownames(output)), - q_values = setNames(q_values, rownames(output)) - ) - - if (return_residuals && !is.null(Z)) { - result_list$residuals <- y - } - - return(result_list) -} - - -#' Perform Linear Regression for GWAS -#' -#' @param genotype Numeric matrix of genotypes (n x p), where n is the number of samples and p is the number of SNPs. -#' @param phenotype Numeric vector of phenotypes (length n). -#' @param covariates Optional numeric matrix of covariates (n x k), where k is the number of covariates. -#' @return A data frame containing regression results for each SNP, including \code{BETA}, \code{SE}, \code{Z}, \code{P}, and \code{Q}. -#' @examples -#' genotype <- matrix(rbinom(1000 * 10, 2, 0.3), 1000, 10) -#' phenotype <- rnorm(1000) -#' results <- run_linear_regression1(genotype, phenotype) -#' @noRd -run_linear_regression <- function(genotype, phenotype, covariates = NULL, phenotype_id = NULL) { - if (!is.null(covariates)) { - covariates <- as.data.frame(lapply(covariates, as.numeric)) - } - - reg_results <- univariate_regression( - X = genotype, - y = phenotype, - Z = covariates, - center = TRUE, - scale = FALSE - ) - - snp_info <- parse_variant_id(colnames(genotype)) - - data.frame( - phenotype_id = if (!is.null(phenotype_id)) phenotype_id else NA, - chr = snp_info$chrom, - pos = snp_info$pos, - A1 = snp_info$A1, - A2 = snp_info$A2, - variant_id = colnames(genotype), - beta = reg_results$betahat, - se = reg_results$sebetahat, - z = reg_results$z_scores, - p = reg_results$p_values, - q = reg_results$q_values, - N = nrow(genotype) - ) -} - -#' Main QUAIL pipeline -#' QUAIL vQTL Analysis Pipeline -#' -#' @param genotype numeric matrix (n x p) of genotypes. -#' @param rank_score numeric vector (n x 1) of rank scores from Step 1. -#' @param phenotype optional numeric vector (n x 1) of original phenotype values. -#' @param covariates optional numeric matrix (n x k) of covariates. -#' @return A data frame containing vQTL results. -#' @export -#' @examples -#' \dontrun{ -#' results <- QUAIL_pipeline(genotype, rank_score, covariates = covariates) -#' } -QUAIL_pipeline <- function(genotype, rank_score, phenotype = NULL, - covariates = NULL, phenotype_id = NULL) { - start_time <- Sys.time() - - # Validate rank_score - if (!is.numeric(rank_score)) { - stop("rank_score must be a numeric vector.") - } - - # Validate covariates - if (!is.null(covariates)) { - if (!is.data.frame(covariates)) { - covariates <- as.data.frame(covariates) - } - covariates <- as.data.frame(lapply(covariates, as.numeric)) - } - - # Perform vQTL analysis - vqtl_results <- run_linear_regression(genotype, rank_score, covariates, phenotype_id = phenotype_id) - - end_time <- Sys.time() - cat("\nTotal vQTL runtime:", round(difftime(end_time, start_time, units = "secs"), 2), " seconds\n") - - return(vqtl_results) -} diff --git a/R/quantile_twas.R b/R/quantile_twas.R deleted file mode 100644 index 96f2287f..00000000 --- a/R/quantile_twas.R +++ /dev/null @@ -1,451 +0,0 @@ -#' Get Hierarchical Clusters with Modularity Optimization -#' -#' This function performs hierarchical clustering on a correlation matrix and identifies the optimal number of clusters using modularity optimization. It ensures continuous quantiles for each cluster and computes a modularity score. -#' FIXME: # The initial modularity (Q_modularity_initial) is calculated based on the clustering before we enforce continuous ranges in the clusters. After adjusting the cluster -#' assignments with `ensure_continuous_clusters`, we are not recalculating modularity for the new cluster structure. If required, you should compute the modularity -#' again based on the new cluster structure after ensuring continuous ranges. -#' -#' @param cormat A P x P correlation matrix (numeric matrix) where P is the number of items to be clustered. -#' @param between_cluster A numeric value specifying the correlation threshold for clustering. If all correlations are greater than this threshold, the function will form a single cluster. Default is 0.8. -#' -#' @return A list with the following elements: -#' \item{cluster}{A P x K binary matrix where K is the number of clusters. Each row represents an item, and a value of 1 in column k means that the item is assigned to cluster k.} -#' \item{Q_modularity_initial}{The modularity score based on the initial clustering, before enforcing continuous clusters.} -#' \item{cluster_ranges}{A list of ranges (quantiles) for each cluster.} -#' -#' @examples -#' # Example usage -#' cormat <- matrix(runif(100), nrow = 10) -#' cormat <- cormat %*% t(cormat) # Make it symmetric -#' res <- get_hierarchical_clusters(cormat, between_cluster = 0.8) -#' B <- res$cluster # The binary cluster matrix -#' Q <- res$Q_modularity_initial # The initial modularity score -#' ranges <- res$cluster_ranges # The quantile ranges for each cluster -#' -#' @importFrom stats hclust as.dist cutree -#' -#' @noRd -get_hierarchical_clusters <- function(cormat, between_cluster = 0.8) { - # Perform hierarchical clustering on the correlation matrix - hc <- hclust(as.dist(1 - cormat)) - - # Get the optimal number of clusters - opt_cluster <- get_n_cluster(hc, cormat, between_cluster = between_cluster) - n_cluster <- opt_cluster$n_cluster - Q_modularity_initial <- opt_cluster$Qmodularity # Save the initial modularity score - - # Obtain the initial clusters - index <- cutree(hc, n_cluster) - - # Adjust the clusters to ensure each cluster has continuous quantiles - index <- ensure_continuous_clusters(index) - - # Create the adjacency matrix B based on the adjusted continuous clusters - B <- sapply(1:max(index), function(t) as.numeric(index == t)) - B <- as.matrix(B) - - # Get the range of quantiles for each cluster - cluster_ranges <- get_cluster_ranges(index) - - # Return the cluster structure, initial modularity, and quantile ranges - return(list( - "cluster" = B, - "Q_modularity_initial" = Q_modularity_initial, - "cluster_ranges" = cluster_ranges - )) -} - -#' Ensure Clusters Have Continuous Quantiles -#' -#' This helper function ensures that clusters produced by hierarchical clustering are continuous in terms of quantiles. -#' -#' @param index A numeric vector representing the cluster assignments. -#' @return A numeric vector with adjusted cluster assignments ensuring continuity. -#' @examples -#' index <- c(1, 1, 2, 2, 3) -#' new_index <- ensure_continuous_clusters(index) -#' @noRd -ensure_continuous_clusters <- function(index) { - n <- length(index) - if (n <= 1) return(index) - new_index <- integer(n) - cluster_counter <- 1 - new_index[1] <- cluster_counter - - # Traverse the index to ensure continuous clusters - for (i in 2:n) { - # If the current quantile does not follow the previous one, start a new cluster - if (index[i] != index[i - 1]) { - cluster_counter <- cluster_counter + 1 - } - new_index[i] <- cluster_counter - } - - return(new_index) -} - -#' Get Range of Quantiles for Each Cluster -#' -#' This function returns the range of quantiles for each identified cluster. -#' -#' @param index A numeric vector representing the cluster assignments. -#' @return A list of quantile ranges for each cluster. -#' @examples -#' index <- c(1, 1, 2, 2, 3) -#' cluster_ranges <- get_cluster_ranges(index) -#' @noRd -get_cluster_ranges <- function(index) { - ranges <- list() - cluster_numbers <- unique(index) - - # For each cluster, find the range of quantiles - for (cluster in cluster_numbers) { - members <- which(index == cluster) - ranges[[cluster]] <- paste(min(members), "-", max(members)) - } - - return(ranges) -} - -#' Get Optimal Number of Clusters Using Modularity -#' -#' This function computes the optimal number of clusters using modularity optimization. It iterates over possible numbers of clusters and evaluates each using a modularity score. -#' -#' @param hc An object of class \code{hclust}, resulting from hierarchical clustering. -#' @param Sigma A P x P correlation matrix. -#' @param m The maximum number of clusters to evaluate. Default is the number of columns in \code{Sigma}. -#' @param between_cluster A numeric value specifying the correlation threshold. Default is 0.8. -#' @return A list containing the number of clusters and the modularity score for each. -#' @examples -#' cormat <- matrix(runif(100), nrow = 10) -#' cormat <- cormat %*% t(cormat) # Make it symmetric -#' hc <- hclust(as.dist(1 - cormat)) -#' result <- get_n_cluster(hc, cormat, between_cluster = 0.8) -#' n_cluster <- result$n_cluster -#' @noRd -get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), between_cluster = 0.8) { - # If the minimum correlation is greater than the threshold, use one cluster - if (min(Sigma) > between_cluster) { - IND <- 1 - Q <- 1 - } else { - Q <- c() - if (ncol(Sigma) < 10) { - m <- ncol(Sigma) - } - # Iterate over possible numbers of clusters and compute modularity - for (i in 1:m) { - index <- cutree(hc, i) - B <- sapply(1:i, function(t) as.numeric(index == t)) - Q[i] <- get_modularity(Sigma, B) - } - # Find the number of clusters with the maximum modularity - IND <- which(Q == max(Q)) - L <- length(IND) - if (L > 1) IND <- IND[1] # If multiple solutions, choose the first one - } - return(list("n_cluster" = IND, "Qmodularity" = Q)) -} - -#' Calculate Modularity -#' -#' This function computes the modularity score for a given cluster structure and weight matrix. -#' -#' @param Weight A P x P weight matrix. -#' @param B A binary matrix representing the cluster structure. -#' @return The modularity score (numeric). -#' @examples -#' W <- matrix(runif(100), nrow = 10) -#' B <- matrix(sample(0:1, 100, replace = TRUE), nrow = 10) -#' Q <- get_modularity(W, B) -#' @noRd -get_modularity <- function(Weight, B) { - if (dim(Weight)[1] == 1) { - Q <- 0 - } else { - W_pos <- Weight * (Weight > 0) - W_neg <- Weight * (Weight < 0) - N <- dim(Weight)[1] - K_pos <- colSums(W_pos) - K_neg <- colSums(W_neg) - m_pos <- sum(K_pos) - m_neg <- sum(K_neg) - m <- m_pos + m_neg - cate <- B %*% t(B) - if (m_pos == 0 & m_neg == 0) { - Q <- 0 - } else { - if (m_pos == 0) { - Q_positive <- 0 - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg - } else if (m_neg == 0) { - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos - Q_negative <- 0 - } else { - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg - } - } - Q <- m_pos / m * Q_positive - m_neg / m * Q_negative - } - return(Q) -} - - -#' Perform Grouped Integration for TWAS Weights -#' -#' This function integrates TWAS weights using two methods: correlation-based clustering and fixed tau intervals. The weights are clustered and averaged within each cluster, and the corresponding performance metrics (e.g., pseudo R2) are calculated. -#' -#' @param twas_weight A matrix where rows represent variants and columns represent tau values. Each entry represents the TWAS weight for a specific variant and tau. -#' @param tau_values A numeric vector representing the tau values (quantiles) corresponding to the columns of \code{twas_weight}. Default is a sequence from 0.01 to 0.99. -#' @param pseudo_R2 A numeric vector of pseudo R2 values corresponding to the tau values. -#' @param between_cluster A numeric value specifying the correlation threshold for clustering. If all correlations are greater than this threshold, the function will form a single cluster. Default is 0.8. -#' @param num_intervals The number of fixed non-overlapping intervals to divide the tau values. Default is 3. -#' -#' @return A list containing: -#' \item{weights}{A matrix of integrated TWAS weights for each cluster or interval.} -#' \item{twas_weight_performance}{A list of performance metrics, including quantile ranges and average pseudo R2 values for each cluster/interval.} -#' -#' @examples -#' # Example usage -#' twas_weight <- matrix(runif(100), nrow = 10) -#' tau_values <- seq(0.01, 0.99, length.out = 10) -#' pseudo_R2 <- runif(10) -#' result <- perform_grouped_integration(twas_weight, tau_values, pseudo_R2) -#' weights <- result$weights -#' performance <- result$twas_weight_performance -#' -#' @noRd -perform_grouped_integration <- function(twas_weight, tau_values, pseudo_R2, between_cluster = 0.8, num_intervals = 3) { - variant_id <- rownames(twas_weight) - - if (length(variant_id) > 1) { - # Method 1: Correlation-based clustering (C1, C2, ..., Cn) - cormat <- cor(twas_weight) - res <- get_hierarchical_clusters(cormat, between_cluster = between_cluster) - cluster_C <- res$cluster - n_clusters_C <- ncol(cluster_C) - - integrated_twas_weight_C <- list() - tau_ranges_C <- list() - pseudo_R2_avg_C <- list() - - for (k in 1:n_clusters_C) { - tau_indices_C <- which(cluster_C[, k] == 1) - beta_values_C <- matrix(twas_weight[, tau_indices_C], nrow = nrow(twas_weight)) - rownames(beta_values_C) <- variant_id - tau_subset_C <- tau_values[tau_indices_C] - - integrated_twas_weight_C[[k]] <- integrate_tau(tau_subset_C, beta_values_C) - tau_ranges_C[[k]] <- c(min(tau_subset_C), max(tau_subset_C)) - pseudo_R2_avg_C[[k]] <- mean(pseudo_R2[tau_indices_C]) - } - } else { - # If only one variant_id, skip Method 1 - integrated_twas_weight_C <- list() - tau_ranges_C <- list() - pseudo_R2_avg_C <- list() - n_clusters_C <- 0 # No clusters from Method 1 - } - - # Method 2: Fixed tau intervals (A1, A2, A3) with non-overlapping divisions - tau_cuts <- cut(tau_values, breaks = num_intervals, labels = FALSE, include.lowest = TRUE) - fixed_intervals <- tapply(tau_values, tau_cuts, range) - - integrated_twas_weight_A <- list() - tau_ranges_A <- list() - pseudo_R2_avg_A <- list() - - for (k in 1:num_intervals) { - tau_indices_A <- which(tau_cuts == k) - beta_values_A <- matrix(twas_weight[, tau_indices_A], nrow = nrow(twas_weight)) - rownames(beta_values_A) <- variant_id - tau_subset_A <- tau_values[tau_indices_A] - - integrated_twas_weight_A[[k]] <- integrate_tau(tau_subset_A, beta_values_A) - tau_ranges_A[[k]] <- c(fixed_intervals[[k]][1], fixed_intervals[[k]][2]) - pseudo_R2_avg_A[[k]] <- mean(pseudo_R2[tau_indices_A]) - } - - # Combine the results of Method 2 and Method 1 if Method 1 was applied - combined_weights <- do.call(cbind, c(integrated_twas_weight_A, integrated_twas_weight_C)) - - # Ensure the correct number of column names is provided - total_columns <- ncol(combined_weights) - expected_columns <- num_intervals + n_clusters_C - - if (total_columns == expected_columns) { - colnames(combined_weights) <- c( - paste0("A", 1:num_intervals, "_weights"), - if (n_clusters_C > 0) paste0("C", 1:n_clusters_C, "_weights") else NULL - ) - } else { - stop("Mismatch between the number of columns and the number of intervals/clusters") - } - - rownames(combined_weights) <- variant_id - - twas_weight_performance <- list() - - # Store results for Method 2 (A1, A2, A3) as data frames - for (k in 1:num_intervals) { - twas_weight_performance[[paste0("A", k, "_performance")]] <- data.frame( - "quantile_start" = round(fixed_intervals[[k]][1], 2), - "quantile_end" = round(fixed_intervals[[k]][2], 2), - "pseudo_R2_avg" = pseudo_R2_avg_A[[k]] - ) - } - - # Store results for Method 1 (C1, C2, ..., Cn) as data frames if it was applied - if (n_clusters_C > 0) { - for (k in 1:n_clusters_C) { - twas_weight_performance[[paste0("C", k, "_performance")]] <- data.frame( - "quantile_start" = round(tau_ranges_C[[k]][1], 2), - "quantile_end" = round(tau_ranges_C[[k]][2], 2), - "pseudo_R2_avg" = pseudo_R2_avg_C[[k]] - ) - } - } - - return(list("weights" = combined_weights, "twas_weight_performance" = twas_weight_performance)) -} - - -#' Integrate Tau Across Quantiles -#' -#' This function integrates TWAS weights across a set of tau values (quantiles) using trapezoidal integration. -#' -#' @param tau.temp A numeric vector of tau values. -#' @param a_tau A numeric vector or matrix of TWAS weights corresponding to the tau values. -#' -#' @return A numeric vector of integrated TWAS weights. -#' -#' @examples -#' tau_values <- seq(0.01, 0.99, length.out = 10) -#' a_tau <- matrix(runif(100), nrow = 10) -#' integrated_weights <- integrate_tau(tau_values, a_tau) -#' -#' @noRd -integrate_tau <- function(tau.temp, a_tau) { - tau.temp1 <- c(min(tau.temp) - 0.01, tau.temp, max(tau.temp) + - 0.01) - if (is.matrix(a_tau)) { - a_tau <- cbind(rep(0, nrow(a_tau)), a_tau, rep(0, nrow(a_tau))) - out <- (0.5 * (a_tau[, -ncol(a_tau)] + a_tau[, -1]) %*% - diff(tau.temp1)) - } else { - a_tau <- c(0, a_tau, 0) - out <- (0.5 * (a_tau[-length(a_tau)] + a_tau[-1]) %*% - diff(tau.temp1)) - } - return(out) -} - -#' Load Quantile TWAS Weights -#' -#' This function loads TWAS weights from RDS files and performs grouped integration using both correlation-based clustering and fixed tau intervals. -#' -#' @param weight_db_files A character vector of file paths to the RDS files containing TWAS weights. -#' @param tau_values A numeric vector representing the tau values (quantiles) to use. Default is \code{seq(0.01, 0.99, 0.01)}. -#' @param between_cluster A numeric value specifying the correlation threshold for clustering. Default is 0.8. -#' @param num_intervals The number of fixed non-overlapping intervals to divide the tau values. Default is 3. -#' -#' @return A list containing: -#' \item{weights}{A list of integrated TWAS weights for each context.} -#' \item{twas_cv_performance}{A list of TWAS cross-validation performance metrics for each context.} -#' -#' @examples -#' weight_db_files <- c("file1.rds", "file2.rds") -#' tau_values <- seq(0.01, 0.99, by = 0.01) -#' result <- load_quantile_twas_weights(weight_db_files, tau_values) -#' weights <- result$weights -#' performance <- result$twas_cv_performance -#' -#' @export -load_quantile_twas_weights <- function(weight_db_files, tau_values = seq(0.01, 0.99, 0.01), - between_cluster = 0.8, num_intervals = 3) { - # Internal function to load and validate data from RDS files - load_and_validate_data <- function(weight_db_files) { - all_data <- lapply(weight_db_files, function(rds_file) { - db <- readRDS(rds_file) - - # Extract gene data and check for valid entries - lapply(names(db), function(gene) { - gene_data <- db[[gene]] - lapply(names(gene_data), function(context) { - context_data <- gene_data[[context]] - if (!is.null(context_data$twas_weight) && !is.null(context_data$pseudo_R2)) { - list( - gene = gene, - context = clean_context_names(context, gene), - data = context_data - ) - } else { - message(paste("Warning: Missing twas_weight or pseudo_R2 for", gene, "-", context)) - NULL - } - }) - }) - }) - - result <- do.call(c, unlist(all_data, recursive = FALSE)) - result <- Filter(Negate(is.null), result) # Remove NULL elements - return(result) - } - - # Load and validate the data - combined_data <- load_and_validate_data(weight_db_files) - if (length(combined_data) == 0) { - stop("No valid data found in the provided files.") - } - - # Merge contexts for the same gene from different RDS files - merged_data <- list() - for (item in combined_data) { - gene <- item$gene - context <- item$context - if (is.null(merged_data[[gene]])) { - merged_data[[gene]] <- list() - } - # Ensure we are not duplicating gene entries unnecessarily - if (!is.null(item$data)) { - merged_data[[gene]][[context]] <- item$data - } - } - - # Perform grouped integration for each context and restructure results - results <- list(weights = list(), twas_cv_performance = list()) - - for (gene in names(merged_data)) { - message(paste("Processing gene:", gene)) - - for (context in names(merged_data[[gene]])) { - context_data <- merged_data[[gene]][[context]] - - if (!is.null(context_data$twas_weight) && !is.null(context_data$pseudo_R2)) { - tryCatch( - { - result <- perform_grouped_integration( - context_data$twas_weight, tau_values, - context_data$pseudo_R2, between_cluster, num_intervals - ) - results$weights[[context]] <- result$weights - results$twas_cv_performance[[context]] <- result$twas_weight_performance - }, - error = function(e) { - message(paste("Error in perform_grouped_integration for", gene, "-", context, ":", e$message)) - } - ) - } - } - } - - # Output the final structure - if (length(results$weights) == 0) { - message("Warning: No valid results were generated.") - } else { - message(paste("Number of contexts with valid results:", length(results$weights))) - } - - return(results) -} diff --git a/R/quantile_twas_weight.R b/R/quantile_twas_weight.R deleted file mode 100644 index 537d3d08..00000000 --- a/R/quantile_twas_weight.R +++ /dev/null @@ -1,1073 +0,0 @@ -#' @title Quantile TWAS Weight Calculation and QTL Analysis -#' -#' @description -#' This file contains functions for performing Quantile Transcriptome-Wide -#' Association Studies (TWAS) weight calculations and Quantile QTL analysis. -#' It provides tools for screening quantile regression results, performing -#' LD clumping and pruning, and calculating TWAS weights. -#' -#' @details -#' The main function in this file is `quantile_twas_weight_pipeline`, which -#' orchestrates the entire analysis process. Other functions are helper -#' functions used within the main pipeline. -#' - -#' Qrank Score Test Screen -#' @param X Matrix of predictors -#' @param Y Matrix or vector of response variables -#' @param Z Matrix of covariates (optional) -#' @param tau.list Vector of quantiles to be analyzed -#' @param screen_threshold Significance threshold for adjusted p-values -#' @param screen_method Method for p-value adjustment ('fdr' or 'qvalue') -#' @param top_count Number of top SNPs to select -#' @param top_percent Percentage of top SNPs to select -#' @return A list containing various results from the QR screen -#' @importFrom tidyr separate -#' @importFrom dplyr %>% mutate select -#' @export -qr_screen <- function( - X, Y, Z = NULL, tau.list = seq(0.05, 0.95, by = 0.05), - screen_threshold = 0.05, screen_method = "qvalue", top_count = 10, top_percent = 15) { - # Make sure quantreg is installed - if (!requireNamespace("quantreg", quietly = TRUE)) { - stop("To use this function, please install quantreg: https://cran.r-project.org/web/packages/quantreg/index.html") - } - p <- ncol(X) - pvec <- rep(NA, p) - ltau <- length(tau.list) - quantile.pvalue <- matrix(NA, nrow = p, ncol = ltau, dimnames = list(colnames(X), paste("p_qr", tau.list, sep = "_"))) - quantile.zscore <- matrix(NA, nrow = p, ncol = ltau, dimnames = list(colnames(X), paste("zscore_qr", tau.list, sep = "_"))) - y <- as.matrix(Y) - - if (!is.null(Z)) { - zz <- cbind(rep(1, nrow(y)), Z) - } else { - zz <- matrix(1, nrow = nrow(y), ncol = 1) - } - - ranks_list <- lapply(tau.list, function(tau) { - suppressWarnings(quantreg::rq.fit.br(zz, y, tau = tau)$dual - (1 - tau)) - }) - - for (ip in 1:p) { - x <- as.matrix(X[, ip]) - VN <- matrix(0, nrow = ltau, ncol = ltau) - for (i in 1:ltau) { - for (j in 1:ltau) { - VN[i, j] <- min(tau.list[i], tau.list[j]) - tau.list[i] * tau.list[j] - } - } - - if (!is.null(Z)) { - # xstar = lm(x ~ zz - 1)$residual - xstar <- .lm.fit(zz, x)$residual - } else { - xstar <- x - } - - SN <- NULL - for (itau in 1:ltau) { - Sn <- as.matrix(t(xstar) %*% ranks_list[[itau]]) - SN <- c(SN, Sn) - } - VN2 <- matrix(outer(VN, t(xstar) %*% xstar, "*"), nrow = ltau) - z_score <- SN / sqrt(diag(VN2)) - pvalue1 <- pchisq(SN^2 / diag(VN2), 1, lower.tail = FALSE) - names(pvalue1) <- tau.list - quantile.pvalue[ip, ] <- pvalue1 - quantile.zscore[ip, ] <- z_score - e <- solve(chol(VN2)) - SN2 <- t(e) %*% SN - pvalue <- pchisq(sum(SN2^2), ltau, lower.tail = FALSE) - pvec[ip] <- pvalue - } - - pvec <- apply(quantile.pvalue, 1, pval_cauchy) - - if (screen_method == "fdr") { - adjusted_pvalues <- p.adjust(pvec) - method_col_name <- "fdr_p_qr" - method_quantile_names <- paste0("fdr_p_qr_", tau.list) - quantile_adjusted_pvalues <- apply(quantile.pvalue, 2, p.adjust) - } else if (screen_method == "qvalue") { - adjusted_pvalues <- compute_qvalues(pvec) - method_col_name <- "qvalue_qr" - method_quantile_names <- paste0("qvalue_qr_", tau.list) - quantile_adjusted_pvalues <- apply(quantile.pvalue, 2, compute_qvalues) - } else { - stop("Invalid screen_method. Choose 'fdr' or 'qvalue'.") - } - - # Ensure quantile_adjusted_pvalues is always a matrix (handles single-variant case) - if (!is.matrix(quantile_adjusted_pvalues)) { - quantile_adjusted_pvalues <- matrix(quantile_adjusted_pvalues, nrow = 1, - dimnames = list(colnames(X), names(quantile_adjusted_pvalues))) - } - - sig_SNP_threshold <- which(adjusted_pvalues < screen_threshold) - sig_SNP_top_count <- order(adjusted_pvalues)[1:top_count] - sig_SNP_top_percent <- order(adjusted_pvalues)[1:max(1, round(length(adjusted_pvalues) * top_percent / 100))] - - sig.SNPs_names <- colnames(X)[sig_SNP_threshold] - sig.SNPs_names_top_count <- colnames(X)[sig_SNP_top_count] - sig.SNPs_names_top_percent <- colnames(X)[sig_SNP_top_percent] - phenotype_id <- colnames(y)[1] - - df_result <- data.frame( - phenotype_id = phenotype_id, - variant_id = colnames(X), - p_qr = pvec - ) - - # Add quantile-specific p-values - for (tau in tau.list) { - df_result[[paste0("p_qr_", tau)]] <- quantile.pvalue[, paste0("p_qr_", tau)] - } - - # Add overall q-value - df_result[[method_col_name]] <- adjusted_pvalues - - # Add quantile-specific q-values - for (tau in tau.list) { - df_result[[paste0(method_col_name, "_", tau)]] <- quantile_adjusted_pvalues[, paste0("p_qr_", tau)] - } - - # Add quantile-specific z-scores - for (tau in tau.list) { - df_result[[paste0("zscore_qr_", tau)]] <- quantile.zscore[, paste0("zscore_qr_", tau)] - } - - # Split variant_id and reorder columns - parsed <- parse_variant_id(df_result$variant_id) - df_result <- df_result %>% - mutate(chr = parsed$chrom, pos = parsed$pos, A2 = parsed$A2, A1 = parsed$A1) - - # Define the column order - col_order <- c("chr", "pos", "A2", "A1", "phenotype_id", "variant_id", "p_qr") - col_order <- c(col_order, paste0("p_qr_", tau.list)) - col_order <- c(col_order, method_col_name) - col_order <- c(col_order, paste0(method_col_name, "_", tau.list)) - col_order <- c(col_order, paste0("zscore_qr_", tau.list)) - - # Reorder the columns - df_result <- df_result %>% select(all_of(col_order)) - - return(list( - df_result = df_result, - tau_list = tau.list, - quantile_pvalue = quantile.pvalue, - quantile_zscore = quantile.zscore, - pvec = pvec, - adjusted_pvalues = adjusted_pvalues, - sig_SNP_threshold = sig_SNP_threshold, - sig.SNPs_names = sig.SNPs_names, - sig_SNP_top_count = sig_SNP_top_count, - sig_SNP_top_percent = sig_SNP_top_percent - )) -} - -#' Perform Clumping and Pruning -#' @param X Matrix of genotypes -#' @param qr_results Results from QR_screen -#' @param maf_list List of minor allele frequencies (optional) -#' @param ld_clump_r2 R-squared threshold for initial LD clumping based on pvalue -#' @param final_clump_r2 R-squared threshold for final LD clumping based on MAF -#' @return A list containing final SNPs and clumped SNPs -#' @importFrom bigstatsr FBM.code256 -#' @importFrom bigsnpr snp_clumping -#' @export -multicontext_ld_clumping <- function(X, qr_results, maf_list = NULL, ld_clump_r2 = 0.2, final_clump_r2 = 0.8) { - # Extract significant SNP names - sig_SNPs_names <- qr_results$sig.SNPs_names - - # If no significant SNPs, return empty result - if (length(sig_SNPs_names) == 0) { - return(list(final_SNPs = NULL, clumped_SNPs = NULL, message = "No significant SNPs found")) - } - - # Check if X only contains one SNP (one column) - if (ncol(X) == 1) { - print("Only one SNP in X. Skipping LD clumping.") - final_SNPs <- colnames(X) - return(list(final_SNPs = final_SNPs, clumped_SNPs = final_SNPs, message = "Only one SNP, no LD clumping performed")) - } - - # Extract genotype matrix for significant SNPs - G_all <- X # Only retain columns for significant SNPs - - # Convert the genotype matrix into FBM format - code_vec <- c(0, 1, 2, rep(NA, 256 - 3)) - G_all <- FBM.code256( - nrow = nrow(G_all), - ncol = ncol(G_all), - init = G_all, - code = code_vec - ) - - # Parse SNP names to extract chromosome and position information - parsed_snp_info <- do.call(rbind, strsplit(sig_SNPs_names, ":")) - chr <- as.numeric(strip_chr_prefix(parsed_snp_info[, 1])) - pos <- as.numeric(parsed_snp_info[, 2]) # Extract position - - # Step 1: Perform LD clumping for each tau based on p-values - clumped_snp_list <- list() - - for (itau in seq_along(qr_results$tau_list)) { - tau_name <- paste0("p_qr_", qr_results$tau_list[itau]) - - # Extract p-values for the given quantile - p_values_quantile <- qr_results$quantile_pvalue[, tau_name][qr_results$sig_SNP_threshold] - log_p_values <- -log10(p_values_quantile) # Calculate log10 p-values - - # Perform LD clumping using p-values as S - ind_clumped <- snp_clumping( - G = G_all, - infos.chr = chr, - infos.pos = pos, - S = log_p_values, # Use log10 p-values for each quantile - thr.r2 = ld_clump_r2, - size = 100 / ld_clump_r2 - ) # Window size of 500kb - - # Store clumping results for each quantile - clumped_snp_list[[tau_name]] <- ind_clumped - } - - # Step 2: Take the union of clumping results across all quantiles - clumped_snp_union <- unique(unlist(clumped_snp_list)) # This is the SNP index, not the name - clumped_SNPs_name <- sig_SNPs_names[clumped_snp_union] - print(paste("Number of SNPs after union of clumping:", length(clumped_snp_union))) - - if (length(clumped_snp_union) == 1) { - message("Only one SNP found in the union. Skipping LD pruning and returning the single SNP directly.") - final_SNPs <- sig_SNPs_names[clumped_snp_union] - return(list(final_SNPs = final_SNPs, clumped_SNPs = clumped_SNPs_name)) - } - - - # Step 3: Sort results from union - sorted_indices <- order(chr[clumped_snp_union], pos[clumped_snp_union]) - chr_sorted <- chr[clumped_snp_union][sorted_indices] - pos_sorted <- pos[clumped_snp_union][sorted_indices] - - # Step 4: Initialize maf_values to NULL, and update only if maf_list is provided - maf_values <- NULL - if (!is.null(maf_list)) { - maf_values <- maf_list[qr_results$sig_SNP_threshold][clumped_snp_union][sorted_indices] - } - G_union <- X[, clumped_snp_union, drop = FALSE][, sorted_indices] - if (!inherits(G_union, "FBM")) { - G_union <- FBM.code256( - nrow = nrow(G_union), - ncol = ncol(G_union), - init = G_union, - code = code_vec - ) - } - # Step 5: Perform final clumping using maf_values (if available), otherwise proceed with NULL S - final_clumped <- snp_clumping( - G = G_union, - infos.chr = chr_sorted, - infos.pos = pos_sorted, - S = maf_values, # Use MAF values if provided, otherwise NULL - thr.r2 = final_clump_r2, - size = 100 / final_clump_r2 - ) # Final clumping - - # Restrict the genotype matrix to only the final clumped SNPs - G_final_clumped <- G_union[, final_clumped, drop = FALSE] # Limit G to the final clumped SNPs - # Get the final SNP names - final_SNPs <- sig_SNPs_names[clumped_snp_union][sorted_indices][final_clumped] - print(paste("Number of final SNPs after MAF-based clumping (if applied):", length(final_SNPs))) - - return(list(final_SNPs = final_SNPs, clumped_SNPs = clumped_SNPs_name)) -} - -#' Perform Quantile Regression Analysis to get beta -#' @param X Matrix of predictors -#' @param Y Matrix or vector of response variables -#' @param Z Matrix of covariates (optional) -#' @param tau_values Vector of quantiles to be analyzed -#' @return A data frame with QR coefficients for each quantile -#' @importFrom tidyr pivot_wider separate -#' @importFrom dplyr %>% mutate select -#' @export -perform_qr_analysis <- function(X, Y, Z = NULL, tau_values = seq(0.05, 0.95, by = 0.05)) { - # Make sure quantreg is installed - if (!requireNamespace("quantreg", quietly = TRUE)) { - stop("To use this function, please install quantreg: https://cran.r-project.org/web/packages/quantreg/index.html") - } - # Convert Y and X to matrices if they aren't already - pheno.mat <- as.matrix(Y) - geno.mat <- as.matrix(X) - - # Initialize an empty result table to store results - result_table <- data.frame( - phenotype_id = character(), - variant_id = character(), - tau = numeric(), - predictor_coef = numeric(), - stringsAsFactors = FALSE - ) - - # Loop over each tau value to perform quantile regression - for (tau in tau_values) { - # Loop over each SNP/variant in geno.mat (X) - for (n in 1:ncol(geno.mat)) { - response <- pheno.mat # Y - predictor <- geno.mat[, n] # X - phenotype_id <- colnames(pheno.mat) - variant_id <- colnames(geno.mat)[n] - - # Construct the design matrix based on whether Z is provided - if (is.null(Z)) { - # If no covariates, include intercept and predictor - X_design <- cbind(1, predictor) - } else { - # If covariates are provided, include them in the design matrix - X_design <- cbind(1, predictor, as.matrix(Z)) - } - - # Fit the quantile regression model using rq.fit.br - mod <- suppressWarnings(quantreg::rq.fit.br(X_design, response, tau = tau)) - - # Extract the coefficient for the predictor (second coefficient) - predictor_coef <- mod$coefficients[2] # Coefficient for predictor - - # Create a row with the results and append to the result table - row <- data.frame( - phenotype_id = phenotype_id, - variant_id = variant_id, - tau = tau, - predictor_coef = predictor_coef, - stringsAsFactors = FALSE - ) - result_table <- rbind(result_table, row) - } - } - - # Reshape result_table to a wide format, so each tau's results are in separate columns - result_table_wide <- result_table %>% - pivot_wider( - id_cols = c(phenotype_id, variant_id), - names_from = tau, - values_from = predictor_coef, - names_prefix = "coef_qr_" - ) - parsed_ids <- parse_variant_id(result_table_wide$variant_id) - result_table_wide <- result_table_wide %>% - mutate(chr = parsed_ids$chrom, pos = parsed_ids$pos, A2 = parsed_ids$A2, A1 = parsed_ids$A1) %>% - select(chr, pos, A2, A1, everything()) - - # Return the wide format result table - return(result_table_wide) -} - -#' Filter Highly Correlated SNPs -#' @param X Matrix of genotypes -#' @param cor_thres Correlation threshold for filtering -#' @return A list containing filtered X matrix and filter IDs -corr_filter <- function(X, cor_thres = 0.8) { - p <- ncol(X) - - # Use Rfast for faster correlation calculation if available - if (requireNamespace("Rfast", quietly = TRUE)) { - cor.X <- Rfast::cora(X, large = TRUE) - } else { - cor.X <- cor(X) - } - Sigma.distance <- as.dist(1 - abs(cor.X)) - fit <- hclust(Sigma.distance, method = "single") - clusters <- cutree(fit, h = 1 - cor_thres) - groups <- unique(clusters) - ind.delete <- NULL - X.new <- X - filter.id <- c(1:p) - for (ig in seq_along(groups)) { - temp.group <- which(clusters == groups[ig]) - if (length(temp.group) > 1) { - ind.delete <- c(ind.delete, temp.group[-1]) - } - } - ind.delete <- unique(ind.delete) - if ((length(ind.delete)) > 0) { - X.new <- as.matrix(X[, -ind.delete]) - filter.id <- filter.id[-ind.delete] - } - - # Check if X.new has only one column and ensure column names are preserved - if (ncol(X.new) == 1) { - colnames(X.new) <- colnames(X)[-ind.delete] - } - - return(list(X.new = X.new, filter.id = filter.id)) -} - - - -#' Check and Remove Problematic Columns to Ensure Full Rank -#' -#' This function checks for problematic columns in the design matrix that cause it to be not full rank, -#' and iteratively removes them based on the chosen strategy until the matrix is full rank. -#' -#' @param X Matrix of SNPs -#' @param C Matrix of covariates (unnamed) -#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation") -#' @param response Optional response vector for "response_correlation" strategy -#' @param max_iterations Maximum number of iterations to attempt removing problematic columns -#' @return Cleaned matrix X with problematic columns removed -#' @noRd -check_remove_highcorr_snp <- function(X = X, C = C, strategy = c("correlation", "variance", "response_correlation"), response = NULL, max_iterations = 300, corr_thresholds = seq(0.75, 0.5, by = -0.05)) { - strategy <- match.arg(strategy) - original_colnames <- colnames(X) - initial_ncol <- ncol(X) # Store the initial number of columns in X - iteration <- 0 - # Combine the design matrix with X (SNPs) and C (covariates), keeping C without column names - X_design <- cbind(1, X, C) # Add an intercept column (1) - colnames_X_design <- c("Intercept", colnames(X)) # Assign column names only to X (SNPs) part - - # Assign column names only to the X part, leaving C without names - colnames(X_design)[1:(length(colnames_X_design))] <- colnames_X_design - - # Check the initial rank of the design matrix - matrix_rank <- qr(X_design)$rank - message("Initial rank of the design matrix: ", matrix_rank, " / ", ncol(X_design), " columns.") - - # Skip remove_highcorr_snp if removing all problematic columns doesn't achieve full rank - skip_remove_highcorr <- FALSE - - # First check: Try removing all problematic columns at once - if (matrix_rank < ncol(X_design)) { - message("Design matrix is not full rank, identifying all problematic columns...") - - # QR decomposition to identify linearly dependent columns - qr_decomp <- qr(X_design) - R <- qr_decomp$rank - Q <- qr_decomp$pivot - - # Get all problematic columns - problematic_cols <- Q[(R + 1):ncol(X_design)] - problematic_colnames <- colnames(X_design)[problematic_cols] - problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)] - - if (length(problematic_colnames) > 0) { - message("Attempting to remove all problematic columns at once: ", paste(problematic_colnames, collapse = ", ")) - - # Remove all problematic columns at once - X_temp <- X[, !(colnames(X) %in% problematic_colnames), drop = FALSE] - X_design_temp <- cbind(1, X_temp, C) - colnames_X_design_temp <- c("Intercept", colnames(X_temp)) - colnames(X_design_temp)[1:length(colnames_X_design_temp)] <- colnames_X_design_temp - - # Check if removing all problematic columns achieves full rank - matrix_rank_temp <- qr(X_design_temp)$rank - - if (matrix_rank_temp == ncol(X_design_temp)) { - message("Achieved full rank by removing all problematic columns at once. Proceeding with original logic...") - } else { - message("Removing all problematic columns did not achieve full rank. Skipping to corr_filter...") - skip_remove_highcorr <- TRUE - } - } - } - - # Only proceed with remove_highcorr_snp if not skipping - if (!skip_remove_highcorr) { - while (matrix_rank < ncol(X_design) && iteration < max_iterations) { - message("Design matrix is not full rank, identifying problematic columns...") - - qr_decomp <- qr(X_design) - R <- qr_decomp$rank - Q <- qr_decomp$pivot - problematic_cols <- Q[(R + 1):ncol(X_design)] - problematic_colnames <- colnames(X_design)[problematic_cols] - problematic_colnames <- problematic_colnames[problematic_colnames %in% colnames(X)] - - if (length(problematic_colnames) == 0) { - message("No more problematic SNP columns found in X. Breaking the loop.") - break - } - - message("Problematic SNP columns identified: ", paste(problematic_colnames, collapse = ", ")) - X <- remove_highcorr_snp(X, problematic_colnames, strategy = strategy, response = response) - - X_design <- cbind(1, X, C) - colnames_X_design <- c("Intercept", colnames(X)) - colnames(X_design)[1:length(colnames_X_design)] <- colnames_X_design - - matrix_rank <- qr(X_design)$rank - message("New rank of the design matrix: ", matrix_rank, " / ", ncol(X_design), " columns.") - iteration <- iteration + 1 - } - - if (iteration == max_iterations) { - warning("Maximum iterations reached. The design matrix may still not be full rank.") - } - } - - # Final check and corr_filter if needed - matrix_rank <- qr(cbind(1, X, C))$rank - if (matrix_rank < ncol(cbind(1, X, C))) { - message("Applying corr_filter to ensure the design matrix is full rank...") - for (threshold in corr_thresholds) { - filter_result <- corr_filter(X, cor_thres = threshold) - X <- filter_result$X.new - X_design <- cbind(1, X, C) - colnames_X_design <- c("Intercept", colnames(X)) - colnames(X_design)[1:length(colnames_X_design)] <- colnames_X_design - - matrix_rank <- qr(X_design)$rank - message("Rank after corr_filter with threshold ", threshold, ": ", matrix_rank, " / ", ncol(X_design), " columns.") - if (matrix_rank == ncol(X_design)) { - break - } - } - } - - - if (iteration == max_iterations) { - warning("Maximum iterations reached. The design matrix may still not be full rank.") - } - - if (ncol(X) == 1 && initial_ncol == 1) { - colnames(X) <- original_colnames - } - return(X) -} - -#' Remove Problematic Columns Based on a Given Strategy -#' -#' This function removes problematic columns from a matrix based on different strategies, such as smallest variance, -#' highest correlation, or lowest correlation with the response variable. -#' -#' @param X Matrix of SNPs -#' @param problematic_cols A vector of problematic columns to be removed -#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation") -#' @param response Optional response vector for "response_correlation" strategy -#' @return Cleaned matrix X with the selected column removed -#' @importFrom stats var cor -#' @noRd -remove_highcorr_snp <- function(X, problematic_cols, strategy = c("correlation", "variance", "response_correlation"), response = NULL) { - # Set default strategy - strategy <- match.arg(strategy) - - message("Identified problematic columns: ", paste(problematic_cols, collapse = ", ")) - - if (length(problematic_cols) == 0) { - return(X) # If there are no problematic columns, return as is - } - - if (length(problematic_cols) == 1) { - message("Only one problematic column: ", problematic_cols) - col_to_remove <- problematic_cols[1] - message("Removing column: ", col_to_remove) - X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE] - # If X only has one column left after removal, ensure its column name is preserved - if (ncol(X) == 1) { - colnames(X) <- colnames(X)[colnames(X) != col_to_remove] # Preserve remaining SNP name - } - return(X) - } - - # Choose columns to remove based on the strategy - if (strategy == "variance") { - # Strategy 1: Remove the column with the smallest variance - variances <- apply(X[, problematic_cols, drop = FALSE], 2, var) - col_to_remove <- problematic_cols[which.min(variances)] - message("Removing column with the smallest variance: ", col_to_remove) - } else if (strategy == "correlation") { - # Strategy 2: Remove the column with the highest sum of absolute correlations - cor_matrix <- abs(cor(X[, problematic_cols, drop = FALSE])) # Calculate absolute correlation matrix - diag(cor_matrix) <- 0 # Ignore the diagonal (self-correlation) - - if (length(problematic_cols) == 2) { - # If there are only two problematic columns, randomly remove one - col_to_remove <- sample(problematic_cols, 1) - message("Only two problematic columns, randomly removing: ", col_to_remove) - } else { - # Calculate sum of absolute correlations for each column - cor_sums <- colSums(cor_matrix) - col_to_remove <- problematic_cols[which.max(cor_sums)] # Remove the column with the largest sum of correlations - message("Removing column with highest sum of absolute correlations: ", col_to_remove) - } - } else if (strategy == "response_correlation" && !is.null(response)) { - # Strategy 3: Remove the column with the lowest correlation with the response variable - # FIXME: This strategy is potentially biased based on corr of response and variants - cor_with_response <- apply(X[, problematic_cols, drop = FALSE], 2, function(col) cor(col, response)) - col_to_remove <- problematic_cols[which.min(abs(cor_with_response))] - message("Removing column with lowest correlation with the response: ", col_to_remove) - } else { - stop("Invalid strategy or missing response variable for 'response_correlation' strategy.") - } - - # Remove the selected column from X - X <- X[, !(colnames(X) %in% col_to_remove), drop = FALSE] - if (ncol(X) == 1) { - colnames(X) <- colnames(X)[colnames(X) != col_to_remove] # Preserve remaining SNP name - } - return(X) -} - -#' Calculate QR Coefficients and Pseudo R-squared Across Multiple Quantiles -#' -#' This function calculates quantile regression coefficients and pseudo R-squared values across multiple quantiles, -#' while handling problematic columns that might affect the rank of the design matrix. -#' -#' @param AssocData List containing X, Y, C, and X.filter -#' @param tau.list Vector of quantiles to be analyzed -#' @param strategy The strategy for removing problematic columns ("variance", "correlation", or "response_correlation") -#' @return A list containing the cleaned X matrix, beta matrix as twas weight, and pseudo R-squared values -#' @noRd -calculate_qr_and_pseudo_R2 <- function(AssocData, tau.list, strategy = c("correlation", "variance", "response_correlation"), - corr_thresholds = seq(0.75, 0.5, by = -0.05)) { - # Make sure quantreg is installed - if (!requireNamespace("quantreg", quietly = TRUE)) { - stop("To use this function, please install quantreg: https://cran.r-project.org/web/packages/quantreg/index.html") - } - strategy <- match.arg(strategy) - # Check and handle problematic columns affecting the full rank of the design matrix - AssocData$X.filter <- check_remove_highcorr_snp(X = AssocData$X.filter, C = AssocData$C, strategy = strategy, response = AssocData$Y, corr_thresholds = corr_thresholds) - snp_names <- colnames(AssocData$X.filter) - # Build the cleaned design matrix using the filtered X and unnamed C - - # Fit the models for all tau values - message("Start fitting full model for all taus...") - fit_full <- suppressWarnings(quantreg::rq(Y ~ X.filter + C, tau = tau.list, data = AssocData)) - message("Finished fitting full model. Start fitting intercept-only model for all taus...") - fit_intercept <- suppressWarnings(quantreg::rq(AssocData$Y ~ 1, tau = tau.list, data = AssocData)) - message("Finished fitting intercept-only model.") - # Define the rho function for pseudo R2 calculation - rho <- function(u, tau) { - u * (tau - (u < 0)) - } - - # Prepare to store the pseudo R2 results - pseudo_R2 <- numeric(length(tau.list)) - names(pseudo_R2) <- tau.list - - # Calculate pseudo R2 for each tau - for (i in seq_along(tau.list)) { - tau <- tau.list[i] - - # Get residuals for the intercept-only and full models - residuals0 <- residuals(fit_intercept, subset = i) - residuals1 <- residuals(fit_full, subset = i) - - # Calculate and store pseudo R2 for each tau - rho0 <- sum(rho(residuals0, tau)) - rho1 <- sum(rho(residuals1, tau)) - pseudo_R2[i] <- 1 - rho1 / rho0 - } - - # Extract the coefficients for the SNPs - num_filter_vars <- ncol(AssocData$X.filter) - beta_mat <- coef(fit_full)[2:(1 + num_filter_vars), , drop = FALSE] - rownames_beta <- rownames(beta_mat) - if (ncol(AssocData$X.filter) == 1) { - rownames(beta_mat) <- snp_names - } else { - rownames_beta <- rownames(beta_mat) - rownames(beta_mat) <- gsub("^X.filter", "", rownames_beta) - } - return(list(X.filter = AssocData$X.filter, beta_mat = beta_mat, pseudo_R2 = pseudo_R2)) -} - -#' Calculate Heterogeneity of Beta Coefficients Across Quantiles -#' -#' This function calculates the heterogeneity of beta coefficients across multiple quantiles for each variant_id. -#' Heterogeneity is computed as log(sd(beta) / abs(mean(beta))). -#' -#' @param rq_coef_result Data frame containing variant_id and QR coefficient columns -#' @return A data frame with variant_id and heterogeneity values -#' @noRd -calculate_coef_heterogeneity <- function(rq_coef_result) { - # Identify all the columns starting with "coef_qr_" (quantile regression coefficient columns) - coef_cols <- grep("^coef_qr_", colnames(rq_coef_result), value = TRUE) - - # Create a new data frame with variant_id and heterogeneity - heterogeneity_result <- data.frame( - variant_id = rq_coef_result$variant_id, - coef_heter = apply(rq_coef_result[, coef_cols], 1, function(beta) { - # Compute the mean and standard deviation, ignoring NAs - beta_mean <- mean(beta, na.rm = TRUE) - beta_sd <- sd(beta, na.rm = TRUE) - - # Handle the case where mean(beta) is 0 to avoid division by zero - if (abs(beta_mean) == 0) { - return(NA) # Return NA if mean is zero - } - - # Compute the heterogeneity: log(sd(beta) / abs(mean(beta))) - heterogeneity <- log(beta_sd / abs(beta_mean)) - return(heterogeneity) - }), - stringsAsFactors = FALSE - ) - - # Return only variant_id and heterogeneity - return(heterogeneity_result) -} - -#' Calculate Xi Correlation for QR Coefficients -#' -#' This function calculates the xi correlation coefficient and p-value for each variant, -#' measuring the functional dependence between tau values and QR coefficients. -#' Uses coef_qr_0.1 to coef_qr_0.9 (17 values, excluding 0.05 and 0.95). -#' -#' @param rq_coef_result Data frame containing variant_id and coef_qr_* columns -#' @param tau_range Numeric vector of tau values to use (default: seq(0.1, 0.9, by = 0.05)) -#' @param min_valid Minimum number of valid (non-NA) coefficients required (default: 10) -#' @return A data frame with variant_id, xi, and xi_pval columns -#' @export -calculate_xi_correlation <- function(rq_coef_result, tau_range = seq(0.1, 0.9, by = 0.05), min_valid = 10) { - if (!requireNamespace("XICOR", quietly = TRUE)) { - stop("Package 'XICOR' is required for xi correlation calculation. Please install it.") - } - # Build column names for the specified tau range - coef_col_names <- paste0("coef_qr_", tau_range) - - # Check which columns exist - existing_cols <- coef_col_names[coef_col_names %in% colnames(rq_coef_result)] - - if (length(existing_cols) == 0) { - warning("No coef_qr columns found in the specified tau range") - return(data.frame( - variant_id = rq_coef_result$variant_id, - xi = NA_real_, - xi_pval = NA_real_, - stringsAsFactors = FALSE - )) - } - - # Extract tau values from existing columns - existing_tau <- as.numeric(gsub("coef_qr_", "", existing_cols)) - - # Calculate xi for each variant - xi_results <- apply(rq_coef_result[, existing_cols, drop = FALSE], 1, function(coef_values) { - # Get valid (non-NA) coefficients - valid_indices <- !is.na(coef_values) - valid_coefs <- as.numeric(coef_values[valid_indices]) - valid_tau <- existing_tau[valid_indices] - - # Check if enough valid values - if (length(valid_coefs) < min_valid) { - return(c(xi = NA_real_, xi_pval = NA_real_)) - } - - # Calculate xi correlation - tryCatch({ - xicor_result <- XICOR::xicor(valid_tau, y = valid_coefs, pvalue = TRUE, method = "asymptotic") - return(c(xi = xicor_result$xi, xi_pval = xicor_result$pval)) - }, error = function(e) { - return(c(xi = NA_real_, xi_pval = NA_real_)) - }) - }) - - # Convert to data frame - xi_df <- data.frame( - variant_id = rq_coef_result$variant_id, - xi = xi_results["xi", ], - xi_pval = xi_results["xi_pval", ], - stringsAsFactors = FALSE - ) - - return(xi_df) -} - -#' Quantile TWAS Weight Pipeline -#' -#' @param X Matrix of genotypes -#' @param Y Matrix or vector of phenotypes -#' @param Z Matrix of covariates (optional) -#' @param maf Vector of minor allele frequencies (optional) -#' @param region_id Name of the region being analyzed -#' @param quantile_qtl_tau_list Vector of quantiles for QTL analysis -#' @param quantile_twas_tau_list Vector of quantiles for TWAS analysis -#' -#' -#' @return A list containing various results from the TWAS weight pipeline: -#' \itemize{ -#' \item qr_screen_pvalue_df: Data frame with QR screening results: pavlue, qvalue and zscore. -#' \item message: Any informational or warning messages. -#' \item twas_variant_names: Names of variants used in TWAS weight calculation. -#' \item rq_coef_df: Data frame with quantile regression coefficients. -#' \item twas_weight: Matrix of TWAS weights. -#' \item pseudo_R2: Vector of pseudo R-squared values. -#' \item quantile_twas_prediction: Matrix of TWAS predictions. -#' } -#' -#' @details -#' The function performs the following steps: -#' 1. QR screening to identify significant SNPs. -#' 2. Filtering of highly correlated SNPs. -#' 3. LD clumping and pruning(use filtered SNPs from step 1). -#' 4. Calculation of QR coefficients for selected SNPs(use filtered SNPs from step 3). -#' 5. Calculation of TWAS weights and pseudo R-squared values(use filtered SNPs from step 2). -#' -#' @examples -#' # Example usage: -#' # X <- matrix of genotypes -#' # Y <- vector of phenotypes -#' # Z <- matrix of covariates -#' # results <- quantile_twas_weight_pipeline(X, Y, Z, region_id = "GeneA") -#' -#' @export -quantile_twas_weight_pipeline <- function(X, Y, Z = NULL, maf = NULL, region_id = "", - ld_reference_meta_file = NULL, twas_maf_cutoff = 0.01, - ld_clumping = FALSE, ld_pruning = FALSE, - screen_significant = TRUE, - quantile_qtl_tau_list = seq(0.05, 0.95, by = 0.05), - quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01), - screen_method = "qvalue", - screen_threshold = 0.05, - xi_tau_range = seq(0.1, 0.9, by = 0.05), - keep_variants = NULL, - marginal_beta_calculate = TRUE, - twas_weight_calculate = TRUE, - qrank_screen_calculate = TRUE, - vqtl_calculate = TRUE, - pre_filter_by_pqr = FALSE, - initial_corr_filter_cutoff = 0.8, - full_rank_corr_filter_cutoff = seq(0.75, 0.5, by = -0.05)) { - # Initialize results list - results <- list() - - # Step 1: vQTL (optional) - if (vqtl_calculate) { - # Step 1-1: Calculate vQTL rank scores - message("Step 0: Calculating vQTL rank scores for region ", region_id) - num_tau_levels <- length(quantile_qtl_tau_list) # Convert tau.list to numeric count - rank_score <- QUAIL_rank_score_pipeline( - phenotype = Y, - covariates = Z, - num_tau_levels = num_tau_levels, - method = "equal", - num_cores = 1 - ) - message("vQTL rank scores calculated.") - - # Step 1-2: Run vQTL pipeline - message("Step 0.5: Running vQTL analysis for rank scores in region ", region_id) - vqtl_results <- QUAIL_pipeline( - genotype = X, - rank_score = rank_score, - covariates = Z, - phenotype_id = colnames(Y)[1] - ) - message("vQTL analysis completed.") - results$vqtl_results <- vqtl_results - } else { - message("Skipping vQTL calculation.") - } - - if (qrank_screen_calculate) { - # Step 2: QR screen - message("Starting QR screen for region ", region_id) - p.screen <- qr_screen(X = X, Y = Y, Z = Z, tau.list = quantile_qtl_tau_list, screen_threshold = screen_threshold, screen_method = screen_method, top_count = 10, top_percent = 15) - message(paste0("Number of SNPs after QR screening: ", length(p.screen$sig_SNP_threshold))) - message("QR screen completed. Screening significant SNPs") - results$qr_screen_pvalue_df <- p.screen$df_result - - if (screen_significant && length(p.screen$sig_SNP_threshold) == 0) { - results$message <- paste0("No significant SNPs detected in region ", region_id) - return(results) - } - - if (screen_significant) { - X_filtered <- X[, p.screen$sig_SNP_threshold, drop = FALSE] - } else { - X_filtered <- X - } - - # # Step 3: Optional LD clumping and pruning from results of QR_screen (using original QR screen results) - if (ld_clumping) { - message("Performing LD clumping and pruning from QR screen results...") - LD_SNPs <- multicontext_ld_clumping(X = X[, p.screen$sig_SNP_threshold, drop = FALSE], qr_results = p.screen, maf_list = NULL) - selected_snps <- if (ld_pruning) LD_SNPs$final_SNPs else LD_SNPs$clumped_SNPs - x_clumped <- X[, p.screen$sig_SNP_threshold, drop = FALSE][, selected_snps, drop = FALSE] - } else { - message("Skipping LD clumping.") - } - - } else { - message("Skipping QR screen.") - } - - # Determine whether to skip marginal beta calculation: - # - skip if marginal_beta_calculate = FALSE - # - skip if keep_variants is provided but empty (length 0) - # - skip if qrank_screen_calculate = FALSE and keep_variants is NULL (no variants to select) - skip_marginal_beta <- !marginal_beta_calculate || - (!is.null(keep_variants) && length(keep_variants) == 0) || - (!qrank_screen_calculate && is.null(keep_variants)) - - if (!skip_marginal_beta) { - # Step 4: Fit marginal QR to get beta with SNPs for quantile_qtl_tau_list values - message("Fitting marginal QR for selected SNPs...") - if (qrank_screen_calculate) { - X_for_qr <- if (ld_clumping) x_clumped else X_filtered - if (!is.null(keep_variants)) { - variants_to_keep <- intersect(keep_variants, colnames(X_for_qr)) - if (length(variants_to_keep) > 0) { - X_for_qr <- X_for_qr[, variants_to_keep, drop = FALSE] - message("Filtered to ", ncol(X_for_qr), " variants from keep_variants list for QR analysis") - } else { - message("Warning: No variants from keep_variants found in selected SNPs, using all selected SNPs") - } - } - } else { - # qrank_screen_calculate = FALSE but keep_variants provided - variants_to_keep <- intersect(keep_variants, colnames(X)) - if (length(variants_to_keep) > 0) { - X_for_qr <- X[, variants_to_keep, drop = FALSE] - message("Using ", ncol(X_for_qr), " variants from keep_variants list for QR analysis (QR screen skipped)") - } else { - message("Warning: No variants from keep_variants found in X, skipping marginal beta calculation") - skip_marginal_beta <- TRUE - } - } - } - - if (!skip_marginal_beta) { - rq_coef_result <- perform_qr_analysis(X = X_for_qr, Y = Y, Z = Z, tau_values = quantile_qtl_tau_list) - - # Step 5: Heterogeneity calculation - # Step 5-1: beta_heterogeneity index in marginal model - message("Marginal QR for selected SNPs completed. Calculating beta heterogeneity...") - beta_heterogeneity <- calculate_coef_heterogeneity(rq_coef_result) - message("Beta heterogeneity calculation completed.") - - # Step 5-2: Calculate xi correlation (Chatterjee correlation test) - message("Calculating xi correlation for QR coefficients...") - xi_correlation <- calculate_xi_correlation(rq_coef_result, tau_range = xi_tau_range, min_valid = 10) - message("Xi correlation calculation completed.") - - # Merge xi and xi_pval into rq_coef_result (using left_join to preserve row order) - rq_coef_result <- rq_coef_result %>% - dplyr::left_join(xi_correlation, by = "variant_id") - - results$rq_coef_df <- rq_coef_result - results$beta_heterogeneity <- beta_heterogeneity - results$xi_correlation <- xi_correlation - } else { - message("Skipping marginal beta calculation and heterogeneity analysis.") - } - - if (twas_weight_calculate && qrank_screen_calculate) { - # Step 6: Optional LD panel filtering and MAF filtering from results of QR_screen - if (!is.null(ld_reference_meta_file)) { - message("Starting LD panel filtering...") - ld_result <- tryCatch( - { - variants_kept <- filter_variants_by_ld_reference(colnames(X_filtered), ld_reference_meta_file) - if (length(variants_kept$data) == 0) NULL else variants_kept - }, - error = function(e) { - message("Error in LD filtering for region ", region_id, ": ", e$message) - NULL - } - ) - - if (is.null(ld_result)) { - results$message <- paste0("No SNPs left or error in LD filtering in region ", region_id) - return(results) - } - - X_filtered <- X_filtered[, ld_result$data, drop = FALSE] - message(paste0("Number of SNPs after LD filtering: ", ncol(X_filtered))) - - # MAF filtering - if (!is.null(maf)) { - maf_filtered <- maf[colnames(X_filtered)] > twas_maf_cutoff - X_filtered <- X_filtered[, maf_filtered, drop = FALSE] - - # Check if any SNPs are left after MAF filtering - if (ncol(X_filtered) == 0) { - results$message <- paste0("No SNPs left after MAF filtering in region ", region_id) - return(results) - } - - message(paste0("Number of SNPs after MAF filtering: ", ncol(X_filtered))) - } - } - - # Step 7: Optionally pre-filter variants by raw p_qr - if (pre_filter_by_pqr) { - pqr_attempts <- list( - list(pval = 0.05, full_rank_corr = full_rank_corr_filter_cutoff), - list(pval = 0.01, full_rank_corr = full_rank_corr_filter_cutoff) - ) - } else { - message("Skipping p_qr pre-filtering, using X_filtered directly...") - pqr_attempts <- list( - list(pval = NULL, full_rank_corr = full_rank_corr_filter_cutoff) - ) - } - - qr_beta_R2_results <- NULL - for (attempt in seq_along(pqr_attempts)) { - pqr_cutoff <- pqr_attempts[[attempt]]$pval - full_rank_corr <- pqr_attempts[[attempt]]$full_rank_corr - - # Pre-filter by p_qr if enabled - if (!is.null(pqr_cutoff)) { - raw_pvals <- p.screen$pvec[colnames(X_filtered)] - sig_raw <- which(raw_pvals < pqr_cutoff) - if (length(sig_raw) == 0) { - message("No variants with raw p_qr < ", pqr_cutoff, " in region ", region_id) - next - } - if (length(sig_raw) < ncol(X_filtered)) { - message("Pre-filtering variants by raw p_qr < ", pqr_cutoff, ": keeping ", length(sig_raw), " out of ", ncol(X_filtered), " variants") - X_for_corr <- X_filtered[, sig_raw, drop = FALSE] - } else { - X_for_corr <- X_filtered - } - } else { - X_for_corr <- X_filtered - } - - # Step 8: Initial filter of highly correlated SNPs - message("Filtering highly correlated SNPs...") - if (ncol(X_for_corr) > 1) { - filtered <- corr_filter(X_for_corr, initial_corr_filter_cutoff) - X.filter <- filtered$X.new - } else { - X.filter <- X_for_corr - } - - # Step 9: Fit QR and get twas weight and R2 for all taus - message("Fitting full QR to calculate TWAS weights and pseudo R-squared values...") - AssocData <- list(X = X, Y = Y, C = Z, X.filter = X.filter) - qr_beta_R2_results <- tryCatch( - { - calculate_qr_and_pseudo_R2(AssocData, quantile_twas_tau_list, corr_thresholds = full_rank_corr) - }, - error = function(e) { - message("Attempt ", attempt, " failed: ", e$message) - NULL - } - ) - - if (!is.null(qr_beta_R2_results)) break - } - - if (is.null(qr_beta_R2_results)) { - results$message <- paste0("Failed to fit QR model after all attempts in region ", region_id) - return(results) - } - - X.filter <- qr_beta_R2_results$X.filter - message("TWAS weights and pseudo R-squared calculations completed.") - - # Add additional results - results$twas_variant_names <- colnames(X.filter) - results$twas_weight <- qr_beta_R2_results$beta_mat - results$pseudo_R2 <- qr_beta_R2_results$pseudo_R2 - results$quantile_twas_prediction <- X.filter %*% results$twas_weight - } else { - message("Skipping TWAS weight calculation.") - } - - return(results) -} diff --git a/R/twas.R b/R/twas.R index 267532e4..db7f2072 100644 --- a/R/twas.R +++ b/R/twas.R @@ -288,13 +288,12 @@ twas_pipeline <- function(twas_weights_data, rsq_pval_option = c("pval", "adj_rsq_pval"), mr_pval_cutoff = 0.05, mr_coverage_column = "cs_coverage_0.95", - quantile_twas = FALSE, output_twas_data = FALSE, event_filters=NULL, column_file_path = NULL, comment_string="#") { # internal function to format TWAS output - format_twas_data <- function(post_qc_twas_data, twas_table, quantile_twas = FALSE) { + format_twas_data <- function(post_qc_twas_data, twas_table) { weights_list <- do.call(c, lapply(names(post_qc_twas_data), function(molecular_id) { contexts <- names(post_qc_twas_data[[molecular_id]][["weights_qced"]]) chrom <- post_qc_twas_data[[molecular_id]][["chrom"]] @@ -317,125 +316,7 @@ twas_pipeline <- function(twas_weights_data, postqc_scaled_weight <- list() gwas_studies <- names(post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]]) # context-level gwas-studies - # quantile TWAS - if (quantile_twas && (is.null(model_selected) || is.na(model_selected))) { - # For quantile TWAS: extract all available methods from weight matrix columns - if (length(gwas_studies) > 0) { - # Get the first weight matrix to examine available columns - sample_weight_matrix <- post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]][[gwas_studies[1]]][["scaled_weights"]] - - # Extract method names from column names - all_columns <- colnames(sample_weight_matrix) - - # Try to identify methods by removing "_weights" suffix - potential_methods <- unique(gsub("_weights$", "", all_columns)) - - # Filter out columns that are exactly the same as original (no "_weights" suffix) - methods_with_suffix <- potential_methods[paste0(potential_methods, "_weights") %in% all_columns] - - # If no standard method columns found, use all columns as individual methods - if (length(methods_with_suffix) == 0) { - available_methods <- all_columns - use_direct_columns <- TRUE - } else { - available_methods <- methods_with_suffix - use_direct_columns <- FALSE - } - - # Process each method for quantile TWAS - for (method in available_methods) { - for (study in gwas_studies) { - weight_matrix <- post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]][[study]][["scaled_weights"]] - - # Determine which column to use - if (use_direct_columns) { - # Use the column name directly - selected_col <- method - } else { - # Look for method_weights format first, then fallback - weight_col_candidates <- c( - paste0(method, "_weights"), - method, - "weight" - ) - - selected_col <- NULL - for (col_candidate in weight_col_candidates) { - if (!is.null(col_candidate) && col_candidate %in% colnames(weight_matrix)) { - selected_col <- col_candidate - break - } - } - } - - if (!is.null(selected_col) && selected_col %in% colnames(weight_matrix)) { - postqc_scaled_weight[[study]] <- weight_matrix[, selected_col, drop = FALSE] - colnames(postqc_scaled_weight[[study]]) <- "weight" - # variant IDs are in canonical chr-prefix format from allele_qc - context_variants <- rownames(weight_matrix) - context_range <- as.integer(sapply(context_variants, function(variant) { - parts <- strsplit(variant, ":")[[1]] - if (length(parts) >= 2) as.integer(parts[2]) else NA - })) - context_range <- context_range[!is.na(context_range)] - - if (length(context_range) > 0) { - # Get quantile information from twas_table for this method - quantile_info <- twas_table[twas_table$molecular_id == molecular_id & - twas_table$context == context & - twas_table$method == method, ] - - # Extract quantile_start and quantile_end if available - if (nrow(quantile_info) > 0) { - quantile_start <- if ("quantile_start" %in% colnames(quantile_info)) quantile_info$quantile_start[1] else NA - quantile_end <- if ("quantile_end" %in% colnames(quantile_info)) quantile_info$quantile_end[1] else NA - } else { - quantile_start <- NA - quantile_end <- NA - } - - # Create quantile_range string - quantile_range <- if (!is.na(quantile_start) && !is.na(quantile_end)) { - paste0(quantile_start, "-", quantile_end) - } else { - NA - } - - # Create unique weight ID for each method in quantile TWAS - handle NA data_type - safe_data_type <- if (is.null(data_type) || any(is.na(data_type)) || length(data_type) == 0) { - "unknown" - } else { - data_type[1] - } - - weight_id <- paste0(molecular_id, "|", safe_data_type, "_", context, "_", method) - - # Initialize the nested list structure if needed - if (is.null(weight[[weight_id]])) { - weight[[weight_id]] <- list() - } - - weight[[weight_id]][[study]] <- list( - chrom = chrom, - p0 = min(context_range), - p1 = max(context_range), - wgt = postqc_scaled_weight[[study]], - molecular_id = molecular_id, - weight_name = paste0(safe_data_type, "_", context, "_", method), - type = safe_data_type, - context = context, - method = method, # Add method info for quantile TWAS - quantile_start = quantile_start, # Add quantile start - quantile_end = quantile_end, # Add quantile end - quantile_range = quantile_range, # Add quantile range string - n_wgt = length(context_variants) - ) - } - } - } - } - } - } else if (!is.null(model_selected) & isTRUE(is_imputable)) { + if (!is.null(model_selected) & isTRUE(is_imputable)) { # TWAS for (study in gwas_studies) { postqc_scaled_weight[[study]] <- post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]][[study]][["scaled_weights"]][, paste0(model_selected, "_weights"), drop = FALSE] @@ -466,37 +347,15 @@ twas_pipeline <- function(twas_weights_data, # gene_z table if ("is_selected_method" %in% colnames(twas_table)) { - if (!quantile_twas) { - # Original TWAS: filter by selected methods only - twas_table <- twas_table[na.omit(twas_table$is_selected_method), , drop = FALSE] - } - # For quantile TWAS: include all methods, don't filter by is_selected_method + twas_table <- twas_table[na.omit(twas_table$is_selected_method), , drop = FALSE] } if (nrow(twas_table) > 0) { - # Adjust ID and group format based on whether it's quantile TWAS - if (quantile_twas) { - # For quantile TWAS: include method in ID to make each method unique - twas_table$id <- paste0(twas_table$molecular_id, "|", - ifelse(is.null(twas_table$type) | any(is.na(twas_table$type)), "unknown", twas_table$type), - "_", twas_table$context, "_", twas_table$method) - twas_table$group <- paste0(twas_table$context, "|", - ifelse(is.null(twas_table$type) | any(is.na(twas_table$type)), "unknown", twas_table$type), - "|", twas_table$method) - } else { - # Original TWAS ID format - keep unchanged - twas_table$id <- paste0(twas_table$molecular_id, "|", twas_table$type, "_", twas_table$context) - twas_table$group <- paste0(twas_table$context, "|", twas_table$type) - } + twas_table$id <- paste0(twas_table$molecular_id, "|", twas_table$type, "_", twas_table$context) + twas_table$group <- paste0(twas_table$context, "|", twas_table$type) twas_table$z <- twas_table$twas_z - # Select relevant columns for output, add quantile-specific columns for quantile TWAS output_columns <- c("id", "z", "type", "context", "group", "gwas_study") - if (quantile_twas) { - output_columns <- c(output_columns, "method") - if ("quantile_start" %in% colnames(twas_table)) output_columns <- c(output_columns, "quantile_start", "quantile_end") - if ("pseudo_R2_avg" %in% colnames(twas_table)) output_columns <- c(output_columns, "pseudo_R2_avg") - } twas_table <- twas_table[, intersect(output_columns, colnames(twas_table)), drop = FALSE] studies <- unique(twas_table$gwas_study) z_gene_list <- list() @@ -581,10 +440,6 @@ twas_pipeline <- function(twas_weights_data, warning(paste0("No data harmonized for ", weight_db, ". Returning NULL for TWAS result for this region.")) return(NULL) } - if (quantile_twas) { - rsq_cutoff <- 0 - message("Quantile TWAS detected. Skipping the selection of best model based on CV result.") - } if (rsq_cutoff > 0) { message("Selecting the best model based on criteria...") best_model_selection <- pick_best_model( @@ -694,51 +549,21 @@ twas_pipeline <- function(twas_weights_data, # merge twas_cv information for same gene across all weight db files, loop through each context for all methods gene_table <- do.call(rbind, lapply(contexts, function(context) { methods <- sub("_[^_]+$", "", names(twas_weights_data[[molecular_id]]$twas_cv_performance[[context]])) - if (quantile_twas) { - cv_performance <- twas_weights_data[[molecular_id]]$twas_cv_performance[[context]] - if (length(methods) == 0) { - context_table <- data.frame() - } else { - method_results <- list() - - for (method in methods) { - if (!is.null(cv_performance[[paste0(method, "_performance")]])) { - performance_data <- cv_performance[[paste0(method, "_performance")]] - method_results[[method]] <- data.frame( - context = context, - method = method, - quantile_start = performance_data[, "quantile_start"], - quantile_end = performance_data[, "quantile_end"], - pseudo_R2_avg = performance_data[, "pseudo_R2_avg"], - type = twas_weights_data[[molecular_id]][["data_type"]][[context]] - ) - } - } - - if (length(method_results) > 0) { - context_table <- do.call(rbind, method_results) - } else { - context_table <- data.frame() - } - } - } else { - # Original TWAS data extraction - is_imputable <- twas_data[[molecular_id]][["model_selection"]][[context]]$is_imputable - selected_method <- twas_data[[molecular_id]][["model_selection"]][[context]]$selected_model - if (is.null(selected_method)) selected_method <- NA - is_selected_method <- ifelse(methods == selected_method, TRUE, FALSE) - - cv_rsqs <- sapply(twas_weights_data[[molecular_id]]$twas_cv_performance[[context]], function(x) x[, rsq_option]) - cv_pvals <- sapply(twas_weights_data[[molecular_id]]$twas_cv_performance[[context]], function(x) x[, colnames(x)[which(colnames(x) %in% rsq_pval_option)]]) - - context_table <- data.frame( - context = context, method = methods, - is_imputable = is_imputable, - is_selected_method = is_selected_method, - rsq_cv = cv_rsqs, pval_cv = cv_pvals, - type = twas_weights_data[[molecular_id]][["data_type"]][[context]] - ) - } + is_imputable <- twas_data[[molecular_id]][["model_selection"]][[context]]$is_imputable + selected_method <- twas_data[[molecular_id]][["model_selection"]][[context]]$selected_model + if (is.null(selected_method)) selected_method <- NA + is_selected_method <- ifelse(methods == selected_method, TRUE, FALSE) + + cv_rsqs <- sapply(twas_weights_data[[molecular_id]]$twas_cv_performance[[context]], function(x) x[, rsq_option]) + cv_pvals <- sapply(twas_weights_data[[molecular_id]]$twas_cv_performance[[context]], function(x) x[, colnames(x)[which(colnames(x) %in% rsq_pval_option)]]) + + context_table <- data.frame( + context = context, method = methods, + is_imputable = is_imputable, + is_selected_method = is_selected_method, + rsq_cv = cv_rsqs, pval_cv = cv_pvals, + type = twas_weights_data[[molecular_id]][["data_type"]][[context]] + ) return(context_table) })) gene_table$molecular_id <- molecular_id @@ -748,20 +573,14 @@ twas_pipeline <- function(twas_weights_data, twas_table$block <- region_block # Step 3. merge twas result table and twas input into twas_data to output - colname_ordered <- if (quantile_twas) { - c("chr", "molecular_id", "context", "gwas_study", "method", "quantile_start", "quantile_end", "pseudo_R2_avg", "twas_z", "twas_pval", "type", "block") - } else { - c("chr", "molecular_id", "context", "gwas_study", "method", "is_imputable", "is_selected_method", "rsq_cv", "pval_cv", "twas_z", "twas_pval", "type", "block") - } + colname_ordered <- c("chr", "molecular_id", "context", "gwas_study", "method", "is_imputable", "is_selected_method", "rsq_cv", "pval_cv", "twas_z", "twas_pval", "type", "block") if (nrow(twas_results_table) == 0) { return(list(twas_result = NULL, twas_data = NULL, mr_result = NULL)) - } - twas_table <- merge(twas_table, twas_results_table, by = c("molecular_id", "context", "method")) - if (!quantile_twas) { - twas_table <- twas_table[twas_table$is_imputable, , drop = FALSE] } + twas_table <- merge(twas_table, twas_results_table, by = c("molecular_id", "context", "method")) + twas_table <- twas_table[twas_table$is_imputable, , drop = FALSE] if (output_twas_data & nrow(twas_table) > 0) { - twas_data_subset <- format_twas_data(twas_data, twas_table, quantile_twas = quantile_twas) + twas_data_subset <- format_twas_data(twas_data, twas_table) # if (!is.null(twas_data_subset)) twas_data_subset$snp_info <- snp_info } else { twas_data_subset <- NULL diff --git a/man/QUAIL_pipeline.Rd b/man/QUAIL_pipeline.Rd deleted file mode 100644 index 51011d2a..00000000 --- a/man/QUAIL_pipeline.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quail_vqtl.R -\name{QUAIL_pipeline} -\alias{QUAIL_pipeline} -\title{Main QUAIL pipeline -QUAIL vQTL Analysis Pipeline} -\usage{ -QUAIL_pipeline( - genotype, - rank_score, - phenotype = NULL, - covariates = NULL, - phenotype_id = NULL -) -} -\arguments{ -\item{genotype}{numeric matrix (n x p) of genotypes.} - -\item{rank_score}{numeric vector (n x 1) of rank scores from Step 1.} - -\item{phenotype}{optional numeric vector (n x 1) of original phenotype values.} - -\item{covariates}{optional numeric matrix (n x k) of covariates.} -} -\value{ -A data frame containing vQTL results. -} -\description{ -Main QUAIL pipeline -QUAIL vQTL Analysis Pipeline -} -\examples{ -\dontrun{ -results <- QUAIL_pipeline(genotype, rank_score, covariates = covariates) -} -} diff --git a/man/QUAIL_rank_score_pipeline.Rd b/man/QUAIL_rank_score_pipeline.Rd deleted file mode 100644 index 82ce532a..00000000 --- a/man/QUAIL_rank_score_pipeline.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quail_rank_score.R -\name{QUAIL_rank_score_pipeline} -\alias{QUAIL_rank_score_pipeline} -\title{Main QUAIL Rank Score Pipeline} -\usage{ -QUAIL_rank_score_pipeline( - phenotype, - covariates, - num_tau_levels = 19, - method = "equal", - num_cores = 1 -) -} -\arguments{ -\item{phenotype}{numeric vector of phenotype values} - -\item{covariates}{matrix/data.frame of covariates} - -\item{num_tau_levels}{integer number of quantile levels} - -\item{method}{character "equal" or "ivw"} - -\item{num_cores}{integer number of cores for parallel processing} -} -\value{ -data.frame with integrated rank scores -} -\description{ -Main QUAIL Rank Score Pipeline -} diff --git a/man/calculate_xi_correlation.Rd b/man/calculate_xi_correlation.Rd deleted file mode 100644 index 782e09df..00000000 --- a/man/calculate_xi_correlation.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{calculate_xi_correlation} -\alias{calculate_xi_correlation} -\title{Calculate Xi Correlation for QR Coefficients} -\usage{ -calculate_xi_correlation( - rq_coef_result, - tau_range = seq(0.1, 0.9, by = 0.05), - min_valid = 10 -) -} -\arguments{ -\item{rq_coef_result}{Data frame containing variant_id and coef_qr_* columns} - -\item{tau_range}{Numeric vector of tau values to use (default: seq(0.1, 0.9, by = 0.05))} - -\item{min_valid}{Minimum number of valid (non-NA) coefficients required (default: 10)} -} -\value{ -A data frame with variant_id, xi, and xi_pval columns -} -\description{ -This function calculates the xi correlation coefficient and p-value for each variant, -measuring the functional dependence between tau values and QR coefficients. -Uses coef_qr_0.1 to coef_qr_0.9 (17 values, excluding 0.05 and 0.95). -} diff --git a/man/load_quantile_twas_weights.Rd b/man/load_quantile_twas_weights.Rd deleted file mode 100644 index 68e208a3..00000000 --- a/man/load_quantile_twas_weights.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas.R -\name{load_quantile_twas_weights} -\alias{load_quantile_twas_weights} -\title{Load Quantile TWAS Weights} -\usage{ -load_quantile_twas_weights( - weight_db_files, - tau_values = seq(0.01, 0.99, 0.01), - between_cluster = 0.8, - num_intervals = 3 -) -} -\arguments{ -\item{weight_db_files}{A character vector of file paths to the RDS files containing TWAS weights.} - -\item{tau_values}{A numeric vector representing the tau values (quantiles) to use. Default is \code{seq(0.01, 0.99, 0.01)}.} - -\item{between_cluster}{A numeric value specifying the correlation threshold for clustering. Default is 0.8.} - -\item{num_intervals}{The number of fixed non-overlapping intervals to divide the tau values. Default is 3.} -} -\value{ -A list containing: -\item{weights}{A list of integrated TWAS weights for each context.} -\item{twas_cv_performance}{A list of TWAS cross-validation performance metrics for each context.} -} -\description{ -This function loads TWAS weights from RDS files and performs grouped integration using both correlation-based clustering and fixed tau intervals. -} -\examples{ -weight_db_files <- c("file1.rds", "file2.rds") -tau_values <- seq(0.01, 0.99, by = 0.01) -result <- load_quantile_twas_weights(weight_db_files, tau_values) -weights <- result$weights -performance <- result$twas_cv_performance - -} diff --git a/man/multicontext_ld_clumping.Rd b/man/multicontext_ld_clumping.Rd deleted file mode 100644 index 6d0cbf09..00000000 --- a/man/multicontext_ld_clumping.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{multicontext_ld_clumping} -\alias{multicontext_ld_clumping} -\title{Perform Clumping and Pruning} -\usage{ -multicontext_ld_clumping( - X, - qr_results, - maf_list = NULL, - ld_clump_r2 = 0.2, - final_clump_r2 = 0.8 -) -} -\arguments{ -\item{X}{Matrix of genotypes} - -\item{qr_results}{Results from QR_screen} - -\item{maf_list}{List of minor allele frequencies (optional)} - -\item{ld_clump_r2}{R-squared threshold for initial LD clumping based on pvalue} - -\item{final_clump_r2}{R-squared threshold for final LD clumping based on MAF} -} -\value{ -A list containing final SNPs and clumped SNPs -} -\description{ -Perform Clumping and Pruning -} diff --git a/man/perform_qr_analysis.Rd b/man/perform_qr_analysis.Rd deleted file mode 100644 index c838e35b..00000000 --- a/man/perform_qr_analysis.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{perform_qr_analysis} -\alias{perform_qr_analysis} -\title{Perform Quantile Regression Analysis to get beta} -\usage{ -perform_qr_analysis(X, Y, Z = NULL, tau_values = seq(0.05, 0.95, by = 0.05)) -} -\arguments{ -\item{X}{Matrix of predictors} - -\item{Y}{Matrix or vector of response variables} - -\item{Z}{Matrix of covariates (optional)} - -\item{tau_values}{Vector of quantiles to be analyzed} -} -\value{ -A data frame with QR coefficients for each quantile -} -\description{ -Perform Quantile Regression Analysis to get beta -} diff --git a/man/qr_screen.Rd b/man/qr_screen.Rd deleted file mode 100644 index 2b798e10..00000000 --- a/man/qr_screen.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{qr_screen} -\alias{qr_screen} -\title{Quantile TWAS Weight Calculation and QTL Analysis} -\usage{ -qr_screen( - X, - Y, - Z = NULL, - tau.list = seq(0.05, 0.95, by = 0.05), - screen_threshold = 0.05, - screen_method = "qvalue", - top_count = 10, - top_percent = 15 -) -} -\arguments{ -\item{X}{Matrix of predictors} - -\item{Y}{Matrix or vector of response variables} - -\item{Z}{Matrix of covariates (optional)} - -\item{tau.list}{Vector of quantiles to be analyzed} - -\item{screen_threshold}{Significance threshold for adjusted p-values} - -\item{screen_method}{Method for p-value adjustment ('fdr' or 'qvalue')} - -\item{top_count}{Number of top SNPs to select} - -\item{top_percent}{Percentage of top SNPs to select} -} -\value{ -A list containing various results from the QR screen -} -\description{ -This file contains functions for performing Quantile Transcriptome-Wide -Association Studies (TWAS) weight calculations and Quantile QTL analysis. -It provides tools for screening quantile regression results, performing -LD clumping and pruning, and calculating TWAS weights. -} -\details{ -The main function in this file is `quantile_twas_weight_pipeline`, which -orchestrates the entire analysis process. Other functions are helper -functions used within the main pipeline. - -Qrank Score Test Screen -} diff --git a/man/quantile_twas_weight_pipeline.Rd b/man/quantile_twas_weight_pipeline.Rd deleted file mode 100644 index c1726a5b..00000000 --- a/man/quantile_twas_weight_pipeline.Rd +++ /dev/null @@ -1,78 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{quantile_twas_weight_pipeline} -\alias{quantile_twas_weight_pipeline} -\title{Quantile TWAS Weight Pipeline} -\usage{ -quantile_twas_weight_pipeline( - X, - Y, - Z = NULL, - maf = NULL, - region_id = "", - ld_reference_meta_file = NULL, - twas_maf_cutoff = 0.01, - ld_clumping = FALSE, - ld_pruning = FALSE, - screen_significant = TRUE, - quantile_qtl_tau_list = seq(0.05, 0.95, by = 0.05), - quantile_twas_tau_list = seq(0.01, 0.99, by = 0.01), - screen_method = "qvalue", - screen_threshold = 0.05, - xi_tau_range = seq(0.1, 0.9, by = 0.05), - keep_variants = NULL, - marginal_beta_calculate = TRUE, - twas_weight_calculate = TRUE, - qrank_screen_calculate = TRUE, - vqtl_calculate = TRUE, - pre_filter_by_pqr = FALSE, - initial_corr_filter_cutoff = 0.8, - full_rank_corr_filter_cutoff = seq(0.75, 0.5, by = -0.05) -) -} -\arguments{ -\item{X}{Matrix of genotypes} - -\item{Y}{Matrix or vector of phenotypes} - -\item{Z}{Matrix of covariates (optional)} - -\item{maf}{Vector of minor allele frequencies (optional)} - -\item{region_id}{Name of the region being analyzed} - -\item{quantile_qtl_tau_list}{Vector of quantiles for QTL analysis} - -\item{quantile_twas_tau_list}{Vector of quantiles for TWAS analysis} -} -\value{ -A list containing various results from the TWAS weight pipeline: -\itemize{ - \item qr_screen_pvalue_df: Data frame with QR screening results: pavlue, qvalue and zscore. - \item message: Any informational or warning messages. - \item twas_variant_names: Names of variants used in TWAS weight calculation. - \item rq_coef_df: Data frame with quantile regression coefficients. - \item twas_weight: Matrix of TWAS weights. - \item pseudo_R2: Vector of pseudo R-squared values. - \item quantile_twas_prediction: Matrix of TWAS predictions. -} -} -\description{ -Quantile TWAS Weight Pipeline -} -\details{ -The function performs the following steps: -1. QR screening to identify significant SNPs. -2. Filtering of highly correlated SNPs. -3. LD clumping and pruning(use filtered SNPs from step 1). -4. Calculation of QR coefficients for selected SNPs(use filtered SNPs from step 3). -5. Calculation of TWAS weights and pseudo R-squared values(use filtered SNPs from step 2). -} -\examples{ -# Example usage: -# X <- matrix of genotypes -# Y <- vector of phenotypes -# Z <- matrix of covariates -# results <- quantile_twas_weight_pipeline(X, Y, Z, region_id = "GeneA") - -} diff --git a/tests/testthat/test_ctwas.R b/tests/testthat/test_ctwas.R new file mode 100644 index 00000000..8284a566 --- /dev/null +++ b/tests/testthat/test_ctwas.R @@ -0,0 +1,160 @@ +context("ctwas") + +# =========================================================================== +# ctwas wrapper tests +# =========================================================================== + + +# ---------- trim_ctwas_variants -------------------------------------------- + +# Helper: build a minimal region_data structure that trim_ctwas_variants expects +make_mock_region_data <- function() { + # Variant IDs in canonical format (chr:pos:A2:A1) + variant_ids <- c("chr1:1000:A:G", "chr1:2000:C:T", "chr1:3000:G:A", "chr1:4000:T:C") + + # Weight matrix (4 variants x 1 weight column) + wgt <- matrix(c(0.5, 0.0001, 0.3, -0.2), nrow = 4, ncol = 1) + rownames(wgt) <- variant_ids + + gene_id <- "GENE1|ctx1" + context <- "ctx1" + study <- "study1" + + weights <- list() + weights[[gene_id]] <- list() + weights[[gene_id]][[study]] <- list( + wgt = wgt, + context = context, + p0 = 1000, + p1 = 4000 + ) + + # SuSiE intermediate info + pip_vals <- c(0.8, 0.05, 0.6, 0.02) + names(pip_vals) <- variant_ids + + susie_weights_intermediate <- list() + susie_weights_intermediate[["GENE1"]] <- list() + susie_weights_intermediate[["GENE1"]][[context]] <- list( + pip = pip_vals, + cs_variants = list(variant_ids[c(1, 3)]), + cs_purity = list(min.abs.corr = 0.9) + ) + + list( + weights = weights, + susie_weights_intermediate = susie_weights_intermediate + ) +} + +test_that("trim_ctwas_variants removes variants below weight cutoff", { + rd <- make_mock_region_data() + # Default cutoff 1e-5, variant 2 has weight 0.0001 (above), so all 4 should pass default cutoff + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) + expect_true(is.list(result)) + # With a higher cutoff, the near-zero variant should be removed + result_strict <- trim_ctwas_variants(rd, twas_weight_cutoff = 0.001) + # study1 should exist in result + expect_true("study1" %in% names(result_strict)) + # Get the gene-level result + gene_weights <- result_strict[["study1"]][["GENE1|ctx1"]] + # Variant 2 has abs(weight) = 0.0001 < 0.001, so should be removed + expect_false("chr1:2000:C:T" %in% rownames(gene_weights$wgt)) +}) + +test_that("trim_ctwas_variants removes gene when all weights below cutoff", { + rd <- make_mock_region_data() + # Set cutoff so high that all variants are dropped + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 10) + # Gene should be removed entirely since no weights pass the cutoff + # Result should be an empty list + expect_equal(length(result), 0) +}) + +test_that("trim_ctwas_variants returns result keyed by study", { + rd <- make_mock_region_data() + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) + # merge_by_study reorganizes: weights[[study]][[group]] + expect_true("study1" %in% names(result)) + expect_true("GENE1|ctx1" %in% names(result[["study1"]])) +}) + +test_that("trim_ctwas_variants updates p0 and p1 positions", { + rd <- make_mock_region_data() + # Use a weight cutoff that removes the variant at position 2000 + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 0.001) + gene_weights <- result[["study1"]][["GENE1|ctx1"]] + # p0 and p1 should reflect the range of remaining variant positions + remaining_positions <- as.integer(sapply( + rownames(gene_weights$wgt), + function(v) strsplit(v, ":")[[1]][2] + )) + expect_equal(gene_weights$p0, min(remaining_positions)) + expect_equal(gene_weights$p1, max(remaining_positions)) +}) + +test_that("trim_ctwas_variants respects max_num_variants", { + rd <- make_mock_region_data() + # Request max 2 variants; since nrow(wgt) == 4 >= max_num_variants == 2, + # it triggers select_variants which picks by PIP priority + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5, max_num_variants = 2) + gene_weights <- result[["study1"]][["GENE1|ctx1"]] + expect_true(nrow(gene_weights$wgt) <= 2) +}) + +test_that("trim_ctwas_variants handles NA weights by removing group", { + rd <- make_mock_region_data() + # Replace all weights with NA + rd$weights[["GENE1|ctx1"]][["study1"]]$wgt[, 1] <- NA + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 0) + # The group should be removed because all weights are NA + expect_equal(length(result), 0) +}) + +test_that("trim_ctwas_variants handles multiple genes", { + rd <- make_mock_region_data() + + # Add a second gene + variant_ids2 <- c("chr1:5000:A:G", "chr1:6000:C:T") + wgt2 <- matrix(c(0.4, -0.3), nrow = 2, ncol = 1) + rownames(wgt2) <- variant_ids2 + + rd$weights[["GENE2|ctx1"]] <- list() + rd$weights[["GENE2|ctx1"]][["study1"]] <- list( + wgt = wgt2, + context = "ctx1", + p0 = 5000, + p1 = 6000 + ) + + pip_vals2 <- c(0.7, 0.4) + names(pip_vals2) <- variant_ids2 + rd$susie_weights_intermediate[["GENE2"]] <- list() + rd$susie_weights_intermediate[["GENE2"]][["ctx1"]] <- list( + pip = pip_vals2, + cs_variants = list(variant_ids2[1]), + cs_purity = list(min.abs.corr = 0.95) + ) + + result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) + expect_true("GENE1|ctx1" %in% names(result[["study1"]])) + expect_true("GENE2|ctx1" %in% names(result[["study1"]])) +}) + +test_that("trim_ctwas_variants select_variants uses cs_min_cor to include CS variants", { + rd <- make_mock_region_data() + # cs_purity min.abs.corr = 0.9, so with cs_min_cor = 0.8 the CS variants + # (variant 1 and 3) should be included. Max 2 variants. + result <- trim_ctwas_variants(rd, + twas_weight_cutoff = 1e-5, + cs_min_cor = 0.8, + min_pip_cutoff = 0.0, + max_num_variants = 2 + ) + gene_weights <- result[["study1"]][["GENE1|ctx1"]] + included <- rownames(gene_weights$wgt) + # CS variants chr1:1000:A:G and chr1:3000:G:A have highest PIPs (0.8 and 0.6) + # and are in the CS, so they should be prioritized + expect_true("chr1:1000:A:G" %in% included) + expect_true("chr1:3000:G:A" %in% included) +}) diff --git a/tests/testthat/test_quail_ctwas.R b/tests/testthat/test_quail_ctwas.R deleted file mode 100644 index 7a66c55f..00000000 --- a/tests/testthat/test_quail_ctwas.R +++ /dev/null @@ -1,374 +0,0 @@ -context("quail_ctwas") - -# =========================================================================== -# Helper: small simulated phenotype + covariate data (n = 30) -# =========================================================================== -make_quail_data <- function(n = 30, seed = 42) { - set.seed(seed) - covariates <- data.frame( - age = rnorm(n, 50, 10), - sex = rbinom(n, 1, 0.5) - ) - # Phenotype with variance that depends on covariates (heteroscedastic) - phenotype <- 2 + 0.5 * covariates$age + rnorm(n, sd = 1 + 0.3 * covariates$sex) - list(phenotype = phenotype, covariates = covariates) -} - -# =========================================================================== -# QUAIL rank score tests -# =========================================================================== - -# ---------- calculate_rank_score ------------------------------------------- - -test_that("calculate_rank_score returns vector of correct length", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - result <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.5, seed = 1) - expect_true(is.numeric(result)) - expect_equal(length(result), length(d$phenotype)) -}) - -test_that("calculate_rank_score is reproducible with same seed", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - r1 <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.25, seed = 99) - r2 <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.25, seed = 99) - expect_identical(r1, r2) -}) - -test_that("calculate_rank_score differs with different seeds", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - r1 <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.5, seed = 1) - r2 <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.5, seed = 2) - # Different random covariate column, so results should differ - - expect_false(identical(r1, r2)) -}) - -test_that("calculate_rank_score produces different results for different tau values", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - r_low <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.1, seed = 1) - r_high <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.9, seed = 1) - expect_false(identical(r_low, r_high)) -}) - -test_that("calculate_rank_score returns finite values", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - result <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau = 0.5, seed = 1) - expect_true(all(is.finite(result))) -}) - -# ---------- calculate_integrated_score ------------------------------------- - -test_that("calculate_integrated_score works with equal method and even num_tau_levels", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - n <- 30 - num_tau <- 4 # even - # Create mock rank scores: list of numeric vectors - set.seed(10) - rank_scores <- lapply(1:num_tau, function(i) rnorm(n)) - result <- pecotmr:::calculate_integrated_score(rank_scores, method = "equal", num_tau_levels = num_tau) - expect_true(is.numeric(result)) - expect_equal(length(result), n) - expect_true(all(is.finite(result))) -}) - -test_that("calculate_integrated_score works with equal method and odd num_tau_levels", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - n <- 30 - num_tau <- 5 # odd - set.seed(11) - rank_scores <- lapply(1:num_tau, function(i) rnorm(n)) - result <- pecotmr:::calculate_integrated_score(rank_scores, method = "equal", num_tau_levels = num_tau) - expect_true(is.numeric(result)) - expect_equal(length(result), n) -}) - -test_that("calculate_integrated_score equal method: middle tau has zero weight for odd levels", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - n <- 20 - num_tau <- 3 # odd, mid_point = 2 - # Set all scores to zero except the middle one - rank_scores <- list(rep(0, n), rep(1, n), rep(0, n)) - result <- pecotmr:::calculate_integrated_score(rank_scores, method = "equal", num_tau_levels = num_tau) - # Middle score (index 2) is skipped for odd levels; lower_half = {1}, upper_half = {3} - # int_rank_score = -rank_scores[[1]] + rank_scores[[3]] = 0 + 0 = 0 - # n_pairs = 1, so result = 0/1 = 0 - expect_equal(result, rep(0, n)) -}) - -test_that("calculate_integrated_score IVW method returns correct length", { - skip("Known bug: solve.QP constraint matrix dimensions are incorrect in IVW method") - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - n <- 30 - num_tau <- 4 - set.seed(20) - rank_scores <- lapply(1:num_tau, function(i) rnorm(n, mean = i)) - result <- pecotmr:::calculate_integrated_score(rank_scores, method = "ivw", num_tau_levels = num_tau) - expect_true(is.numeric(result)) - expect_equal(length(result), n) - expect_true(all(is.finite(result))) -}) - -test_that("calculate_integrated_score IVW method with odd num_tau_levels", { - skip("Known bug: solve.QP constraint matrix dimensions are incorrect in IVW method") - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - n <- 30 - num_tau <- 5 - set.seed(21) - rank_scores <- lapply(1:num_tau, function(i) rnorm(n, mean = i * 0.5)) - result <- pecotmr:::calculate_integrated_score(rank_scores, method = "ivw", num_tau_levels = num_tau) - expect_true(is.numeric(result)) - expect_equal(length(result), n) -}) - -# ---------- fit_rank_scores ------------------------------------------------ - -test_that("fit_rank_scores returns list of correct length", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - num_tau <- 3 - result <- pecotmr:::fit_rank_scores(d$phenotype, d$covariates, num_tau_levels = num_tau, num_cores = 1) - expect_true(is.list(result)) - expect_equal(length(result), num_tau) -}) - -test_that("fit_rank_scores each element has correct length", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - num_tau <- 3 - result <- pecotmr:::fit_rank_scores(d$phenotype, d$covariates, num_tau_levels = num_tau, num_cores = 1) - for (i in seq_along(result)) { - expect_equal(length(result[[i]]), length(d$phenotype)) - expect_true(is.numeric(result[[i]])) - } -}) - -test_that("fit_rank_scores elements correspond to evenly spaced tau levels", { - skip_if_not_installed("quantreg") - d <- make_quail_data() - num_tau <- 3 - result <- pecotmr:::fit_rank_scores(d$phenotype, d$covariates, num_tau_levels = num_tau, num_cores = 1) - # Each element should match a direct call to calculate_rank_score at the expected tau - for (i in 1:num_tau) { - tau <- i / (num_tau + 1) - expected <- pecotmr:::calculate_rank_score(d$phenotype, d$covariates, tau) - expect_equal(result[[i]], expected) - } -}) - -# ---------- QUAIL_rank_score_pipeline (exported) --------------------------- - -test_that("QUAIL_rank_score_pipeline errors on non-numeric character phenotype", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - d <- make_quail_data() - pheno_char <- c("a", "b", "c") - expect_error( - QUAIL_rank_score_pipeline(pheno_char, d$covariates, num_tau_levels = 3), - "phenotype must be a numeric vector" - ) -}) - -test_that("QUAIL_rank_score_pipeline accepts data.frame phenotype", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - d <- make_quail_data() - pheno_df <- data.frame(pheno = d$phenotype) - # Should not error; data.frame is converted internally - result <- QUAIL_rank_score_pipeline(pheno_df, d$covariates, num_tau_levels = 3, method = "equal") - expect_true(is.numeric(result)) - expect_equal(length(result), length(d$phenotype)) -}) - -test_that("QUAIL_rank_score_pipeline full run with small data and equal method", { - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - d <- make_quail_data() - result <- QUAIL_rank_score_pipeline(d$phenotype, d$covariates, - num_tau_levels = 5, method = "equal", num_cores = 1 - ) - expect_true(is.numeric(result)) - expect_equal(length(result), length(d$phenotype)) - expect_true(all(is.finite(result))) -}) - -test_that("QUAIL_rank_score_pipeline full run with IVW method", { - skip("Known bug: solve.QP constraint matrix dimensions are incorrect in IVW method") - skip_if_not_installed("quantreg") - skip_if_not_installed("quadprog") - d <- make_quail_data() - result <- QUAIL_rank_score_pipeline(d$phenotype, d$covariates, - num_tau_levels = 4, method = "ivw", num_cores = 1 - ) - expect_true(is.numeric(result)) - expect_equal(length(result), length(d$phenotype)) - expect_true(all(is.finite(result))) -}) - -# =========================================================================== -# ctwas wrapper tests -# =========================================================================== - - -# ---------- trim_ctwas_variants -------------------------------------------- - -# Helper: build a minimal region_data structure that trim_ctwas_variants expects -make_mock_region_data <- function() { - # Variant IDs in canonical format (chr:pos:A2:A1) - variant_ids <- c("chr1:1000:A:G", "chr1:2000:C:T", "chr1:3000:G:A", "chr1:4000:T:C") - - # Weight matrix (4 variants x 1 weight column) - wgt <- matrix(c(0.5, 0.0001, 0.3, -0.2), nrow = 4, ncol = 1) - rownames(wgt) <- variant_ids - - gene_id <- "GENE1|ctx1" - context <- "ctx1" - study <- "study1" - - weights <- list() - weights[[gene_id]] <- list() - weights[[gene_id]][[study]] <- list( - wgt = wgt, - context = context, - p0 = 1000, - p1 = 4000 - ) - - # SuSiE intermediate info - pip_vals <- c(0.8, 0.05, 0.6, 0.02) - names(pip_vals) <- variant_ids - - susie_weights_intermediate <- list() - susie_weights_intermediate[["GENE1"]] <- list() - susie_weights_intermediate[["GENE1"]][[context]] <- list( - pip = pip_vals, - cs_variants = list(variant_ids[c(1, 3)]), - cs_purity = list(min.abs.corr = 0.9) - ) - - list( - weights = weights, - susie_weights_intermediate = susie_weights_intermediate - ) -} - -test_that("trim_ctwas_variants removes variants below weight cutoff", { - rd <- make_mock_region_data() - # Default cutoff 1e-5, variant 2 has weight 0.0001 (above), so all 4 should pass default cutoff - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) - expect_true(is.list(result)) - # With a higher cutoff, the near-zero variant should be removed - result_strict <- trim_ctwas_variants(rd, twas_weight_cutoff = 0.001) - # study1 should exist in result - expect_true("study1" %in% names(result_strict)) - # Get the gene-level result - gene_weights <- result_strict[["study1"]][["GENE1|ctx1"]] - # Variant 2 has abs(weight) = 0.0001 < 0.001, so should be removed - expect_false("chr1:2000:C:T" %in% rownames(gene_weights$wgt)) -}) - -test_that("trim_ctwas_variants removes gene when all weights below cutoff", { - rd <- make_mock_region_data() - # Set cutoff so high that all variants are dropped - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 10) - # Gene should be removed entirely since no weights pass the cutoff - # Result should be an empty list - expect_equal(length(result), 0) -}) - -test_that("trim_ctwas_variants returns result keyed by study", { - rd <- make_mock_region_data() - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) - # merge_by_study reorganizes: weights[[study]][[group]] - expect_true("study1" %in% names(result)) - expect_true("GENE1|ctx1" %in% names(result[["study1"]])) -}) - -test_that("trim_ctwas_variants updates p0 and p1 positions", { - rd <- make_mock_region_data() - # Use a weight cutoff that removes the variant at position 2000 - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 0.001) - gene_weights <- result[["study1"]][["GENE1|ctx1"]] - # p0 and p1 should reflect the range of remaining variant positions - remaining_positions <- as.integer(sapply( - rownames(gene_weights$wgt), - function(v) strsplit(v, ":")[[1]][2] - )) - expect_equal(gene_weights$p0, min(remaining_positions)) - expect_equal(gene_weights$p1, max(remaining_positions)) -}) - -test_that("trim_ctwas_variants respects max_num_variants", { - rd <- make_mock_region_data() - # Request max 2 variants; since nrow(wgt) == 4 >= max_num_variants == 2, - # it triggers select_variants which picks by PIP priority - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5, max_num_variants = 2) - gene_weights <- result[["study1"]][["GENE1|ctx1"]] - expect_true(nrow(gene_weights$wgt) <= 2) -}) - -test_that("trim_ctwas_variants handles NA weights by removing group", { - rd <- make_mock_region_data() - # Replace all weights with NA - rd$weights[["GENE1|ctx1"]][["study1"]]$wgt[, 1] <- NA - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 0) - # The group should be removed because all weights are NA - expect_equal(length(result), 0) -}) - -test_that("trim_ctwas_variants handles multiple genes", { - rd <- make_mock_region_data() - - # Add a second gene - variant_ids2 <- c("chr1:5000:A:G", "chr1:6000:C:T") - wgt2 <- matrix(c(0.4, -0.3), nrow = 2, ncol = 1) - rownames(wgt2) <- variant_ids2 - - rd$weights[["GENE2|ctx1"]] <- list() - rd$weights[["GENE2|ctx1"]][["study1"]] <- list( - wgt = wgt2, - context = "ctx1", - p0 = 5000, - p1 = 6000 - ) - - pip_vals2 <- c(0.7, 0.4) - names(pip_vals2) <- variant_ids2 - rd$susie_weights_intermediate[["GENE2"]] <- list() - rd$susie_weights_intermediate[["GENE2"]][["ctx1"]] <- list( - pip = pip_vals2, - cs_variants = list(variant_ids2[1]), - cs_purity = list(min.abs.corr = 0.95) - ) - - result <- trim_ctwas_variants(rd, twas_weight_cutoff = 1e-5) - expect_true("GENE1|ctx1" %in% names(result[["study1"]])) - expect_true("GENE2|ctx1" %in% names(result[["study1"]])) -}) - -test_that("trim_ctwas_variants select_variants uses cs_min_cor to include CS variants", { - rd <- make_mock_region_data() - # cs_purity min.abs.corr = 0.9, so with cs_min_cor = 0.8 the CS variants - # (variant 1 and 3) should be included. Max 2 variants. - result <- trim_ctwas_variants(rd, - twas_weight_cutoff = 1e-5, - cs_min_cor = 0.8, - min_pip_cutoff = 0.0, - max_num_variants = 2 - ) - gene_weights <- result[["study1"]][["GENE1|ctx1"]] - included <- rownames(gene_weights$wgt) - # CS variants chr1:1000:A:G and chr1:3000:G:A have highest PIPs (0.8 and 0.6) - # and are in the CS, so they should be prioritized - expect_true("chr1:1000:A:G" %in% included) - expect_true("chr1:3000:G:A" %in% included) -}) diff --git a/tests/testthat/test_quail_vqtl.R b/tests/testthat/test_quail_vqtl.R deleted file mode 100644 index 39170ddd..00000000 --- a/tests/testthat/test_quail_vqtl.R +++ /dev/null @@ -1,338 +0,0 @@ -context("quail_vqtl") - -# =========================================================================== -# Helpers: small synthetic genotype / phenotype data -# =========================================================================== - -make_genotype_matrix <- function(n = 50, p = 5, seed = 123) { - set.seed(seed) - geno <- matrix(rbinom(n * p, 2, 0.3), nrow = n, ncol = p) - colnames(geno) <- paste0("chr1:", seq(100, by = 100, length.out = p), ":A:G") - geno -} - -make_rank_score <- function(n = 50, seed = 123) { - set.seed(seed) - rnorm(n) -} - -make_covariates <- function(n = 50, seed = 123) { - set.seed(seed + 1) - data.frame( - age = rnorm(n, 50, 10), - sex = rbinom(n, 1, 0.5) - ) -} - -# =========================================================================== -# QUAIL_pipeline tests -# =========================================================================== - -# ---------- Input validation ----------------------------------------------- - -test_that("QUAIL_pipeline errors when rank_score is not numeric", { - geno <- make_genotype_matrix() - expect_error( - QUAIL_pipeline(geno, rank_score = c("a", "b", "c")), - "rank_score must be a numeric vector" - ) -}) - -# ---------- Basic synthetic data run --------------------------------------- - -test_that("QUAIL_pipeline runs with basic synthetic data", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), ncol(geno)) -}) - -# ---------- Output structure ----------------------------------------------- - -test_that("QUAIL_pipeline output has required columns", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - expected_cols <- c("phenotype_id", "chr", "pos", "A1", "A2", - "variant_id", "beta", "se", "z", "p", "q", "N") - expect_true(all(expected_cols %in% colnames(result))) -}) - -test_that("QUAIL_pipeline N column equals number of samples", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix(n = 40) - rs <- make_rank_score(n = 40) - result <- QUAIL_pipeline(geno, rs) - expect_true(all(result$N == 40)) -}) - -test_that("QUAIL_pipeline variant_id matches genotype column names", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - expect_equal(result$variant_id, colnames(geno)) -}) - -test_that("QUAIL_pipeline parses chr and pos from variant IDs", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - # All variants are on chr1 - expect_true(all(result$chr == 1)) - # Positions should be 100, 200, ..., 500 - expect_equal(result$pos, seq(100, by = 100, length.out = ncol(geno))) -}) - -test_that("QUAIL_pipeline phenotype_id defaults to NA when not provided", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - expect_true(all(is.na(result$phenotype_id))) -}) - -test_that("QUAIL_pipeline phenotype_id is set when provided", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs, phenotype_id = "my_trait") - expect_true(all(result$phenotype_id == "my_trait")) -}) - -# ---------- With covariates ------------------------------------------------ - -test_that("QUAIL_pipeline works with covariates", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - cov <- make_covariates() - result <- QUAIL_pipeline(geno, rs, covariates = cov) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), ncol(geno)) -}) - -test_that("QUAIL_pipeline covariates affect beta estimates", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - cov <- make_covariates() - result_no_cov <- QUAIL_pipeline(geno, rs) - result_with_cov <- QUAIL_pipeline(geno, rs, covariates = cov) - # Betas should differ when adjusting for covariates - expect_false(identical(result_no_cov$beta, result_with_cov$beta)) -}) - -test_that("QUAIL_pipeline accepts matrix covariates", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - cov_mat <- as.matrix(make_covariates()) - result <- QUAIL_pipeline(geno, rs, covariates = cov_mat) - expect_true(is.data.frame(result)) -}) - -# ---------- p-values and z-scores ------------------------------------------ - -test_that("QUAIL_pipeline p-values are between 0 and 1", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix() - rs <- make_rank_score() - result <- QUAIL_pipeline(geno, rs) - expect_true(all(result$p >= 0 & result$p <= 1)) -}) - -test_that("QUAIL_pipeline z-scores are finite", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix(n = 60, p = 4) - rs <- make_rank_score(n = 60) - result <- QUAIL_pipeline(geno, rs) - # z-scores should all be finite for well-conditioned data - expect_true(all(is.finite(result$z))) -}) - -# =========================================================================== -# univariate_regression tests (internal function) -# =========================================================================== - -test_that("univariate_regression returns expected list structure", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200), 50, 4) - colnames(X) <- paste0("snp", 1:4) - y <- rnorm(50) - res <- pecotmr:::univariate_regression(X, y) - expect_true(is.list(res)) - expected_names <- c("betahat", "sebetahat", "z_scores", "p_values", "q_values") - expect_true(all(expected_names %in% names(res))) - # Each element should have length equal to ncol(X) - for (nm in expected_names) { - expect_equal(length(res[[nm]]), ncol(X)) - } -}) - -test_that("univariate_regression names outputs by column names of X", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(150), 50, 3) - colnames(X) <- c("var_A", "var_B", "var_C") - y <- rnorm(50) - res <- pecotmr:::univariate_regression(X, y) - expect_equal(names(res$betahat), colnames(X)) - expect_equal(names(res$sebetahat), colnames(X)) -}) - -test_that("univariate_regression handles centering", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200, mean = 10), 50, 4) - colnames(X) <- paste0("s", 1:4) - y <- rnorm(50, mean = 5) - # With centering (default) - res_centered <- pecotmr:::univariate_regression(X, y, center = TRUE) - # Without centering - res_no_center <- pecotmr:::univariate_regression(X, y, center = FALSE) - # Since the regression includes an intercept, centering X and y - # should yield numerically equivalent slope estimates - expect_equal(res_centered$betahat, res_no_center$betahat, tolerance = 1e-10) - expect_equal(res_centered$sebetahat, res_no_center$sebetahat, tolerance = 1e-10) - # Both results should have valid finite values - expect_true(all(is.finite(res_centered$betahat))) - expect_true(all(is.finite(res_no_center$betahat))) -}) - -test_that("univariate_regression handles scaling", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200, sd = 5), 50, 4) - colnames(X) <- paste0("s", 1:4) - y <- rnorm(50) - res_scaled <- pecotmr:::univariate_regression(X, y, scale = TRUE) - res_not_scaled <- pecotmr:::univariate_regression(X, y, scale = FALSE) - # Scaling divides each column by its SD, so beta_scaled = beta_unscaled * column_sd - col_sds <- apply(X, 2, sd) - expect_equal(res_scaled$betahat, res_not_scaled$betahat * col_sds, - tolerance = 1e-10) - # Both should be finite - expect_true(all(is.finite(res_scaled$betahat))) - expect_true(all(is.finite(res_not_scaled$betahat))) -}) - -test_that("univariate_regression handles covariates", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200), 50, 4) - colnames(X) <- paste0("s", 1:4) - Z <- matrix(rnorm(100), 50, 2) - y <- rnorm(50) - res_cov <- pecotmr:::univariate_regression(X, y, Z = Z) - res_no_cov <- pecotmr:::univariate_regression(X, y) - # Adjusting for covariates should generally reduce residual SE - # (or at minimum keep it comparable) - expect_true(all(res_cov$sebetahat <= res_no_cov$sebetahat + 1e-10)) - # Both should return valid finite results - expect_true(all(is.finite(res_cov$betahat))) - expect_true(all(is.finite(res_cov$sebetahat))) -}) - -test_that("univariate_regression handles NAs in y", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200), 50, 4) - colnames(X) <- paste0("s", 1:4) - y <- rnorm(50) - y[c(1, 5, 10)] <- NA - res <- pecotmr:::univariate_regression(X, y) - # Should still return valid results - expect_equal(length(res$betahat), 4) - expect_true(all(is.finite(res$betahat))) -}) - -test_that("univariate_regression return_residuals works with covariates", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200), 50, 4) - colnames(X) <- paste0("s", 1:4) - Z <- matrix(rnorm(100), 50, 2) - y <- rnorm(50) - res <- pecotmr:::univariate_regression(X, y, Z = Z, return_residuals = TRUE) - expect_true("residuals" %in% names(res)) - expect_equal(length(res$residuals), length(y)) -}) - -test_that("univariate_regression return_residuals without covariates omits residuals", { - skip_if_not_installed("qvalue") - set.seed(42) - X <- matrix(rnorm(200), 50, 4) - colnames(X) <- paste0("s", 1:4) - y <- rnorm(50) - res <- pecotmr:::univariate_regression(X, y, return_residuals = TRUE) - # Without Z, residuals should not be in the output - expect_false("residuals" %in% names(res)) -}) - -# =========================================================================== -# run_linear_regression tests (internal function) -# =========================================================================== - -test_that("run_linear_regression returns a data frame with expected columns", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix(n = 40, p = 3) - pheno <- make_rank_score(n = 40) - res <- pecotmr:::run_linear_regression(geno, pheno) - expected_cols <- c("phenotype_id", "chr", "pos", "A1", "A2", - "variant_id", "beta", "se", "z", "p", "q", "N") - expect_true(all(expected_cols %in% colnames(res))) - expect_equal(nrow(res), ncol(geno)) -}) - -test_that("run_linear_regression passes phenotype_id through", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix(n = 40, p = 3) - pheno <- make_rank_score(n = 40) - res <- pecotmr:::run_linear_regression(geno, pheno, phenotype_id = "test_pheno") - expect_true(all(res$phenotype_id == "test_pheno")) -}) - -# =========================================================================== -# Edge cases -# =========================================================================== - -test_that("QUAIL_pipeline works with a single SNP", { - skip_if_not_installed("qvalue") - set.seed(99) - n <- 30 - geno <- matrix(rbinom(n, 2, 0.3), nrow = n, ncol = 1) - colnames(geno) <- "chr1:500:C:T" - rs <- rnorm(n) - result <- QUAIL_pipeline(geno, rs) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), 1) -}) - -test_that("univariate_regression handles all-zero genotype column gracefully", { - skip_if_not_installed("qvalue") - set.seed(99) - n <- 40 - X <- matrix(rnorm(n * 3), n, 3) - X[, 2] <- 0 # all-zero column - colnames(X) <- paste0("s", 1:3) - y <- rnorm(n) - # Should not error; the zero-variance column gets NaN which is set to 0 - res <- pecotmr:::univariate_regression(X, y) - expect_equal(length(res$betahat), 3) - # The all-zero column (after centering, NaN set to 0) produces beta = 0 - expect_equal(unname(res$betahat[2]), 0) -}) - -test_that("QUAIL_pipeline se values are non-negative", { - skip_if_not_installed("qvalue") - geno <- make_genotype_matrix(n = 60, p = 4) - rs <- make_rank_score(n = 60) - result <- QUAIL_pipeline(geno, rs) - expect_true(all(result$se >= 0)) -}) diff --git a/tests/testthat/test_quantile_twas.R b/tests/testthat/test_quantile_twas.R deleted file mode 100644 index f3d99282..00000000 --- a/tests/testthat/test_quantile_twas.R +++ /dev/null @@ -1,717 +0,0 @@ -context("quantile_twas") - -# =========================================================================== -# ensure_continuous_clusters (internal) -# =========================================================================== - -test_that("ensure_continuous_clusters preserves already continuous", { - index <- c(1, 1, 2, 2, 3) - result <- pecotmr:::ensure_continuous_clusters(index) - expect_equal(result, c(1, 1, 2, 2, 3)) -}) - -test_that("ensure_continuous_clusters renumbers non-continuous", { - index <- c(1, 2, 1, 2, 3) - result <- pecotmr:::ensure_continuous_clusters(index) - expect_equal(result, c(1, 2, 3, 4, 5)) -}) - -test_that("ensure_continuous_clusters handles single element", { - result <- pecotmr:::ensure_continuous_clusters(c(1)) - expect_equal(result, c(1)) -}) - -test_that("ensure_continuous_clusters handles two elements same cluster", { - result <- pecotmr:::ensure_continuous_clusters(c(1, 1)) - expect_equal(result, c(1, 1)) -}) - -test_that("ensure_continuous_clusters all same cluster", { - result <- pecotmr:::ensure_continuous_clusters(c(3, 3, 3, 3)) - expect_equal(result, c(1, 1, 1, 1)) -}) - -# =========================================================================== -# get_cluster_ranges (internal) -# =========================================================================== - -test_that("get_cluster_ranges returns correct ranges", { - index <- c(1, 1, 2, 2, 3) - result <- pecotmr:::get_cluster_ranges(index) - expect_length(result, 3) - expect_equal(result[[1]], "1 - 2") - expect_equal(result[[2]], "3 - 4") - expect_equal(result[[3]], "5 - 5") -}) - -test_that("get_cluster_ranges single cluster", { - index <- c(1, 1, 1) - result <- pecotmr:::get_cluster_ranges(index) - expect_length(result, 1) - expect_equal(result[[1]], "1 - 3") -}) - -# =========================================================================== -# get_modularity (internal) -# =========================================================================== - -test_that("get_modularity returns 0 for single element", { - W <- matrix(1, nrow = 1, ncol = 1) - B <- matrix(1, nrow = 1, ncol = 1) - result <- pecotmr:::get_modularity(W, B) - expect_equal(result, 0) -}) - -test_that("get_modularity returns numeric for identity weight", { - W <- diag(5) - B <- matrix(c(rep(c(1, 0), c(3, 2)), rep(c(0, 1), c(3, 2))), nrow = 5) - result <- pecotmr:::get_modularity(W, B) - expect_type(result, "double") -}) - -test_that("get_modularity handles all-positive weights", { - set.seed(42) - W <- abs(matrix(rnorm(16), 4, 4)) - W <- (W + t(W)) / 2 - B <- matrix(c(1, 1, 0, 0, 0, 0, 1, 1), nrow = 4) - result <- pecotmr:::get_modularity(W, B) - expect_type(result, "double") - expect_true(is.finite(result)) -}) - -# =========================================================================== -# get_n_cluster (internal) -# =========================================================================== - -test_that("get_n_cluster returns single cluster for high correlation", { - Sigma <- matrix(0.9, nrow = 5, ncol = 5) - diag(Sigma) <- 1 - hc <- hclust(as.dist(1 - Sigma)) - result <- pecotmr:::get_n_cluster(hc, Sigma, between_cluster = 0.8) - expect_equal(result$n_cluster, 1) -}) - -test_that("get_n_cluster returns multiple clusters for low correlation", { - set.seed(42) - Sigma <- diag(10) - Sigma[1:5, 1:5] <- 0.8 - Sigma[6:10, 6:10] <- 0.8 - diag(Sigma) <- 1 - hc <- hclust(as.dist(1 - Sigma)) - result <- pecotmr:::get_n_cluster(hc, Sigma, between_cluster = 0.5) - expect_true(result$n_cluster >= 1) -}) - -# =========================================================================== -# integrate_tau (internal) -# =========================================================================== - -test_that("integrate_tau with vector input", { - tau <- c(0.25, 0.50, 0.75) - a_tau <- c(1, 2, 1) - result <- pecotmr:::integrate_tau(tau, a_tau) - expect_type(result, "double") - expect_true(result > 0) -}) - -test_that("integrate_tau with matrix input", { - tau <- c(0.25, 0.50, 0.75) - a_tau <- matrix(c(1, 2, 1, 0.5, 1, 0.5), nrow = 2, byrow = TRUE) - result <- pecotmr:::integrate_tau(tau, a_tau) - expect_equal(nrow(result), 2) - expect_true(all(result > 0)) -}) - -test_that("integrate_tau zero weights give zero", { - tau <- c(0.25, 0.50, 0.75) - a_tau <- c(0, 0, 0) - result <- pecotmr:::integrate_tau(tau, a_tau) - expect_equal(as.numeric(result), 0) -}) - -# =========================================================================== -# get_hierarchical_clusters (internal) -# =========================================================================== - -test_that("get_hierarchical_clusters returns valid structure", { - set.seed(42) - p <- 10 - Sigma <- matrix(0.3, nrow = p, ncol = p) - Sigma[1:5, 1:5] <- 0.9 - Sigma[6:10, 6:10] <- 0.9 - diag(Sigma) <- 1 - result <- pecotmr:::get_hierarchical_clusters(Sigma, between_cluster = 0.5) - expect_type(result, "list") - expect_true("cluster" %in% names(result)) - expect_true("Q_modularity_initial" %in% names(result)) - expect_true("cluster_ranges" %in% names(result)) - expect_equal(nrow(result$cluster), p) - expect_true(all(rowSums(result$cluster) == 1)) -}) - -# =========================================================================== -# perform_grouped_integration (internal) -# =========================================================================== - -test_that("perform_grouped_integration returns correct structure", { - set.seed(42) - n_variants <- 5 - n_tau <- 10 - twas_weight <- matrix(rnorm(n_variants * n_tau), nrow = n_variants) - rownames(twas_weight) <- paste0("v", 1:n_variants) - tau_values <- seq(0.1, 0.9, length.out = n_tau) - pseudo_R2 <- runif(n_tau, 0.1, 0.5) - - result <- pecotmr:::perform_grouped_integration(twas_weight, tau_values, pseudo_R2, - num_intervals = 3) - expect_type(result, "list") - expect_true("weights" %in% names(result)) - expect_true("twas_weight_performance" %in% names(result)) - expect_equal(nrow(result$weights), n_variants) - expect_true(ncol(result$weights) >= 3) -}) - -test_that("perform_grouped_integration single variant skips clustering", { - twas_weight <- matrix(rnorm(10), nrow = 1) - rownames(twas_weight) <- "v1" - tau_values <- seq(0.1, 0.9, length.out = 10) - pseudo_R2 <- runif(10, 0.1, 0.5) - - result <- pecotmr:::perform_grouped_integration(twas_weight, tau_values, pseudo_R2, - num_intervals = 3) - expect_equal(ncol(result$weights), 3) -}) - -# =========================================================================== -# corr_filter (internal) -# =========================================================================== - -test_that("corr_filter removes highly correlated columns", { - set.seed(42) - n <- 50; p <- 10 - X <- matrix(rnorm(n * p), nrow = n) - colnames(X) <- paste0("v", 1:p) - X[, 2] <- X[, 1] + rnorm(n, sd = 0.01) - result <- pecotmr:::corr_filter(X, cor_thres = 0.9) - expect_true(ncol(result$X.new) < p) - expect_true(length(result$filter.id) == ncol(result$X.new)) -}) - -test_that("corr_filter keeps all columns when uncorrelated", { - set.seed(42) - n <- 100; p <- 5 - X <- matrix(rnorm(n * p), nrow = n) - colnames(X) <- paste0("v", 1:p) - result <- pecotmr:::corr_filter(X, cor_thres = 0.99) - expect_equal(ncol(result$X.new), p) - expect_equal(result$filter.id, 1:p) -}) - -test_that("corr_filter preserves colnames for single remaining column", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 3), nrow = n) - colnames(X) <- c("a", "b", "c") - X[, 2] <- X[, 1] + rnorm(n, sd = 0.001) - X[, 3] <- X[, 1] + rnorm(n, sd = 0.001) - result <- pecotmr:::corr_filter(X, cor_thres = 0.5) - expect_true(ncol(result$X.new) >= 1) - expect_true(!is.null(colnames(result$X.new))) -}) - -test_that("corr_filter handles single-column input", { - set.seed(42) - n <- 30 - X <- matrix(rnorm(n), nrow = n, ncol = 1) - colnames(X) <- "v1" - expect_error(pecotmr:::corr_filter(X, cor_thres = 0.8)) -}) - -test_that("corr_filter with low threshold removes more columns", { - set.seed(42) - n <- 100; p <- 5 - X <- matrix(rnorm(n * p), nrow = n) - colnames(X) <- paste0("v", 1:p) - X[, 2] <- X[, 1] + rnorm(n, sd = 0.1) - X[, 3] <- X[, 1] + rnorm(n, sd = 0.1) - X[, 5] <- X[, 4] + rnorm(n, sd = 0.1) - result_strict <- pecotmr:::corr_filter(X, cor_thres = 0.3) - result_lenient <- pecotmr:::corr_filter(X, cor_thres = 0.99) - expect_true(ncol(result_strict$X.new) <= ncol(result_lenient$X.new)) -}) - -test_that("corr_filter preserves colnames when no columns deleted", { - set.seed(42) - n <- 100; p <- 3 - X <- matrix(rnorm(n * p), nrow = n) - colnames(X) <- c("snp_a", "snp_b", "snp_c") - result <- pecotmr:::corr_filter(X, cor_thres = 0.999) - expect_equal(colnames(result$X.new), colnames(X)) -}) - -# =========================================================================== -# remove_highcorr_snp (internal) -# =========================================================================== - -test_that("remove_highcorr_snp returns X unchanged when no problematic columns", { - X <- matrix(rnorm(100), nrow = 20, ncol = 5) - colnames(X) <- paste0("v", 1:5) - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = character(0), strategy = "correlation") - expect_equal(ncol(result), 5) -}) - -test_that("remove_highcorr_snp removes single problematic column", { - X <- matrix(rnorm(100), nrow = 20, ncol = 5) - colnames(X) <- paste0("v", 1:5) - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = "v3", strategy = "correlation") - expect_equal(ncol(result), 4) - expect_false("v3" %in% colnames(result)) -}) - -test_that("remove_highcorr_snp variance strategy removes lowest variance column", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) - colnames(X) <- c("low_var", "mid_var", "high_var") - X[, 1] <- X[, 1] * 0.01 - X[, 3] <- X[, 3] * 10 - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = c("low_var", "mid_var", "high_var"), - strategy = "variance") - expect_equal(ncol(result), 2) - expect_false("low_var" %in% colnames(result)) -}) - -test_that("remove_highcorr_snp correlation strategy with two columns removes one randomly", { - set.seed(42) - X <- matrix(rnorm(100), nrow = 20, ncol = 5) - colnames(X) <- paste0("v", 1:5) - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = c("v1", "v2"), strategy = "correlation") - expect_equal(ncol(result), 4) - expect_true(xor("v1" %in% colnames(result), "v2" %in% colnames(result)) || - (!("v1" %in% colnames(result)) && !("v2" %in% colnames(result))) == FALSE) -}) - -test_that("remove_highcorr_snp correlation strategy with 3+ cols removes highest sum", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 4), nrow = n, ncol = 4) - colnames(X) <- paste0("v", 1:4) - X[, 2] <- X[, 1] + rnorm(n, sd = 0.01) - X[, 3] <- X[, 1] + rnorm(n, sd = 0.01) - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = c("v1", "v2", "v3"), - strategy = "correlation") - expect_equal(ncol(result), 3) -}) - -test_that("remove_highcorr_snp response_correlation strategy removes lowest response corr", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) - colnames(X) <- c("v1", "v2", "v3") - response <- X[, 1] * 2 + rnorm(n, sd = 0.1) - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = c("v1", "v2", "v3"), - strategy = "response_correlation", - response = response) - expect_equal(ncol(result), 2) - expect_true("v1" %in% colnames(result)) -}) - -test_that("remove_highcorr_snp errors on invalid strategy", { - X <- matrix(rnorm(100), nrow = 20, ncol = 5) - colnames(X) <- paste0("v", 1:5) - expect_error( - pecotmr:::remove_highcorr_snp(X, problematic_cols = c("v1", "v2"), - strategy = "invalid_strategy"), - "arg" - ) -}) - -test_that("remove_highcorr_snp preserves column name when single column remains", { - set.seed(42) - X <- matrix(rnorm(40), nrow = 20, ncol = 2) - colnames(X) <- c("keeper", "removed") - result <- pecotmr:::remove_highcorr_snp(X, problematic_cols = "removed", strategy = "correlation") - expect_equal(ncol(result), 1) - expect_equal(colnames(result), "keeper") -}) - -# =========================================================================== -# check_remove_highcorr_snp (internal) -# =========================================================================== - -test_that("check_remove_highcorr_snp returns full-rank matrix when already full rank", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 3), nrow = n, ncol = 3) - colnames(X) <- paste0("v", 1:3) - C <- matrix(rnorm(n * 2), nrow = n, ncol = 2) - result <- pecotmr:::check_remove_highcorr_snp(X = X, C = C, strategy = "correlation") - expect_true(is.matrix(result)) - expect_true(ncol(result) >= 1) -}) - -test_that("check_remove_highcorr_snp handles rank-deficient design via corr_filter fallback", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 4), nrow = n, ncol = 4) - colnames(X) <- paste0("v", 1:4) - X[, 4] <- X[, 1] + X[, 2] - C <- NULL - result <- pecotmr:::check_remove_highcorr_snp(X = X, C = C, strategy = "correlation") - design <- cbind(1, result) - expect_equal(qr(design)$rank, ncol(design)) -}) - -test_that("check_remove_highcorr_snp preserves colname for single-column input", { - set.seed(42) - n <- 50 - X <- matrix(rnorm(n), nrow = n, ncol = 1) - colnames(X) <- "only_snp" - C <- NULL - result <- pecotmr:::check_remove_highcorr_snp(X = X, C = C, strategy = "correlation") - expect_equal(colnames(result), "only_snp") -}) - -# =========================================================================== -# calculate_coef_heterogeneity (internal) -# =========================================================================== - -test_that("calculate_coef_heterogeneity computes log(sd/mean)", { - df <- data.frame( - variant_id = c("v1", "v2"), - coef_qr_0.25 = c(1, 0), - coef_qr_0.50 = c(2, 0), - coef_qr_0.75 = c(3, 0), - stringsAsFactors = FALSE - ) - result <- pecotmr:::calculate_coef_heterogeneity(df) - expect_equal(nrow(result), 2) - expect_true("coef_heter" %in% colnames(result)) - expect_equal(result$coef_heter[1], log(1 / 2), tolerance = 0.01) - expect_true(is.na(result$coef_heter[2])) -}) - -test_that("calculate_coef_heterogeneity handles NA values", { - df <- data.frame( - variant_id = "v1", - coef_qr_0.25 = 1, - coef_qr_0.50 = NA, - coef_qr_0.75 = 3, - stringsAsFactors = FALSE - ) - result <- pecotmr:::calculate_coef_heterogeneity(df) - expect_equal(nrow(result), 1) - expect_true(is.finite(result$coef_heter[1]) || is.na(result$coef_heter[1])) -}) - -test_that("calculate_coef_heterogeneity handles all-same coefficients", { - df <- data.frame( - variant_id = "v1", - coef_qr_0.25 = 5, - coef_qr_0.50 = 5, - coef_qr_0.75 = 5, - stringsAsFactors = FALSE - ) - result <- pecotmr:::calculate_coef_heterogeneity(df) - expect_true(is.infinite(result$coef_heter[1]) || is.na(result$coef_heter[1])) -}) - -test_that("calculate_coef_heterogeneity handles negative coefficients", { - df <- data.frame( - variant_id = "v1", - coef_qr_0.25 = -1, - coef_qr_0.50 = -2, - coef_qr_0.75 = -3, - stringsAsFactors = FALSE - ) - result <- pecotmr:::calculate_coef_heterogeneity(df) - expect_equal(result$coef_heter[1], log(1 / 2), tolerance = 0.01) -}) - -# =========================================================================== -# calculate_xi_correlation -# =========================================================================== - -test_that("calculate_xi_correlation computes xi for valid monotonic data", { - skip_if_not_installed("XICOR") - df <- data.frame(variant_id = "v1") - tau_range <- seq(0.05, 0.95, by = 0.05) - for (tau in tau_range) { - df[[paste0("coef_qr_", tau)]] <- tau * 2 - } - result <- calculate_xi_correlation(df) - expect_true("xi" %in% colnames(result)) - expect_true(is.numeric(result$xi)) -}) - -test_that("calculate_xi_correlation warns on missing columns", { - skip_if_not_installed("XICOR") - df <- data.frame(variant_id = "v1", some_col = 1) - expect_warning(result <- calculate_xi_correlation(df)) - expect_true(is.na(result$xi)) -}) - -test_that("calculate_xi_correlation returns NA for too few valid values", { - skip_if_not_installed("XICOR") - df <- data.frame(variant_id = "v1") - tau_range <- seq(0.1, 0.9, by = 0.05) - for (tau in tau_range) { - df[[paste0("coef_qr_", tau)]] <- NA - } - df$coef_qr_0.1 <- 1 - result <- calculate_xi_correlation(df, min_valid = 10) - expect_true(is.na(result$xi)) -}) - -test_that("calculate_xi_correlation handles multiple variants", { - skip_if_not_installed("XICOR") - df <- data.frame(variant_id = c("v1", "v2")) - tau_range <- seq(0.1, 0.9, by = 0.05) - for (tau in tau_range) { - df[[paste0("coef_qr_", tau)]] <- c(tau * 2, tau * -1) - } - result <- calculate_xi_correlation(df, tau_range = tau_range) - expect_equal(nrow(result), 2) - expect_true("xi" %in% colnames(result)) - expect_true("xi_pval" %in% colnames(result)) -}) - -test_that("calculate_xi_correlation handles error in xicor gracefully", { - skip_if_not_installed("XICOR") - df <- data.frame(variant_id = "v1") - tau_range <- seq(0.1, 0.9, by = 0.05) - for (tau in tau_range) { - df[[paste0("coef_qr_", tau)]] <- 1 - } - result <- calculate_xi_correlation(df, tau_range = tau_range, min_valid = 5) - expect_equal(nrow(result), 1) - expect_true(is.numeric(result$xi) || is.na(result$xi)) -}) - -# =========================================================================== -# qr_screen -# =========================================================================== - -test_that("qr_screen runs and returns results with quantreg", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50 - X <- matrix(rnorm(n * 5), nrow = n, ncol = 5) - colnames(X) <- paste0("chr1:", seq(100, 500, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - result <- qr_screen(X, Y, tau.list = c(0.25, 0.5, 0.75)) - expect_type(result, "list") -}) - -test_that("qr_screen with fdr screen_method works", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 5 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 500, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - result <- qr_screen(X, Y, tau.list = c(0.25, 0.5, 0.75), screen_method = "fdr") - expect_type(result, "list") - expect_true("df_result" %in% names(result)) - expect_true("fdr_p_qr" %in% colnames(result$df_result)) -}) - -test_that("qr_screen errors with invalid screen_method", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 30; p <- 3 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 300, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - expect_error(qr_screen(X, Y, tau.list = c(0.5), screen_method = "invalid_method"), - "Invalid screen_method") -}) - -test_that("qr_screen with covariates Z", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 3 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 300, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - Z <- matrix(rnorm(n * 2), nrow = n, ncol = 2) - result <- qr_screen(X, Y, Z = Z, tau.list = c(0.25, 0.5, 0.75)) - expect_type(result, "list") - expect_equal(nrow(result$df_result), p) -}) - -test_that("qr_screen with single variant", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50 - X <- matrix(rnorm(n), nrow = n, ncol = 1) - colnames(X) <- "chr1:100:A:G" - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - result <- qr_screen(X, Y, tau.list = c(0.25, 0.5, 0.75)) - expect_type(result, "list") - expect_equal(nrow(result$df_result), 1) -}) - -# =========================================================================== -# perform_qr_analysis -# =========================================================================== - -test_that("perform_qr_analysis returns wide-format results", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 3 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 300, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - result <- perform_qr_analysis(X, Y, tau_values = c(0.25, 0.5, 0.75)) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), p) - expect_true("coef_qr_0.25" %in% colnames(result)) - expect_true("coef_qr_0.5" %in% colnames(result)) - expect_true("coef_qr_0.75" %in% colnames(result)) - expect_true("chr" %in% colnames(result)) - expect_true("pos" %in% colnames(result)) -}) - -test_that("perform_qr_analysis with covariates Z", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 2 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 200, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - Z <- matrix(rnorm(n * 2), nrow = n, ncol = 2) - result <- perform_qr_analysis(X, Y, Z = Z, tau_values = c(0.5)) - expect_equal(nrow(result), p) - expect_true("coef_qr_0.5" %in% colnames(result)) -}) - -test_that("perform_qr_analysis with single variant", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50 - X <- matrix(rnorm(n), nrow = n, ncol = 1) - colnames(X) <- "chr1:100:A:G" - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - result <- perform_qr_analysis(X, Y, tau_values = c(0.25, 0.5)) - expect_equal(nrow(result), 1) -}) - -# =========================================================================== -# multicontext_ld_clumping -# =========================================================================== - -test_that("multicontext_ld_clumping returns empty result when no significant SNPs", { - qr_results <- list(sig.SNPs_names = character(0)) - X <- matrix(rnorm(100), nrow = 20, ncol = 5) - result <- multicontext_ld_clumping(X, qr_results) - expect_null(result$final_SNPs) - expect_null(result$clumped_SNPs) - expect_true(grepl("No significant", result$message)) -}) - -test_that("multicontext_ld_clumping returns early when X has single column", { - X <- matrix(rnorm(20), nrow = 20, ncol = 1) - colnames(X) <- "chr1:100:A:G" - qr_results <- list(sig.SNPs_names = "chr1:100:A:G") - result <- multicontext_ld_clumping(X, qr_results) - expect_equal(result$final_SNPs, "chr1:100:A:G") - expect_true(grepl("Only one SNP", result$message)) -}) - -# =========================================================================== -# quantile_twas_weight_pipeline (mocked) -# =========================================================================== - -test_that("quantile_twas_weight_pipeline returns early when no significant SNPs and screen_significant=TRUE", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 3 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 300, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - - local_mocked_bindings( - QUAIL_rank_score_pipeline = function(...) list(rank_score = matrix(rnorm(n), ncol = 1)), - QUAIL_pipeline = function(...) list(vqtl = "mocked"), - qr_screen = function(...) { - list( - df_result = data.frame(variant_id = colnames(X)), - sig_SNP_threshold = integer(0), - sig.SNPs_names = character(0), - pvec = rep(1, p), - quantile_pvalue = matrix(1, nrow = p, ncol = 1), - quantile_zscore = matrix(0, nrow = p, ncol = 1), - tau_list = 0.5 - ) - } - ) - result <- quantile_twas_weight_pipeline( - X, Y, screen_significant = TRUE, - quantile_qtl_tau_list = c(0.5), - quantile_twas_tau_list = c(0.5) - ) - expect_true(grepl("No significant", result$message)) - expect_true("qr_screen_pvalue_df" %in% names(result)) -}) - -test_that("quantile_twas_weight_pipeline returns early when no raw p_qr < 0.05", { - skip_if_not_installed("quantreg") - set.seed(42) - n <- 50; p <- 3 - X <- matrix(rnorm(n * p), nrow = n, ncol = p) - colnames(X) <- paste0("chr1:", seq(100, 300, by = 100), ":A:G") - Y <- matrix(rnorm(n), ncol = 1) - colnames(Y) <- "pheno1" - - local_mocked_bindings( - QUAIL_rank_score_pipeline = function(...) list(rank_score = matrix(rnorm(n), ncol = 1)), - QUAIL_pipeline = function(...) list(vqtl = "mocked"), - qr_screen = function(...) { - pvec <- rep(0.5, p) - names(pvec) <- colnames(X) - list( - df_result = data.frame(variant_id = colnames(X)), - sig_SNP_threshold = 1:p, - sig.SNPs_names = colnames(X), - pvec = pvec, - quantile_pvalue = matrix(0.5, nrow = p, ncol = 1), - quantile_zscore = matrix(0, nrow = p, ncol = 1), - tau_list = 0.5 - ) - }, - perform_qr_analysis = function(...) { - data.frame( - variant_id = colnames(X), - coef_qr_0.5 = rnorm(p), - chr = rep("chr1", p), - pos = seq(100, 300, by = 100), - ref = rep("G", p), - alt = rep("A", p), - phenotype_id = rep("pheno1", p), - stringsAsFactors = FALSE - ) - }, - calculate_coef_heterogeneity = function(...) { - data.frame(variant_id = colnames(X), coef_heter = rnorm(p)) - }, - calculate_xi_correlation = function(...) { - data.frame(variant_id = colnames(X), xi = rnorm(p), xi_pval = runif(p)) - } - ) - result <- quantile_twas_weight_pipeline( - X, Y, screen_significant = FALSE, - quantile_qtl_tau_list = c(0.5), - quantile_twas_tau_list = c(0.5) - ) - # expect_true(grepl("No variants with raw", result$message)) -}) diff --git a/tests/testthat/test_twas.R b/tests/testthat/test_twas.R index 9b6e2fdf..7dff17ef 100644 --- a/tests/testthat/test_twas.R +++ b/tests/testthat/test_twas.R @@ -1651,7 +1651,7 @@ test_that("twas_pipeline: pick_best_model skips context when no model passes thr }) # NOTE: Removed test "twas_pipeline: rsq_cutoff=0 skips best model selection entirely" -# When rsq_cutoff=0 in non-quantile mode, the source code assigns NA model_selection +# When rsq_cutoff=0, the source code assigns NA model_selection # instead of a list structure, causing the is_imputable column to be missing in the # downstream data frame construction. This is a known source code issue. @@ -1904,7 +1904,7 @@ test_that("twas_pipeline: empty twas_variants intersection returns empty data fr # =========================================================================== # NOTE: Removed test "twas_pipeline: missing data_type in twas_weights_data gets assigned NA" -# When rsq_cutoff=0 in non-quantile mode, the source code assigns NA model_selection +# When rsq_cutoff=0, the source code assigns NA model_selection # instead of a list structure, causing the is_imputable column to be missing in the # downstream data frame construction. This is a known source code issue. @@ -2108,73 +2108,6 @@ test_that("twas_pipeline: mr_result is returned in final result", { expect_true("mr_result" %in% names(result)) }) -# =========================================================================== -# twas_pipeline: quantile_twas mode -# =========================================================================== - -test_that("twas_pipeline: quantile_twas=TRUE sets rsq_cutoff to 0 and skips model selection", { - set.seed(501) - variant_ids <- make_variant_ids() - p <- length(variant_ids) - weights_mat <- make_weights_matrix(variant_ids) - - twas_weights_data <- make_twas_weights_data( - molecular_id = "gene1", - contexts = "ctx1" - ) - # Add quantile-specific performance data - twas_weights_data$gene1$twas_cv_performance$ctx1$susie_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.15, - rsq = 0.5, adj_rsq = 0.45, pval = 0.01, adj_rsq_pval = 0.02 - ) - twas_weights_data$gene1$twas_cv_performance$ctx1$lasso_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.12, - rsq = 0.3, adj_rsq = 0.25, pval = 0.05, adj_rsq_pval = 0.06 - ) - - LD_matrix <- diag(p) - rownames(LD_matrix) <- colnames(LD_matrix) <- variant_ids - - mock_twas_data_qced <- list( - gene1 = list( - chrom = 1, - data_type = list(ctx1 = "expression"), - variant_names = list(ctx1 = list(study1 = variant_ids)), - weights_qced = list(ctx1 = list(study1 = list( - scaled_weights = weights_mat, - weights = weights_mat - ))), - gwas_qced = list(study1 = data.frame( - variant_id = variant_ids, - z = rnorm(p), - stringsAsFactors = FALSE - )), - LD = LD_matrix - ) - ) - - local_mocked_bindings( - harmonize_twas = function(...) { - list(twas_data_qced = mock_twas_data_qced, ref_panel = data.frame()) - } - ) - - result <- suppressMessages(twas_pipeline( - twas_weights_data = twas_weights_data, - ld_meta_file_path = "fake_ld.tsv", - gwas_meta_file = "fake_gwas.tsv", - region_block = "chr1_100_500", - ld_reference_sample_size = 17000, - quantile_twas = TRUE - )) - - expect_true(is.list(result)) - # In quantile mode the output should have twas_result - if (!is.null(result$twas_result)) { - expect_true("method" %in% colnames(result$twas_result)) - } -}) - # =========================================================================== # twas_pipeline: nrow(twas_results_table) == 0 returns NULL results # =========================================================================== @@ -2614,72 +2547,6 @@ test_that("twas_pipeline: rsq_pval_option='adj_rsq_pval' uses the correct p-valu # No model should pass -> no imputable genes }) -# =========================================================================== -# twas_pipeline: format_twas_data internal function -- quantile_twas path -# =========================================================================== - -test_that("twas_pipeline: quantile_twas with output_twas_data exercises quantile format_twas_data", { - set.seed(1001) - variant_ids <- make_variant_ids() - p <- length(variant_ids) - weights_mat <- make_weights_matrix(variant_ids) - - twas_weights_data <- make_twas_weights_data( - molecular_id = "gene1", - contexts = "ctx1" - ) - twas_weights_data$gene1$twas_cv_performance$ctx1$susie_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.15, - rsq = 0.5, adj_rsq = 0.45, pval = 0.01, adj_rsq_pval = 0.02 - ) - twas_weights_data$gene1$twas_cv_performance$ctx1$lasso_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.12, - rsq = 0.3, adj_rsq = 0.25, pval = 0.05, adj_rsq_pval = 0.06 - ) - - LD_matrix <- diag(p) - rownames(LD_matrix) <- colnames(LD_matrix) <- variant_ids - - mock_twas_data_qced <- list( - gene1 = list( - chrom = 1, - data_type = list(ctx1 = "expression"), - variant_names = list(ctx1 = list(study1 = variant_ids)), - weights_qced = list(ctx1 = list(study1 = list( - scaled_weights = weights_mat, - weights = weights_mat - ))), - gwas_qced = list(study1 = data.frame( - variant_id = variant_ids, - z = rnorm(p), - stringsAsFactors = FALSE - )), - LD = LD_matrix - ) - ) - - local_mocked_bindings( - harmonize_twas = function(...) { - list(twas_data_qced = mock_twas_data_qced, ref_panel = data.frame()) - } - ) - - result <- suppressMessages(twas_pipeline( - twas_weights_data = twas_weights_data, - ld_meta_file_path = "fake_ld.tsv", - gwas_meta_file = "fake_gwas.tsv", - region_block = "chr1_100_500", - ld_reference_sample_size = 17000, - quantile_twas = TRUE, - output_twas_data = TRUE - )) - - expect_true(is.list(result)) - if (!is.null(result$twas_result)) { - expect_true("method" %in% colnames(result$twas_result)) - } -}) - # =========================================================================== # Tests from test_twas_predict.R # =========================================================================== @@ -3112,75 +2979,6 @@ test_that("twas_pipeline: adj_rsq_pval option exercised in pick_best_model", { } }) -# =========================================================================== -# SECTION Q: twas_pipeline - quantile_twas code paths in Step 2 merge -# =========================================================================== - -test_that("twas_pipeline: quantile_twas=TRUE with proper cv data triggers quantile merge path", { - set.seed(902) - variant_ids <- make_variant_ids() - p <- length(variant_ids) - weights_mat <- make_weights_matrix(variant_ids) - - twas_weights_data <- make_twas_weights_data( - molecular_id = "gene1", - contexts = "ctx1", - methods = c("susie_weights", "lasso_weights") - ) - # Add quantile-specific performance columns - twas_weights_data$gene1$twas_cv_performance$ctx1$susie_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.15, - rsq = 0.5, adj_rsq = 0.45, pval = 0.01, adj_rsq_pval = 0.02 - ) - twas_weights_data$gene1$twas_cv_performance$ctx1$lasso_performance <- data.frame( - quantile_start = 0, quantile_end = 0.25, pseudo_R2_avg = 0.12, - rsq = 0.3, adj_rsq = 0.25, pval = 0.05, adj_rsq_pval = 0.06 - ) - - LD_matrix <- diag(p) - rownames(LD_matrix) <- colnames(LD_matrix) <- variant_ids - - mock_twas_data_qced <- list( - gene1 = list( - chrom = 1, - data_type = list(ctx1 = "expression"), - variant_names = list(ctx1 = list(study1 = variant_ids)), - weights_qced = list(ctx1 = list(study1 = list( - scaled_weights = weights_mat, - weights = weights_mat - ))), - gwas_qced = list(study1 = data.frame( - variant_id = variant_ids, - z = rnorm(p), - stringsAsFactors = FALSE - )), - LD = LD_matrix - ) - ) - - local_mocked_bindings( - harmonize_twas = function(...) { - list(twas_data_qced = mock_twas_data_qced, ref_panel = data.frame()) - } - ) - - result <- suppressMessages(twas_pipeline( - twas_weights_data = twas_weights_data, - ld_meta_file_path = "fake_ld.tsv", - gwas_meta_file = "fake_gwas.tsv", - region_block = "chr1_100_500", - ld_reference_sample_size = 17000, - quantile_twas = TRUE - )) - - expect_true(is.list(result)) - if (!is.null(result$twas_result)) { - # Quantile mode should have quantile_start/quantile_end columns - expect_true("quantile_start" %in% colnames(result$twas_result) || - "method" %in% colnames(result$twas_result)) - } -}) - # =========================================================================== # SECTION R: twas_pipeline - data_type missing from twas_weights_data (line 614) # =========================================================================== From 689add0a050e3110305c1a3b44a115539d1af404 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 17 Apr 2026 13:51:29 -0700 Subject: [PATCH 2/5] clean up mash_wrapper, TWAS, finemapping reg. regression pipelines --- .Rbuildignore | 1 + .github/recipe/recipe.yaml | 5 +- .gitignore | 1 + DESCRIPTION | 3 +- NAMESPACE | 12 - R/mash_wrapper.R | 439 +++++++++++------------------ R/multigene_udr.R | 88 ------ R/plot.R | 129 --------- R/regularized_regression.R | 95 +++---- R/susie_wrapper.R | 83 +++--- R/twas.R | 153 +++++----- R/twas_weights.R | 115 ++++---- R/univariate_pipeline.R | 89 +++--- man/corr_filter.Rd | 19 -- man/manhattan_plot.Rd | 20 -- man/multigene_udr.Rd | 37 --- man/twas_pipeline.Rd | 1 - man/venn.Rd | 17 -- pixi.toml | 4 +- tests/testthat/test_LD.R | 105 +++++++ tests/testthat/test_file_utils.R | 60 ++++ tests/testthat/test_ld_loader.R | 122 ++++++++ tests/testthat/test_mash_wrapper.R | 34 +-- tests/testthat/test_plot.R | 26 -- 24 files changed, 706 insertions(+), 952 deletions(-) delete mode 100644 R/multigene_udr.R delete mode 100644 R/plot.R delete mode 100644 man/corr_filter.Rd delete mode 100644 man/manhattan_plot.Rd delete mode 100644 man/multigene_udr.Rd delete mode 100644 man/venn.Rd create mode 100644 tests/testthat/test_ld_loader.R delete mode 100644 tests/testthat/test_plot.R diff --git a/.Rbuildignore b/.Rbuildignore index a5c06f4a..b317f075 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^pixi.lock ^\.pixi ^.covrignore +^.claude diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml index e279aac0..d93c8751 100644 --- a/.github/recipe/recipe.yaml +++ b/.github/recipe/recipe.yaml @@ -44,6 +44,7 @@ requirements: - r-gbj - r-glmnet - r-harmonicmeanp + - r-igraph - r-l0learn - r-magrittr - r-mashr @@ -54,8 +55,6 @@ requirements: - r-pgenlibr - r-purrr - r-qgg - - r-quadprog - - r-quantreg - r-rcpp - r-rcpparmadillo - r-rcppdpr @@ -68,7 +67,6 @@ requirements: - r-tidyr - r-vctrs - r-vroom - - r-xicor run: - bioconductor-biostrings - bioconductor-iranges @@ -91,6 +89,7 @@ requirements: - r-gbj - r-glmnet - r-harmonicmeanp + - r-igraph - r-l0learn - r-magrittr - r-mashr diff --git a/.gitignore b/.gitignore index 8ec0282b..8f6dda7e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ pecotmr.Rproj /pixi.lock /.pixi docs +.claude diff --git a/DESCRIPTION b/DESCRIPTION index 5b28239a..b81719cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Imports: dplyr, furrr, future, + igraph, magrittr, matrixStats, purrr, @@ -55,7 +56,6 @@ Suggests: ncvreg, pgenlibr, qgg, - udr, qvalue, rmarkdown, snpStats, @@ -64,7 +64,6 @@ Remotes: stephenslab/fsusieR, stephenslab/mvsusieR, stephenslab/susieR, - stephenslab/udr LinkingTo: Rcpp, RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index 1dc4b07f..faf1e899 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,7 +75,6 @@ export(mr_format) export(mrash_weights) export(mrmash_weights) export(mrmash_wrapper) -export(multigene_udr) export(multivariate_analysis_pipeline) export(mvsusie_weights) export(normalize_variant_id) @@ -127,11 +126,8 @@ importFrom(IRanges,reduce) importFrom(IRanges,start) importFrom(S4Vectors,queryHits) importFrom(S4Vectors,subjectHits) -importFrom(bigsnpr,snp_clumping) -importFrom(bigstatsr,FBM.code256) importFrom(coloc,coloc.bf_bf) importFrom(doFuture,registerDoFuture) -importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,any_of) @@ -162,7 +158,6 @@ importFrom(future,multicore) importFrom(future,plan) importFrom(magrittr,"%>%") importFrom(matrixStats,colVars) -importFrom(parallel,mclapply) importFrom(purrr,compact) importFrom(purrr,exec) importFrom(purrr,map) @@ -177,20 +172,14 @@ importFrom(readr,cols) importFrom(readr,parse_number) importFrom(readr,read_delim) importFrom(rlang,"!!!") -importFrom(stats,as.dist) importFrom(stats,coef) importFrom(stats,cor) -importFrom(stats,cov) -importFrom(stats,cutree) -importFrom(stats,hclust) importFrom(stats,lm.fit) importFrom(stats,pchisq) importFrom(stats,pnorm) importFrom(stats,predict) -importFrom(stats,rnorm) importFrom(stats,sd) importFrom(stats,setNames) -importFrom(stats,var) importFrom(stringr,str_detect) importFrom(stringr,str_remove) importFrom(stringr,str_replace) @@ -205,7 +194,6 @@ importFrom(susieR,susie_rss) importFrom(susieR,univariate_regression) importFrom(tibble,as_tibble) importFrom(tibble,tibble) -importFrom(tidyr,pivot_wider) importFrom(tidyr,replace_na) importFrom(tidyr,separate) importFrom(tools,file_path_sans_ext) diff --git a/R/mash_wrapper.R b/R/mash_wrapper.R index f31baa1c..55c5c5d2 100644 --- a/R/mash_wrapper.R +++ b/R/mash_wrapper.R @@ -1,3 +1,11 @@ +# Filter rows of a z-score matrix by significance p-value cutoff. +# Returns integer indices of rows where any |z| exceeds the threshold. +# @noRd +filter_by_significance <- function(z_matrix, sig_p_cutoff) { + z_threshold <- sqrt(qchisq(sig_p_cutoff, df = 1, lower.tail = FALSE)) + which(apply(z_matrix, 1, function(row) any(abs(row) >= z_threshold))) +} + #' @importFrom vroom vroom #' @export filter_invalid_summary_stat <- function(dat_list, bhat = NULL, sbhat = NULL, z = NULL, btoz = FALSE, sig_p_cutoff = 1E-6, filter_by_missing_rate = 0.2) { @@ -41,9 +49,7 @@ filter_invalid_summary_stat <- function(dat_list, bhat = NULL, sbhat = NULL, z = } if ("strong.z" %in% names(dat_list)) { if (!is.null(sig_p_cutoff)) { - chi_square_stat <- qchisq(sig_p_cutoff, df = 1, lower.tail = FALSE) - z_score <- sqrt(chi_square_stat) - keep_index <- which(apply(dat_list$strong.z, 1, function(row) any(abs(row) >= z_score))) + keep_index <- filter_by_significance(dat_list$strong.z, sig_p_cutoff) dat_list[["strong.z"]] <- dat_list$strong.z[keep_index, ] dat_list[["strong.b"]] <- dat_list$strong.b[keep_index, ] dat_list[["strong.s"]] <- dat_list$strong.s[keep_index, ] @@ -64,23 +70,15 @@ filter_invalid_summary_stat <- function(dat_list, bhat = NULL, sbhat = NULL, z = } # Process each component if it exists - if (!is.null(dat_list$strong) && !is.null(dat_list$strong$z)) { - dat_list$strong$z <- process_z(dat_list$strong$z) - } - - if (!is.null(dat_list$random) && !is.null(dat_list$random$z)) { - dat_list$random$z <- process_z(dat_list$random$z) - } - - if (!is.null(dat_list$null) && !is.null(dat_list$null$z)) { - dat_list$null$z <- process_z(dat_list$null$z) + for (comp in c("strong", "random", "null")) { + if (!is.null(dat_list[[comp]]) && !is.null(dat_list[[comp]]$z)) { + dat_list[[comp]]$z <- process_z(dat_list[[comp]]$z) + } } # Apply significance cutoff to strong signals if applicable if (!is.null(dat_list$strong) && !is.null(dat_list$strong$z) && !is.null(sig_p_cutoff)) { - chi_square_stat <- qchisq(sig_p_cutoff, df = 1, lower.tail = FALSE) - z_score <- sqrt(chi_square_stat) - keep_index <- which(apply(dat_list$strong$z, 1, function(row) any(abs(row) >= z_score))) + keep_index <- filter_by_significance(dat_list$strong$z, sig_p_cutoff) dat_list$strong$z <- dat_list$strong$z[keep_index, , drop = FALSE] } } @@ -106,20 +104,13 @@ filter_mixture_components <- function(conditions_to_keep, U, w = NULL, w_cutoff }, conditions_to_keep) # Remove matrices where all values are zero or weight is below cutoff - for (mat_name in names(U)) { - if (all(U[[mat_name]] == 0)) { - U[[mat_name]] <- NULL - if (!is.null(w)) { - w <- w[!names(w) %in% mat_name] - } - next - } - if (!is.null(w)) { - if (w[mat_name] < w_cutoff) { - w <- w[!names(w) %in% mat_name] - U[[mat_name]] <- NULL - } - } + keep_names <- names(purrr::keep(U, function(mat) !all(mat == 0))) + if (!is.null(w)) { + keep_names <- intersect(keep_names, names(w[w >= w_cutoff])) + } + U <- U[keep_names] + if (!is.null(w)) { + w <- w[keep_names] } # Note: Matrices in U may contain very small values on the diagonal @@ -329,169 +320,103 @@ merge_susie_cs <- function(susie_fit, coverage = "cs_coverage_0.95", complementa } return(overlap_sets) } - # Merge overlapping credible sets and update the credible sets in - # variants_list + # Merge overlapping credible sets using graph connected components merge_and_update_overlap_sets <- function(variants_sets_and_pips_list, overlap_sets) { - # Combine and identify unique combined sets - combined_sets <- unique(unlist(lapply(overlap_sets, function(x) { - paste(sort(x), - collapse = "," - ) - }))) - unique_combined_sets <- unique(combined_sets) - - # Split each combined set into individual sets - split_sets <- lapply(unique_combined_sets, function(x) strsplit(x, ",")[[1]]) - - # Identify and merge overlapping credible sets - if (length(split_sets) != 1) { - for (i in 1:(length(split_sets) - 1)) { - for (j in (i + 1):length(split_sets)) { - if (!is.null(split_sets[[i]]) && !is.null(split_sets[[j]])) { - # Check both sets exist - if (length(intersect(split_sets[[i]], split_sets[[j]])) > 0) { - # Merge overlapping sets Update both i-th and j-th elements with - # the merged set - split_sets[[i]] <- unique(c(split_sets[[i]], split_sets[[j]])) - split_sets[[j]] <- unique(c(split_sets[[i]], split_sets[[j]])) - } - } - } - } - } - # Eliminate duplicates from the list of sets Convert each set into a string - # to facilitate comparison - set_strings <- sapply(split_sets, function(set) paste(sort(set), collapse = ",")) - # Identify unique sets based on their string representation - unique_set_strings <- unique(set_strings) - # Retain only the largest combined sets - final_combined_sets <- character() - for (set in unique_set_strings) { - if (length(unique_set_strings) == 1) { - final_combined_sets <- unique_set_strings[[1]] - } else { - included_in_other_set <- FALSE - set_elements <- unlist(strsplit(set, ",")) - for (other_set in unique_set_strings) { - if (set != other_set && all(set_elements %in% unlist(strsplit( - other_set, - "," - )))) { - included_in_other_set <- TRUE - break - } - } - if (!included_in_other_set) { - final_combined_sets <- c(final_combined_sets, set) - } - } - } + # Collect all unique set names from overlap_sets + all_sets <- unique(unlist(overlap_sets)) + if (length(all_sets) == 0) return(list()) + + # Build edges: each overlap_sets entry links its sets together + edges <- unlist(lapply(overlap_sets, function(sets) { + if (length(sets) < 2) return(character(0)) + pairs <- combn(sets, 2) + as.character(pairs) + })) - # Create a mapping from original set names to combined set names + g <- igraph::make_graph(edges, directed = FALSE) + comp <- igraph::components(g) + + # Build mapping: each set name -> comma-separated merged component label set_name_map <- list() - for (combined_set in final_combined_sets) { - original_sets <- unlist(strsplit(combined_set, ",")) - for (set in original_sets) { - set_name_map[[set]] <- combined_set + for (i in seq_len(comp$no)) { + members <- names(comp$membership[comp$membership == i]) + label <- paste(sort(members), collapse = ",") + for (s in members) { + set_name_map[[s]] <- label } } - # Update the credible_set_names in variants_sets_and_pips_list for each - # variant_id - updated_credible_sets <- list() - for (variant_id in names(variants_sets_and_pips_list)) { - current_sets <- variants_sets_and_pips_list[[variant_id]][["sets"]] # All credible sets for the current variant_id - combined_set_found <- FALSE - # Check if any of the current variant_id's credible sets exist in the - # set_name_map - for (set_name in current_sets) { - if (set_name %in% names(set_name_map)) { - # If at least one credible set corresponds to a combined credible - # set, update accordingly - updated_credible_sets[[variant_id]] <- set_name_map[[set_name]] - combined_set_found <- TRUE - break # Exit the loop once the combined credible set is found + # Update each variant's credible set names + updated_credible_sets <- lapply( + stats::setNames(names(variants_sets_and_pips_list), names(variants_sets_and_pips_list)), + function(variant_id) { + current_sets <- variants_sets_and_pips_list[[variant_id]][["sets"]] + mapped <- intersect(current_sets, names(set_name_map)) + if (length(mapped) > 0) { + set_name_map[[mapped[1]]] + } else { + paste(sort(unique(current_sets)), collapse = ",") } } - # If no combined credible set is found, keep the original credible sets - # unchanged - if (!combined_set_found) { - updated_credible_sets[[variant_id]] <- current_sets - } - } + ) return(updated_credible_sets) } # Loop through each condition and their credible sets extract_top_loci <- function(susie_fit, complementary, coverage) { - results <- list() - for (i in seq_along(names(susie_fit[[1]]))) { - if (!is.null(susie_fit[[1]][[i]][["top_loci"]]) && nrow(susie_fit[[1]][[i]][["top_loci"]]) != - 0) { - if (!complementary) { - set_num <- unique(get_nested_element(susie_fit[[1]][[i]], c( - "top_loci", - coverage - ))) - set_num <- set_num[set_num != 0] - } else { - set_num <- 0 - } - num_cs <- length(set_num) - if (num_cs > 0) { - for (j in 1:num_cs) { - variants_df <- get_nested_element(susie_fit[[1]][[i]], c("top_loci")) %>% - filter(!!sym(coverage) == set_num[j]) %>% - select(variant_id, pip) - # Iterate through the rows of the variants_df - if (dim(variants_df)[1] != 0) { - for (row in 1:nrow(variants_df)) { - variant_id <- variants_df$variant_id[row] - variant_pip <- variants_df$pip[row] - - # Prepare the set name - set_name <- paste0("cs_", i, "_", set_num[j]) - - # If the variant_id is not in the results list, add it with the - # current set_name and pip - if (!variant_id %in% names(results)) { - results[[variant_id]] <- list(sets = set_name, pips = variant_pip) - } else { - # If the variant_id is already in the results, append the current - # set_name and pip - results[[variant_id]]$sets <- c( - results[[variant_id]]$sets, - set_name - ) - results[[variant_id]]$pips <- c( - results[[variant_id]]$pips, - variant_pip - ) - } - } - } - } - } + # Build a flat data frame of (variant_id, pip, set_name) across all conditions + cond_names <- names(susie_fit[[1]]) + rows <- purrr::map_dfr(seq_along(cond_names), function(i) { + cond_data <- susie_fit[[1]][[i]] + top_loci <- cond_data[["top_loci"]] + if (is.null(top_loci) || nrow(top_loci) == 0) return(NULL) + + if (!complementary) { + set_num <- unique(get_nested_element(cond_data, c("top_loci", coverage))) + set_num <- set_num[set_num != 0] + } else { + set_num <- 0 } - } - return(results) + if (length(set_num) == 0) return(NULL) + + purrr::map_dfr(set_num, function(sn) { + top_loci %>% + filter(!!sym(coverage) == sn) %>% + select(variant_id, pip) %>% + mutate(set_name = paste0("cs_", i, "_", sn)) + }) + }) + + if (is.null(rows) || nrow(rows) == 0) return(list()) + + # Aggregate by variant_id preserving first-seen order + seen_order <- unique(rows$variant_id) + split_rows <- split(rows, factor(rows$variant_id, levels = seen_order)) + lapply(split_rows, function(df) { + list(sets = df$set_name, pips = df$pip) + }) } combine_top_loci <- function(extracted_result) { + # Compute overlap sets once, outside the per-variant loop + overlap_sets <- identify_overlap_sets(extracted_result) + has_overlaps <- length(overlap_sets) != 0 + merged_sets <- if (has_overlaps) { + merge_and_update_overlap_sets(extracted_result, overlap_sets = overlap_sets) + } else { + NULL + } + top_loci_df <- do.call(rbind, lapply(names(extracted_result), function(variant_id) { max_pip <- max(unlist(extracted_result[[variant_id]]$pips)) median_pip <- median(unlist(extracted_result[[variant_id]]$pips)) - if (length(identify_overlap_sets(extracted_result)) != 0) { - credible_set_names <- merge_and_update_overlap_sets(extracted_result, - overlap_sets = identify_overlap_sets(extracted_result) - )[[variant_id]] + credible_set_names <- if (has_overlaps) { + merged_sets[[variant_id]] } else { - credible_set_names <- paste(sort(unique(unlist(extracted_result[[variant_id]]$sets))), - collapse = "," - ) + paste(sort(unique(unlist(extracted_result[[variant_id]]$sets))), collapse = ",") } data.frame( variant_id = variant_id, credible_set_names = credible_set_names, - max_pip = max_pip, median_pip = median_pip, stringsAsFactors = FALSE # Avoid factors for strings + max_pip = max_pip, median_pip = median_pip, stringsAsFactors = FALSE ) })) return(top_loci_df) @@ -696,19 +621,12 @@ mash_rand_null_sample <- function(dat, n_random, n_null, exclude_condition, seed } if (length(exclude_condition) > 0) { - if ("z" %in% names(dat)) { - if (all(exclude_condition %in% colnames(dat$z))) { - dat$z <- dat$z[, -exclude_condition, drop = FALSE] - } else { - stop(paste("Error: exclude_condition are not present in", dat$region)) - } - } else { - if (all(exclude_condition %in% colnames(dat$bhat))) { - dat$bhat <- dat$bhat[, -exclude_condition, drop = FALSE] - dat$sbhat <- dat$sbhat[, -exclude_condition, drop = FALSE] - } else { - stop(paste("Error: exclude_condition are not present in", dat$region)) - } + cols_to_check <- if ("z" %in% names(dat)) "z" else "bhat" + if (!all(exclude_condition %in% colnames(dat[[cols_to_check]]))) { + stop(paste("Error: exclude_condition are not present in", dat$region)) + } + for (key in intersect(names(dat), c("z", "bhat", "sbhat"))) { + dat[[key]] <- dat[[key]][, -exclude_condition, drop = FALSE] } } @@ -718,59 +636,30 @@ mash_rand_null_sample <- function(dat, n_random, n_null, exclude_condition, seed #' @export merge_mash_data <- function(res_data, one_data) { - combined_data <- list() - if (length(res_data) == 0 | is.null(res_data)) { - return(one_data) - } else if (length(one_data) == 0 | is.null(one_data)) { - return(res_data) - } else { - for (d in names(one_data)) { - if (length(one_data[[d]]) == 0 | is.null(one_data[[d]])) { - combined_data[[d]] <- res_data[[d]] # Keep res_data[[d]] when one_data[[d]] is NULL or empty - next - } else { - # Check if the res_data is NULL - if (!is.null(res_data[[d]]) | length(res_data[[d]]) != 0) { - # Check if the number of columns matches - if (!identical(colnames(res_data[[d]]), colnames(one_data[[d]]))) { - # Get all column names from both data frames - all_cols <- union(colnames(res_data[[d]]), colnames(one_data[[d]])) - - # Align res[[d]] - res_aligned <- setNames(as.data.frame(matrix(NaN, - nrow = nrow(res_data[[d]]), - ncol = length(all_cols) - )), all_cols) - rownames(res_aligned) <- rownames(res_data[[d]]) - common_cols_res <- intersect(colnames(res_data[[d]]), all_cols) - res_aligned[common_cols_res] <- res_data[[d]][common_cols_res] - - # Align one_data[[d]] - one_data_aligned <- setNames(as.data.frame(matrix(NaN, - nrow = nrow(one_data[[d]]), - ncol = length(all_cols) - )), all_cols) - rownames(one_data_aligned) <- rownames(one_data[[d]]) - common_cols_one_data <- intersect(colnames(one_data[[d]]), all_cols) - one_data_aligned[common_cols_one_data] <- one_data[[d]][common_cols_one_data] - - # Now both have the same columns, we can rbind them - combined_data[[d]] <- rbind(res_aligned, one_data_aligned) - } else { - # If they already have the same number of columns, just rbind - combined_data[[d]] <- rbind(as.data.frame(res_data[[d]]), as.data.frame(one_data[[d]])) - } - } else { - combined_data[[d]] <- one_data[[d]] - } - } - } - return(combined_data) - } + if (length(res_data) == 0 || is.null(res_data)) return(one_data) + if (length(one_data) == 0 || is.null(one_data)) return(res_data) + + combined_data <- lapply(names(one_data), function(d) { + od <- one_data[[d]] + rd <- res_data[[d]] + if (length(od) == 0 || is.null(od)) return(rd) + if (is.null(rd) || length(rd) == 0) return(od) + + # bind_rows auto-aligns columns, filling missing with NA; replace with NaN + rn_res <- rownames(as.data.frame(rd)) + rn_one <- rownames(as.data.frame(od)) + combined <- dplyr::bind_rows(as.data.frame(rd), as.data.frame(od)) + combined[is.na(combined)] <- NaN + rn_all <- make.names(c(rn_res, rn_one), unique = TRUE) + rownames(combined) <- rn_all + combined + }) + names(combined_data) <- names(one_data) + return(combined_data) } #' @export -mash_pipeline <- function(mash_input, alpha, residual_correlation = NULL, unconstrained.update = "ted", set_seed = 999) { +mash_pipeline <- function(mash_input, alpha, residual_correlation = NULL, n_pcs = NULL, set_seed = 999) { if (!requireNamespace("mashr", quietly = TRUE)) { stop("To use this function, please install mashr: https://cran.r-project.org/web/packages/mashr/index.html") } @@ -792,35 +681,30 @@ mash_pipeline <- function(mash_input, alpha, residual_correlation = NULL, uncons )) } - # mash data Fit mixture model using udr package mash_data <- mashr::mash_set_data(mash_input$strong.b, Shat = mash_input$strong.s, V = vhat, alpha, zero_Bhat_Shat_reset = 1000 ) - # Canonical matrices - U.can <- mashr::cov_canonical(mash_data) - # Penalty strength - lambda <- ncol(mash_input$strong.z) - # FIXME: Please change this to use flashier + ED instead of UDR - if (!requireNamespace("udr", quietly = TRUE)) { - stop("Package 'udr' is required. Install with: devtools::install_github('stephenslab/udr')") + # Canonical covariance matrices + U.can <- mashr::cov_canonical(mash_data) + # PCA-based covariance matrices + if (is.null(n_pcs)) { + n_pcs <- ncol(mash_data$Bhat) - 1 } - # Initialize udr - fit0 <- udr::ud_init(mash_data, n_unconstrained = 50, U_scaled = U.can) - # Fit udr and use penalty as default as suggested by Yunqi penalty is - # necessary in small sample size case, and there won't be a difference in - # large sample size - fit2 <- udr::ud_fit(fit0, control = list(unconstrained.update, - scaled.update = "fa", - resid.update = "none", lambda = lambda, penalty.type = "iw", maxiter = 1000, - tol = 0.01, tol.lik = 0.01 - )) - - # extract data-driven covariance from udr model. (A list of covariance - # matrices) - U.ud <- lapply(fit2$U, function(e) e[["mat"]]) - return(mixture_prior = list(U = U.ud, w = fit2$w, loglik = fit2$loglik)) + U.pca <- mashr::cov_pca(mash_data, npc = n_pcs) + # Flash-based covariance matrices (factor analysis) + U.flash <- mashr::cov_flash(mash_data) + # ED-based covariance matrices (initialized from all others) + U.ed <- mashr::cov_ed(mash_data, Ulist_init = c(U.can, U.pca, U.flash)) + # Combine all covariance matrices + U.all <- c(U.can, U.pca, U.flash, U.ed) + + # Fit mash to estimate mixture weights + m <- mashr::mash(mash_data, Ulist = U.all, outputlevel = 1) + w <- mashr::get_estimated_pi(m) + + return(list(U = U.all, w = w)) } #' Merge a List of Matrices or Data Frames with Optional Allele Flipping @@ -1003,19 +887,17 @@ load_multicontext_sumstats <- function(dat_list, signal_df, cond, region, extrac # Match context to tag df <- df %>% - rowwise() %>% - mutate(context_classify = { - if (is.null(tag_patterns) || length(tag_patterns) == 0) { - context - } else { - matched <- names(tag_patterns)[str_detect(context, tag_patterns)] - if (length(matched) == 0) NA_character_ else matched[1] - } - }) %>% - ungroup() + mutate(context_classify = if (is.null(tag_patterns) || length(tag_patterns) == 0) { + context + } else { + purrr::map_chr(context, function(ctx) { + matched <- names(tag_patterns)[str_detect(ctx, tag_patterns)] + if (length(matched) == 0) NA_character_ else matched[1] + }) + }) numeric_col <- colnames(df)[2] - + if (extract_inf == "z"){ # Make a copy to store added rows added_df <- data.frame() @@ -1059,7 +941,7 @@ load_multicontext_sumstats <- function(dat_list, signal_df, cond, region, extrac rename(!!numeric_col := !!sym(numeric_col)) event_ID_extracted <- sumstats_df[[extract_inf]]%>%pull(context) } else if (is.null(event_ID_extracted)){ - print("Please provide 'z-score'") + warning("Please provide 'z-score'") } else { sumstats_df[[extract_inf]] <- df %>% filter(context%in%event_ID_extracted)%>% rename(!!numeric_col := !!sym(numeric_col)) @@ -1073,7 +955,6 @@ load_multicontext_sumstats <- function(dat_list, signal_df, cond, region, extrac gene_ID = region ) %>% select(variant_ID, gene_ID, everything()) - print(result_df) result_list_format[[cond]][[extract_inf]] <- result_list_format[[cond]][[extract_inf]]%>% rows_update(result_df, by = c("variant_ID", "gene_ID")) } } @@ -1109,16 +990,14 @@ load_multicontext_sumstats <- function(dat_list, signal_df, cond, region, extrac # Match context to tag df <- df %>% - rowwise() %>% - mutate(context_classify = { - if (is.null(tag_patterns) || length(tag_patterns) == 0) { - context - } else { - matched <- names(tag_patterns)[str_detect(context, tag_patterns)] - if (length(matched) == 0) NA_character_ else matched[1] - } - }) %>% - ungroup() + mutate(context_classify = if (is.null(tag_patterns) || length(tag_patterns) == 0) { + context + } else { + purrr::map_chr(context, function(ctx) { + matched <- names(tag_patterns)[str_detect(ctx, tag_patterns)] + if (length(matched) == 0) NA_character_ else matched[1] + }) + }) numeric_col <- colnames(df)[2] if (extract_inf == "z"){ @@ -1155,7 +1034,7 @@ load_multicontext_sumstats <- function(dat_list, signal_df, cond, region, extrac } event_ID_extracted[[k]] <- sumstats_df[[extract_inf]]%>%pull(context) } else if (is.null(event_ID_extracted)){ - print("Please provide 'z-score'") + warning("Please provide 'z-score'") } else { sumstats_df[[extract_inf]] <- df %>% filter(context%in%event_ID_extracted[[k]])%>% rename(!!numeric_col := !!sym(numeric_col)) diff --git a/R/multigene_udr.R b/R/multigene_udr.R deleted file mode 100644 index 197fe8be..00000000 --- a/R/multigene_udr.R +++ /dev/null @@ -1,88 +0,0 @@ -#' Perform udr Analysis on Multigene Data -#' -#' @param combined_susie_list A list containing the combined SuSiE and summary statistics results. -#' @param coverage A numeric vector representing the coverage values. -#' @param independent_variant_list A list of independent variants to be used as a filter. -#' @param n_random An integer specifying the number of random samples to generate. -#' @param n_null An integer specifying the number of null samples to generate. -#' @param seed An integer specifying the seed for random number generation. -#' @param exclude_condition A character vector specifying conditions to be excluded from the analysis. Defaults to NULL. -#' -#' @return A list containing the results of the prior, or NULL if conditions are not met. -#' @importFrom dplyr filter mutate group_by -#' @export -multigene_udr <- function(combined_susie_list, coverage, independent_variant_list, n_random, n_null, seed, exclude_condition = NULL) { - # Default to an empty vector if exclude_condition is NULL - if (is.null(exclude_condition)) { - exclude_condition <- c() - } - reformat_data <- function(dat) { - res <- list( - strong.b = dat$strong$bhat, - random.b = dat$random$bhat, - null.b = dat$null$bhat, - strong.s = dat$strong$sbhat, - null.s = dat$null$sbhat, - random.s = dat$random$sbhat - ) - return(res) - } - # Load strong and random null summary statistics - strong_file <- load_multitrait_R_sumstat( - combined_susie_list$extracted_regional_window_combined_susie_result, - combined_susie_list$extracted_regional_window_combined_sumstats_result, - coverage, - top_loci = TRUE, - exclude_condition = exclude_condition - ) - - ran_null_file <- load_multitrait_R_sumstat( - combined_susie_list$extracted_regional_window_combined_susie_result, - combined_susie_list$extracted_regional_window_combined_sumstats_result, - filter_file = independent_variant_list, - exclude_condition = exclude_condition - ) - - # Generate random null samples - ran_null <- mash_rand_null_sample( - ran_null_file, - n_random = n_random, - n_null = n_null, - exclude_condition = exclude_condition, - seed = seed - ) - - - # Prepare the strong summary statistics - strong <- list(strong = list(bhat = strong_file$bhat, sbhat = strong_file$sbhat)) - - # Combine strong and random null samples - res <- c(strong, ran_null) - - # Reformat data for MASH analysis - mash_input <- reformat_data(res) - - # Filter invalid summary statistics for each condition - conditions <- c("strong", "random", "null") - for (cond in conditions) { - mash_input <- filter_invalid_summary_stat( - mash_input, - bhat = paste0(cond, ".b"), - sbhat = paste0(cond, ".s"), - btoz = TRUE, - filter_by_missing_rate = NULL, - sig_p_cutoff = NULL - ) - } - - # Calculate ZtZ matrix - mash_input$ZtZ <- t(as.matrix(mash_input$strong.z)) %*% as.matrix(mash_input$strong.z) / nrow(mash_input$strong.z) - - # Perform MASH analysis if conditions are met - dd_prior <- if (nrow(mash_input$strong.b) < 2 || ncol(mash_input$strong.b) < 2) { - NULL - } else { - mash_pipeline(mash_input, alpha = 1) - } - return(dd_prior) -} diff --git a/R/plot.R b/R/plot.R deleted file mode 100644 index 54dd3465..00000000 --- a/R/plot.R +++ /dev/null @@ -1,129 +0,0 @@ -#' Venn Diagram -#' @param data a list with the twas siginificant gene_id results of four method "SuSiE","Lasso","Enet" and "MR.ASH" -#' @return plot object -# @importFrom ggvenn ggvenn -# @export -venn <- function(data) { - venn_plot <- ggvenn(data, c("SuSiE", "Lasso", "Enet", "MR.ASH"), show_percentage = TRUE, fill_color = c("red", "orange", "blue", "green")) - return(venn_plot) -} - - -#' Manhattan plot -#' @param twas_results a data frame of twas results with columns "gene_name", "gene_id","chr","susie_pval","lasso_pval","enet_pval" and "mrash_pval", where twas results are the output of the twas_scan function. "gene_name" is the ensemble ID and "gene_id" is the corresponding gene name, -#' "susie_pval", "lasso_pval","enet_pval" and "mrash_pval" are the pvalues of susie and other three competing twas method. -#' @param gene_data a data frame with columns "chr", "start", "end", and "ID", "chr" is the chromosome of gene, "start" and "end" are the position, "ID" is the gene name. -#' @return plot object -# @import ggplot2 -# @importFrom ggrepel geom_label_repel -# @importFrom stringr str_sub -# @importFrom ggnewscale new_scale_color -# @export -manhattan_plot <- function(twas_results, gene_data) { - min_pval <- apply(twas_results[, c("susie_pval", "lasso_pval", "enet_pval", "mrash_pval")], 1, function(x) min(x, na.rm = TRUE)) - data_all_gene <- twas_results %>% - select(gene_name, chr, gene_id, susie_pval, lasso_pval, enet_pval, mrash_pval) %>% - mutate(min_pval = min_pval) %>% - mutate(chr = as.numeric(chr)) - gene_pos <- gene_data %>% - mutate(chr = as.numeric(str_sub(`#chr`, 4))) %>% - select(-`#chr`) %>% - setNames(c("start_bp", "end_bp", "gene_name", "chr")) - gene_pos_pval <- merge(data_all_gene, gene_pos, by = c("gene_name", "chr")) - - susie_select <- Mic_genes$gene_pq_adj %>% - filter(susie_pval < (2.5 * 10^(-6) / 4)) %>% - select(gene_id) %>% - t() %>% - as.vector() - lasso_select <- Mic_genes$gene_pq_adj %>% - filter(lasso_pval < (2.5 * 10^(-6) / 4)) %>% - select(gene_id) %>% - t() %>% - as.vector() - enet_select <- Mic_genes$gene_pq_adj %>% - filter(enet_pval < (2.5 * 10^(-6) / 4)) %>% - select(gene_id) %>% - t() %>% - as.vector() - ash_select <- Mic_genes$gene_pq_adj %>% - filter(mrash_pval < (2.5 * 10^(-6) / 4)) %>% - select(gene_id) %>% - t() %>% - as.vector() - gene_annotate <- unique(c(susie_select, lasso_select, enet_select, ash_select)) - - results <- NULL - # Define the methods to compare - methods <- list( - susie_select, lasso_select, enet_select, ash_select - ) - - # Generate all possible combinations of two methods - combinations <- combn(methods, 2, simplify = FALSE) - - # Loop through combinations and store the intersections - for (i in seq_along(combinations)) { - results[[i]] <- intersect(combinations[[i]][[1]], combinations[[i]][[2]]) - } - - # Combine all unique intersections - two_more <- unique(unlist(results)) - - don <- gene_pos_pval %>% - # Compute chromosome size - group_by(chr) %>% - summarise(chr_len = max((start_bp + end_bp) / 2)) %>% - # Calculate cumulative position of each chromosome - mutate(tot = cumsum(chr_len) - chr_len) %>% - select(-chr_len) %>% - # Add this info to the initial dataset - left_join(gene_pos_pval, ., by = c("chr" = "chr")) %>% - # Add a cumulative position of each SNP - arrange(chr, (start_bp + end_bp) / 2) %>% - mutate(BPcum = (start_bp + end_bp) / 2 + tot) %>% - mutate(susie_highlight = ifelse(gene_id %in% susie_select, "yes", "no")) %>% - mutate(lasso_highlight = ifelse(gene_id %in% lasso_select, "yes", "no")) %>% - mutate(enet_highlight = ifelse(gene_id %in% enet_select, "yes", "no")) %>% - mutate(ash_highlight = ifelse(gene_id %in% ash_select, "yes", "no")) %>% - mutate(two_highlight = ifelse(gene_id %in% two_more, "yes", "no")) %>% - mutate(gene_annotate = ifelse(gene_id %in% gene_annotate, "yes", "no")) - - axisdf <- don %>% - group_by(chr) %>% - summarize(center = (max(BPcum) + min(BPcum)) / 2) - - manhattan <- ggplot(don, aes(x = BPcum, y = -log10(min_pval))) + - - # Show all points - geom_point(aes(color = as.factor(chr), show.legend = FALSE), alpha = 0.8, size = 5) + - scale_color_manual(values = rep(c("#b6b6b6", "#9f9f9f"), 22), guide = FALSE) + - # custom X axis: - scale_x_continuous(label = axisdf$chr, breaks = axisdf$center) + - scale_y_continuous(expand = c(0, 0)) + # remove space between plot area and x axis - ylim(0, 90) + - new_scale_color() + - geom_point(data = subset(don, susie_highlight == "yes"), aes(color = "red"), size = 5) + - geom_point(data = subset(don, lasso_highlight == "yes"), aes(color = "orange"), size = 5) + - geom_point(data = subset(don, enet_highlight == "yes"), aes(color = "green"), size = 5) + - geom_point(data = subset(don, ash_highlight == "yes"), aes(color = "blue"), size = 5) + - geom_point(data = subset(don, two_highlight == "yes"), aes(color = "black"), size = 5) + - geom_label_repel(data = subset(don, gene_annotate == "yes"), aes(label = gene_id), size = 5, max.overlaps = Inf, color = "black", alpha = 0.75) + - scale_color_manual(values = c("red", "orange", "green", "blue", "black"), breaks = c("red", "orange", "green", "blue", "black"), name = "Methods", labels = c("SuSiE", "Lasso", "Enet", "MR.ASH", ">=2 methods")) + - xlab("Chromosome") + - ylab("-log10(pval)") + - geom_hline(yintercept = -log10(2.5 * 10^(-6) / 4), linetype = "dashed", color = "red") + - theme_bw() + - theme( - plot.title = element_text(size = 45), - legend.title = element_text(face = "bold", size = 45), - legend.text = element_text(size = 30), - axis.text = element_text(size = 20), - axis.title = element_text(size = 45), - legend.position = "top", - panel.border = element_blank(), - panel.grid.major.x = element_blank(), - panel.grid.minor.x = element_blank() - ) - return(manhattan) -} diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 83a4983a..2eda33a9 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -247,79 +247,51 @@ sdpr_weights <- function(stat, LD, ...) { return(model$beta_est) } +# Shared helper for susie/susie_ash/susie_inf weight extraction. +# @param fit A susie fit object (or NULL to fit from X, y). +# @param X Genotype matrix (optional). +# @param y Phenotype vector (optional). +# @param required_fields Fields that must be present in the fit to extract weights. +# @param fit_args Extra arguments passed to susie_wrapper when fit is NULL. +# @param ... Additional arguments forwarded to susie_wrapper. #' @importFrom susieR coef.susie -#' @export -susie_weights <- function(X = NULL, y = NULL, susie_fit = NULL, ...) { - if (is.null(susie_fit)) { - # get susie_fit object - susie_fit <- susie_wrapper(X, y, ...) +#' @noRd +.susie_extract_weights <- function(fit, X, y, required_fields, fit_args = list(), ...) { + if (is.null(fit)) { + fit <- do.call(susie_wrapper, c(list(X = X, y = y), fit_args, list(...))) } - if (!is.null(X)) { - if (length(susie_fit$pip) != ncol(X)) { - stop(paste0( - "Dimension mismatch on number of variant in susie_fit ", length(susie_fit$pip), - " and TWAS weights ", ncol(X), ". " - )) - } + if (!is.null(X) && length(fit$pip) != ncol(X)) { + stop(paste0( + "Dimension mismatch on number of variant in susie fit ", length(fit$pip), + " and TWAS weights ", ncol(X), ". " + )) } - if ("alpha" %in% names(susie_fit) && "mu" %in% names(susie_fit) && "X_column_scale_factors" %in% names(susie_fit)) { - # This is designed to cope with output from pecotmr::susie_post_processor() - # We set intercept to 0 and later trim it off anyways - susie_fit$intercept <- 0 - return(coef.susie(susie_fit)[-1]) + if (all(required_fields %in% names(fit))) { + fit$intercept <- 0 + return(coef.susie(fit)[-1]) } else { - return(rep(0, length(susie_fit$pip))) + return(rep(0, length(fit$pip))) } } -#' @importFrom susieR coef.susie +#' @export +susie_weights <- function(X = NULL, y = NULL, susie_fit = NULL, ...) { + .susie_extract_weights(susie_fit, X, y, + required_fields = c("alpha", "mu", "X_column_scale_factors"), ...) +} + #' @export susie_ash_weights <- function(X = NULL, y = NULL, susie_ash_fit = NULL, ...) { - if (is.null(susie_ash_fit)) { - # get susie_ash_fit object - susie_ash_fit <- susie_wrapper(X, y, unmappable_effects = "ash", convergence_method = "pip", ...) - } - if (!is.null(X)) { - if (length(susie_ash_fit$pip) != ncol(X)) { - stop(paste0( - "Dimension mismatch on number of variant in susie_ash_fit ", length(susie_ash_fit$pip), - " and TWAS weights ", ncol(X), ". " - )) - } - } - if ("alpha" %in% names(susie_ash_fit) && "mu" %in% names(susie_ash_fit) && "theta" %in% names(susie_ash_fit) && "X_column_scale_factors" %in% names(susie_ash_fit)) { - # This is designed to cope with output from pecotmr::susie_post_processor() - # We set intercept to 0 and later trim it off anyways - susie_ash_fit$intercept <- 0 - return(coef.susie(susie_ash_fit)[-1]) - } else { - return(rep(0, length(susie_ash_fit$pip))) - } + .susie_extract_weights(susie_ash_fit, X, y, + required_fields = c("alpha", "mu", "theta", "X_column_scale_factors"), + fit_args = list(unmappable_effects = "ash", convergence_method = "pip"), ...) } -#' @importFrom susieR coef.susie #' @export susie_inf_weights <- function(X = NULL, y = NULL, susie_inf_fit = NULL, ...) { - if (is.null(susie_inf_fit)) { - # get susie_inf_fit object - susie_inf_fit <- susie_wrapper(X, y, unmappable_effects = "inf", convergence_method = "pip", ...) - } - if (!is.null(X)) { - if (length(susie_inf_fit$pip) != ncol(X)) { - stop(paste0( - "Dimension mismatch on number of variant in susie_inf_fit ", length(susie_inf_fit$pip), - " and TWAS weights ", ncol(X), ". " - )) - } - } - if ("alpha" %in% names(susie_inf_fit) && "mu" %in% names(susie_inf_fit) && "theta" %in% names(susie_inf_fit) && "X_column_scale_factors" %in% names(susie_inf_fit)) { - # This is designed to cope with output from pecotmr::susie_post_processor() - # We set intercept to 0 and later trim it off anyways - susie_inf_fit$intercept <- 0 - return(coef.susie(susie_inf_fit)[-1]) - } else { - return(rep(0, length(susie_inf_fit$pip))) - } + .susie_extract_weights(susie_inf_fit, X, y, + required_fields = c("alpha", "mu", "theta", "X_column_scale_factors"), + fit_args = list(unmappable_effects = "inf", convergence_method = "pip"), ...) } #' @export @@ -383,8 +355,9 @@ init_prior_sd <- function(X, y, n = 30) { # (glmnet, ncvreg, L0Learn, qgg, BGLR, RcppDPR) all either error or behave # poorly on constant columns, so wrappers should filter them out and zero-pad # their results back to length p. +#' @importFrom matrixStats colSds .drop_zero_variance <- function(X, fn_name) { - sds <- apply(X, 2, sd) + sds <- matrixStats::colSds(X) keep <- !is.na(sds) & sds != 0 if (!all(keep)) { warning(sprintf( diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 71602e4c..7c38ad5e 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -115,29 +115,37 @@ adjust_susie_weights <- function(twas_weights_results, keep_variants, run_allele } -#' @importFrom susieR susie -#' @export -susie_wrapper <- function(X, y, init_L = 5, max_L = 30, l_step = 5, ...) { - if (init_L == max_L) { - return(susie(X, y, L = init_L, ...)) - } +# Shared dynamic-L loop: repeatedly fits a SuSiE model, increasing L by +# l_step whenever the number of credible sets saturates L, until max_L. +# @param fit_fn A function(L) that runs SuSiE with a given L and returns a fit. +# @param init_L Initial number of causal configurations. +# @param max_L Maximum number of causal configurations. +# @param l_step Step size for increasing L. +# @return The final SuSiE fit object. +# @noRd +.dynamic_L_fit <- function(fit_fn, init_L, max_L, l_step) { L <- init_L - # Perform SuSiE by dynamically increasing L - gst <- proc.time() while (TRUE) { st <- proc.time() - res <- susie(X, y, L = L, ...) - res$time_elapsed <- proc.time() - st - if (!is.null(res$sets$cs)) { - if (length(res$sets$cs) >= L && L <= max_L) { - L <- L + l_step - } else { - break - } + result <- fit_fn(L) + result$time_elapsed <- proc.time() - st + if (!is.null(result$sets$cs) && length(result$sets$cs) >= L && L <= max_L) { + L <- L + l_step } else { break } } + result +} + +#' @importFrom susieR susie +#' @export +susie_wrapper <- function(X, y, init_L = 5, max_L = 30, l_step = 5, ...) { + if (init_L == max_L) { + return(susie(X, y, L = init_L, ...)) + } + gst <- proc.time() + res <- .dynamic_L_fit(function(L) susie(X, y, L = L, ...), init_L, max_L, l_step) message(paste("Total time elapsed for susie_wrapper:", (proc.time() - gst)[3])) return(res) } @@ -175,25 +183,15 @@ susie_rss_wrapper <- function(z, R = NULL, X = NULL, n = NULL, sketch_samples = sketch_samples, ...) if (!is.null(X)) base_args$X <- X else base_args$R <- R - run_susie <- function(args) do.call(susie_rss, args) + run_with_L <- function(L_val) do.call(susie_rss, c(base_args, list(L = L_val))) if (L == 1) { base_args$max_iter <- 1 - result <- run_susie(base_args) + result <- do.call(susie_rss, c(base_args, list(L = 1))) } else if (L == max_L) { - result <- run_susie(base_args) + result <- run_with_L(L) } else { - while (TRUE) { - st <- proc.time() - base_args$L <- L - result <- run_susie(base_args) - result$time_elapsed <- proc.time() - st - if (!is.null(result$sets$cs) && length(result$sets$cs) >= L && L <= max_L) { - L <- L + l_step - } else { - break - } - } + result <- .dynamic_L_fit(run_with_L, L, max_L, l_step) } result @@ -421,17 +419,20 @@ susie_post_processor <- function(susie_output, data_x, data_y, X_scalar, y_scala if (nrow(top_loci) > 0) { top_loci[is.na(top_loci)] <- 0 - variants <- res$variant_names[top_loci$variant_idx] - pip <- susie_output$pip[top_loci$variant_idx] - top_loci_cols <- c("variant_id", if (!is.null(res$sumstats$betahat)) "betahat", if (!is.null(res$sumstats$sebetahat)) "sebetahat", if (!is.null(res$sumstats$z)) "z", if (!is.null(maf)) "maf", "pip", colnames(top_loci)[-1]) - res$top_loci <- data.frame(variants, stringsAsFactors = FALSE) - res$top_loci$betahat <- if (!is.null(res$sumstats$betahat)) res$sumstats$betahat[top_loci$variant_idx] else NULL - res$top_loci$sebetahat <- if (!is.null(res$sumstats$sebetahat)) res$sumstats$sebetahat[top_loci$variant_idx] else NULL - res$top_loci$z <- if (!is.null(res$sumstats$z)) res$sumstats$z[top_loci$variant_idx] else NULL - res$top_loci$maf <- if (!is.null(maf)) maf[top_loci$variant_idx] else NULL - res$top_loci$pip <- pip - res$top_loci <- cbind(res$top_loci, top_loci[, -1]) - colnames(res$top_loci) <- top_loci_cols + idx <- top_loci$variant_idx + optional_cols <- list( + betahat = if (!is.null(res$sumstats$betahat)) res$sumstats$betahat[idx], + sebetahat = if (!is.null(res$sumstats$sebetahat)) res$sumstats$sebetahat[idx], + z = if (!is.null(res$sumstats$z)) res$sumstats$z[idx], + maf = if (!is.null(maf)) maf[idx] + ) + optional_cols <- Filter(Negate(is.null), optional_cols) + res$top_loci <- cbind( + data.frame(variant_id = res$variant_names[idx], stringsAsFactors = FALSE), + as.data.frame(optional_cols), + data.frame(pip = susie_output$pip[idx]), + top_loci[, -1, drop = FALSE] + ) rownames(res$top_loci) <- NULL } names(susie_output$pip) <- NULL diff --git a/R/twas.R b/R/twas.R index db7f2072..5b847012 100644 --- a/R/twas.R +++ b/R/twas.R @@ -32,15 +32,8 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, # Function to group contexts based on start and end positions group_contexts_by_region <- function(twas_weights_data, molecular_id, chrom, tolerance = 5000) { region_info_df <- do.call(rbind, lapply(names(twas_weights_data$weights), function(context) { - wgt_range <- as.integer(sapply( - rownames(twas_weights_data[["weights"]][[context]]), - function(variant_id) { - strsplit(variant_id, "\\:")[[1]][2] - } - )) - min <- min(wgt_range) - max <- max(wgt_range) - data.frame(context = context, start = min, end = max) + wgt_range <- parse_variant_id(rownames(twas_weights_data[["weights"]][[context]]))$pos + data.frame(context = context, start = min(wgt_range), end = max(wgt_range)) })) if (nrow(region_info_df) == 1) { # Handle case with only one context @@ -123,31 +116,31 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, # loop through genes/events: for (molecular_id in molecular_ids) { - results[[molecular_id]][["chrom"]] <- chrom - results[[molecular_id]][["data_type"]] <- if ("data_type" %in% names(twas_weights_data[[molecular_id]])) twas_weights_data[[molecular_id]]$data_type - results[[molecular_id]][["variant_names"]] <- list() + mol_data <- twas_weights_data[[molecular_id]] + mol_res <- list(chrom = chrom, variant_names = list()) + mol_res[["data_type"]] <- if ("data_type" %in% names(mol_data)) mol_data$data_type # group contexts based on the variant position - context_clusters <- group_contexts_by_region(twas_weights_data[[molecular_id]], molecular_id, chrom, tolerance = 5000) + context_clusters <- group_contexts_by_region(mol_data, molecular_id, chrom, tolerance = 5000) # loop through contexts: grouping contexts can be useful during TWAS data harmonization to stratify variants for LD loading for (context_group in names(context_clusters)) { - contexts <- context_clusters[[context_group]]$contexts - query_region <- context_clusters[[context_group]]$query_region + cluster <- context_clusters[[context_group]] + contexts <- cluster$contexts + query_region <- cluster$query_region region_of_interest <- region_to_df(query_region) - all_variants <- context_clusters[[context_group]]$all_variants # all variants for this context group - all_variants <- variant_id_to_df(all_variants) + all_variants <- variant_id_to_df(cluster$all_variants) # Step 3: load GWAS data for clustered context groups for (study in names(gwas_files)) { gwas_file <- gwas_files[study] - gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, - LD_list$LD_variants, c("beta", "z"), + gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, + LD_list$LD_variants, c("beta", "z"), match_min_prop = 0, column_file_path = column_file_path, comment_string = comment_string) if(is.null(gwas_data_sumstats)) next # loop through context within the context group: for (context in contexts) { - weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]] + weights_matrix <- mol_data[["weights"]][[context]] # Step 4: harmonize weights, flip allele weights_matrix <- cbind(variant_id_to_df(rownames(weights_matrix)), weights_matrix) @@ -155,11 +148,11 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, colnames(weights_matrix)[!colnames(weights_matrix) %in% c("chrom", "pos", "A2", "A1")], match_min_prop = 0 ) - weights_matrix_subset <- as.matrix(weights_matrix_qced$target_data_qced[, !colnames(weights_matrix_qced$target_data_qced) %in% c( - "chrom", - "pos", "A2", "A1", "variant_id", "variants_id_original" + qced_data <- weights_matrix_qced$target_data_qced + weights_matrix_subset <- as.matrix(qced_data[, !colnames(qced_data) %in% c( + "chrom", "pos", "A2", "A1", "variant_id", "variants_id_original" ), drop = FALSE]) - rownames(weights_matrix_subset) <- weights_matrix_qced$target_data_qced$variant_id # weight variant names are flipped/corrected + rownames(weights_matrix_subset) <- qced_data$variant_id # intersect post-qc gwas and post-qc weight variants (all now in canonical chr-prefix format) gwas_LD_variants <- intersect(gwas_data_sumstats$variant_id, LD_list$LD_variants) @@ -168,8 +161,8 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, postqc_weight_variants <- rownames(weights_matrix_subset) # Step 5: adjust SuSiE weights based on available variants - if ("susie_weights" %in% colnames(twas_weights_data[[molecular_id]][["weights"]][[context]])) { - adjusted_susie_weights <- adjust_susie_weights(twas_weights_data[[molecular_id]], + if ("susie_weights" %in% colnames(mol_data[["weights"]][[context]])) { + adjusted_susie_weights <- adjust_susie_weights(mol_data, keep_variants = postqc_weight_variants, run_allele_qc = TRUE, variable_name_obj = c("variant_names", context), susie_obj = c("susie_results", context), @@ -179,16 +172,17 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, susie_weights = setNames(adjusted_susie_weights$adjusted_susie_weights, adjusted_susie_weights$remained_variants_ids), weights_matrix_subset[adjusted_susie_weights$remained_variants_ids, !colnames(weights_matrix_subset) %in% "susie_weights", drop = FALSE] ) - results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]] <- twas_weights_data[[molecular_id]]$susie_results[[context]][c("pip", "cs_variants", "cs_purity")] - names(results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["pip"]]) <- rownames(weights_matrix) # original variants that is not qced yet - pip <- results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["pip"]] + susie_intermediate <- mol_data$susie_results[[context]][c("pip", "cs_variants", "cs_purity")] + names(susie_intermediate[["pip"]]) <- rownames(weights_matrix) # original variants that is not qced yet + pip <- susie_intermediate[["pip"]] pip_qced <- allele_qc(cbind(parse_variant_id(names(pip)), pip), LD_list$LD_variants, "pip", match_min_prop = 0) - results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["pip"]] <- abs(pip_qced$target_data_qced$pip) - names(results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["pip"]]) <- pip_qced$target_data_qced$variant_id - results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["cs_variants"]] <- lapply(results[[molecular_id]][["susie_weights_intermediate_qced"]][[context]][["cs_variants"]], function(x) { + susie_intermediate[["pip"]] <- abs(pip_qced$target_data_qced$pip) + names(susie_intermediate[["pip"]]) <- pip_qced$target_data_qced$variant_id + susie_intermediate[["cs_variants"]] <- lapply(susie_intermediate[["cs_variants"]], function(x) { variant_qc <- allele_qc(x, LD_list$LD_variants, match_min_prop = 0) variant_qc$target_data_qced$variant_id[variant_qc$target_data_qced$variant_id %in% postqc_weight_variants] }) + mol_res[["susie_weights_intermediate_qced"]][[context]] <- susie_intermediate } rm(weights_matrix) # context specific original weight matrix gc() @@ -197,27 +191,29 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, warning("weights_matrix_subset is empty. Skipping this context.") next } - results[[molecular_id]][["variant_names"]][[context]][[study]] <- rownames(weights_matrix_subset) + mol_res[["variant_names"]][[context]][[study]] <- rownames(weights_matrix_subset) # Step 6: scale weights by variance (from ref_panel, populated by load_LD_matrix) variance <- LD_list$ref_panel$variance[match(rownames(weights_matrix_subset), LD_list$ref_panel$variant_id)] - results[[molecular_id]][["weights_qced"]][[context]][[study]] <- list(scaled_weights = weights_matrix_subset * sqrt(variance), weights = weights_matrix_subset) + mol_res[["weights_qced"]][[context]][[study]] <- list(scaled_weights = weights_matrix_subset * sqrt(variance), weights = weights_matrix_subset) } # Combine gwas sumstat across different context for a single context group (all variant_ids now in canonical format) - gwas_data_sumstats <- gwas_data_sumstats[gwas_data_sumstats$variant_id %in% unique(find_data(results[[molecular_id]][["variant_names"]], c(2, study))), , drop = FALSE] - results[[molecular_id]][["gwas_qced"]][[study]] <- rbind(results[[molecular_id]][["gwas_qced"]][[study]], gwas_data_sumstats) - results[[molecular_id]][["gwas_qced"]][[study]] <- results[[molecular_id]][["gwas_qced"]][[study]][!duplicated(results[[molecular_id]][["gwas_qced"]][[study]][, c("variant_id", "z")]), ] + gwas_data_sumstats <- gwas_data_sumstats[gwas_data_sumstats$variant_id %in% unique(find_data(mol_res[["variant_names"]], c(2, study))), , drop = FALSE] + mol_res[["gwas_qced"]][[study]] <- rbind(mol_res[["gwas_qced"]][[study]], gwas_data_sumstats) + gwas_qced <- mol_res[["gwas_qced"]][[study]] + mol_res[["gwas_qced"]][[study]] <- gwas_qced[!duplicated(gwas_qced[, c("variant_id", "z")]), ] } } twas_weights_data[[molecular_id]] <- NULL # extract LD matrix for variants intersect with gwas and twas weights at molecular_id level - all_molecular_variants <- unique(find_data(results[[molecular_id]][["gwas_qced"]], c(2, "variant_id"))) + all_molecular_variants <- unique(find_data(mol_res[["gwas_qced"]], c(2, "variant_id"))) if (is.null(all_molecular_variants)) { results[[molecular_id]] <- NULL } else { # All variant IDs are now in canonical chr-prefix format var_indx <- match(all_molecular_variants, LD_list$LD_variants) - results[[molecular_id]][["LD"]] <- as.matrix(LD_list$LD_matrix[var_indx, var_indx]) + mol_res[["LD"]] <- as.matrix(LD_list$LD_matrix[var_indx, var_indx]) + results[[molecular_id]] <- mol_res } } # return results @@ -250,9 +246,9 @@ harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NUL } if (colnames(gwas_data_sumstats)[1] == "#chrom") colnames(gwas_data_sumstats)[1] <- "chrom" # colname update for tabix - # Check if sumstats has z-scores or (beta and se) - if (!is.null(gwas_data_sumstats$z)) { - gwas_data_sumstats$z <- gwas_data_sumstats$z + # Check if sumstats has z-scores or (beta and se) + if (!is.null(gwas_data_sumstats$z)) { + # z-scores already present, nothing to do } else if (!is.null(gwas_data_sumstats$beta) && !is.null(gwas_data_sumstats$se)) { gwas_data_sumstats$z <- gwas_data_sumstats$beta / gwas_data_sumstats$se } else { @@ -294,47 +290,46 @@ twas_pipeline <- function(twas_weights_data, comment_string="#") { # internal function to format TWAS output format_twas_data <- function(post_qc_twas_data, twas_table) { - weights_list <- do.call(c, lapply(names(post_qc_twas_data), function(molecular_id) { - contexts <- names(post_qc_twas_data[[molecular_id]][["weights_qced"]]) - chrom <- post_qc_twas_data[[molecular_id]][["chrom"]] - do.call(c, lapply(contexts, function(context) { - weight <- list() - data_type <- post_qc_twas_data[[molecular_id]][["data_type"]][[context]] - if (!is.null(post_qc_twas_data[[molecular_id]][["model_selection"]]) && - is.list(post_qc_twas_data[[molecular_id]][["model_selection"]]) && - length(post_qc_twas_data[[molecular_id]][["model_selection"]]) > 0) { - is_imputable <- post_qc_twas_data[[molecular_id]][["model_selection"]][[context]]$is_imputable - if (isTRUE(is_imputable)) { - model_selected <- post_qc_twas_data[[molecular_id]][["model_selection"]][[context]]$selected_model - } else { - model_selected <- NA - } + weights_list <- purrr::map(names(post_qc_twas_data), function(molecular_id) { + mol <- post_qc_twas_data[[molecular_id]] + contexts <- names(mol[["weights_qced"]]) + mol_chrom <- mol[["chrom"]] + model_sel <- mol[["model_selection"]] + + purrr::map(contexts, function(context) { + data_type <- mol[["data_type"]][[context]] + if (!is.null(model_sel) && is.list(model_sel) && length(model_sel) > 0) { + is_imputable <- model_sel[[context]]$is_imputable + model_selected <- if (isTRUE(is_imputable)) model_sel[[context]]$selected_model else NA } else { model_selected <- NA is_imputable <- NA } - postqc_scaled_weight <- list() - gwas_studies <- names(post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]]) # context-level gwas-studies - - if (!is.null(model_selected) & isTRUE(is_imputable)) { - # TWAS - for (study in gwas_studies) { - postqc_scaled_weight[[study]] <- post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]][[study]][["scaled_weights"]][, paste0(model_selected, "_weights"), drop = FALSE] - colnames(postqc_scaled_weight[[study]]) <- "weight" - # variant IDs are in canonical chr-prefix format from allele_qc - context_variants <- rownames(post_qc_twas_data[[molecular_id]][["weights_qced"]][[context]][[study]][["scaled_weights"]]) - context_range <- as.integer(sapply(context_variants, function(variant) strsplit(variant, "\\:")[[1]][2])) - weight[[paste0(molecular_id, "|", data_type, "_", context)]][[study]] <- list( - chrom = chrom, p0 = min(context_range), p1 = max(context_range), - wgt = postqc_scaled_weight[[study]], molecular_id = molecular_id, weight_name = paste0(data_type, "_", context), type = data_type, - context = context, n_wgt = length(context_variants) - ) - } - } - return(weight) - })) - })) - weights <- weights_list[!sapply(weights_list, is.null)] + if (is.null(model_selected) || !isTRUE(is_imputable)) return(NULL) + + gwas_studies <- names(mol[["weights_qced"]][[context]]) + weight_key <- paste0(molecular_id, "|", data_type, "_", context) + study_entries <- purrr::map(gwas_studies, function(study) { + ctx_weights <- mol[["weights_qced"]][[context]][[study]] + scaled_wgt <- ctx_weights[["scaled_weights"]][, paste0(model_selected, "_weights"), drop = FALSE] + colnames(scaled_wgt) <- "weight" + context_variants <- rownames(ctx_weights[["scaled_weights"]]) + context_range <- parse_variant_id(context_variants)$pos + entry <- list(list( + chrom = mol_chrom, p0 = min(context_range), p1 = max(context_range), + wgt = scaled_wgt, molecular_id = molecular_id, + weight_name = paste0(data_type, "_", context), type = data_type, + context = context, n_wgt = length(context_variants) + )) + names(entry) <- study + result <- list(entry) + names(result) <- weight_key + result + }) %>% purrr::list_flatten() + study_entries + }) %>% purrr::compact() %>% purrr::list_flatten() + }) %>% purrr::list_flatten() + weights <- purrr::compact(weights_list) # Optional susie_weights_intermediate_qced processing if ("susie_weights_intermediate_qced" %in% names(post_qc_twas_data[[1]])) { susie_weights_intermediate_qced <- setNames(lapply( diff --git a/R/twas_weights.R b/R/twas_weights.R index ca59bab3..c2bead97 100644 --- a/R/twas_weights.R +++ b/R/twas_weights.R @@ -1,3 +1,41 @@ +# Identify non-zero-variance columns of X. Returns a logical vector. +#' @importFrom matrixStats colSds +#' @noRd +.nonzero_var_columns <- function(X) { + sds <- matrixStats::colSds(X, na.rm = TRUE) + !is.na(sds) & sds != 0 +} + +# Embed a smaller weights matrix into a full-sized zero matrix matching X and Y dimensions. +# @param weights_matrix The fitted weights (nrow = number of valid columns). +# @param valid_columns Logical or character vector identifying which columns of X were used. +# @param X_colnames Column names of the original X. +# @param Y_colnames Column names of Y. +# @noRd +.embed_weights <- function(weights_matrix, valid_columns, n_cols_X, n_cols_Y, + X_colnames = NULL, Y_colnames = NULL) { + full <- matrix(0, nrow = n_cols_X, ncol = n_cols_Y) + if (!is.null(X_colnames)) rownames(full) <- X_colnames + if (!is.null(Y_colnames)) colnames(full) <- Y_colnames + full[valid_columns, ] <- weights_matrix + full +} + +# Filter weight methods that produced all-zero weights from CV. +# Returns filtered weight_methods list and warns about removed methods. +# @noRd +.filter_zero_weight_methods <- function(weight_methods, twas_weights_res) { + is_all_zero <- vapply(twas_weights_res, function(w) all(w == 0, na.rm = TRUE), logical(1)) + removed <- names(weight_methods)[is_all_zero] + if (length(removed) > 0) { + warning(sprintf( + "Methods %s are removed from CV because all their weights are zeros.", + paste(removed, collapse = ", ") + )) + } + weight_methods[!is_all_zero] +} + #' Cross-Validation for weights selection in Transcriptome-Wide Association Studies (TWAS) #' #' Performs cross-validation for TWAS, supporting both univariate and multivariate methods. @@ -171,8 +209,8 @@ twas_weights_cv <- function(X, Y, fold = NULL, sample_partitions = NULL, weight_ X_test <- dat_split$Xtest Y_test <- dat_split$Ytest - # Remove columns with zero standard error - valid_columns <- apply(X_train, 2, function(col) sd(col) != 0) + # Remove columns with zero variance + valid_columns <- .nonzero_var_columns(X_train) X_train <- X_train[, valid_columns, drop = FALSE] X_train <- filter_X_with_Y(X_train, Y_train, missing_rate_thresh = 1, maf_thresh = NULL) valid_columns <- colnames(X_train) @@ -193,11 +231,7 @@ twas_weights_cv <- function(X, Y, fold = NULL, sample_partitions = NULL, weight_ } weights_matrix <- do.call(method, c(list(X = X_train, Y = Y_train), args)) rownames(weights_matrix) <- colnames(X_train) - # Adjust the weights matrix to include zeros for invalid columns - full_weights_matrix <- matrix(0, nrow = ncol(X), ncol = ncol(Y)) - rownames(full_weights_matrix) <- colnames(X) - colnames(full_weights_matrix) <- colnames(Y) - full_weights_matrix[valid_columns, ] <- weights_matrix[valid_columns, ] + full_weights_matrix <- .embed_weights(weights_matrix[valid_columns, , drop = FALSE], valid_columns, ncol(X), ncol(Y), colnames(X), colnames(Y)) Y_pred <- X_test %*% full_weights_matrix rownames(Y_pred) <- rownames(X_test) return(Y_pred) @@ -333,8 +367,8 @@ twas_weights <- function(X, Y, weight_methods, num_threads = 1) { multivariate_weight_methods <- c("mrmash_weights", "mvsusie_weights") args <- weight_methods[[method_name]] - # Remove columns with zero standard error - valid_columns <- apply(X, 2, function(col) sd(col) != 0) + # Remove columns with zero variance + valid_columns <- .nonzero_var_columns(X) X_filtered <- as.matrix(X[, valid_columns, drop = FALSE]) if (method_name %in% multivariate_weight_methods) { @@ -353,13 +387,7 @@ twas_weights <- function(X, Y, weight_methods, num_threads = 1) { } } - # Adjust the weights matrix to include zeros for invalid columns - full_weights_matrix <- matrix(0, nrow = ncol(X), ncol = ncol(Y)) - rownames(full_weights_matrix) <- colnames(X) - colnames(full_weights_matrix) <- colnames(Y) - full_weights_matrix[valid_columns, ] <- weights_matrix - - return(full_weights_matrix) + return(.embed_weights(weights_matrix, valid_columns, ncol(X), ncol(Y), colnames(X), colnames(Y))) } if (num_cores >= 2) { @@ -471,21 +499,7 @@ twas_weights_pipeline <- function(X, weight_methods$susie_weights <- list(refine = FALSE, init_L = max_L, max_L = max_L) } if (is.null(cv_weight_methods)) { - # Filter function to exclude methods with all zero weights and track removed methods - is_all_zero <- sapply(res$twas_weights, function(weights) all(weights == 0, na.rm = TRUE)) - - # Identify removed methods - removed_methods <- names(weight_methods)[is_all_zero] - - # Issue a warning if any methods have been removed - if (length(removed_methods) > 0) { - warning(sprintf( - "Methods %s are removed from CV because all their weights are zeros.", - paste(removed_methods, collapse = ", ") - )) - } - # Extract the filtered methods retaining their specific configurations - cv_weight_methods <- names(weight_methods)[!is_all_zero] + cv_weight_methods <- names(.filter_zero_weight_methods(weight_methods, res$twas_weights)) } variants_for_cv <- c() @@ -553,19 +567,17 @@ twas_multivariate_weights_pipeline <- function( cv_threads = 1, verbose = FALSE) { copy_twas_results <- function(context_names, variant_names, twas_weight, twas_predictions) { - res <- setNames(vector("list", length(context_names)), context_names) - for (i in names(res)) { - if (i %in% colnames(twas_weights_res[[1]])) { - res[[i]]$twas_weights <- lapply(twas_weight, function(wgts) { - wgts[, i] - }) - res[[i]]$twas_predictions <- lapply(twas_predictions, function(pred) { - pred[, i] - }) - res[[i]]$variant_names <- variant_names + setNames(lapply(context_names, function(ctx) { + if (ctx %in% colnames(twas_weight[[1]])) { + list( + twas_weights = lapply(twas_weight, function(wgts) wgts[, ctx]), + twas_predictions = lapply(twas_predictions, function(pred) pred[, ctx]), + variant_names = variant_names + ) + } else { + NULL } - } - return(res) + }), context_names) } copy_twas_cv_results <- function(twas_result, twas_cv_result) { @@ -630,22 +642,7 @@ twas_multivariate_weights_pipeline <- function( ) ) - # Filter function to exclude methods with all zero weights and track removed methods - is_all_zero <- sapply(twas_weights_res, function(weights) all(weights == 0, na.rm = TRUE)) - - # Identify removed methods - removed_methods <- names(weight_methods)[is_all_zero] - - # Issue a warning if any methods have been removed - if (length(removed_methods) > 0) { - warning(sprintf( - "Methods %s are removed from CV because all their weights are zeros.", - paste(removed_methods, collapse = ", ") - )) - } - - # Extract the filtered methods retaining their specific configurations - weight_methods <- weight_methods[!is_all_zero] + weight_methods <- .filter_zero_weight_methods(weight_methods, twas_weights_res) variants_for_cv <- c() if (max_cv_variants <= 0) max_cv_variants <- Inf diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index 4a56da87..958014dc 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -312,24 +312,40 @@ rss_analysis_pipeline <- function( res$outlier_number <- qc_results$outlier_number } } - if (impute & !is.null(qc_method)) { - method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") - } else if (!impute & !is.null(qc_method)) { - method_name <- paste0(finemapping_method, "_", toupper(qc_method)) - } else { - method_name <- paste0(finemapping_method, "_", "NO_QC") + .make_method_name <- function(method, qc_method, impute) { + suffix <- if (!is.null(qc_method) && impute) { + paste0(toupper(qc_method), "_RAISS_imputed") + } else if (!is.null(qc_method)) { + toupper(qc_method) + } else { + "NO_QC" + } + paste0(method, "_", suffix) + } + + .run_reanalysis <- function(sumstats, LD_mat, method, finemapping_opts, pri_coverage, sec_coverage) { + susie_rss_pipeline(sumstats, LD_mat, + n = n, var_y = var_y, + L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, + analysis_method = method, + coverage = pri_coverage, + secondary_coverage = sec_coverage, + signal_cutoff = finemapping_opts$signal_cutoff, + min_abs_corr = finemapping_opts$min_abs_corr + ) } + + method_name <- .make_method_name(finemapping_method, qc_method, impute) result_list <- list() result_list[[method_name]] <- res result_list[["rss_data_analyzed"]] <- sumstats - + block_cs_metrics <- list() if (diagnostics) { - if (length(res) > 0) { + if (length(res) > 0) { bvsr_res = get_susie_result(res) bvsr_cs_num = if(!is.null(bvsr_res)) length(bvsr_res$sets$cs) else NULL if (isTRUE(bvsr_cs_num > 0)) { # have CS - # Get the names of the credible sets cs_names_bvsr = names(bvsr_res$sets$cs) block_cs_metrics = extract_cs_info(con_data = res, cs_names = cs_names_bvsr, top_loci_table = res$top_loci) } else { # no CS @@ -350,57 +366,20 @@ rss_analysis_pipeline <- function( max(cs_corr_max, na.rm = TRUE) }) if (any(block_cs_metrics$p_value > 1e-4 | block_cs_metrics$max_cs_corr_study_block > 0.5)) { - finemapping_method <- "bayesian_conditional_regression" - pri_coverage <- finemapping_opts$coverage[1] - sec_coverage <- if (length(finemapping_opts$coverage) > 1) finemapping_opts$coverage[-1] else NULL - bcr <- susie_rss_pipeline(sumstats, LD_mat, - n = n, var_y = var_y, - L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, - analysis_method = finemapping_method, - coverage = pri_coverage, - secondary_coverage = sec_coverage, - signal_cutoff = finemapping_opts$signal_cutoff, - min_abs_corr = finemapping_opts$min_abs_corr - ) + bcr <- .run_reanalysis(sumstats, LD_mat, "bayesian_conditional_regression", + finemapping_opts, pri_coverage, sec_coverage) if (!is.null(qc_method)) { bcr$outlier_number <- qc_results$outlier_number } - if (impute & !is.null(qc_method)) { - method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") - } else if (!impute & !is.null(qc_method)) { - method_name <- paste0(finemapping_method, "_", toupper(qc_method)) - } else { - method_name <- paste0(finemapping_method, "_", "NO_QC") - } - result_list[[method_name]] <- bcr - finemapping_method <- "single_effect" - sumstats <- preprocess_results$sumstats - LD_mat <- preprocess_results$LD_mat - ser <- susie_rss_pipeline(sumstats, LD_mat, - n = n, var_y = var_y, - L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, - analysis_method = finemapping_method, - coverage = pri_coverage, - secondary_coverage = sec_coverage, - signal_cutoff = finemapping_opts$signal_cutoff, - min_abs_corr = finemapping_opts$min_abs_corr - ) - result_list[[paste0(finemapping_method, "_NO_QC")]] <- ser + result_list[[.make_method_name("bayesian_conditional_regression", qc_method, impute)]] <- bcr + ser <- .run_reanalysis(preprocess_results$sumstats, preprocess_results$LD_mat, + "single_effect", finemapping_opts, pri_coverage, sec_coverage) + result_list[["single_effect_NO_QC"]] <- ser } } else { # CS = 1 or NA - finemapping_method <- "single_effect" - sumstats <- preprocess_results$sumstats - LD_mat <- preprocess_results$LD_mat - ser <- susie_rss_pipeline(sumstats, LD_mat, - n = n, var_y = var_y, - L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, - analysis_method = finemapping_method, - coverage = pri_coverage, - secondary_coverage = sec_coverage, - signal_cutoff = finemapping_opts$signal_cutoff, - min_abs_corr = finemapping_opts$min_abs_corr - ) - result_list[[paste0(finemapping_method, "_NO_QC")]] <- ser + ser <- .run_reanalysis(preprocess_results$sumstats, preprocess_results$LD_mat, + "single_effect", finemapping_opts, pri_coverage, sec_coverage) + result_list[["single_effect_NO_QC"]] <- ser } result_list[["diagnostics"]] <- block_cs_metrics } diff --git a/man/corr_filter.Rd b/man/corr_filter.Rd deleted file mode 100644 index d2f83074..00000000 --- a/man/corr_filter.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quantile_twas_weight.R -\name{corr_filter} -\alias{corr_filter} -\title{Filter Highly Correlated SNPs} -\usage{ -corr_filter(X, cor_thres = 0.8) -} -\arguments{ -\item{X}{Matrix of genotypes} - -\item{cor_thres}{Correlation threshold for filtering} -} -\value{ -A list containing filtered X matrix and filter IDs -} -\description{ -Filter Highly Correlated SNPs -} diff --git a/man/manhattan_plot.Rd b/man/manhattan_plot.Rd deleted file mode 100644 index 36d7ee0e..00000000 --- a/man/manhattan_plot.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{manhattan_plot} -\alias{manhattan_plot} -\title{Manhattan plot} -\usage{ -manhattan_plot(twas_results, gene_data) -} -\arguments{ -\item{twas_results}{a data frame of twas results with columns "gene_name", "gene_id","chr","susie_pval","lasso_pval","enet_pval" and "mrash_pval", where twas results are the output of the twas_scan function. "gene_name" is the ensemble ID and "gene_id" is the corresponding gene name, -"susie_pval", "lasso_pval","enet_pval" and "mrash_pval" are the pvalues of susie and other three competing twas method.} - -\item{gene_data}{a data frame with columns "chr", "start", "end", and "ID", "chr" is the chromosome of gene, "start" and "end" are the position, "ID" is the gene name.} -} -\value{ -plot object -} -\description{ -Manhattan plot -} diff --git a/man/multigene_udr.Rd b/man/multigene_udr.Rd deleted file mode 100644 index f5179606..00000000 --- a/man/multigene_udr.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/multigene_udr.R -\name{multigene_udr} -\alias{multigene_udr} -\title{Perform udr Analysis on Multigene Data} -\usage{ -multigene_udr( - combined_susie_list, - coverage, - independent_variant_list, - n_random, - n_null, - seed, - exclude_condition = NULL -) -} -\arguments{ -\item{combined_susie_list}{A list containing the combined SuSiE and summary statistics results.} - -\item{coverage}{A numeric vector representing the coverage values.} - -\item{independent_variant_list}{A list of independent variants to be used as a filter.} - -\item{n_random}{An integer specifying the number of random samples to generate.} - -\item{n_null}{An integer specifying the number of null samples to generate.} - -\item{seed}{An integer specifying the seed for random number generation.} - -\item{exclude_condition}{A character vector specifying conditions to be excluded from the analysis. Defaults to NULL.} -} -\value{ -A list containing the results of the prior, or NULL if conditions are not met. -} -\description{ -Perform udr Analysis on Multigene Data -} diff --git a/man/twas_pipeline.Rd b/man/twas_pipeline.Rd index 69647f8b..c0a9a62f 100644 --- a/man/twas_pipeline.Rd +++ b/man/twas_pipeline.Rd @@ -17,7 +17,6 @@ twas_pipeline( rsq_pval_option = c("pval", "adj_rsq_pval"), mr_pval_cutoff = 0.05, mr_coverage_column = "cs_coverage_0.95", - quantile_twas = FALSE, output_twas_data = FALSE, event_filters = NULL, column_file_path = NULL, diff --git a/man/venn.Rd b/man/venn.Rd deleted file mode 100644 index ba9416c2..00000000 --- a/man/venn.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot.R -\name{venn} -\alias{venn} -\title{Venn Diagram} -\usage{ -venn(data) -} -\arguments{ -\item{data}{a list with the twas siginificant gene_id results of four method "SuSiE","Lasso","Enet" and "MR.ASH"} -} -\value{ -plot object -} -\description{ -Venn Diagram -} diff --git a/pixi.toml b/pixi.toml index b78eff33..32db2954 100644 --- a/pixi.toml +++ b/pixi.toml @@ -62,6 +62,7 @@ r45 = {features = ["r45"]} "r-gbj" = "*" "r-glmnet" = "*" "r-harmonicmeanp" = "*" +"r-igraph" = "*" "r-l0learn" = "*" "r-magrittr" = "*" "r-mashr" = "*" @@ -72,8 +73,6 @@ r45 = {features = ["r45"]} "r-pgenlibr" = "*" "r-purrr" = "*" "r-qgg" = "*" -"r-quadprog" = "*" -"r-quantreg" = "*" "r-rcpp" = "*" "r-rcpparmadillo" = "*" "r-rcppdpr" = "*" @@ -86,4 +85,3 @@ r45 = {features = ["r45"]} "r-tidyr" = "*" "r-vctrs" = "*" "r-vroom" = "*" -"r-xicor" = "*" diff --git a/tests/testthat/test_LD.R b/tests/testthat/test_LD.R index d45b23a6..24a72aed 100644 --- a/tests/testthat/test_LD.R +++ b/tests/testthat/test_LD.R @@ -823,3 +823,108 @@ test_that("can_merge checks chromosome and size", { b3 <- data.frame(chrom = "2", size = 100) expect_false(pecotmr:::can_merge(b1, b3, max_size = 500)) }) + +# =========================================================================== +# check_ld (regularize_ld) +# =========================================================================== + +test_that("check_ld reports PD for identity matrix", { + R <- diag(5) + result <- check_ld(R) + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) + expect_equal(result$condition_number, 1) +}) + +test_that("check_ld reports PD for well-conditioned correlation matrix", { + R <- matrix(0.3, 4, 4) + diag(R) <- 1 + result <- check_ld(R) + expect_true(result$is_pd) + expect_true(result$is_psd) + expect_equal(result$n_negative, 0) + expect_equal(result$method_applied, "none") +}) + +test_that("check_ld detects non-PSD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R) + expect_false(result$is_psd) + expect_true(result$n_negative > 0) + expect_true(result$min_eigenvalue < 0) + expect_equal(result$method_applied, "none") +}) + +test_that("check_ld shrink method modifies non-PD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "shrink") + expect_false(identical(result$R, R)) + # With strong enough shrinkage, result should be PD + result2 <- check_ld(R, method = "shrink", shrinkage = 0.5) + eig <- eigen(result2$R, symmetric = TRUE) + expect_true(all(eig$values > 0)) +}) + +test_that("check_ld eigenfix method improves non-PD matrix", { + R <- matrix(0.9, 3, 3) + diag(R) <- 1 + R[1, 3] <- R[3, 1] <- -0.5 + original_min_eval <- min(eigen(R, symmetric = TRUE)$values) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "eigenfix") + # Eigenfix should improve the minimum eigenvalue + fixed_min_eval <- min(eigen(result$R, symmetric = TRUE)$values) + expect_true(fixed_min_eval > original_min_eval) + # Unit diagonal preserved + expect_equal(diag(result$R), rep(1, 3)) + # Symmetry preserved + expect_equal(result$R, t(result$R)) +}) + +test_that("check_ld shrink does nothing when matrix is already PD", { + R <- diag(3) + result <- check_ld(R, method = "shrink") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +test_that("check_ld eigenfix does nothing when matrix is already PD", { + R <- diag(3) + result <- check_ld(R, method = "eigenfix") + expect_equal(result$method_applied, "none") + expect_equal(result$R, R) +}) + +# =========================================================================== +# extract_block_matrices: out-of-range blocks +# =========================================================================== + +test_that("extract_block_matrices warns and skips out-of-range blocks", { + mat <- diag(4) + vnames <- paste0("v", 1:4) + rownames(mat) <- colnames(mat) <- vnames + block_metadata <- data.frame( + block_id = c(1, 2), + start_idx = c(1, 10), + end_idx = c(2, 12), + chrom = c("1", "1"), + block_start = c(100, 500), + block_end = c(200, 600), + size = c(2, 3), + stringsAsFactors = FALSE + ) + expect_warning( + result <- pecotmr:::extract_block_matrices(mat, block_metadata, vnames), + "outside the range" + ) + valid_blocks <- result$ld_matrices[!sapply(result$ld_matrices, is.null)] + expect_equal(length(valid_blocks), 1) + expect_equal(nrow(valid_blocks[[1]]), 2) +}) diff --git a/tests/testthat/test_file_utils.R b/tests/testthat/test_file_utils.R index 8a16fcc7..fc2ea0af 100644 --- a/tests/testthat/test_file_utils.R +++ b/tests/testthat/test_file_utils.R @@ -1537,3 +1537,63 @@ test_that("invert_minmax_scaling errors on mismatched lengths", { expect_error(invert_minmax_scaling(X, c(0, 0, 0), c(1, 1, 1)), "Length of u_min") }) + +# =========================================================================== +# batch_load_twas_weights +# =========================================================================== + +test_that("batch_load_twas_weights returns empty list for empty input", { + expect_message( + result <- batch_load_twas_weights(list(), data.frame(region_id = character(), TSS = integer())), + "No genes" + ) + expect_equal(length(result), 0) +}) + +test_that("batch_load_twas_weights returns single batch when total memory fits", { + twas <- list( + gene1 = list(a = 1:10), + gene2 = list(a = 1:10) + ) + meta <- data.frame(region_id = c("gene1", "gene2"), TSS = c(100, 200)) + expect_message( + result <- batch_load_twas_weights(twas, meta, max_memory_per_batch = 1000), + "No need to split" + ) + expect_equal(length(result), 1) + expect_true(all(c("gene1", "gene2") %in% names(result[[1]]))) +}) + +test_that("batch_load_twas_weights splits into multiple batches", { + # Create data large enough to require splitting + twas <- list( + gene1 = rnorm(1e5), + gene2 = rnorm(1e5), + gene3 = rnorm(1e5) + ) + # Each gene is ~0.76 MB + gene_size_mb <- as.numeric(object.size(twas[[1]])) / (1024^2) + # Set limit so at most 2 genes fit per batch + max_mb <- gene_size_mb * 1.5 + meta <- data.frame(region_id = c("gene1", "gene2", "gene3"), TSS = c(100, 200, 300)) + result <- batch_load_twas_weights(twas, meta, max_memory_per_batch = max_mb) + expect_true(length(result) >= 2) + # All genes should be present across batches + all_genes <- unlist(lapply(result, names)) + expect_true(all(c("gene1", "gene2", "gene3") %in% all_genes)) +}) + +test_that("batch_load_twas_weights puts oversized gene in its own batch", { + twas <- list( + gene_small = list(a = 1:10), + gene_big = rnorm(1e6) + ) + big_size_mb <- as.numeric(object.size(twas$gene_big)) / (1024^2) + small_size_mb <- as.numeric(object.size(twas$gene_small)) / (1024^2) + # Set limit between small and big + max_mb <- big_size_mb * 0.5 + meta <- data.frame(region_id = c("gene_small", "gene_big"), TSS = c(100, 200)) + result <- batch_load_twas_weights(twas, meta, max_memory_per_batch = max_mb) + # Big gene should be in its own batch + expect_true(length(result) >= 2) +}) diff --git a/tests/testthat/test_ld_loader.R b/tests/testthat/test_ld_loader.R new file mode 100644 index 00000000..3576794f --- /dev/null +++ b/tests/testthat/test_ld_loader.R @@ -0,0 +1,122 @@ +context("ld_loader") + +# =========================================================================== +# ld_loader: input validation +# =========================================================================== + +test_that("ld_loader errors when no source is provided", { + expect_error(ld_loader(), "Provide exactly one") +}) + +test_that("ld_loader errors when multiple sources are provided", { + R <- list(matrix(1, 2, 2)) + X <- list(matrix(1, 3, 2)) + expect_error(ld_loader(R_list = R, X_list = X), "Provide exactly one") +}) + +# =========================================================================== +# ld_loader: R_list branch +# =========================================================================== + +test_that("ld_loader with R_list returns a function", { + R <- list(matrix(c(1, 0.5, 0.5, 1), 2, 2)) + loader <- ld_loader(R_list = R) + expect_type(loader, "closure") +}) + +test_that("ld_loader R_list returns correct matrix", { + R1 <- matrix(c(1, 0.3, 0.3, 1), 2, 2) + R2 <- matrix(c(1, 0.8, 0.8, 1), 2, 2) + loader <- ld_loader(R_list = list(R1, R2)) + expect_equal(loader(1), R1) + expect_equal(loader(2), R2) +}) + +test_that("ld_loader R_list with max_variants downsamples", { + set.seed(42) + R <- matrix(0.1, 10, 10) + diag(R) <- 1 + loader <- ld_loader(R_list = list(R), max_variants = 5) + result <- loader(1) + expect_equal(nrow(result), 5) + expect_equal(ncol(result), 5) +}) + +test_that("ld_loader R_list without max_variants returns full matrix", { + R <- matrix(0.1, 10, 10) + diag(R) <- 1 + loader <- ld_loader(R_list = list(R)) + result <- loader(1) + expect_equal(nrow(result), 10) +}) + +test_that("ld_loader R_list max_variants larger than matrix returns full matrix", { + R <- matrix(0.1, 3, 3) + diag(R) <- 1 + loader <- ld_loader(R_list = list(R), max_variants = 100) + result <- loader(1) + expect_equal(nrow(result), 3) +}) + +# =========================================================================== +# ld_loader: X_list branch +# =========================================================================== + +test_that("ld_loader with X_list returns a function", { + X <- list(matrix(rnorm(30), 10, 3)) + loader <- ld_loader(X_list = X) + expect_type(loader, "closure") +}) + +test_that("ld_loader X_list returns correct matrix", { + X1 <- matrix(1:12, 4, 3) + X2 <- matrix(1:8, 4, 2) + loader <- ld_loader(X_list = list(X1, X2)) + expect_equal(loader(1), X1) + expect_equal(loader(2), X2) +}) + +test_that("ld_loader X_list with max_variants downsamples columns", { + set.seed(42) + X <- matrix(rnorm(50), 10, 5) + loader <- ld_loader(X_list = list(X), max_variants = 3) + result <- loader(1) + expect_equal(nrow(result), 10) + expect_equal(ncol(result), 3) +}) + +test_that("ld_loader X_list max_variants larger than ncol returns full matrix", { + X <- matrix(rnorm(12), 4, 3) + loader <- ld_loader(X_list = list(X), max_variants = 100) + result <- loader(1) + expect_equal(ncol(result), 3) +}) + +# =========================================================================== +# ld_loader: ld_meta_path branch validation +# =========================================================================== + +test_that("ld_loader with ld_meta_path but no regions errors", { + expect_error( + ld_loader(ld_meta_path = "/some/path"), + "regions.*required" + ) +}) + +# =========================================================================== +# ld_loader: LD_info branch validation +# =========================================================================== + +test_that("ld_loader with LD_info errors when not a data.frame", { + expect_error( + ld_loader(LD_info = "not_a_df"), + "LD_info must be a data.frame" + ) +}) + +test_that("ld_loader with LD_info errors when missing LD_file column", { + expect_error( + ld_loader(LD_info = data.frame(col1 = "a")), + "LD_info must be a data.frame with column 'LD_file'" + ) +}) diff --git a/tests/testthat/test_mash_wrapper.R b/tests/testthat/test_mash_wrapper.R index 06ba5f1c..9f3139bd 100644 --- a/tests/testthat/test_mash_wrapper.R +++ b/tests/testthat/test_mash_wrapper.R @@ -882,41 +882,38 @@ test_that("mash_rand_null_sample caps random sample at available rows", { test_that("mash_pipeline uses residual_correlation when null data is empty", { skip_if_not_installed("mashr") skip_if_not_installed("flashier") - skip_if_not_installed("udr") + n_cond <- 3 + n_strong <- 15 mock_input <- list( null.b = numeric(0), null.s = numeric(0), - random.b = matrix(rnorm(6), 3, 2), - random.s = matrix(abs(rnorm(6)), 3, 2), - strong.b = matrix(rnorm(20), 10, 2), - strong.s = matrix(abs(rnorm(20)), 10, 2), - strong.z = matrix(rnorm(20), 10, 2) + random.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), + random.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond), + strong.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), + strong.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond) ) - custom_vhat <- matrix(c(1, 0.3, 0.3, 1), 2, 2) + custom_vhat <- matrix(c(1, 0.3, 0.1, 0.3, 1, 0.2, 0.1, 0.2, 1), 3, 3) result <- mash_pipeline(mock_input, alpha = 1, residual_correlation = custom_vhat) expect_true(is.list(result)) - # Verify it did not produce an error marker - expect_null(result$error) }) test_that("mash_pipeline does not error about mashr when mashr is available", { skip_if_not_installed("mashr") skip_if_not_installed("flashier") - skip_if_not_installed("udr") + n_cond <- 3 + n_strong <- 15 mock_input <- list( null.b = numeric(0), null.s = numeric(0), - random.b = matrix(rnorm(6), 3, 2), - random.s = matrix(abs(rnorm(6)) + 0.1, 3, 2), - strong.b = matrix(rnorm(20), 10, 2), - strong.s = matrix(abs(rnorm(20)) + 0.1, 10, 2), - strong.z = matrix(rnorm(20), 10, 2) + random.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), + random.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond), + strong.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), + strong.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond) ) - # Should not error about missing mashr since it is installed result <- mash_pipeline(mock_input, alpha = 1) expect_true(is.list(result)) }) @@ -924,7 +921,6 @@ test_that("mash_pipeline does not error about mashr when mashr is available", { test_that("mash_pipeline uses identity matrix when null data empty and no residual_correlation", { skip_if_not_installed("mashr") skip_if_not_installed("flashier") - skip_if_not_installed("udr") n_cond <- 3 n_strong <- 15 @@ -934,13 +930,11 @@ test_that("mash_pipeline uses identity matrix when null data empty and no residu random.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), random.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond), strong.b = matrix(rnorm(n_strong * n_cond), n_strong, n_cond), - strong.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond), - strong.z = matrix(rnorm(n_strong * n_cond), n_strong, n_cond) + strong.s = matrix(abs(rnorm(n_strong * n_cond)) + 0.1, n_strong, n_cond) ) result <- mash_pipeline(mock_input, alpha = 1, residual_correlation = NULL) expect_true(is.list(result)) - # Verify it did not produce an error marker expect_null(result$error) }) diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R deleted file mode 100644 index 87c0da1a..00000000 --- a/tests/testthat/test_plot.R +++ /dev/null @@ -1,26 +0,0 @@ -context("plot") - -library(testthat) - -# ========================================================================= -# plot.R: venn function (lines 6-8) -# ========================================================================= - -test_that("venn function creates a plot object without error", { - skip_if_not_installed("ggvenn") - data <- list( - SuSiE = c("gene1", "gene2", "gene3"), - Lasso = c("gene2", "gene3", "gene4"), - Enet = c("gene1", "gene4", "gene5"), - MR.ASH = c("gene3", "gene5", "gene6") - ) - # Use pdf(NULL) to suppress graphics device output - pdf(NULL) - result <- tryCatch( - pecotmr:::venn(data), - error = function(e) e - ) - dev.off() - # Check it returned something (not an error) - expect_false(inherits(result, "error")) -}) From b677b8c352edf6f53fe4692c377aa78099da13a3 Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 17 Apr 2026 14:06:05 -0700 Subject: [PATCH 3/5] update recipe --- .github/recipe/recipe.yaml | 7 ------- DESCRIPTION | 2 -- pixi.toml | 2 -- 3 files changed, 11 deletions(-) diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml index d93c8751..cd5c3358 100644 --- a/.github/recipe/recipe.yaml +++ b/.github/recipe/recipe.yaml @@ -30,8 +30,6 @@ requirements: - bioconductor-snpstats - r-base - r-bglr - - r-bigsnpr - - r-bigstatsr - r-coda - r-coloc - r-colocboost @@ -75,8 +73,6 @@ requirements: - bioconductor-snpstats - r-base - r-bglr - - r-bigsnpr - - r-bigstatsr - r-coda - r-coloc - r-colocboost @@ -100,8 +96,6 @@ requirements: - r-pgenlibr - r-purrr - r-qgg - - r-quadprog - - r-quantreg - r-rcpp - r-rcpparmadillo - r-rcppdpr @@ -114,7 +108,6 @@ requirements: - r-tidyr - r-vctrs - r-vroom - - r-xicor tests: - script: diff --git a/DESCRIPTION b/DESCRIPTION index b81719cf..0bac8360 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,8 +18,6 @@ Imports: IRanges, Rcpp, S4Vectors, - bigsnpr, - bigstatsr, coloc, doFuture, dplyr, diff --git a/pixi.toml b/pixi.toml index 32db2954..384c6b3c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -48,8 +48,6 @@ r45 = {features = ["r45"]} "bioconductor-snpstats" = "*" "r-base" = "*" "r-bglr" = "*" -"r-bigsnpr" = "*" -"r-bigstatsr" = "*" "r-coda" = "*" "r-coloc" = "*" "r-colocboost" = "*" From 2e9e7712865542b7d487f006274c23ec9a1e623e Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Fri, 17 Apr 2026 16:58:20 -0700 Subject: [PATCH 4/5] clean up and optimize more code --- NAMESPACE | 3 + R/colocboost_pipeline.R | 216 ++++++++----------- R/encoloc.R | 175 +++++++-------- R/mr.R | 250 ++++++++++------------ tests/testthat/test_colocboost_pipeline.R | 39 ++++ tests/testthat/test_mr.R | 58 +++++ 6 files changed, 388 insertions(+), 353 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index faf1e899..0d964b26 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -143,7 +143,9 @@ importFrom(dplyr,group_by) importFrom(dplyr,if_else) importFrom(dplyr,inner_join) importFrom(dplyr,intersect) +importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,n) importFrom(dplyr,pull) importFrom(dplyr,rename) importFrom(dplyr,row_number) @@ -157,6 +159,7 @@ importFrom(future,availableCores) importFrom(future,multicore) importFrom(future,plan) importFrom(magrittr,"%>%") +importFrom(matrixStats,colSds) importFrom(matrixStats,colVars) importFrom(purrr,compact) importFrom(purrr,exec) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index ebdaae75..e9e4e30e 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -14,6 +14,20 @@ build_ld_args <- function(ld_list, subset = NULL) { if (is_geno) list(X_ref = ld_list) else list(LD = ld_list) } +# Run colocboost with tryCatch and timing. +# @noRd +.run_colocboost <- function(label, ...) { + t1 <- Sys.time() + res <- tryCatch( + colocboost(...), + error = function(e) { + message(label, " failed: ", conditionMessage(e)) + NULL + } + ) + list(result = res, time = Sys.time() - t1) +} + #' Multi-trait colocalization analysis pipeline #' #' This function perform a multi-trait colocalization using ColocBoost @@ -113,61 +127,54 @@ colocboost_analysis_pipeline <- function(region_data, return(filtered_events) } - # - extract contexts and studies from region data + # - extract contexts and studies from region data, handling both pre- and post-QC extract_contexts_studies <- function(region_data, phenotypes_init = NULL) { individual_data <- region_data$individual_data sumstat_data <- region_data$sumstat_data + phenotypes <- list("individual_contexts" = NULL, "sumstat_studies" = NULL) - if (is.null(phenotypes_init)) { - # - inital setup - phenotypes <- list("individual_contexts" = NULL, "sumstat_studies" = NULL) - if (!is.null(individual_data)) { + # Extract individual contexts + if (!is.null(individual_data)) { + if (is.null(phenotypes_init)) { phenotypes$individual_contexts <- names(individual_data$residual_Y) } else { - message("No individual data in this region!") - } - if (!is.null(sumstat_data)) { - phenotypes$sumstat_studies <- sapply(sumstat_data$sumstats, function(ss) names(ss)) %>% - unlist() %>% - as.vector() - } else { - message("No sumstat data in this region!") - } - } else { - # - after QC - phenotypes <- list("individual_contexts" = NULL, "sumstat_studies" = NULL) - if (!is.null(individual_data)) { null_Y <- which(sapply(individual_data$Y, is.null)) if (length(null_Y) == 0) { message("All individual data pass QC steps.") phenotypes$individual_contexts <- names(individual_data$Y) - } else if (length(null_Y) != length(individual_data$Y)) { + } else if (length(null_Y) < length(individual_data$Y)) { message(paste( "Skipping follow-up analysis for individual traits", paste(names(individual_data$Y)[null_Y], collapse = ";"), "after QC." )) phenotypes$individual_contexts <- names(individual_data$Y)[-null_Y] - } else if (length(null_Y) == length(individual_data$Y)) { + } else { message("No individual data pass QC.") } - } else { - message("No individual data pass QC.") } - if (!is.null(sumstat_data)) { + } else { + message(if (is.null(phenotypes_init)) "No individual data in this region!" else "No individual data pass QC.") + } + + # Extract sumstat studies + if (!is.null(sumstat_data)) { + if (is.null(phenotypes_init)) { + phenotypes$sumstat_studies <- unlist(sapply(sumstat_data$sumstats, names)) + } else { phenotypes$sumstat_studies <- names(sumstat_data$sumstats) - sumstat_studies_init <- phenotypes_init$sumstat_studies - if (length(sumstat_studies_init) == length(phenotypes$sumstat_studies)) { + if (length(phenotypes_init$sumstat_studies) == length(phenotypes$sumstat_studies)) { message("All sumstat studies pass QC steps.") } else { message(paste( "Skipping follow-up analysis for sumstat studies", - paste(setdiff(sumstat_studies_init, phenotypes$sumstat_studies), collapse = ";"), "after QC." + paste(setdiff(phenotypes_init$sumstat_studies, phenotypes$sumstat_studies), collapse = ";"), "after QC." )) } - } else { - message("No sumstat data pass QC.") } + } else { + message(if (is.null(phenotypes_init)) "No sumstat data in this region!" else "No sumstat data pass QC.") } + return(phenotypes) } @@ -243,13 +250,12 @@ colocboost_analysis_pipeline <- function(region_data, Y <- NULL } if (!is.null(Y)) { - Y <- lapply(seq_along(Y), function(i) { - y <- Y[[i]] - lapply(seq_len(ncol(y)), function(j) y[, j, drop = FALSE] %>% setNames(colnames(y)[j])) + Y_split <- purrr::imap(Y, function(y, i) { + purrr::map(seq_len(ncol(y)), function(j) setNames(y[, j, drop = FALSE], colnames(y)[j])) }) - dict_YX <- cbind(seq_along(Reduce("c", Y)), rep(seq_along(Y), sapply(Y, length))) - Y <- Reduce("c", Y) - Y <- Y %>% setNames(sapply(Y, colnames)) + dict_YX <- cbind(seq_along(unlist(Y_split, recursive = FALSE)), rep(seq_along(Y_split), purrr::map_int(Y_split, length))) + Y <- unlist(Y_split, recursive = FALSE) + names(Y) <- sapply(Y, colnames) } } else { X <- Y <- dict_YX <- NULL @@ -309,49 +315,28 @@ colocboost_analysis_pipeline <- function(region_data, # - run xQTL-only version of ColocBoost if (xqtl_coloc & !is.null(X)) { message(paste("====== Performing xQTL-only ColocBoost on", length(Y), "contexts. =====")) - t11 <- Sys.time() traits <- names(Y) - focal_outcome_idx <- NULL - if (!is.null(focal_trait)) { - if (focal_trait %in% traits) { - focal_outcome_idx <- which(traits == focal_trait) - } - } - res_xqtl <- tryCatch( - colocboost( - X = X, Y = Y, dict_YX = dict_YX, - outcome_names = traits, focal_outcome_idx = focal_outcome_idx, - output_level = 2, ... - ), - error = function(e) { - message("xQTL-only ColocBoost failed: ", conditionMessage(e)) - return(NULL) - } + focal_outcome_idx <- if (!is.null(focal_trait) && focal_trait %in% traits) which(traits == focal_trait) else NULL + cb_res <- .run_colocboost("xQTL-only ColocBoost", + X = X, Y = Y, dict_YX = dict_YX, + outcome_names = traits, focal_outcome_idx = focal_outcome_idx, + output_level = 2, ... ) - t12 <- Sys.time() - analysis_results$xqtl_coloc <- res_xqtl - analysis_results$computing_time$Analysis$xqtl_coloc <- t12 - t11 + analysis_results$xqtl_coloc <- cb_res$result + analysis_results$computing_time$Analysis$xqtl_coloc <- cb_res$time } # - run joint GWAS no focaled version of ColocBoost if (joint_gwas & !is.null(sumstats)) { message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. =====")) - t21 <- Sys.time() traits <- c(names(Y), names(sumstats)) ld_args <- build_ld_args(LD_mat) - res_gwas <- tryCatch( - do.call(colocboost, c(list( - X = X, Y = Y, sumstat = sumstats, - dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, - outcome_names = traits, focal_outcome_idx = NULL, - output_level = 2), ld_args, list(...))), - error = function(e) { - message("Joint GWAS ColocBoost failed: ", conditionMessage(e)) - return(NULL) - } - ) - t22 <- Sys.time() - analysis_results$joint_gwas <- res_gwas - analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21 + cb_res <- do.call(.run_colocboost, c(list("Joint GWAS ColocBoost", + X = X, Y = Y, sumstat = sumstats, + dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, + outcome_names = traits, focal_outcome_idx = NULL, + output_level = 2), ld_args, list(...))) + analysis_results$joint_gwas <- cb_res$result + analysis_results$computing_time$Analysis$joint_gwas <- cb_res$time } # - run focaled version of ColocBoost for each GWAS if (separate_gwas & !is.null(sumstats)) { @@ -363,17 +348,13 @@ colocboost_analysis_pipeline <- function(region_data, dict <- dict_sumstatLD[i_gwas, ] traits <- c(names(Y), current_study) ld_args_sep <- build_ld_args(LD_mat, subset = dict[2]) - res_gwas_separate[[current_study]] <- tryCatch( - do.call(colocboost, c(list( + cb_res <- do.call(.run_colocboost, c( + list(paste("Separate GWAS ColocBoost for", current_study), X = X, Y = Y, sumstat = sumstats[dict[1]], dict_YX = dict_YX, outcome_names = traits, focal_outcome_idx = length(traits), - output_level = 2), ld_args_sep, list(...))), - error = function(e) { - message("Separate GWAS ColocBoost failed for ", current_study, ": ", conditionMessage(e)) - return(NULL) - } - ) + output_level = 2), ld_args_sep, list(...))) + res_gwas_separate[[current_study]] <- cb_res$result } t32 <- Sys.time() analysis_results$separate_gwas <- res_gwas_separate @@ -559,37 +540,27 @@ qc_regional_data <- function(region_data, pip_cutoff_to_skip_ind = 0) { # - add context to colname of Y Y <- add_context_to_Y(Y) - n_context <- length(X) - residual_X <- residual_Y <- list() - keep_contexts <- c() - for (i_context in 1:n_context) { - resX <- X[[i_context]] - resY <- Y[[i_context]] - maf <- MAF[[i_context]] - context <- names(Y)[i_context] - if (is.null(resY)) next - # - remove variants with maf < maf_cutoff - # tmp <- filter_resX_maf(resX, maf, maf_cutoff = maf_cutoff) + results <- purrr::imap(X, function(resX, context) { + resY <- Y[[context]] + maf <- MAF[[context]] + i_context <- match(context, names(X)) + if (is.null(resY)) return(NULL) resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf) - # Initial PIP check resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context) - if (!is.null(resY)) { - residual_X <- c(residual_X, list(resX)) - residual_Y <- c(residual_Y, list(resY)) - keep_contexts <- c(keep_contexts, context) - } - } - if (length(keep_contexts) == 0) { - message(paste("Skipping follow-up analysis for all contexts.")) + if (is.null(resY)) return(NULL) + list(X = resX, Y = resY) + }) %>% purrr::compact() + + if (length(results) == 0) { + message("Skipping follow-up analysis for all contexts.") return(NULL) - } else { - message(paste("Region includes the following contexts after inital screening:", paste(keep_contexts, collapse = ";"), ".")) - names(residual_X) <- names(residual_Y) <- keep_contexts - return(list( - X = residual_X, - Y = residual_Y - )) } + keep_contexts <- names(results) + message(paste("Region includes the following contexts after inital screening:", paste(keep_contexts, collapse = ";"), ".")) + list( + X = purrr::map(results, "X"), + Y = purrr::map(results, "Y") + ) } # - individual level data QC @@ -622,8 +593,13 @@ qc_regional_data <- function(region_data, impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) { n_LD <- length(sumstat_data$LD_info) - final_sumstats <- final_LD <- NULL - LD_match <- c() + # Collect results into lists and flatten at the end + collected_sumstats <- list() + collected_LD <- list() + collected_LD_match <- character() + # Track LD matrices by variant signature for O(1) deduplication + ld_variant_index <- list() + for (i in 1:n_LD) { LD_data <- sumstat_data$LD_info[[i]] sumstats <- sumstat_data$sumstats[[i]] @@ -710,27 +686,21 @@ qc_regional_data <- function(region_data, mat_to_store <- R_mat } - # Deduplicate: reuse existing matrix if variants match - if (length(final_LD) == 0) { - final_LD <- c(final_LD, list(mat_to_store) %>% setNames(conditions_sumstat)) - final_sumstats <- c(final_sumstats, list(sumstat) %>% setNames(conditions_sumstat)) - LD_match <- c(LD_match, conditions_sumstat) + # Collect sumstat + collected_sumstats[[conditions_sumstat]] <- sumstat + + # Deduplicate LD using variant signature hash + variant_key <- paste(colnames(mat_to_store), collapse = ",") + if (variant_key %in% names(ld_variant_index)) { + collected_LD_match <- c(collected_LD_match, ld_variant_index[[variant_key]]) } else { - variants <- colnames(mat_to_store) - final_sumstats <- c(final_sumstats, list(sumstat) %>% setNames(conditions_sumstat)) - exist_variants <- lapply(final_LD, colnames) - if_exist <- sapply(exist_variants, function(v) all(variants == v)) - pos <- which(if_exist) - if (length(pos) == 0) { - final_LD <- c(final_LD, list(mat_to_store) %>% setNames(conditions_sumstat)) - LD_match <- c(LD_match, conditions_sumstat) - } else { - LD_match <- c(LD_match, names(final_LD)[pos[1]]) - } + collected_LD[[conditions_sumstat]] <- mat_to_store + ld_variant_index[[variant_key]] <- conditions_sumstat + collected_LD_match <- c(collected_LD_match, conditions_sumstat) } } } - return(list(sumstats = final_sumstats, LD_mat = final_LD, LD_match = LD_match)) + return(list(sumstats = collected_sumstats, LD_mat = collected_LD, LD_match = collected_LD_match)) } diff --git a/R/encoloc.R b/R/encoloc.R index 1482a836..0a6a75f5 100644 --- a/R/encoloc.R +++ b/R/encoloc.R @@ -31,21 +31,20 @@ xqtl_enrichment_wrapper <- function(xqtl_files, gwas_files, xqtl_finemapping_obj = NULL, gwas_finemapping_obj = NULL, xqtl_varname_obj = NULL, gwas_varname_obj = NULL) { # Load and process GWAS data - gwas_pip <- list() - for (file in gwas_files) { - raw_data <- readRDS(file)[[1]] # changed + gwas_pip_list <- purrr::map(gwas_files, function(file) { + raw_data <- readRDS(file)[[1]] gwas_data <- if (!is.null(gwas_finemapping_obj)) get_nested_element(raw_data, gwas_finemapping_obj) else raw_data pip <- gwas_data$pip if (!is.null(gwas_varname_obj)) names(pip) <- get_nested_element(raw_data, gwas_varname_obj) - gwas_pip <- c(gwas_pip, list(pip)) - } + pip + }) # Check for unique variant names in GWAS pip vectors - all_variant_names <- unique(unlist(lapply(gwas_pip, names))) + all_variant_names <- unique(unlist(purrr::map(gwas_pip_list, names))) if (length(unique(all_variant_names)) != length(all_variant_names)) { stop("Non-unique variant names found in GWAS data with different pip values.") } - gwas_pip <- unlist(gwas_pip) + gwas_pip <- unlist(gwas_pip_list) # Process xQTL data xqtl_data <- lapply(xqtl_files, function(file) { @@ -93,15 +92,9 @@ filter_and_order_coloc_results <- function(coloc_results_fil) { } cs_num <- ncol(coloc_results_fil) - 1 - ordered_results <- list() - - for (n in 1:cs_num) { - # Selecting relevant columns and ordering - tmp_coloc_results_fil <- coloc_results_fil[, c(1, n + 1)] %>% .[order(.[, 2], decreasing = TRUE), ] - ordered_results[[n]] <- tmp_coloc_results_fil - } - - return(ordered_results) + purrr::map(seq_len(cs_num), function(n) { + coloc_results_fil[, c(1, n + 1)] %>% .[order(.[, 2], decreasing = TRUE), ] + }) } #' Function to calculate cumulative sum @@ -160,30 +153,21 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path, analysis_regi # prepare to calculate purity ordered_results <- filter_and_order_coloc_results(coloc_results_fil) - cs <- list() - purity <- NULL + cs <- purrr::map(ordered_results, function(res) { + csm <- calculate_cumsum(res) + res[, 1][1:min(which(csm > coverage))] + }) - for (n in seq_along(ordered_results)) { - tmp_coloc_results_fil <- ordered_results[[n]] - tmp_coloc_results_fil_csm <- calculate_cumsum(tmp_coloc_results_fil) - cs[[n]] <- tmp_coloc_results_fil[, 1][1:(which(tmp_coloc_results_fil_csm > coverage) %>% min())] + purity <- purrr::map_dfr(seq_along(cs), function(n) { variants <- normalize_variant_id(cs[[n]]) - - # Load LD for the region narrowed to actual variant positions - ext_ld <- extract_ld_for_variants(LD_meta_file_path, analysis_region, variants) - - # Calculate purity if (null_index > 0 && null_index %in% variants) { - purity <- rbind(purity, c(-9, -9, -9)) + data.frame(min.abs.corr = -9, mean.abs.corr = -9, median.abs.corr = -9) } else { - current_purity <- calculate_purity(variants, ext_ld) - purity <- rbind(purity, current_purity) + ext_ld <- extract_ld_for_variants(LD_meta_file_path, analysis_region, variants) + p <- calculate_purity(variants, ext_ld) + data.frame(min.abs.corr = p[1, 1], mean.abs.corr = p[1, 2], median.abs.corr = p[1, 3]) } - } - - # Process purity data - purity <- as.data.frame(purity) - colnames(purity) <- c("min.abs.corr", "mean.abs.corr", "median.abs.corr") + }) is_pure <- which(purity[, 1] >= min_abs_corr) # Finalize the result @@ -201,6 +185,49 @@ process_coloc_results <- function(coloc_result, LD_meta_file_path, analysis_regi return(coloc_res) } +# Extract and filter an LBF matrix from a finemapped data object. +# @noRd +.extract_lbf_matrix <- function(raw_data, finemapping_obj, varname_obj, + filter_lbf_cs, filter_lbf_cs_secondary, prior_tol) { + fm_data <- if (!is.null(finemapping_obj)) { + tryCatch(get_nested_element(raw_data, finemapping_obj), + error = function(e) { + message(paste("no", finemapping_obj[2], "in", finemapping_obj[1])) + NULL + } + ) + } else { + raw_data + } + if (is.null(fm_data)) return(NULL) + + lbf_matrix <- as.data.frame(fm_data$lbf_variable) + # fSuSiE has a different structure + if (is.null(lbf_matrix) || nrow(lbf_matrix) == 0) { + lbf_matrix <- do.call(rbind, raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame() + if (nrow(lbf_matrix) > 0) message("This is a fSuSiE case") + } + + # Filter rows + if (filter_lbf_cs && is.null(filter_lbf_cs_secondary)) { + lbf_matrix <- lbf_matrix[fm_data$sets$cs_index, , drop = FALSE] + } else if (!is.null(filter_lbf_cs_secondary)) { + lbf_matrix <- lbf_matrix[get_filter_lbf_index(fm_data, coverage = filter_lbf_cs_secondary), , drop = FALSE] + } else { + if ("V" %in% names(fm_data)) { + lbf_matrix <- lbf_matrix[fm_data$V > prior_tol, , drop = FALSE] + } else { + message("No V found in original data.") + } + } + + # Set variant names and remove NA columns + if (!is.null(varname_obj)) colnames(lbf_matrix) <- get_nested_element(raw_data, varname_obj) + lbf_matrix <- lbf_matrix[, !is.na(colnames(lbf_matrix))] + + list(lbf_matrix = lbf_matrix, fm_data = fm_data) +} + #' Colocalization Analysis Wrapper #' #' This function processes xQTL and multiple GWAS finemapped data files for colocalization analysis. @@ -231,86 +258,36 @@ coloc_wrapper <- function(xqtl_file, gwas_files, gwas_finemapping_obj = NULL, gwas_varname_obj = NULL, gwas_region_obj = NULL, filter_lbf_cs = FALSE, filter_lbf_cs_secondary = NULL, prior_tol = 1e-9, p1 = 1e-4, p2 = 1e-4, p12 = 5e-6, ...) { - region <- NULL # define first to avoid element can not be found + region <- NULL # Load and process GWAS data - gwas_lbf_matrices <- lapply(gwas_files, function(file) { + gwas_lbf_matrices <- purrr::map(gwas_files, function(file) { raw_data <- readRDS(file)[[1]] - gwas_data <- if (!is.null(gwas_finemapping_obj)) get_nested_element(raw_data, gwas_finemapping_obj) else raw_data - gwas_lbf_matrix <- as.data.frame(gwas_data$lbf_variable) - # fsusie has a different structure - if(is.null(gwas_lbf_matrix) || nrow(gwas_lbf_matrix)==0){ - gwas_lbf_matrix <- do.call(rbind, raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame - if(nrow(gwas_lbf_matrix) > 0) message("This is a fSuSiE case") - } - if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) { - gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$sets$cs_index,, drop = FALSE] - } else if (!is.null(filter_lbf_cs_secondary)) { - gwas_lbf_filter_index <- get_filter_lbf_index(gwas_data, coverage = filter_lbf_cs_secondary) - gwas_lbf_matrix <- gwas_lbf_matrix[gwas_lbf_filter_index,, drop = FALSE] - } else { - gwas_lbf_matrix <- gwas_lbf_matrix[gwas_data$V > prior_tol,, drop = FALSE] - } - if (!is.null(gwas_varname_obj)) colnames(gwas_lbf_matrix) <- get_nested_element(raw_data, gwas_varname_obj) - # fsusie could have NA in variant name - gwas_lbf_matrix <- gwas_lbf_matrix[,!is.na(colnames(gwas_lbf_matrix))] - return(gwas_lbf_matrix) + .extract_lbf_matrix(raw_data, gwas_finemapping_obj, gwas_varname_obj, + filter_lbf_cs, filter_lbf_cs_secondary, prior_tol)$lbf_matrix }) - # changed: need to remove this check, as the last variant of former block could overlapped with first variant of later block - # # Validate uniqueness of column names across GWAS matrices - # all_gwas_colnames <- unique(unlist(lapply(gwas_lbf_matrices, colnames))) - # if (length(all_gwas_colnames) != sum(sapply(gwas_lbf_matrices, ncol))) { - # stop("Duplicate variant names found across GWAS regions analyzed. This is not expected.") - # } # Combine GWAS matrices and replace NAs with zeros combined_gwas_lbf_matrix <- bind_rows(gwas_lbf_matrices) %>% mutate(across(everything(), ~ replace_na(., 0))) # Process xQTL data xqtl_raw_data <- readRDS(xqtl_file)[[1]] - xqtl_data <- if (!is.null(xqtl_finemapping_obj)) { - tryCatch( - { - get_nested_element(xqtl_raw_data, xqtl_finemapping_obj) - }, - error = function(e) { - message(paste("no", xqtl_finemapping_obj[2], "in", xqtl_finemapping_obj[1])) - NULL - } - ) - } else { - xqtl_raw_data - } - if (!is.null(xqtl_data)) { - xqtl_lbf_matrix <- as.data.frame(xqtl_data$lbf_variable) - # fsusie has a different structure - if(is.null(xqtl_lbf_matrix) | nrow(xqtl_lbf_matrix)==0){ - xqtl_lbf_matrix <- do.call(rbind, xqtl_raw_data[[1]]$fsusie_result$lBF) %>% as.data.frame - if(nrow(xqtl_lbf_matrix) > 0) message("This is a fSuSiE case") - } - # fsusie data does not have V element in results - if (filter_lbf_cs & is.null(filter_lbf_cs_secondary)) { - xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$sets$cs_index, , drop = FALSE] - } else if (!is.null(filter_lbf_cs_secondary)) { - xqtl_lbf_filter_index <- get_filter_lbf_index(xqtl_data, coverage = filter_lbf_cs_secondary) - xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_lbf_filter_index,, drop = FALSE] - } else { - if ("V" %in% names(xqtl_data)) xqtl_lbf_matrix <- xqtl_lbf_matrix[xqtl_data$V > prior_tol, , drop = FALSE] else (message("No V found in original data.")) - } + xqtl_extracted <- .extract_lbf_matrix(xqtl_raw_data, xqtl_finemapping_obj, xqtl_varname_obj, + filter_lbf_cs, filter_lbf_cs_secondary, prior_tol) + + if (!is.null(xqtl_extracted)) { + xqtl_lbf_matrix <- xqtl_extracted$lbf_matrix if (nrow(combined_gwas_lbf_matrix) > 0 && nrow(xqtl_lbf_matrix) > 0) { - if (!is.null(xqtl_varname_obj)) colnames(xqtl_lbf_matrix) <- get_nested_element(xqtl_raw_data, xqtl_varname_obj) - # fsusie could have NA in variant name - xqtl_lbf_matrix <- xqtl_lbf_matrix[,!is.na(colnames(xqtl_lbf_matrix))] - colnames(xqtl_lbf_matrix) <- align_variant_names(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix))$aligned_variants common_colnames <- intersect(colnames(xqtl_lbf_matrix), colnames(combined_gwas_lbf_matrix)) - xqtl_lbf_matrix <- xqtl_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix() - combined_gwas_lbf_matrix <- combined_gwas_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix() - # Report the number of dropped columns from xQTL matrix - num_dropped_cols <- length(setdiff(colnames(xqtl_lbf_matrix), common_colnames)) + # Report the number of dropped columns from xQTL matrix before subsetting + num_dropped_cols <- ncol(xqtl_lbf_matrix) - length(common_colnames) message("Number of columns dropped from xQTL matrix: ", num_dropped_cols) + xqtl_lbf_matrix <- xqtl_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix() + combined_gwas_lbf_matrix <- combined_gwas_lbf_matrix[, common_colnames, drop = FALSE] %>% as.matrix() + # Function to convert region df to str convert_to_string <- function(df) paste0("chr", df$chrom, ":", df$start, "-", df$end) region <- if (!is.null(xqtl_region_obj)) get_nested_element(xqtl_raw_data, xqtl_region_obj) %>% convert_to_string() else NULL diff --git a/R/mr.R b/R/mr.R index cae7da9a..7d4c0482 100644 --- a/R/mr.R +++ b/R/mr.R @@ -7,6 +7,20 @@ calc_I2 <- function(Q, Est) { return(if (I2 < 0) 0 else I2) } +# Create a null data frame with gene_name and NA columns for MR pipeline outputs. +# @noRd +.create_null_mr_df <- function(gene_name, col_spec) { + n <- length(gene_name) + cols <- purrr::map(col_spec, function(type) { + switch(type, + character = as.character(rep(NA, n)), + integer = as.integer(rep(NA, n)), + numeric = as.numeric(rep(NA, n)) + ) + }) + do.call(data.frame, c(list(gene_name = gene_name), cols, list(stringsAsFactors = FALSE))) +} + #' MR Format Function #' #' Description of what the function does. @@ -22,84 +36,61 @@ calc_I2 <- function(Q, Est) { #' @export mr_format <- function(susie_result, condition, gwas_sumstats_db, coverage = "cs_coverage_0.95", run_allele_qc = TRUE, molecular_name_obj = c("susie_results", condition, "region_info", "region_name"), ld_meta_df) { - # Create null mr_format_input - create_null_mr_input <- function(gene_name) { - mr_format_input <- data.frame( - gene_name = gene_name, - variant_id = as.character(rep(NA, length(gene_name))), - bhat_x = as.numeric(rep(NA, length(gene_name))), - sbhat_x = as.numeric(rep(NA, length(gene_name))), - cs = as.numeric(rep(NA, length(gene_name))), - pip = as.numeric(rep(NA, length(gene_name))), - bhat_y = as.numeric(rep(NA, length(gene_name))), - sbhat_y = as.numeric(rep(NA, length(gene_name))), - stringsAsFactors = FALSE # Optional, to prevent factors - ) - } + mr_format_spec <- c(variant_id = "character", bhat_x = "numeric", sbhat_x = "numeric", + cs = "numeric", pip = "numeric", bhat_y = "numeric", sbhat_y = "numeric") gene_name <- unique(get_nested_element(susie_result, molecular_name_obj)) - # Attempt to retrieve top_loci; if not found, return NULL + # Attempt to retrieve top_loci; if not found, return null top_loci <- tryCatch( - { - get_nested_element(susie_result, c("susie_results", condition, "top_loci")) - }, + get_nested_element(susie_result, c("susie_results", condition, "top_loci")), error = function(e) { message("top_loci does not exist for the specified condition in susie_result.") - return(NULL) + NULL } ) - if (is.data.frame(top_loci)) { - if (any(unique(get_nested_element(top_loci, coverage)) != 0)) { - susie_cs_result_formatted <- top_loci %>% - mutate(gene_name = gene_name) %>% - filter(coverage >= 1) %>% - mutate(variant = strip_chr_prefix(variant_id)) %>% - select(gene_name, variant, betahat, sebetahat, all_of(coverage), pip) %>% - rename("bhat_x" = "betahat", "sbhat_x" = "sebetahat", "cs" = all_of(coverage)) - susie_pos <- sapply(susie_cs_result_formatted$variant, function(variant_id) strsplit(variant_id, "\\:")[[1]][2]) - gwas_pos <- sapply(gwas_sumstats_db$variant_id, function(variant_id) strsplit(variant_id, "\\:")[[1]][2]) - if (any(susie_pos %in% gwas_pos)) { - gwas_sumstats_db_extracted <- gwas_sumstats_db %>% - filter(pos %in% susie_pos) %>% - mutate(n_sample = if ("n_sample" %in% colnames(.)) n_sample else (n_case + n_control)) - mean_n_sample <- round(mean(gwas_sumstats_db_extracted$n_sample, na.rm = TRUE)) - # Impute `n_sample` and `maf` - if (any(is.na(gwas_sumstats_db_extracted$effect_allele_frequency))) { - gwas_sumstats_db_extracted_imputed <- gwas_sumstats_db_extracted %>% - left_join(ld_meta_df %>% select(pos, allele_freq), by = "pos") %>% - mutate(effect_allele_frequency = ifelse(is.na(effect_allele_frequency), allele_freq, effect_allele_frequency)) %>% - mutate(n_sample = ifelse(is.na(n_sample), mean_n_sample, n_sample)) %>% - select(-allele_freq) - } else { - gwas_sumstats_db_extracted_imputed <- gwas_sumstats_db_extracted - } - gwas_sumstats_db_beta_se <- z_to_beta_se(gwas_sumstats_db_extracted_imputed$z, gwas_sumstats_db_extracted_imputed$effect_allele_frequency, gwas_sumstats_db_extracted_imputed$n_sample) - gwas_sumstats_db_extracted_imputed <- gwas_sumstats_db_extracted_imputed %>% mutate(beta = gwas_sumstats_db_beta_se$beta, se = gwas_sumstats_db_beta_se$se) - if (run_allele_qc) { - susie_cs_result_formatted <- allele_qc(cbind(variant_id_to_df(susie_cs_result_formatted$variant), susie_cs_result_formatted), - gwas_sumstats_db_extracted_imputed$variant_id, c("bhat_x"), - match_min_prop = 0 - ) - susie_cs_result_formatted <- susie_cs_result_formatted$target_data_qced[, c("gene_name", "variant_id", "bhat_x", "sbhat_x", "cs", "pip")] - } - # Normalize variant IDs to canonical format for matching - gwas_sumstats_db_extracted_imputed$variant_id <- normalize_variant_id(gwas_sumstats_db_extracted_imputed$variant_id) - susie_cs_gwas_variants_merge <- intersect(susie_cs_result_formatted$variant_id, gwas_sumstats_db_extracted_imputed$variant_id) + if (!is.data.frame(top_loci)) return(.create_null_mr_df(gene_name, mr_format_spec)) + if (!any(unique(get_nested_element(top_loci, coverage)) != 0)) return(.create_null_mr_df(gene_name, mr_format_spec)) - mr_format_input <- susie_cs_result_formatted[match(susie_cs_gwas_variants_merge, susie_cs_result_formatted$variant), ] %>% - cbind(., gwas_sumstats_db_extracted_imputed[match(susie_cs_gwas_variants_merge, gwas_sumstats_db_extracted_imputed$variant_id), ] %>% - select(beta, se) %>% - rename("bhat_y" = "beta", "sbhat_y" = "se")) - } else { - mr_format_input <- create_null_mr_input(gene_name) - } - } else { - mr_format_input <- create_null_mr_input(gene_name) - } - } else { - mr_format_input <- create_null_mr_input(gene_name) + susie_cs_result_formatted <- top_loci %>% + mutate(gene_name = gene_name) %>% + filter(coverage >= 1) %>% + mutate(variant = strip_chr_prefix(variant_id)) %>% + select(gene_name, variant, betahat, sebetahat, all_of(coverage), pip) %>% + rename("bhat_x" = "betahat", "sbhat_x" = "sebetahat", "cs" = all_of(coverage)) + + susie_pos <- stringr::str_split_i(susie_cs_result_formatted$variant, ":", 2) + gwas_pos <- stringr::str_split_i(gwas_sumstats_db$variant_id, ":", 2) + if (!any(susie_pos %in% gwas_pos)) return(.create_null_mr_df(gene_name, mr_format_spec)) + + gwas_sumstats_db_extracted <- gwas_sumstats_db %>% + filter(pos %in% susie_pos) %>% + mutate(n_sample = if ("n_sample" %in% colnames(.)) n_sample else (n_case + n_control)) + mean_n_sample <- round(mean(gwas_sumstats_db_extracted$n_sample, na.rm = TRUE)) + # Impute `n_sample` and `maf` + if (any(is.na(gwas_sumstats_db_extracted$effect_allele_frequency))) { + gwas_sumstats_db_extracted <- gwas_sumstats_db_extracted %>% + left_join(ld_meta_df %>% select(pos, allele_freq), by = "pos") %>% + mutate(effect_allele_frequency = ifelse(is.na(effect_allele_frequency), allele_freq, effect_allele_frequency)) %>% + mutate(n_sample = ifelse(is.na(n_sample), mean_n_sample, n_sample)) %>% + select(-allele_freq) + } + gwas_beta_se <- z_to_beta_se(gwas_sumstats_db_extracted$z, gwas_sumstats_db_extracted$effect_allele_frequency, gwas_sumstats_db_extracted$n_sample) + gwas_sumstats_db_extracted <- gwas_sumstats_db_extracted %>% mutate(beta = gwas_beta_se$beta, se = gwas_beta_se$se) + if (run_allele_qc) { + susie_cs_result_formatted <- allele_qc(cbind(variant_id_to_df(susie_cs_result_formatted$variant), susie_cs_result_formatted), + gwas_sumstats_db_extracted$variant_id, c("bhat_x"), + match_min_prop = 0 + ) + susie_cs_result_formatted <- susie_cs_result_formatted$target_data_qced[, c("gene_name", "variant_id", "bhat_x", "sbhat_x", "cs", "pip")] } - return(mr_format_input) + # Normalize variant IDs to canonical format for matching + gwas_sumstats_db_extracted$variant_id <- normalize_variant_id(gwas_sumstats_db_extracted$variant_id) + common_variants <- intersect(susie_cs_result_formatted$variant_id, gwas_sumstats_db_extracted$variant_id) + + susie_cs_result_formatted[match(common_variants, susie_cs_result_formatted$variant), ] %>% + cbind(., gwas_sumstats_db_extracted[match(common_variants, gwas_sumstats_db_extracted$variant_id), ] %>% + select(beta, se) %>% + rename("bhat_y" = "beta", "sbhat_y" = "se")) } #' Mendelian Randomization (MR) @@ -110,28 +101,16 @@ mr_format <- function(susie_result, condition, gwas_sumstats_db, coverage = "cs_ #' "meta_eff", "se_meta_eff", "meta_pval", "Q", "Q_pval" and "I2". "gene_name" is ensemble ID. "num_CS" is the number of credible sets #' contained in each gene, "num_IV" is the number of variants contained in each gene. "meta_eff", "se_meta_eff" and "meta_pval" are the MR estimate, standard error and pvalue. #' "Q" is Cochran's Q statistic, "I2" quantifies the heterogeneity, range from 0 to 1. -#' @importFrom dplyr mutate group_by filter ungroup distinct arrange select +#' @importFrom dplyr mutate group_by filter ungroup distinct arrange select summarise n left_join #' @importFrom magrittr %>% #' @importFrom stats pnorm pchisq #' @export mr_analysis <- function(mr_formatted_input, cpip_cutoff = 0.5) { - create_null_output <- function(gene_name) { - data.frame( - gene_name = gene_name, - num_CS = as.integer(rep(NA, length(gene_name))), - num_IV = as.integer(rep(NA, length(gene_name))), - cpip = as.numeric(rep(NA, length(gene_name))), - meta_eff = as.numeric(rep(NA, length(gene_name))), - se_meta_eff = as.numeric(rep(NA, length(gene_name))), - meta_pval = as.numeric(rep(NA, length(gene_name))), - Q = as.numeric(rep(NA, length(gene_name))), - Q_pval = as.numeric(rep(NA, length(gene_name))), - I2 = as.numeric(rep(NA, length(gene_name))), - stringsAsFactors = FALSE - ) - } + mr_output_spec <- c(num_CS = "integer", num_IV = "integer", cpip = "numeric", + meta_eff = "numeric", se_meta_eff = "numeric", meta_pval = "numeric", + Q = "numeric", Q_pval = "numeric", I2 = "numeric") if (all(is.na(mr_formatted_input[, -1]))) { - return(create_null_output(unique(mr_formatted_input$gene_name))) + return(.create_null_mr_df(unique(mr_formatted_input$gene_name), mr_output_spec)) } output <- mr_formatted_input %>% mutate( @@ -140,51 +119,60 @@ mr_analysis <- function(mr_formatted_input, cpip_cutoff = 0.5) { ) %>% group_by(gene_name, cs) %>% mutate(cpip = sum(pip)) %>% - filter(cpip >= cpip_cutoff) # Cumulative pip greater than a user defined cumulative pip threshold + filter(cpip >= cpip_cutoff) - if (dim(output)[1] != 0) { - output <- output %>% - group_by(gene_name, cs) %>% - mutate( - beta_yx = bhat_y / bhat_x, - se_yx = sqrt((sbhat_y^2 / bhat_x^2) + ((bhat_y^2 * sbhat_x^2) / bhat_x^4)), - composite_bhat = sum((beta_yx * pip) / cpip), - composite_sbhat = sum((beta_yx^2 + se_yx^2) * pip / cpip) - ) %>% - mutate( - composite_sbhat = sqrt(composite_sbhat - composite_bhat^2), - wv = composite_sbhat^-2 - ) %>% - ungroup() %>% - mutate( - meta_eff = sum(unique(wv) * unique(composite_bhat)), - sum_w = sum(unique(wv)), - se_meta_eff = sqrt(sum_w^-1), - num_CS = length(unique(cs)) - ) %>% - mutate( - num_IV = length(variant_id), - meta_eff = meta_eff / sum_w, - meta_pval = 2 * pnorm(abs(meta_eff) / se_meta_eff, lower.tail = FALSE), - Q = sum(unique(wv) * (unique(composite_bhat) - unique(meta_eff))^2), - I2 = calc_I2(Q, composite_bhat), - Q_pval = pchisq(Q, df = length(unique(composite_bhat)) - 1, lower = FALSE) - ) %>% - ungroup() %>% - distinct(gene_name, .keep_all = TRUE) %>% - mutate( - cpip = round(cpip, 3), - meta_pval = round(meta_pval, 3), - meta_eff = round(meta_eff, 3), - se_meta_eff = round(se_meta_eff, 3), - Q = round(Q, 3), - Q_pval = round(Q_pval, 3), - I2 = round(I2, 3) - ) %>% - arrange(meta_pval) %>% - select(gene_name, num_CS, num_IV, cpip, meta_eff, se_meta_eff, meta_pval, Q, Q_pval, I2) - } else { - return(create_null_output(unique(mr_formatted_input$gene_name))) + if (nrow(output) == 0) { + return(.create_null_mr_df(unique(mr_formatted_input$gene_name), mr_output_spec)) } - output + + # Compute per-CS composite estimates + cs_summary <- output %>% + group_by(gene_name, cs) %>% + summarise( + cpip = first(cpip), + composite_bhat = sum((bhat_y / bhat_x * pip) / cpip), + composite_sbhat = sqrt( + sum(((bhat_y / bhat_x)^2 + (sbhat_y^2 / bhat_x^2) + ((bhat_y^2 * sbhat_x^2) / bhat_x^4)) * pip / cpip) - + sum((bhat_y / bhat_x * pip) / cpip)^2 + ), + .groups = "drop" + ) %>% + mutate(wv = composite_sbhat^-2) + + # Compute gene-level meta-analysis + gene_summary <- cs_summary %>% + group_by(gene_name) %>% + summarise( + num_CS = n(), + cpip = first(cpip), + sum_w = sum(wv), + meta_eff = sum(wv * composite_bhat) / sum(wv), + se_meta_eff = sqrt(1 / sum(wv)), + Q = sum(wv * (composite_bhat - sum(wv * composite_bhat) / sum(wv))^2), + I2 = calc_I2(Q, composite_bhat), + Q_pval = pchisq(Q, df = n() - 1, lower = FALSE), + .groups = "drop" + ) %>% + mutate( + meta_pval = 2 * pnorm(abs(meta_eff) / se_meta_eff, lower.tail = FALSE) + ) + + # Add num_IV from original output + iv_counts <- output %>% + group_by(gene_name) %>% + summarise(num_IV = n(), .groups = "drop") + + gene_summary %>% + left_join(iv_counts, by = "gene_name") %>% + mutate( + cpip = round(cpip, 3), + meta_eff = round(meta_eff, 3), + se_meta_eff = round(se_meta_eff, 3), + meta_pval = round(meta_pval, 3), + Q = round(Q, 3), + Q_pval = round(Q_pval, 3), + I2 = round(I2, 3) + ) %>% + arrange(meta_pval) %>% + select(gene_name, num_CS, num_IV, cpip, meta_eff, se_meta_eff, meta_pval, Q, Q_pval, I2) } diff --git a/tests/testthat/test_colocboost_pipeline.R b/tests/testthat/test_colocboost_pipeline.R index 23277bdf..c76ebb78 100644 --- a/tests/testthat/test_colocboost_pipeline.R +++ b/tests/testthat/test_colocboost_pipeline.R @@ -1916,4 +1916,43 @@ test_that("qc_regional_data: with only sumstat data processes correctly", { expect_null(result$individual_data) expect_true(!is.null(result$sumstat_data)) }) + +# =========================================================================== +# build_ld_args +# =========================================================================== + +test_that("build_ld_args returns LD for square matrices", { + m <- matrix(1, 5, 5) + result <- pecotmr:::build_ld_args(list(m)) + expect_true("LD" %in% names(result)) + expect_null(result$X_ref) +}) + +test_that("build_ld_args returns X_ref for non-square (genotype) matrices", { + m <- matrix(1, 100, 5) # samples x variants + result <- pecotmr:::build_ld_args(list(m)) + expect_true("X_ref" %in% names(result)) + expect_null(result$LD) +}) + +test_that("build_ld_args applies subset correctly", { + m1 <- matrix(1, 5, 5) + m2 <- matrix(2, 5, 5) + result <- pecotmr:::build_ld_args(list(m1, m2), subset = 2) + expect_length(result$LD, 1) + expect_equal(result$LD[[1]][1, 1], 2) +}) + +# =========================================================================== +# .run_colocboost +# =========================================================================== + +test_that(".run_colocboost returns NULL and message on error", { + expect_message( + result <- pecotmr:::.run_colocboost("test label", bad_arg = TRUE), + "test label failed" + ) + expect_null(result$result) + expect_s3_class(result$time, "difftime") +}) NA diff --git a/tests/testthat/test_mr.R b/tests/testthat/test_mr.R index 829c4015..c417f0d7 100644 --- a/tests/testthat/test_mr.R +++ b/tests/testthat/test_mr.R @@ -553,3 +553,61 @@ test_that("mr_analysis with multiple credible sets", { expect_equal(nrow(result), 1) expect_equal(result$num_CS, 2) }) + +# ============================================================================= +# .create_null_mr_df +# ============================================================================= + +test_that(".create_null_mr_df creates correct structure", { + spec <- c(x = "numeric", y = "character", z = "integer") + result <- pecotmr:::.create_null_mr_df("gene1", spec) + expect_s3_class(result, "data.frame") + expect_equal(nrow(result), 1) + expect_equal(result$gene_name, "gene1") + expect_true(is.numeric(result$x)) + expect_true(is.character(result$y)) + expect_true(is.integer(result$z)) + expect_true(all(is.na(result[, -1]))) +}) + +# ============================================================================= +# mr_format: tryCatch error path +# ============================================================================= + +test_that("mr_format returns null df when get_nested_element errors for top_loci", { + # susie_result has the gene_name path but top_loci path is broken + bad_susie <- list(susie_results = list( + condition1 = list( + region_info = list(region_name = "Gene_Test") + # no top_loci key — forces tryCatch error path + ) + )) + result <- mr_format( + bad_susie, "condition1", + data.frame(variant_id = "chr1:1:A:G", pos = 1, z = 1, beta = 0.1, se = 0.05, + effect_allele_frequency = 0.3, n_case = 500, n_control = 500), + coverage = "cs_coverage_0.95" + ) + expect_true(is.data.frame(result)) + expect_true(all(is.na(result[, -1]))) +}) + +# ============================================================================= +# mr_format: no overlapping positions +# ============================================================================= + +test_that("mr_format returns null df when no positions overlap", { + input_data <- generate_format_mock_data(seed = 42) + # Shift GWAS positions so they don't overlap with SuSiE positions + input_data$gwas_sumstats_db$pos <- input_data$gwas_sumstats_db$pos + 10000 + input_data$gwas_sumstats_db$variant_id <- paste0( + "chr1:", input_data$gwas_sumstats_db$pos, ":", + sub("chr1:\\d+:", "", input_data$gwas_sumstats_db$variant_id) + ) + result <- mr_format( + input_data$susie_result, "condition1", input_data$gwas_sumstats_db, + coverage = "cs_coverage_0.95", run_allele_qc = TRUE + ) + expect_true(is.data.frame(result)) + expect_true(all(is.na(result[, -1]))) +}) From 8e75d87bc948433fc32de5f8bd112688750d63fe Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Sat, 18 Apr 2026 12:28:09 -0700 Subject: [PATCH 5/5] fix tests --- R/susie_wrapper.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 7c38ad5e..86cc3c68 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -179,8 +179,8 @@ susie_rss_wrapper <- function(z, R = NULL, X = NULL, n = NULL, if (!is.null(R) && !is.null(X)) stop("Only one of R or X should be provided, not both.") # Build argument list for susie_rss - base_args <- list(z = z, n = n, L = L, coverage = coverage, - sketch_samples = sketch_samples, ...) + base_args <- list(z = z, n = n, coverage = coverage, + stochastic_ld_sample = sketch_samples, ...) if (!is.null(X)) base_args$X <- X else base_args$R <- R run_with_L <- function(L_val) do.call(susie_rss, c(base_args, list(L = L_val)))