From 748d432c144777783bdd0bd5c0dd2a913b00bd91 Mon Sep 17 00:00:00 2001 From: alexmccreight <57416850+alexmccreight@users.noreply.github.com> Date: Tue, 25 Mar 2025 17:24:20 -0400 Subject: [PATCH 1/2] added susie.ash and susie-inf weight functions --- R/regularized_regression.R | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/R/regularized_regression.R b/R/regularized_regression.R index d076231d..b8a84e11 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -823,3 +823,43 @@ bayes_c_rss_weights <- function(sumstats, LD, ...) { bayes_r_rss_weights <- function(sumstats, LD, ...) { return(bayes_alphabet_rss_weights(sumstats, LD, method = "bayesR", ...)) } + +#' @export +susie_ash_weights <- function(susie_ash_fit, X = NULL, y = NULL, ...) { + # If the fit object is missing or NULL, try to recover it from the parent frame. + if (missing(susie_ash_fit) || is.null(susie_ash_fit)) { + susie_ash_fit <- get0("susie_ash_fit", envir = parent.frame()) + if (is.null(susie_ash_fit)) { + stop("A susie_ash_fit object is required.") + } + } + if (!is.null(X)) { + if (length(susie_ash_fit$marginal_PIP) != ncol(X)) { + stop(paste0("Dimension mismatch on number of variants in susie_ash_fit ", + length(susie_ash_fit$marginal_PIP), " and TWAS weights ", ncol(X), ". ")) + } + } + # Calculate coefficients as per the provided formula. + weights <- rowSums(susie_ash_fit$mu * susie_ash_fit$PIP) + susie_ash_fit$theta + return(weights) +} + +#' @export +susie_inf_weights <- function(susie_inf_fit, X = NULL, y = NULL, ...) { + # If the fit object is missing or NULL, try to recover it from the parent frame. + if (missing(susie_inf_fit) || is.null(susie_inf_fit)) { + susie_inf_fit <- get0("susie_inf_fit", envir = parent.frame()) + if (is.null(susie_inf_fit)) { + stop("A susie_inf_fit object is required.") + } + } + if (!is.null(X)) { + if (length(susie_inf_fit$marginal_PIP) != ncol(X)) { + stop(paste0("Dimension mismatch on number of variants in susie_inf_fit ", + length(susie_inf_fit$marginal_PIP), " and TWAS weights ", ncol(X), ". ")) + } + } + # Calculate coefficients as per the provided formula. + weights <- rowSums(susie_inf_fit$mu * susie_inf_fit$PIP) + susie_inf_fit$alpha + return(weights) +} From 3fab18338241081075d71b3db71a8d62f409565d Mon Sep 17 00:00:00 2001 From: alexmccreight Date: Tue, 25 Mar 2025 21:27:22 +0000 Subject: [PATCH 2/2] Update documentation --- NAMESPACE | 2 ++ R/RcppExports.R | 11 ++++++----- man/allele_qc.Rd | 15 ++++++--------- man/check_consecutive_regions.Rd | 11 ----------- man/load_multitask_regional_data.Rd | 19 +++++++++---------- man/load_rss_data.Rd | 15 +++++++-------- man/load_tsv_region.Rd | 26 +++++++++++++------------- man/rss_analysis_pipeline.Rd | 12 +++++++++--- 8 files changed, 52 insertions(+), 59 deletions(-) delete mode 100644 man/check_consecutive_regions.Rd diff --git a/NAMESPACE b/NAMESPACE index 8e0a54d5..9f3d3773 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -86,6 +86,8 @@ export(sdpr) export(sdpr_weights) export(slalom) export(summary_stats_qc) +export(susie_ash_weights) +export(susie_inf_weights) export(susie_post_processor) export(susie_rss_pipeline) export(susie_rss_qc) diff --git a/R/RcppExports.R b/R/RcppExports.R index 814b38fd..efa6874d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,21 +2,22 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose = FALSE) { - .Call("_pecotmr_dentist_iterative_impute", PACKAGE = "pecotmr", LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose) + .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose) } rcpp_mr_ash_rss <- function(bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol = 1e-8, max_iter = 1e5L, update_w0 = TRUE, update_sigma = TRUE, compute_ELBO = TRUE, standardize = FALSE, ncpus = 1L) { - .Call("_pecotmr_rcpp_mr_ash_rss", PACKAGE = "pecotmr", bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus) + .Call('_pecotmr_rcpp_mr_ash_rss', PACKAGE = 'pecotmr', bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus) } prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) { - .Call("_pecotmr_prs_cs_rcpp", PACKAGE = "pecotmr", a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) + .Call('_pecotmr_prs_cs_rcpp', PACKAGE = 'pecotmr', a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) } qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas = 0, pi_qtl = 0, ImpN = 25L, shrinkage_lambda = 1.0, num_threads = 1L) { - .Call("_pecotmr_qtl_enrichment_rcpp", PACKAGE = "pecotmr", r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads) + .Call('_pecotmr_qtl_enrichment_rcpp', PACKAGE = 'pecotmr', r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads) } sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE) { - .Call("_pecotmr_sdpr_rcpp", PACKAGE = "pecotmr", bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose) + .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose) } + diff --git a/man/allele_qc.Rd b/man/allele_qc.Rd index 28c8f73e..6993271a 100644 --- a/man/allele_qc.Rd +++ b/man/allele_qc.Rd @@ -2,12 +2,11 @@ % Please edit documentation in R/allele_qc.R \name{allele_qc} \alias{allele_qc} -\title{Match alleles between target_variants and ref_variants} +\title{Match alleles between target data and reference variants} \usage{ allele_qc( - target_variants, - ref_variants, target_data, + ref_variants, col_to_flip = NULL, match_min_prop = 0.2, remove_dups = TRUE, @@ -15,16 +14,14 @@ allele_qc( remove_strand_ambiguous = TRUE, flip_strand = FALSE, remove_unmatched = TRUE, - remove_same_vars = FALSE, - target_gwas = TRUE + remove_same_vars = FALSE ) } \arguments{ -\item{target_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} - -\item{ref_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} +\item{target_data}{A data frame with columns "chrom", "pos", "A2", "A1" (and optionally other columns like "beta" or "z"), +or a vector of strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1". Can be automatically converted to a data frame if a vector.} -\item{target_data}{A data frame on which QC procedures will be applied.} +\item{ref_variants}{A data frame with columns "chrom", "pos", "A2", "A1" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} \item{col_to_flip}{The name of the column in target_data where flips are to be applied.} diff --git a/man/check_consecutive_regions.Rd b/man/check_consecutive_regions.Rd deleted file mode 100644 index b4566ec1..00000000 --- a/man/check_consecutive_regions.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LD.R -\name{check_consecutive_regions} -\alias{check_consecutive_regions} -\title{Function to Check if Regions are in increasing order and remove duplicated rows} -\usage{ -check_consecutive_regions(df) -} -\description{ -Function to Check if Regions are in increasing order and remove duplicated rows -} diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd index 3593c9cf..1e3d1457 100644 --- a/man/load_multitask_regional_data.Rd +++ b/man/load_multitask_regional_data.Rd @@ -30,12 +30,11 @@ load_multitask_regional_data( LD_meta_file_path_list = NULL, match_LD_sumstat = NULL, conditions_list_sumstat = NULL, - subset = TRUE, n_samples = 0, n_cases = 0, n_controls = 0, - target = "", - target_column_index = "", + extract_sumstats_region_name = NULL, + sumstats_region_name_col = NULL, comment_string = "#", extract_coordinates = NULL ) @@ -87,19 +86,19 @@ load_multitask_regional_data( \item{conditions_list_sumstat}{A vector of strings representing different sumstats.} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} +\item{n_samples}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.} -\item{target_column_index}{Filter this specific column for the target.} +\item{n_cases}{User-specified number of cases.} -\item{comment_string}{comment sign in the column_mapping file, default is #} +\item{n_controls}{User-specified number of controls.} -\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.} +\item{extract_sumstats_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} -\item{n_sample}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.} +\item{sumstats_region_name_col}{Filter this specific column for the extract_sumstats_region_name.} -\item{n_case}{User-specified number of cases.} +\item{comment_string}{comment sign in the column_mapping file, default is #} -\item{n_control}{User-specified number of controls.} +\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.} } \value{ A list containing the individual_data and sumstat_data: diff --git a/man/load_rss_data.Rd b/man/load_rss_data.Rd index 1230d18f..2dd95a5f 100644 --- a/man/load_rss_data.Rd +++ b/man/load_rss_data.Rd @@ -7,13 +7,12 @@ load_rss_data( sumstat_path, column_file_path, - subset = TRUE, n_sample = 0, n_case = 0, n_control = 0, - target = "", - region = "", - target_column_index = "", + region = NULL, + extract_region_name = NULL, + region_name_col = NULL, comment_string = "#" ) } @@ -28,13 +27,13 @@ load_rss_data( \item{n_control}{User-specified number of controls.} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} - \item{region}{The region where tabix use to subset the input dataset.} -\item{target_column_index}{Filter this specific column for the target.} +\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} + +\item{region_name_col}{Filter this specific column for the extract_region_name.} -\item{comment_string}{comment sign in the column_mapping file, default is #} +\item{comment_string}{Comment sign in the column_mapping file, default is #} } \value{ A list of rss_input, including the column-name-formatted summary statistics, diff --git a/man/load_tsv_region.Rd b/man/load_tsv_region.Rd index 0fabb7e1..88409c9e 100644 --- a/man/load_tsv_region.Rd +++ b/man/load_tsv_region.Rd @@ -2,29 +2,29 @@ % Please edit documentation in R/file_utils.R \name{load_tsv_region} \alias{load_tsv_region} -\title{Load customized tsv data} +\title{Load and filter tabular data with optional region subsetting} \usage{ load_tsv_region( - sumstat_path, - region = "", - target = "", - target_column_index = "" + file_path, + region = NULL, + extract_region_name = NULL, + region_name_col = NULL ) } \arguments{ -\item{sumstat_path}{File path to the summary statistics.} +\item{file_path}{Path to the summary statistics file.} -\item{region}{The region where tabix use to subset the input dataset. Format: chr:start-end (eg: 9:10000-50000)} +\item{region}{Genomic region for subsetting tabix-indexed files. Format: chr:start-end (e.g., "9:10000-50000").} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} +\item{extract_region_name}{Value to filter for in the specified filter column.} -\item{target_column_index}{Filter this specific column for the target.} +\item{region_name_col}{Index of the column to apply the extract_region_name against.} } \value{ -A dataframe of the subsetted summary statistics, +A dataframe containing the filtered summary statistics. } \description{ -This function load the input data. If the input sumstat data is .gz and tabixed, then can use the region parameter to subset the data -and filter by target column -Otherwise, it will only filter by target column since tabix command won't function (this apply to .tsv, .txt files) +This function loads summary statistics data from tabular files (TSV, TXT). +For compressed (.gz) and tabix-indexed files, it can subset data by genomic region. +Additionally, it can filter results by a specified target value in a designated column. } diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd index a4aa0fb2..2f3b8c4d 100644 --- a/man/rss_analysis_pipeline.Rd +++ b/man/rss_analysis_pipeline.Rd @@ -11,10 +11,10 @@ rss_analysis_pipeline( n_sample = 0, n_case = 0, n_control = 0, - target = "", - region = "", - target_column_index = "", + region = NULL, skip_region = NULL, + extract_region_name = NULL, + region_name_col = NULL, qc_method = c("rss_qc", "dentist", "slalom"), finemapping_method = c("susie_rss", "single_effect", "bayesian_conditional_regression"), finemapping_opts = list(init_L = 5, max_L = 20, l_step = 5, coverage = c(0.95, 0.7, @@ -39,9 +39,15 @@ rss_analysis_pipeline( \item{n_control}{User-specified number of controls.} +\item{region}{The region where tabix use to subset the input dataset.} + \item{skip_region}{A character vector specifying regions to be skipped in the analysis (optional). Each region should be in the format "chrom:start-end" (e.g., "1:1000000-2000000").} +\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} + +\item{region_name_col}{Filter this specific column for the extract_region_name.} + \item{qc_method}{Quality control method to use. Options are "rss_qc", "dentist", or "slalom" (default: "rss_qc").} \item{impute}{Logical; if TRUE, performs imputation for outliers identified in the analysis (default: TRUE).}