From 09738b10ab17aa906857a26f7436ef0656ce18b3 Mon Sep 17 00:00:00 2001 From: Chunmingl Date: Mon, 24 Mar 2025 20:48:24 -0400 Subject: [PATCH 1/2] map data type --- R/file_utils.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/R/file_utils.R b/R/file_utils.R index 47b82ea5..73d0dab8 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -980,3 +980,23 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo names(batches) <- NULL return(batches) } + + +#' Map data type for twas weight contexts based on provided data type table. +#' @param data_type_table A data frame contains three columns: type, context, subgroup +#' @param context_names A vector of contexts +#' @return A vector of mapped data type based on context names +#' @export +map_data_type <- function(data_type_table, context_names){ + data_type <- lapply(context_names, function(context){ + xqtl_type_table$type[apply(xqtl_type_table, 1, function(x) { + if (! grepl("sQTL",context)) { + grepl(x[2], context) + } else { + grepl(x[2], context) & grepl(x[3], context) + } + })] + }) + names(data_type) <- context_names + return(data_type) +} From a9f8aa9c9a4a3c57a4d2575bbed2e606b7c261a2 Mon Sep 17 00:00:00 2001 From: Chunmingl Date: Tue, 25 Mar 2025 04:41:13 +0000 Subject: [PATCH 2/2] Update documentation --- NAMESPACE | 1 + R/RcppExports.R | 11 ++++++----- man/allele_qc.Rd | 15 ++++++--------- man/check_consecutive_regions.Rd | 11 ----------- man/load_multitask_regional_data.Rd | 19 +++++++++---------- man/load_rss_data.Rd | 15 +++++++-------- man/load_tsv_region.Rd | 26 +++++++++++++------------- man/map_data_type.Rd | 19 +++++++++++++++++++ man/rss_analysis_pipeline.Rd | 12 +++++++++--- 9 files changed, 70 insertions(+), 59 deletions(-) delete mode 100644 man/check_consecutive_regions.Rd create mode 100644 man/map_data_type.Rd diff --git a/NAMESPACE b/NAMESPACE index 8e0a54d5..49a710df 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(load_regional_univariate_data) export(load_rss_data) export(load_tsv_region) export(load_twas_weights) +export(map_data_type) export(mash_pipeline) export(mash_rand_null_sample) export(merge_mash_data) diff --git a/R/RcppExports.R b/R/RcppExports.R index 814b38fd..efa6874d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,21 +2,22 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose = FALSE) { - .Call("_pecotmr_dentist_iterative_impute", PACKAGE = "pecotmr", LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose) + .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose) } rcpp_mr_ash_rss <- function(bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol = 1e-8, max_iter = 1e5L, update_w0 = TRUE, update_sigma = TRUE, compute_ELBO = TRUE, standardize = FALSE, ncpus = 1L) { - .Call("_pecotmr_rcpp_mr_ash_rss", PACKAGE = "pecotmr", bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus) + .Call('_pecotmr_rcpp_mr_ash_rss', PACKAGE = 'pecotmr', bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus) } prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) { - .Call("_pecotmr_prs_cs_rcpp", PACKAGE = "pecotmr", a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) + .Call('_pecotmr_prs_cs_rcpp', PACKAGE = 'pecotmr', a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) } qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas = 0, pi_qtl = 0, ImpN = 25L, shrinkage_lambda = 1.0, num_threads = 1L) { - .Call("_pecotmr_qtl_enrichment_rcpp", PACKAGE = "pecotmr", r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads) + .Call('_pecotmr_qtl_enrichment_rcpp', PACKAGE = 'pecotmr', r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads) } sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE) { - .Call("_pecotmr_sdpr_rcpp", PACKAGE = "pecotmr", bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose) + .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose) } + diff --git a/man/allele_qc.Rd b/man/allele_qc.Rd index 28c8f73e..6993271a 100644 --- a/man/allele_qc.Rd +++ b/man/allele_qc.Rd @@ -2,12 +2,11 @@ % Please edit documentation in R/allele_qc.R \name{allele_qc} \alias{allele_qc} -\title{Match alleles between target_variants and ref_variants} +\title{Match alleles between target data and reference variants} \usage{ allele_qc( - target_variants, - ref_variants, target_data, + ref_variants, col_to_flip = NULL, match_min_prop = 0.2, remove_dups = TRUE, @@ -15,16 +14,14 @@ allele_qc( remove_strand_ambiguous = TRUE, flip_strand = FALSE, remove_unmatched = TRUE, - remove_same_vars = FALSE, - target_gwas = TRUE + remove_same_vars = FALSE ) } \arguments{ -\item{target_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} - -\item{ref_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} +\item{target_data}{A data frame with columns "chrom", "pos", "A2", "A1" (and optionally other columns like "beta" or "z"), +or a vector of strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1". Can be automatically converted to a data frame if a vector.} -\item{target_data}{A data frame on which QC procedures will be applied.} +\item{ref_variants}{A data frame with columns "chrom", "pos", "A2", "A1" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".} \item{col_to_flip}{The name of the column in target_data where flips are to be applied.} diff --git a/man/check_consecutive_regions.Rd b/man/check_consecutive_regions.Rd deleted file mode 100644 index b4566ec1..00000000 --- a/man/check_consecutive_regions.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/LD.R -\name{check_consecutive_regions} -\alias{check_consecutive_regions} -\title{Function to Check if Regions are in increasing order and remove duplicated rows} -\usage{ -check_consecutive_regions(df) -} -\description{ -Function to Check if Regions are in increasing order and remove duplicated rows -} diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd index 3593c9cf..1e3d1457 100644 --- a/man/load_multitask_regional_data.Rd +++ b/man/load_multitask_regional_data.Rd @@ -30,12 +30,11 @@ load_multitask_regional_data( LD_meta_file_path_list = NULL, match_LD_sumstat = NULL, conditions_list_sumstat = NULL, - subset = TRUE, n_samples = 0, n_cases = 0, n_controls = 0, - target = "", - target_column_index = "", + extract_sumstats_region_name = NULL, + sumstats_region_name_col = NULL, comment_string = "#", extract_coordinates = NULL ) @@ -87,19 +86,19 @@ load_multitask_regional_data( \item{conditions_list_sumstat}{A vector of strings representing different sumstats.} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} +\item{n_samples}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.} -\item{target_column_index}{Filter this specific column for the target.} +\item{n_cases}{User-specified number of cases.} -\item{comment_string}{comment sign in the column_mapping file, default is #} +\item{n_controls}{User-specified number of controls.} -\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.} +\item{extract_sumstats_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} -\item{n_sample}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.} +\item{sumstats_region_name_col}{Filter this specific column for the extract_sumstats_region_name.} -\item{n_case}{User-specified number of cases.} +\item{comment_string}{comment sign in the column_mapping file, default is #} -\item{n_control}{User-specified number of controls.} +\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.} } \value{ A list containing the individual_data and sumstat_data: diff --git a/man/load_rss_data.Rd b/man/load_rss_data.Rd index 1230d18f..2dd95a5f 100644 --- a/man/load_rss_data.Rd +++ b/man/load_rss_data.Rd @@ -7,13 +7,12 @@ load_rss_data( sumstat_path, column_file_path, - subset = TRUE, n_sample = 0, n_case = 0, n_control = 0, - target = "", - region = "", - target_column_index = "", + region = NULL, + extract_region_name = NULL, + region_name_col = NULL, comment_string = "#" ) } @@ -28,13 +27,13 @@ load_rss_data( \item{n_control}{User-specified number of controls.} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} - \item{region}{The region where tabix use to subset the input dataset.} -\item{target_column_index}{Filter this specific column for the target.} +\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} + +\item{region_name_col}{Filter this specific column for the extract_region_name.} -\item{comment_string}{comment sign in the column_mapping file, default is #} +\item{comment_string}{Comment sign in the column_mapping file, default is #} } \value{ A list of rss_input, including the column-name-formatted summary statistics, diff --git a/man/load_tsv_region.Rd b/man/load_tsv_region.Rd index 0fabb7e1..88409c9e 100644 --- a/man/load_tsv_region.Rd +++ b/man/load_tsv_region.Rd @@ -2,29 +2,29 @@ % Please edit documentation in R/file_utils.R \name{load_tsv_region} \alias{load_tsv_region} -\title{Load customized tsv data} +\title{Load and filter tabular data with optional region subsetting} \usage{ load_tsv_region( - sumstat_path, - region = "", - target = "", - target_column_index = "" + file_path, + region = NULL, + extract_region_name = NULL, + region_name_col = NULL ) } \arguments{ -\item{sumstat_path}{File path to the summary statistics.} +\item{file_path}{Path to the summary statistics file.} -\item{region}{The region where tabix use to subset the input dataset. Format: chr:start-end (eg: 9:10000-50000)} +\item{region}{Genomic region for subsetting tabix-indexed files. Format: chr:start-end (e.g., "9:10000-50000").} -\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.} +\item{extract_region_name}{Value to filter for in the specified filter column.} -\item{target_column_index}{Filter this specific column for the target.} +\item{region_name_col}{Index of the column to apply the extract_region_name against.} } \value{ -A dataframe of the subsetted summary statistics, +A dataframe containing the filtered summary statistics. } \description{ -This function load the input data. If the input sumstat data is .gz and tabixed, then can use the region parameter to subset the data -and filter by target column -Otherwise, it will only filter by target column since tabix command won't function (this apply to .tsv, .txt files) +This function loads summary statistics data from tabular files (TSV, TXT). +For compressed (.gz) and tabix-indexed files, it can subset data by genomic region. +Additionally, it can filter results by a specified target value in a designated column. } diff --git a/man/map_data_type.Rd b/man/map_data_type.Rd new file mode 100644 index 00000000..de8ec440 --- /dev/null +++ b/man/map_data_type.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{map_data_type} +\alias{map_data_type} +\title{Map data type for twas weight contexts based on provided data type table.} +\usage{ +map_data_type(data_type_table, context_names) +} +\arguments{ +\item{data_type_table}{A data frame contains three columns: type, context, subgroup} + +\item{context_names}{A vector of contexts} +} +\value{ +A vector of mapped data type based on context names +} +\description{ +Map data type for twas weight contexts based on provided data type table. +} diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd index a4aa0fb2..2f3b8c4d 100644 --- a/man/rss_analysis_pipeline.Rd +++ b/man/rss_analysis_pipeline.Rd @@ -11,10 +11,10 @@ rss_analysis_pipeline( n_sample = 0, n_case = 0, n_control = 0, - target = "", - region = "", - target_column_index = "", + region = NULL, skip_region = NULL, + extract_region_name = NULL, + region_name_col = NULL, qc_method = c("rss_qc", "dentist", "slalom"), finemapping_method = c("susie_rss", "single_effect", "bayesian_conditional_regression"), finemapping_opts = list(init_L = 5, max_L = 20, l_step = 5, coverage = c(0.95, 0.7, @@ -39,9 +39,15 @@ rss_analysis_pipeline( \item{n_control}{User-specified number of controls.} +\item{region}{The region where tabix use to subset the input dataset.} + \item{skip_region}{A character vector specifying regions to be skipped in the analysis (optional). Each region should be in the format "chrom:start-end" (e.g., "1:1000000-2000000").} +\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.} + +\item{region_name_col}{Filter this specific column for the extract_region_name.} + \item{qc_method}{Quality control method to use. Options are "rss_qc", "dentist", or "slalom" (default: "rss_qc").} \item{impute}{Logical; if TRUE, performs imputation for outliers identified in the analysis (default: TRUE).}