From 09738b10ab17aa906857a26f7436ef0656ce18b3 Mon Sep 17 00:00:00 2001
From: Chunmingl <Chunming.liu324@gmail.com>
Date: Mon, 24 Mar 2025 20:48:24 -0400
Subject: [PATCH 1/2] map data type

---
 R/file_utils.R | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/R/file_utils.R b/R/file_utils.R
index 47b82ea5..73d0dab8 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -980,3 +980,23 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo
   names(batches) <- NULL
   return(batches)
 }
+
+
+#' Map data type for twas weight contexts based on provided data type table. 
+#' @param data_type_table A data frame contains three columns: type, context, subgroup 
+#' @param context_names A vector of contexts 
+#' @return A vector of mapped data type based on context names
+#' @export
+map_data_type <- function(data_type_table, context_names){
+        data_type <- lapply(context_names, function(context){
+             xqtl_type_table$type[apply(xqtl_type_table, 1, function(x) {
+                 if (! grepl("sQTL",context)) {
+                     grepl(x[2], context)
+                 } else {
+                     grepl(x[2], context) & grepl(x[3], context)
+                 }
+             })]
+        })
+        names(data_type) <- context_names
+        return(data_type)
+}

From a9f8aa9c9a4a3c57a4d2575bbed2e606b7c261a2 Mon Sep 17 00:00:00 2001
From: Chunmingl <Chunmingl@users.noreply.github.com>
Date: Tue, 25 Mar 2025 04:41:13 +0000
Subject: [PATCH 2/2] Update documentation

---
 NAMESPACE                           |  1 +
 R/RcppExports.R                     | 11 ++++++-----
 man/allele_qc.Rd                    | 15 ++++++---------
 man/check_consecutive_regions.Rd    | 11 -----------
 man/load_multitask_regional_data.Rd | 19 +++++++++----------
 man/load_rss_data.Rd                | 15 +++++++--------
 man/load_tsv_region.Rd              | 26 +++++++++++++-------------
 man/map_data_type.Rd                | 19 +++++++++++++++++++
 man/rss_analysis_pipeline.Rd        | 12 +++++++++---
 9 files changed, 70 insertions(+), 59 deletions(-)
 delete mode 100644 man/check_consecutive_regions.Rd
 create mode 100644 man/map_data_type.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 8e0a54d5..49a710df 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -56,6 +56,7 @@ export(load_regional_univariate_data)
 export(load_rss_data)
 export(load_tsv_region)
 export(load_twas_weights)
+export(map_data_type)
 export(mash_pipeline)
 export(mash_rand_null_sample)
 export(merge_mash_data)
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 814b38fd..efa6874d 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -2,21 +2,22 @@
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose = FALSE) {
-  .Call("_pecotmr_dentist_iterative_impute", PACKAGE = "pecotmr", LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose)
+    .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, seed, correct_chen_et_al_bug, verbose)
 }
 
 rcpp_mr_ash_rss <- function(bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol = 1e-8, max_iter = 1e5L, update_w0 = TRUE, update_sigma = TRUE, compute_ELBO = TRUE, standardize = FALSE, ncpus = 1L) {
-  .Call("_pecotmr_rcpp_mr_ash_rss", PACKAGE = "pecotmr", bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus)
+    .Call('_pecotmr_rcpp_mr_ash_rss', PACKAGE = 'pecotmr', bhat, shat, z, R, var_y, n, sigma2_e, s0, w0, mu1_init, tol, max_iter, update_w0, update_sigma, compute_ELBO, standardize, ncpus)
 }
 
 prs_cs_rcpp <- function(a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed) {
-  .Call("_pecotmr_prs_cs_rcpp", PACKAGE = "pecotmr", a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed)
+    .Call('_pecotmr_prs_cs_rcpp', PACKAGE = 'pecotmr', a, b, phi, bhat, maf, n, ld_blk, n_iter, n_burnin, thin, verbose, seed)
 }
 
 qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas = 0, pi_qtl = 0, ImpN = 25L, shrinkage_lambda = 1.0, num_threads = 1L) {
-  .Call("_pecotmr_qtl_enrichment_rcpp", PACKAGE = "pecotmr", r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads)
+    .Call('_pecotmr_qtl_enrichment_rcpp', PACKAGE = 'pecotmr', r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads)
 }
 
 sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE) {
-  .Call("_pecotmr_sdpr_rcpp", PACKAGE = "pecotmr", bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose)
+    .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose)
 }
+
diff --git a/man/allele_qc.Rd b/man/allele_qc.Rd
index 28c8f73e..6993271a 100644
--- a/man/allele_qc.Rd
+++ b/man/allele_qc.Rd
@@ -2,12 +2,11 @@
 % Please edit documentation in R/allele_qc.R
 \name{allele_qc}
 \alias{allele_qc}
-\title{Match alleles between target_variants and ref_variants}
+\title{Match alleles between target data and reference variants}
 \usage{
 allele_qc(
-  target_variants,
-  ref_variants,
   target_data,
+  ref_variants,
   col_to_flip = NULL,
   match_min_prop = 0.2,
   remove_dups = TRUE,
@@ -15,16 +14,14 @@ allele_qc(
   remove_strand_ambiguous = TRUE,
   flip_strand = FALSE,
   remove_unmatched = TRUE,
-  remove_same_vars = FALSE,
-  target_gwas = TRUE
+  remove_same_vars = FALSE
 )
 }
 \arguments{
-\item{target_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".}
-
-\item{ref_variants}{A data frame with columns "chrom", "pos", "A1", "A2" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".}
+\item{target_data}{A data frame with columns "chrom", "pos", "A2", "A1" (and optionally other columns like "beta" or "z"),
+or a vector of strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1". Can be automatically converted to a data frame if a vector.}
 
-\item{target_data}{A data frame on which QC procedures will be applied.}
+\item{ref_variants}{A data frame with columns "chrom", "pos", "A2", "A1" or strings in the format of "chr:pos:A2:A1"/"chr:pos_A2_A1".}
 
 \item{col_to_flip}{The name of the column in target_data where flips are to be applied.}
 
diff --git a/man/check_consecutive_regions.Rd b/man/check_consecutive_regions.Rd
deleted file mode 100644
index b4566ec1..00000000
--- a/man/check_consecutive_regions.Rd
+++ /dev/null
@@ -1,11 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/LD.R
-\name{check_consecutive_regions}
-\alias{check_consecutive_regions}
-\title{Function to Check if Regions are in increasing order and remove duplicated rows}
-\usage{
-check_consecutive_regions(df)
-}
-\description{
-Function to Check if Regions are in increasing order and remove duplicated rows
-}
diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd
index 3593c9cf..1e3d1457 100644
--- a/man/load_multitask_regional_data.Rd
+++ b/man/load_multitask_regional_data.Rd
@@ -30,12 +30,11 @@ load_multitask_regional_data(
   LD_meta_file_path_list = NULL,
   match_LD_sumstat = NULL,
   conditions_list_sumstat = NULL,
-  subset = TRUE,
   n_samples = 0,
   n_cases = 0,
   n_controls = 0,
-  target = "",
-  target_column_index = "",
+  extract_sumstats_region_name = NULL,
+  sumstats_region_name_col = NULL,
   comment_string = "#",
   extract_coordinates = NULL
 )
@@ -87,19 +86,19 @@ load_multitask_regional_data(
 
 \item{conditions_list_sumstat}{A vector of strings representing different sumstats.}
 
-\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.}
+\item{n_samples}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.}
 
-\item{target_column_index}{Filter this specific column for the target.}
+\item{n_cases}{User-specified number of cases.}
 
-\item{comment_string}{comment sign in the column_mapping file, default is #}
+\item{n_controls}{User-specified number of controls.}
 
-\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.}
+\item{extract_sumstats_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.}
 
-\item{n_sample}{User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.}
+\item{sumstats_region_name_col}{Filter this specific column for the extract_sumstats_region_name.}
 
-\item{n_case}{User-specified number of cases.}
+\item{comment_string}{comment sign in the column_mapping file, default is #}
 
-\item{n_control}{User-specified number of controls.}
+\item{extract_coordinates}{Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.}
 }
 \value{
 A list containing the individual_data and sumstat_data:
diff --git a/man/load_rss_data.Rd b/man/load_rss_data.Rd
index 1230d18f..2dd95a5f 100644
--- a/man/load_rss_data.Rd
+++ b/man/load_rss_data.Rd
@@ -7,13 +7,12 @@
 load_rss_data(
   sumstat_path,
   column_file_path,
-  subset = TRUE,
   n_sample = 0,
   n_case = 0,
   n_control = 0,
-  target = "",
-  region = "",
-  target_column_index = "",
+  region = NULL,
+  extract_region_name = NULL,
+  region_name_col = NULL,
   comment_string = "#"
 )
 }
@@ -28,13 +27,13 @@ load_rss_data(
 
 \item{n_control}{User-specified number of controls.}
 
-\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.}
-
 \item{region}{The region where tabix use to subset the input dataset.}
 
-\item{target_column_index}{Filter this specific column for the target.}
+\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.}
+
+\item{region_name_col}{Filter this specific column for the extract_region_name.}
 
-\item{comment_string}{comment sign in the column_mapping file, default is #}
+\item{comment_string}{Comment sign in the column_mapping file, default is #}
 }
 \value{
 A list of rss_input, including the column-name-formatted summary statistics,
diff --git a/man/load_tsv_region.Rd b/man/load_tsv_region.Rd
index 0fabb7e1..88409c9e 100644
--- a/man/load_tsv_region.Rd
+++ b/man/load_tsv_region.Rd
@@ -2,29 +2,29 @@
 % Please edit documentation in R/file_utils.R
 \name{load_tsv_region}
 \alias{load_tsv_region}
-\title{Load customized tsv data}
+\title{Load and filter tabular data with optional region subsetting}
 \usage{
 load_tsv_region(
-  sumstat_path,
-  region = "",
-  target = "",
-  target_column_index = ""
+  file_path,
+  region = NULL,
+  extract_region_name = NULL,
+  region_name_col = NULL
 )
 }
 \arguments{
-\item{sumstat_path}{File path to the summary statistics.}
+\item{file_path}{Path to the summary statistics file.}
 
-\item{region}{The region where tabix use to subset the input dataset. Format: chr:start-end (eg: 9:10000-50000)}
+\item{region}{Genomic region for subsetting tabix-indexed files. Format: chr:start-end (e.g., "9:10000-50000").}
 
-\item{target}{User-specified gene/phenotype name used to further subset the phenotype data.}
+\item{extract_region_name}{Value to filter for in the specified filter column.}
 
-\item{target_column_index}{Filter this specific column for the target.}
+\item{region_name_col}{Index of the column to apply the extract_region_name against.}
 }
 \value{
-A dataframe of the subsetted summary statistics,
+A dataframe containing the filtered summary statistics.
 }
 \description{
-This function load the input data. If the input sumstat data is .gz and tabixed, then can use the region parameter to subset the data
-and filter by target column
-Otherwise, it will only filter by target column since tabix command won't function (this apply to .tsv, .txt files)
+This function loads summary statistics data from tabular files (TSV, TXT).
+For compressed (.gz) and tabix-indexed files, it can subset data by genomic region.
+Additionally, it can filter results by a specified target value in a designated column.
 }
diff --git a/man/map_data_type.Rd b/man/map_data_type.Rd
new file mode 100644
index 00000000..de8ec440
--- /dev/null
+++ b/man/map_data_type.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/file_utils.R
+\name{map_data_type}
+\alias{map_data_type}
+\title{Map data type for twas weight contexts based on provided data type table.}
+\usage{
+map_data_type(data_type_table, context_names)
+}
+\arguments{
+\item{data_type_table}{A data frame contains three columns: type, context, subgroup}
+
+\item{context_names}{A vector of contexts}
+}
+\value{
+A vector of mapped data type based on context names
+}
+\description{
+Map data type for twas weight contexts based on provided data type table.
+}
diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd
index a4aa0fb2..2f3b8c4d 100644
--- a/man/rss_analysis_pipeline.Rd
+++ b/man/rss_analysis_pipeline.Rd
@@ -11,10 +11,10 @@ rss_analysis_pipeline(
   n_sample = 0,
   n_case = 0,
   n_control = 0,
-  target = "",
-  region = "",
-  target_column_index = "",
+  region = NULL,
   skip_region = NULL,
+  extract_region_name = NULL,
+  region_name_col = NULL,
   qc_method = c("rss_qc", "dentist", "slalom"),
   finemapping_method = c("susie_rss", "single_effect", "bayesian_conditional_regression"),
   finemapping_opts = list(init_L = 5, max_L = 20, l_step = 5, coverage = c(0.95, 0.7,
@@ -39,9 +39,15 @@ rss_analysis_pipeline(
 
 \item{n_control}{User-specified number of controls.}
 
+\item{region}{The region where tabix use to subset the input dataset.}
+
 \item{skip_region}{A character vector specifying regions to be skipped in the analysis (optional).
 Each region should be in the format "chrom:start-end" (e.g., "1:1000000-2000000").}
 
+\item{extract_region_name}{User-specified gene/phenotype name used to further subset the phenotype data.}
+
+\item{region_name_col}{Filter this specific column for the extract_region_name.}
+
 \item{qc_method}{Quality control method to use. Options are "rss_qc", "dentist", or "slalom" (default: "rss_qc").}
 
 \item{impute}{Logical; if TRUE, performs imputation for outliers identified in the analysis (default: TRUE).}