From e6a8b0314302ee6f61f074a3d9ae3c2e0323a0c1 Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@cumc.columbia.edu>
Date: Thu, 19 Jun 2025 18:43:22 -0400
Subject: [PATCH 1/3] apply column mapping

---
 R/allele_qc.R  |  4 ++++
 R/file_utils.R |  4 ++++
 R/twas.R       | 31 +++++++++++++++++++++++++------
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/R/allele_qc.R b/R/allele_qc.R
index 7b32d0a8..3f02cb68 100644
--- a/R/allele_qc.R
+++ b/R/allele_qc.R
@@ -70,7 +70,11 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
         variant_df <- target_data %>% select(all_of(variant_cols))
         other_cols <- target_data %>% select(-all_of(variant_cols))
         target_data <- cbind(variant_id_to_df(variant_df), other_cols)
+        print("head(variant_df)")
+        print(head(variant_df))
      } else {
+        print("head(variant_data)")
+        print(head(target_data))
         target_data <- variant_id_to_df(target_data)
      }
   } else {
diff --git a/R/file_utils.R b/R/file_utils.R
index ce3253ff..cec319ec 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -799,6 +799,8 @@ load_twas_weights <- function(weight_db_files, conditions = NULL,
 load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = 0, n_control = 0, region = NULL,
                           extract_region_name = NULL, region_name_col = NULL, comment_string = "#") {
   # Read and preprocess column mapping
+  print("comment_stringin load_rss")
+  print(comment_string)
   if (is.null(comment_string)) {
     column_data <- read.table(column_file_path,
       header = FALSE, sep = ":",
@@ -814,6 +816,8 @@ load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case =
     ) %>%
       rename(standard = V1, original = V2)
   }
+  print("column_data")
+  print(head(column_data))
   # Initialize sumstats variable
   sumstats <- NULL
   var_y <- NULL
diff --git a/R/twas.R b/R/twas.R
index dcc3bd5a..363d0ab5 100644
--- a/R/twas.R
+++ b/R/twas.R
@@ -26,7 +26,7 @@
 #' @importFrom S4Vectors queryHits subjectHits
 #' @importFrom IRanges IRanges findOverlaps start end reduce
 #' @export
-harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) {
+harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, column_file_path = NULL, comment_string = "#") {
   # Function to group contexts based on start and end positions
   group_contexts_by_region <- function(twas_weights_data, molecular_id, chrom, tolerance = 5000) {
     region_info_df <- do.call(rbind, lapply(names(twas_weights_data$weights), function(context) {
@@ -150,8 +150,12 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
       # Step 3: load GWAS data for clustered context groups
       for (study in names(gwas_files)) {
         gwas_file <- gwas_files[study]
-        gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0)
+        gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region,   
+                                              LD_list$combined_LD_variants, c("beta", "z"),   
+                                              match_min_prop = 0, column_file_path = column_file_path, comment_string = comment_string)
         if(is.null(gwas_data_sumstats)) next
+        print("gwas_data_sumstats")
+        print(head(gwas_data_sumstats))
         # loop through context within the context group:
         for (context in contexts) {
           weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]]
@@ -247,9 +251,22 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
 #' @param query_region A string for region of query for tabix-indexed gwas summary statistics file in the format of chr:start-end
 #' @noRd
 #' @export
-harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0){
+harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0, column_file_path=NULL, comment_string="#"){
     if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ")
-    gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping
+    if (!is.null(column_file_path)) {  
+      print("use column_mapping_file")
+      rss_result <- load_rss_data(  
+        sumstat_path = gwas_file,  
+        column_file_path = column_file_path,  
+        region = query_region,
+        comment_string = comment_string
+      )  
+      gwas_data_sumstats <- rss_result$sumstats  
+    } else {  
+      gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region))  
+    }
+    print("gwas_data_sumstats_in gwas")
+    print(head(gwas_data_sumstats))
     if (nrow(gwas_data_sumstats) == 0) {
         if (length(names(gwas_file))==0) names(gwas_file) <- gwas_file
         warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", names(gwas_file), ". "))
@@ -287,7 +304,9 @@ twas_pipeline <- function(twas_weights_data,
                           mr_coverage_column = "cs_coverage_0.95",
                           quantile_twas = FALSE,
                           output_twas_data = FALSE,
-                          event_filters=NULL) {
+                          event_filters=NULL,
+                          column_file_path = NULL,
+                          comment_string="#") {
   # internal function to format TWAS output
   format_twas_data <- function(post_qc_twas_data, twas_table) {
     weights_list <- do.call(c, lapply(names(post_qc_twas_data), function(molecular_id) {
@@ -421,7 +440,7 @@ twas_pipeline <- function(twas_weights_data,
   }
 
   # harmonize twas weights and gwas sumstats against LD
-  twas_data_qced_result <- harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file)
+  twas_data_qced_result <- harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file, column_file_path = column_file_path, comment_string = comment_string)
   twas_results_db <- lapply(names(twas_weights_data), function(weight_db) {
     twas_weights_data[[weight_db]][["molecular_id"]] <- weight_db
     twas_data_qced <- twas_data_qced_result$twas_data_qced

From b4d3815983957865d672d2d582e960ad27c05481 Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@cumc.columbia.edu>
Date: Fri, 20 Jun 2025 01:29:02 -0400
Subject: [PATCH 2/3] clean up

---
 R/allele_qc.R  | 4 ----
 R/file_utils.R | 4 ----
 R/twas.R       | 5 -----
 3 files changed, 13 deletions(-)

diff --git a/R/allele_qc.R b/R/allele_qc.R
index 3f02cb68..7b32d0a8 100644
--- a/R/allele_qc.R
+++ b/R/allele_qc.R
@@ -70,11 +70,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
         variant_df <- target_data %>% select(all_of(variant_cols))
         other_cols <- target_data %>% select(-all_of(variant_cols))
         target_data <- cbind(variant_id_to_df(variant_df), other_cols)
-        print("head(variant_df)")
-        print(head(variant_df))
      } else {
-        print("head(variant_data)")
-        print(head(target_data))
         target_data <- variant_id_to_df(target_data)
      }
   } else {
diff --git a/R/file_utils.R b/R/file_utils.R
index cec319ec..ce3253ff 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -799,8 +799,6 @@ load_twas_weights <- function(weight_db_files, conditions = NULL,
 load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = 0, n_control = 0, region = NULL,
                           extract_region_name = NULL, region_name_col = NULL, comment_string = "#") {
   # Read and preprocess column mapping
-  print("comment_stringin load_rss")
-  print(comment_string)
   if (is.null(comment_string)) {
     column_data <- read.table(column_file_path,
       header = FALSE, sep = ":",
@@ -816,8 +814,6 @@ load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case =
     ) %>%
       rename(standard = V1, original = V2)
   }
-  print("column_data")
-  print(head(column_data))
   # Initialize sumstats variable
   sumstats <- NULL
   var_y <- NULL
diff --git a/R/twas.R b/R/twas.R
index 363d0ab5..228edf75 100644
--- a/R/twas.R
+++ b/R/twas.R
@@ -154,8 +154,6 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file,
                                               LD_list$combined_LD_variants, c("beta", "z"),   
                                               match_min_prop = 0, column_file_path = column_file_path, comment_string = comment_string)
         if(is.null(gwas_data_sumstats)) next
-        print("gwas_data_sumstats")
-        print(head(gwas_data_sumstats))
         # loop through context within the context group:
         for (context in contexts) {
           weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]]
@@ -254,7 +252,6 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file,
 harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0, column_file_path=NULL, comment_string="#"){
     if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ")
     if (!is.null(column_file_path)) {  
-      print("use column_mapping_file")
       rss_result <- load_rss_data(  
         sumstat_path = gwas_file,  
         column_file_path = column_file_path,  
@@ -265,8 +262,6 @@ harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NUL
     } else {  
       gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region))  
     }
-    print("gwas_data_sumstats_in gwas")
-    print(head(gwas_data_sumstats))
     if (nrow(gwas_data_sumstats) == 0) {
         if (length(names(gwas_file))==0) names(gwas_file) <- gwas_file
         warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", names(gwas_file), ". "))

From 1358c05750b30c630dd9e2a43b0f93a178597abe Mon Sep 17 00:00:00 2001
From: rl3328 <rl3328@users.noreply.github.com>
Date: Fri, 20 Jun 2025 05:34:20 +0000
Subject: [PATCH 3/3] Update documentation

---
 man/harmonize_twas.Rd | 8 +++++++-
 man/twas_pipeline.Rd  | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/man/harmonize_twas.Rd b/man/harmonize_twas.Rd
index 6966cbb9..4d3d1469 100644
--- a/man/harmonize_twas.Rd
+++ b/man/harmonize_twas.Rd
@@ -5,7 +5,13 @@
 \title{Function to perform allele flip QC and harmonization on the weights and GWAS against LD for a region.
 FIXME: GWAS loading function from Haochen for both tabix & column-mapping yml application}
 \usage{
-harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file)
+harmonize_twas(
+  twas_weights_data,
+  ld_meta_file_path,
+  gwas_meta_file,
+  column_file_path = NULL,
+  comment_string = "#"
+)
 }
 \arguments{
 \item{twas_weights_data}{List of list of twas weights output from from generate_twas_db function.}
diff --git a/man/twas_pipeline.Rd b/man/twas_pipeline.Rd
index 986bdf24..83dbaaa5 100644
--- a/man/twas_pipeline.Rd
+++ b/man/twas_pipeline.Rd
@@ -18,7 +18,9 @@ twas_pipeline(
   mr_coverage_column = "cs_coverage_0.95",
   quantile_twas = FALSE,
   output_twas_data = FALSE,
-  event_filters = NULL
+  event_filters = NULL,
+  column_file_path = NULL,
+  comment_string = "#"
 )
 }
 \arguments{