From e6a8b0314302ee6f61f074a3d9ae3c2e0323a0c1 Mon Sep 17 00:00:00 2001 From: rl3328 Date: Thu, 19 Jun 2025 18:43:22 -0400 Subject: [PATCH 1/3] apply column mapping --- R/allele_qc.R | 4 ++++ R/file_utils.R | 4 ++++ R/twas.R | 31 +++++++++++++++++++++++++------ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/R/allele_qc.R b/R/allele_qc.R index 7b32d0a8..3f02cb68 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -70,7 +70,11 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, variant_df <- target_data %>% select(all_of(variant_cols)) other_cols <- target_data %>% select(-all_of(variant_cols)) target_data <- cbind(variant_id_to_df(variant_df), other_cols) + print("head(variant_df)") + print(head(variant_df)) } else { + print("head(variant_data)") + print(head(target_data)) target_data <- variant_id_to_df(target_data) } } else { diff --git a/R/file_utils.R b/R/file_utils.R index ce3253ff..cec319ec 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -799,6 +799,8 @@ load_twas_weights <- function(weight_db_files, conditions = NULL, load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = 0, n_control = 0, region = NULL, extract_region_name = NULL, region_name_col = NULL, comment_string = "#") { # Read and preprocess column mapping + print("comment_stringin load_rss") + print(comment_string) if (is.null(comment_string)) { column_data <- read.table(column_file_path, header = FALSE, sep = ":", @@ -814,6 +816,8 @@ load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = ) %>% rename(standard = V1, original = V2) } + print("column_data") + print(head(column_data)) # Initialize sumstats variable sumstats <- NULL var_y <- NULL diff --git a/R/twas.R b/R/twas.R index dcc3bd5a..363d0ab5 100644 --- a/R/twas.R +++ b/R/twas.R @@ -26,7 +26,7 @@ #' @importFrom S4Vectors queryHits subjectHits #' @importFrom IRanges IRanges findOverlaps start end reduce #' @export -harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) { +harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, column_file_path = NULL, comment_string = "#") { # Function to group contexts based on start and end positions group_contexts_by_region <- function(twas_weights_data, molecular_id, chrom, tolerance = 5000) { region_info_df <- do.call(rbind, lapply(names(twas_weights_data$weights), function(context) { @@ -150,8 +150,12 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) # Step 3: load GWAS data for clustered context groups for (study in names(gwas_files)) { gwas_file <- gwas_files[study] - gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0) + gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, + LD_list$combined_LD_variants, c("beta", "z"), + match_min_prop = 0, column_file_path = column_file_path, comment_string = comment_string) if(is.null(gwas_data_sumstats)) next + print("gwas_data_sumstats") + print(head(gwas_data_sumstats)) # loop through context within the context group: for (context in contexts) { weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]] @@ -247,9 +251,22 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) #' @param query_region A string for region of query for tabix-indexed gwas summary statistics file in the format of chr:start-end #' @noRd #' @export -harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0){ +harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0, column_file_path=NULL, comment_string="#"){ if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ") - gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping + if (!is.null(column_file_path)) { + print("use column_mapping_file") + rss_result <- load_rss_data( + sumstat_path = gwas_file, + column_file_path = column_file_path, + region = query_region, + comment_string = comment_string + ) + gwas_data_sumstats <- rss_result$sumstats + } else { + gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) + } + print("gwas_data_sumstats_in gwas") + print(head(gwas_data_sumstats)) if (nrow(gwas_data_sumstats) == 0) { if (length(names(gwas_file))==0) names(gwas_file) <- gwas_file warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", names(gwas_file), ". ")) @@ -287,7 +304,9 @@ twas_pipeline <- function(twas_weights_data, mr_coverage_column = "cs_coverage_0.95", quantile_twas = FALSE, output_twas_data = FALSE, - event_filters=NULL) { + event_filters=NULL, + column_file_path = NULL, + comment_string="#") { # internal function to format TWAS output format_twas_data <- function(post_qc_twas_data, twas_table) { weights_list <- do.call(c, lapply(names(post_qc_twas_data), function(molecular_id) { @@ -421,7 +440,7 @@ twas_pipeline <- function(twas_weights_data, } # harmonize twas weights and gwas sumstats against LD - twas_data_qced_result <- harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file) + twas_data_qced_result <- harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file, column_file_path = column_file_path, comment_string = comment_string) twas_results_db <- lapply(names(twas_weights_data), function(weight_db) { twas_weights_data[[weight_db]][["molecular_id"]] <- weight_db twas_data_qced <- twas_data_qced_result$twas_data_qced From b4d3815983957865d672d2d582e960ad27c05481 Mon Sep 17 00:00:00 2001 From: rl3328 Date: Fri, 20 Jun 2025 01:29:02 -0400 Subject: [PATCH 2/3] clean up --- R/allele_qc.R | 4 ---- R/file_utils.R | 4 ---- R/twas.R | 5 ----- 3 files changed, 13 deletions(-) diff --git a/R/allele_qc.R b/R/allele_qc.R index 3f02cb68..7b32d0a8 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -70,11 +70,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, variant_df <- target_data %>% select(all_of(variant_cols)) other_cols <- target_data %>% select(-all_of(variant_cols)) target_data <- cbind(variant_id_to_df(variant_df), other_cols) - print("head(variant_df)") - print(head(variant_df)) } else { - print("head(variant_data)") - print(head(target_data)) target_data <- variant_id_to_df(target_data) } } else { diff --git a/R/file_utils.R b/R/file_utils.R index cec319ec..ce3253ff 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -799,8 +799,6 @@ load_twas_weights <- function(weight_db_files, conditions = NULL, load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = 0, n_control = 0, region = NULL, extract_region_name = NULL, region_name_col = NULL, comment_string = "#") { # Read and preprocess column mapping - print("comment_stringin load_rss") - print(comment_string) if (is.null(comment_string)) { column_data <- read.table(column_file_path, header = FALSE, sep = ":", @@ -816,8 +814,6 @@ load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = ) %>% rename(standard = V1, original = V2) } - print("column_data") - print(head(column_data)) # Initialize sumstats variable sumstats <- NULL var_y <- NULL diff --git a/R/twas.R b/R/twas.R index 363d0ab5..228edf75 100644 --- a/R/twas.R +++ b/R/twas.R @@ -154,8 +154,6 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0, column_file_path = column_file_path, comment_string = comment_string) if(is.null(gwas_data_sumstats)) next - print("gwas_data_sumstats") - print(head(gwas_data_sumstats)) # loop through context within the context group: for (context in contexts) { weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]] @@ -254,7 +252,6 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file, harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0, column_file_path=NULL, comment_string="#"){ if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ") if (!is.null(column_file_path)) { - print("use column_mapping_file") rss_result <- load_rss_data( sumstat_path = gwas_file, column_file_path = column_file_path, @@ -265,8 +262,6 @@ harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NUL } else { gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) } - print("gwas_data_sumstats_in gwas") - print(head(gwas_data_sumstats)) if (nrow(gwas_data_sumstats) == 0) { if (length(names(gwas_file))==0) names(gwas_file) <- gwas_file warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", names(gwas_file), ". ")) From 1358c05750b30c630dd9e2a43b0f93a178597abe Mon Sep 17 00:00:00 2001 From: rl3328 Date: Fri, 20 Jun 2025 05:34:20 +0000 Subject: [PATCH 3/3] Update documentation --- man/harmonize_twas.Rd | 8 +++++++- man/twas_pipeline.Rd | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/man/harmonize_twas.Rd b/man/harmonize_twas.Rd index 6966cbb9..4d3d1469 100644 --- a/man/harmonize_twas.Rd +++ b/man/harmonize_twas.Rd @@ -5,7 +5,13 @@ \title{Function to perform allele flip QC and harmonization on the weights and GWAS against LD for a region. FIXME: GWAS loading function from Haochen for both tabix & column-mapping yml application} \usage{ -harmonize_twas(twas_weights_data, ld_meta_file_path, gwas_meta_file) +harmonize_twas( + twas_weights_data, + ld_meta_file_path, + gwas_meta_file, + column_file_path = NULL, + comment_string = "#" +) } \arguments{ \item{twas_weights_data}{List of list of twas weights output from from generate_twas_db function.} diff --git a/man/twas_pipeline.Rd b/man/twas_pipeline.Rd index 986bdf24..83dbaaa5 100644 --- a/man/twas_pipeline.Rd +++ b/man/twas_pipeline.Rd @@ -18,7 +18,9 @@ twas_pipeline( mr_coverage_column = "cs_coverage_0.95", quantile_twas = FALSE, output_twas_data = FALSE, - event_filters = NULL + event_filters = NULL, + column_file_path = NULL, + comment_string = "#" ) } \arguments{