diff --git a/NAMESPACE b/NAMESPACE index 9f3d3773..5ed429d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,11 +39,13 @@ export(gbayes_rss) export(get_ctwas_meta_data) export(get_nested_element) export(glmnet_weights) +export(harmonize_gwas) export(harmonize_twas) export(lasso_weights) export(lbf_to_alpha) export(load_LD_matrix) export(load_genotype_region) +export(load_ld_snp_info) export(load_multitask_regional_data) export(load_multitrait_R_sumstat) export(load_multitrait_tensorqtl_sumstat) diff --git a/R/file_utils.R b/R/file_utils.R index f0c8ce1e..11af676a 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -990,3 +990,27 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo names(batches) <- NULL return(batches) } + +#' Function to load LD reference data variants +#' @export +#' @noRd +load_ld_snp_info <- function(ld_meta_file_path, region_of_interest) { + bim_file_path <- get_regional_ld_meta(ld_meta_file_path, region_of_interest)$intersections$bim_file_paths + bim_data <- lapply(bim_file_path, function(bim_file) as.data.frame(vroom(bim_file, col_names = FALSE))) + snp_info <- setNames(lapply(bim_data, function(info_table) { + # for TWAS and MR, the variance and allele_freq are not necessary + if (ncol(info_table) >= 8) { + info_table <- info_table[, c(1, 2, 4:8)] + colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref", "variance", "allele_freq") + } else if (ncol(info_table) == 6) { + info_table <- info_table[, c(1, 2, 4:6)] + colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref") + } else { + warning("Unexpected number of columns; skipping this element.") + return(NULL) + } + info_table$id <- gsub("chr", "", gsub("_", ":", info_table$id)) + return(info_table) + }), sapply(names(bim_data), function(x) gsub("chr", "", paste(strsplit(basename(x), "[_:/.]")[[1]][1:3], collapse = "_")))) + return(snp_info) +} \ No newline at end of file diff --git a/R/twas.R b/R/twas.R index d7ad1134..e732423f 100644 --- a/R/twas.R +++ b/R/twas.R @@ -94,18 +94,13 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) } return(merged_groups) } - # function to load bim file variants - load_bim_file_info <- function(ld_meta_file_path, region_of_interest) { - bim_file_path <- get_regional_ld_meta(ld_meta_file_path, region_of_interest)$intersections$bim_file_paths - bim_data <- lapply(bim_file_path, function(bim_file) as.data.frame(vroom(bim_file, col_names = FALSE))) - return(bim_data) - } + # function to extract LD variance for the query region - query_variance <- function(ld_variant_info_data, extract_coordinates) { - ld_info_data <- do.call(rbind, ld_variant_info_data) - ld_pos <- as.numeric(ld_info_data[, 4]) + query_variance <- function(ld_info_data, extract_coordinates) { + ld_info_data <- do.call(rbind, ld_info_data) + ld_pos <- as.numeric(ld_info_data[, 3]) ld_info_data_filtered <- ld_info_data[ld_pos %in% extract_coordinates$pos, , drop = FALSE] - variance_df <- ld_info_data_filtered[, c(1, 4, 5:7)] # Extract the variance column (7th column) + variance_df <- ld_info_data_filtered[, c(1, 3:6)] # Extract the variance column (7th column) colnames(variance_df) <- c("chrom", "pos", "A1", "A2", "variance") return(variance_df) } @@ -126,23 +121,7 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) region_of_interest <- data.frame(chrom = chrom, start = min(region_variants$pos), end = max(region_variants$pos)) LD_list <- load_LD_matrix(ld_meta_file_path, region_of_interest, region_variants) # load snp info once - ld_variant_info <- load_bim_file_info(ld_meta_file_path, region_of_interest) - snp_info <- setNames(lapply(ld_variant_info, function(info_table) { - # for TWAS and MR, the variance and allele_freq are not necessary - if (ncol(info_table) >= 8) { - info_table <- info_table[, c(1, 2, 4:8)] - colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref", "variance", "allele_freq") - } else if (ncol(info_table) == 6) { - info_table <- info_table[, c(1, 2, 4:6)] - colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref") - } else { - warning("Unexpected number of columns; skipping this element.") - return(NULL) - } - info_table$id <- gsub("chr", "", gsub("_", ":", info_table$id)) - return(info_table) - }), sapply(names(ld_variant_info), function(x) gsub("chr", "", paste(strsplit(basename(x), "[_:/.]")[[1]][1:3], collapse = "_")))) - + snp_info <- load_ld_snp_info(ld_meta_file_path, region_of_interest) # remove duplicate variants dup_idx <- which(duplicated(LD_list$combined_LD_variants)) if (length(dup_idx) >= 1) { @@ -171,18 +150,8 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) # Step 3: load GWAS data for clustered context groups for (study in names(gwas_files)) { gwas_file <- gwas_files[study] - gwas_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping - if (nrow(gwas_sumstats) == 0) { - warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", study, ". ")) - next - } - if (colnames(gwas_sumstats)[1] == "#chrom") colnames(gwas_sumstats)[1] <- "chrom" # colname update for tabix - gwas_sumstats$chrom <- as.integer(gwas_sumstats$chrom) - # check for overlapping variants - if (!any(gwas_sumstats$pos %in% gsub("\\:.*$", "", sub("^.*?\\:", "", LD_list$combined_LD_variants)))) next - gwas_allele_flip <- allele_qc(gwas_sumstats, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0) - gwas_data_sumstats <- gwas_allele_flip$target_data_qced # post-qc gwas data that is flipped and corrected - gwas study level - + gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0) + if(is.null(gwas_data_sumstats)) next # loop through context within the context group: for (context in contexts) { weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]] @@ -229,7 +198,9 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) paste0("chr", variant_qc$target_data_qced$variant_id[variant_qc$target_data_qced$variant_id %in% postqc_weight_variants]) }) } - #rownames(weights_matrix_subset) <- if (!grepl("^chr", rownames(weights_matrix_subset)[1])) paste0("chr", rownames(weights_matrix_subset)) else rownames(weights_matrix_subset) + rm(weights_matrix) # context specific original weight matrix + gc() + if (nrow(weights_matrix_subset) > 0) { rownames(weights_matrix_subset) <- if (!grepl("^chr", rownames(weights_matrix_subset)[1])) { paste0("chr", rownames(weights_matrix_subset)) @@ -243,7 +214,7 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) results[[molecular_id]][["variant_names"]][[context]][[study]] <- rownames(weights_matrix_subset) # Step 6: scale weights by variance - variance_df <- query_variance(ld_variant_info, all_variants) %>% + variance_df <- query_variance(snp_info, all_variants) %>% mutate(variants = paste(chrom, pos, A2, A1, sep = ":")) variance <- variance_df[match(rownames(weights_matrix_subset), paste0("chr", variance_df$variants)), "variance"] results[[molecular_id]][["weights_qced"]][[context]][[study]] <- list(scaled_weights = weights_matrix_subset * sqrt(variance), weights = weights_matrix_subset) @@ -270,6 +241,27 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file) return(list(twas_data_qced = results, snp_info = snp_info)) } +#' Harmonize GWAS Summary Statistics +#' perform harmonization on gwas summary statistics for a chromosome data or specific queried region +#' @param gwas_file A string for the file path of gwas summary statistics file that is already tabix indexed +#' @param query_region A string for region of query for tabix-indexed gwas summary statistics file in the format of chr:start-end +#' @noRd +#' @export +harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0){ + if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ") + gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping + if (nrow(gwas_data_sumstats) == 0) { + warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", study, ". ")) + return(NULL) + } + if (colnames(gwas_data_sumstats)[1] == "#chrom") colnames(gwas_data_sumstats)[1] <- "chrom" # colname update for tabix + # check for overlapping variants + if (!any(gwas_data_sumstats$pos %in% gsub("\\:.*$", "", sub("^.*?\\:", "", ld_variants)))) return(NULL) + gwas_allele_flip <- allele_qc(gwas_data_sumstats, ld_variants, col_to_flip=col_to_flip, match_min_prop = match_min_prop) + gwas_data_sumstats <- gwas_allele_flip$target_data_qced # post-qc gwas data that is flipped and corrected - gwas study level + return(gwas_data_sumstats) +} + #' Function to perform TWAS analysis for across multiple contexts. #' This function peforms TWAS analysis for multiple contexts for imputable genes within an LD region and summarize the twas results. #' @param twas_weights_data List of list of twas weights output from generate_twas_db function. @@ -359,17 +351,8 @@ twas_pipeline <- function(twas_weights_data, z_snp <- list() for (study in studies) { z_gene_list[[study]] <- twas_table[twas_table$gwas_study == study, , drop = FALSE] - z_snp[[study]] <- do.call(rbind, lapply(post_qc_twas_data, function(x) { - if (study %in% names(x$gwas_qced)) { - return(x$gwas_qced[[study]]) - } - })) - colnames(z_snp[[study]])[which(colnames(z_snp[[study]]) == "variant_id")] <- "id" - z_snp[[study]] <- z_snp[[study]][, c("id", "A1", "A2", "z")] - z_snp[[study]] <- z_snp[[study]][!duplicated(z_snp[[study]]$id), , drop = FALSE] - z_snp[[study]]$id <- gsub("chr", "", z_snp[[study]]$id) } - result <- list(weights = weights, z_gene = z_gene_list, z_snp = z_snp) + result <- list(weights = weights, z_gene = z_gene_list) if (!is.null(susie_weights_intermediate_qced)) { result$susie_weights_intermediate_qced <- susie_weights_intermediate_qced } @@ -532,12 +515,14 @@ twas_pipeline <- function(twas_weights_data, mr_context_table <- do.call(rbind, lapply(study_results, function(x) x$mr_rs_df)) return(list(twas_context_table = twas_context_table, mr_context_table = mr_context_table)) }) + twas_data_qced[[weight_db]][["LD"]] <- NULL + twas_weights_data[[weight_db]] <- NULL twas_gene_table <- do.call(rbind, lapply(twas_gene_results, function(x) x$twas_context_table)) mr_gene_table <- do.call(rbind, lapply(twas_gene_results, function(x) x$mr_context_table)) - twas_weights_data[[weight_db]] <- NULL - return(list(twas_table = twas_gene_table, twas_data_qced = twas_data_qced[weight_db], mr_result = mr_gene_table, snp_info = twas_data_qced_result$snp_info)) + return(list(twas_table = twas_gene_table, twas_data_qced = twas_data_qced[weight_db], mr_result = mr_gene_table)) }) rm(twas_data_qced_result) + gc() twas_results_db <- twas_results_db[!sapply(twas_results_db, function(x) is.null(x) || (is.list(x) && all(sapply(x, is.null))))] if (length(twas_results_db) == 0) { return(NULL) @@ -545,8 +530,9 @@ twas_pipeline <- function(twas_weights_data, twas_results_table <- do.call(rbind, lapply(twas_results_db, function(x) x$twas_table)) mr_results <- do.call(rbind, lapply(twas_results_db, function(x) x$mr_result)) twas_data <- do.call(c, lapply(twas_results_db, function(x) x$twas_data_qced)) - snp_info <- do.call(c, lapply(twas_results_db, function(x) x$snp_info)) + # snp_info <- do.call(c, lapply(twas_results_db, function(x) x$snp_info)) rm(twas_results_db) + gc() # Step 2: Summarize and merge twas cv results and region information for all methods for all contexts for imputable genes. twas_table <- do.call(rbind, lapply(names(twas_data), function(molecular_id) { @@ -622,7 +608,7 @@ twas_pipeline <- function(twas_weights_data, } if (output_twas_data & nrow(twas_table) > 0) { twas_data_subset <- format_twas_data(twas_data, twas_table) - if (!is.null(twas_data_subset)) twas_data_subset$snp_info <- snp_info + # if (!is.null(twas_data_subset)) twas_data_subset$snp_info <- snp_info } else { twas_data_subset <- NULL }