Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ export(gbayes_rss)
export(get_ctwas_meta_data)
export(get_nested_element)
export(glmnet_weights)
export(harmonize_gwas)
export(harmonize_twas)
export(lasso_weights)
export(lbf_to_alpha)
export(load_LD_matrix)
export(load_genotype_region)
export(load_ld_snp_info)
export(load_multitask_regional_data)
export(load_multitrait_R_sumstat)
export(load_multitrait_tensorqtl_sumstat)
Expand Down
24 changes: 24 additions & 0 deletions R/file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -990,3 +990,27 @@ batch_load_twas_weights <- function(twas_weights_results, meta_data_df, max_memo
names(batches) <- NULL
return(batches)
}

#' Function to load LD reference data variants
#' @export
#' @noRd
load_ld_snp_info <- function(ld_meta_file_path, region_of_interest) {
bim_file_path <- get_regional_ld_meta(ld_meta_file_path, region_of_interest)$intersections$bim_file_paths
bim_data <- lapply(bim_file_path, function(bim_file) as.data.frame(vroom(bim_file, col_names = FALSE)))
snp_info <- setNames(lapply(bim_data, function(info_table) {
# for TWAS and MR, the variance and allele_freq are not necessary
if (ncol(info_table) >= 8) {
info_table <- info_table[, c(1, 2, 4:8)]
colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref", "variance", "allele_freq")
} else if (ncol(info_table) == 6) {
info_table <- info_table[, c(1, 2, 4:6)]
colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref")
} else {
warning("Unexpected number of columns; skipping this element.")
return(NULL)
}
info_table$id <- gsub("chr", "", gsub("_", ":", info_table$id))
return(info_table)
}), sapply(names(bim_data), function(x) gsub("chr", "", paste(strsplit(basename(x), "[_:/.]")[[1]][1:3], collapse = "_"))))
return(snp_info)
}
96 changes: 41 additions & 55 deletions R/twas.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,13 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
}
return(merged_groups)
}
# function to load bim file variants
load_bim_file_info <- function(ld_meta_file_path, region_of_interest) {
bim_file_path <- get_regional_ld_meta(ld_meta_file_path, region_of_interest)$intersections$bim_file_paths
bim_data <- lapply(bim_file_path, function(bim_file) as.data.frame(vroom(bim_file, col_names = FALSE)))
return(bim_data)
}

# function to extract LD variance for the query region
query_variance <- function(ld_variant_info_data, extract_coordinates) {
ld_info_data <- do.call(rbind, ld_variant_info_data)
ld_pos <- as.numeric(ld_info_data[, 4])
query_variance <- function(ld_info_data, extract_coordinates) {
ld_info_data <- do.call(rbind, ld_info_data)
ld_pos <- as.numeric(ld_info_data[, 3])
ld_info_data_filtered <- ld_info_data[ld_pos %in% extract_coordinates$pos, , drop = FALSE]
variance_df <- ld_info_data_filtered[, c(1, 4, 5:7)] # Extract the variance column (7th column)
variance_df <- ld_info_data_filtered[, c(1, 3:6)] # Extract the variance column (7th column)
colnames(variance_df) <- c("chrom", "pos", "A1", "A2", "variance")
return(variance_df)
}
Expand All @@ -126,23 +121,7 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
region_of_interest <- data.frame(chrom = chrom, start = min(region_variants$pos), end = max(region_variants$pos))
LD_list <- load_LD_matrix(ld_meta_file_path, region_of_interest, region_variants)
# load snp info once
ld_variant_info <- load_bim_file_info(ld_meta_file_path, region_of_interest)
snp_info <- setNames(lapply(ld_variant_info, function(info_table) {
# for TWAS and MR, the variance and allele_freq are not necessary
if (ncol(info_table) >= 8) {
info_table <- info_table[, c(1, 2, 4:8)]
colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref", "variance", "allele_freq")
} else if (ncol(info_table) == 6) {
info_table <- info_table[, c(1, 2, 4:6)]
colnames(info_table) <- c("chrom", "id", "pos", "alt", "ref")
} else {
warning("Unexpected number of columns; skipping this element.")
return(NULL)
}
info_table$id <- gsub("chr", "", gsub("_", ":", info_table$id))
return(info_table)
}), sapply(names(ld_variant_info), function(x) gsub("chr", "", paste(strsplit(basename(x), "[_:/.]")[[1]][1:3], collapse = "_"))))

snp_info <- load_ld_snp_info(ld_meta_file_path, region_of_interest)
# remove duplicate variants
dup_idx <- which(duplicated(LD_list$combined_LD_variants))
if (length(dup_idx) >= 1) {
Expand Down Expand Up @@ -171,18 +150,8 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
# Step 3: load GWAS data for clustered context groups
for (study in names(gwas_files)) {
gwas_file <- gwas_files[study]
gwas_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping
if (nrow(gwas_sumstats) == 0) {
warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", study, ". "))
next
}
if (colnames(gwas_sumstats)[1] == "#chrom") colnames(gwas_sumstats)[1] <- "chrom" # colname update for tabix
gwas_sumstats$chrom <- as.integer(gwas_sumstats$chrom)
# check for overlapping variants
if (!any(gwas_sumstats$pos %in% gsub("\\:.*$", "", sub("^.*?\\:", "", LD_list$combined_LD_variants)))) next
gwas_allele_flip <- allele_qc(gwas_sumstats, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0)
gwas_data_sumstats <- gwas_allele_flip$target_data_qced # post-qc gwas data that is flipped and corrected - gwas study level

gwas_data_sumstats <- harmonize_gwas(gwas_file, query_region=query_region, LD_list$combined_LD_variants, c("beta", "z"), match_min_prop = 0)
if(is.null(gwas_data_sumstats)) next
# loop through context within the context group:
for (context in contexts) {
weights_matrix <- twas_weights_data[[molecular_id]][["weights"]][[context]]
Expand Down Expand Up @@ -229,7 +198,9 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
paste0("chr", variant_qc$target_data_qced$variant_id[variant_qc$target_data_qced$variant_id %in% postqc_weight_variants])
})
}
#rownames(weights_matrix_subset) <- if (!grepl("^chr", rownames(weights_matrix_subset)[1])) paste0("chr", rownames(weights_matrix_subset)) else rownames(weights_matrix_subset)
rm(weights_matrix) # context specific original weight matrix
gc()

if (nrow(weights_matrix_subset) > 0) {
rownames(weights_matrix_subset) <- if (!grepl("^chr", rownames(weights_matrix_subset)[1])) {
paste0("chr", rownames(weights_matrix_subset))
Expand All @@ -243,7 +214,7 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
results[[molecular_id]][["variant_names"]][[context]][[study]] <- rownames(weights_matrix_subset)

# Step 6: scale weights by variance
variance_df <- query_variance(ld_variant_info, all_variants) %>%
variance_df <- query_variance(snp_info, all_variants) %>%
mutate(variants = paste(chrom, pos, A2, A1, sep = ":"))
variance <- variance_df[match(rownames(weights_matrix_subset), paste0("chr", variance_df$variants)), "variance"]
results[[molecular_id]][["weights_qced"]][[context]][[study]] <- list(scaled_weights = weights_matrix_subset * sqrt(variance), weights = weights_matrix_subset)
Expand All @@ -270,6 +241,27 @@ harmonize_twas <- function(twas_weights_data, ld_meta_file_path, gwas_meta_file)
return(list(twas_data_qced = results, snp_info = snp_info))
}

#' Harmonize GWAS Summary Statistics
#' perform harmonization on gwas summary statistics for a chromosome data or specific queried region
#' @param gwas_file A string for the file path of gwas summary statistics file that is already tabix indexed
#' @param query_region A string for region of query for tabix-indexed gwas summary statistics file in the format of chr:start-end
#' @noRd
#' @export
harmonize_gwas <- function(gwas_file, query_region, ld_variants, col_to_flip=NULL, match_min_prop=0){
if(is.null(gwas_file)| is.na(gwas_file)) stop("No GWAS file path provided. ")
gwas_data_sumstats <- as.data.frame(tabix_region(gwas_file, query_region)) # extension for yml file for column name mapping
if (nrow(gwas_data_sumstats) == 0) {
warning(paste0("No GWAS summary statistics found for the region of ", query_region, " in ", study, ". "))
return(NULL)
}
if (colnames(gwas_data_sumstats)[1] == "#chrom") colnames(gwas_data_sumstats)[1] <- "chrom" # colname update for tabix
# check for overlapping variants
if (!any(gwas_data_sumstats$pos %in% gsub("\\:.*$", "", sub("^.*?\\:", "", ld_variants)))) return(NULL)
gwas_allele_flip <- allele_qc(gwas_data_sumstats, ld_variants, col_to_flip=col_to_flip, match_min_prop = match_min_prop)
gwas_data_sumstats <- gwas_allele_flip$target_data_qced # post-qc gwas data that is flipped and corrected - gwas study level
return(gwas_data_sumstats)
}

#' Function to perform TWAS analysis for across multiple contexts.
#' This function peforms TWAS analysis for multiple contexts for imputable genes within an LD region and summarize the twas results.
#' @param twas_weights_data List of list of twas weights output from generate_twas_db function.
Expand Down Expand Up @@ -359,17 +351,8 @@ twas_pipeline <- function(twas_weights_data,
z_snp <- list()
for (study in studies) {
z_gene_list[[study]] <- twas_table[twas_table$gwas_study == study, , drop = FALSE]
z_snp[[study]] <- do.call(rbind, lapply(post_qc_twas_data, function(x) {
if (study %in% names(x$gwas_qced)) {
return(x$gwas_qced[[study]])
}
}))
colnames(z_snp[[study]])[which(colnames(z_snp[[study]]) == "variant_id")] <- "id"
z_snp[[study]] <- z_snp[[study]][, c("id", "A1", "A2", "z")]
z_snp[[study]] <- z_snp[[study]][!duplicated(z_snp[[study]]$id), , drop = FALSE]
z_snp[[study]]$id <- gsub("chr", "", z_snp[[study]]$id)
}
result <- list(weights = weights, z_gene = z_gene_list, z_snp = z_snp)
result <- list(weights = weights, z_gene = z_gene_list)
if (!is.null(susie_weights_intermediate_qced)) {
result$susie_weights_intermediate_qced <- susie_weights_intermediate_qced
}
Expand Down Expand Up @@ -532,21 +515,24 @@ twas_pipeline <- function(twas_weights_data,
mr_context_table <- do.call(rbind, lapply(study_results, function(x) x$mr_rs_df))
return(list(twas_context_table = twas_context_table, mr_context_table = mr_context_table))
})
twas_data_qced[[weight_db]][["LD"]] <- NULL
twas_weights_data[[weight_db]] <- NULL
twas_gene_table <- do.call(rbind, lapply(twas_gene_results, function(x) x$twas_context_table))
mr_gene_table <- do.call(rbind, lapply(twas_gene_results, function(x) x$mr_context_table))
twas_weights_data[[weight_db]] <- NULL
return(list(twas_table = twas_gene_table, twas_data_qced = twas_data_qced[weight_db], mr_result = mr_gene_table, snp_info = twas_data_qced_result$snp_info))
return(list(twas_table = twas_gene_table, twas_data_qced = twas_data_qced[weight_db], mr_result = mr_gene_table))
})
rm(twas_data_qced_result)
gc()
twas_results_db <- twas_results_db[!sapply(twas_results_db, function(x) is.null(x) || (is.list(x) && all(sapply(x, is.null))))]
if (length(twas_results_db) == 0) {
return(NULL)
}
twas_results_table <- do.call(rbind, lapply(twas_results_db, function(x) x$twas_table))
mr_results <- do.call(rbind, lapply(twas_results_db, function(x) x$mr_result))
twas_data <- do.call(c, lapply(twas_results_db, function(x) x$twas_data_qced))
snp_info <- do.call(c, lapply(twas_results_db, function(x) x$snp_info))
# snp_info <- do.call(c, lapply(twas_results_db, function(x) x$snp_info))
rm(twas_results_db)
gc()

# Step 2: Summarize and merge twas cv results and region information for all methods for all contexts for imputable genes.
twas_table <- do.call(rbind, lapply(names(twas_data), function(molecular_id) {
Expand Down Expand Up @@ -622,7 +608,7 @@ twas_pipeline <- function(twas_weights_data,
}
if (output_twas_data & nrow(twas_table) > 0) {
twas_data_subset <- format_twas_data(twas_data, twas_table)
if (!is.null(twas_data_subset)) twas_data_subset$snp_info <- snp_info
# if (!is.null(twas_data_subset)) twas_data_subset$snp_info <- snp_info
} else {
twas_data_subset <- NULL
}
Expand Down
Loading