diff --git a/NAMESPACE b/NAMESPACE index d8ad75f..c0a9e9a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(nice_tSNE) export(power_analysis) export(save_results) export(split_cases) +export(plotheatmap_leadingedge_GSEA) export(tpm) import(ggplot2) importFrom(magrittr,"%>%") diff --git a/R/plotheatmap_leadingedge_GSEA.R b/R/plotheatmap_leadingedge_GSEA.R new file mode 100644 index 0000000..c770b3e --- /dev/null +++ b/R/plotheatmap_leadingedge_GSEA.R @@ -0,0 +1,185 @@ +#' Plot leading edge heatmaps from GSEA results +#' +#' Generates heatmaps for leading edge genes of each gene set from GSEA output. +#' +#' @param main_dir_2 Optional base directory to prepend to file paths. +#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir_2. +#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir_2. +#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir_2. +#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir_2. +#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir_2. +#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps". +#' @param sample_col Name of the sample ID column in metadata; default "Sample". +#' @param group_col Name of the group column in metadata; default "group". +#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting. +#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. +#' @import dplyr readr tidyr readxl pheatmap +#' @export +plotheatmap_leadingedge_GSEA <- function( + main_dir_2 = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE +) { + # Ensure required packages are installed + if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package 'dplyr' is required.") + if (!requireNamespace("readr", quietly = TRUE)) stop("Package 'readr' is required.") + if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package 'tidyr' is required.") + if (!requireNamespace("readxl", quietly = TRUE)) stop("Package 'readxl' is required.") + if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package 'pheatmap' is required.") + + # Prepend base directory if provided + if (!is.null(main_dir_2)) { + expression_file <- file.path(main_dir_2, expression_file) + metadata_file <- file.path(main_dir_2, metadata_file) + gmt_file <- file.path(main_dir_2, gmt_file) + ranked_genes_file <- file.path(main_dir_2, ranked_genes_file) + gsea_file <- file.path(main_dir_2, gsea_file) + output_dir <- file.path(main_dir_2, output_dir) + } + + # 1) Read and process GMT + gmt_data <- readLines(gmt_file) %>% + strsplit("\t") %>% + lapply(function(x) data.frame( + NAME = x[1], + DESCRIPTION = x[2], + GENES = paste(x[-c(1,2)], collapse = ","), + stringsAsFactors = FALSE + )) %>% + dplyr::bind_rows() + + # 2) Read GSEA results and join genes + gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% + dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME") + + # 3) Read ranked genes list + ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) + ranked_vector <- ranked_df[[1]] + + # 4) Internal helper: extract top-n genes from leading edge + extract_top_n <- function(genes_str, n) { + if (is.na(genes_str) || n <= 0) return(NA_character_) + glist <- unlist(strsplit(genes_str, ",")) + glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] + paste(head(glist, n), collapse = ",") + } + + # 5) Compute leading edge size and genes + gsea_df <- gsea_df %>% + dplyr::mutate( + L.EDGE_size = ifelse( + is.na(SIZE * tags), NA, + ifelse((SIZE * tags) %% 1 <= 0.5, + floor(SIZE * tags), + ceiling(SIZE * tags)) + ) + ) %>% + dplyr::rowwise() %>% + dplyr::mutate( + LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size) + ) %>% + dplyr::ungroup() + + # Save intermediate dataframe if requested + if (save_dataframe) { + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") + readr::write_tsv(gsea_df, intermediate_file) + message("Saved data frame to: ", intermediate_file) + } + + # 6) Read metadata and prepare annotation + meta <- readxl::read_xlsx(metadata_file) %>% + dplyr::select(dplyr::all_of(c(sample_col, group_col))) %>% + dplyr::rename(Sample = dplyr::all_of(sample_col), + Group = dplyr::all_of(group_col)) %>% + as.data.frame() + rownames(meta) <- meta$Sample + + # 7) Read expression data + expr_raw <- read.table(expression_file, header = TRUE, sep = "\t", + stringsAsFactors = FALSE, check.names = FALSE) + # Determine gene-name column + if ("NAME" %in% colnames(expr_raw)) { + rownames(expr_raw) <- expr_raw$NAME + expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] + } else { + gene_col <- colnames(expr_raw)[1] + rownames(expr_raw) <- expr_raw[[gene_col]] + expr_mat <- expr_raw[, -1, drop = FALSE] + } + # Clean sample names + colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) + + # Ensure output directory exists + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + + # 8) Loop through each gene set and plot heatmap + for (i in seq_len(nrow(gsea_df))) { + geneset_name <- gsea_df$NAME[i] + leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) + genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] + if (length(genes_present) == 0) next + + heatmap_mat <- expr_mat[genes_present, , drop = FALSE] + common_samps <- intersect(colnames(heatmap_mat), rownames(meta)) + if (length(common_samps) == 0) next + + heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE] + annot_col <- data.frame(Group = meta[common_samps, "Group"]) + rownames(annot_col) <- common_samps + + # Dynamic sizing + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + # PDF output + pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), + width = w, height = h) + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = colorRampPalette(c("blue","white","red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + dev.off() + + # JPG output + jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = colorRampPalette(c("blue","white","red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + dev.off() + } + + message("Heatmaps saved in: ", normalizePath(output_dir)) +} diff --git a/man/plotheatmap_leadingedge_GSEA.Rd b/man/plotheatmap_leadingedge_GSEA.Rd new file mode 100644 index 0000000..cf741da --- /dev/null +++ b/man/plotheatmap_leadingedge_GSEA.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2; do not edit by hand +\name{plotheatmap_leadingedge_GSEA} +\alias{plotheatmap_leadingedge_GSEA} +\title{Plot leading edge heatmaps from GSEA results} +\usage{ +plotheatmap_leadingedge_GSEA( + main_dir_2 = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE +) +} +\arguments{ + \item{main_dir_2}{Optional base directory to prepend to all file paths.} + \item{expression_file}{Path to the expression data file (tab-delimited) or relative to \code{main_dir_2}.} + \item{metadata_file}{Path to the metadata file (Excel) or relative to \code{main_dir_2}.} + \item{gmt_file}{Path to the GMT file defining gene sets or relative to \code{main_dir_2}.} + \item{ranked_genes_file}{Path to the ranked genes list file or relative to \code{main_dir_2}.} + \item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to \code{main_dir_2}.} + \item{output_dir}{Directory to save heatmap outputs and optional TSV; default \code{"leading_edge_heatmaps"}.} + \item{sample_col}{Name of the sample ID column in metadata; default \code{"Sample"}.} + \item{group_col}{Name of the group column in metadata; default \code{"group"}.} + \item{save_dataframe}{Logical; if \code{TRUE}, saves the merged data frame as TSV before plotting.} +} +\description{ +Generates and saves per-geneset heatmaps of leading edge genes from GSEA results. +Reads input files, extracts top-ranked leading edge genes, and plots expression heatmaps. +} +\details{ +This function performs the following steps: +\enumerate{ + \item Reads a GMT file to map gene set names to member genes. + \item Reads GSEA output and merges in the full gene lists for each gene set. + \item Reads a ranked gene list and determines the order for leading edge extraction. + \item Calculates the leading-edge size (\code{SIZE * tags}) and extracts the top genes accordingly. + \item Optionally saves the intermediate merged data frame to a TSV file. + \item Reads expression and metadata tables, aligns sample names, and prepares annotation. + \item Iterates over each gene set, selecting expression rows for the leading edge genes, and writes a heatmap (PDF and JPG) to \code{output_dir}. +} +} +\value{ +No return value; files are written to disk. A message indicates the final output directory. +} +\examples{ +\dontrun{ +plotheatmap_leadingedge_GSEA( + main_dir_2 = "C:/Projects/project1", + expression_file = "Results/expression_data.txt", + metadata_file = "sample_data.xlsx", + gmt_file = "GSEA/genesets.gmt", + ranked_genes_file = "Ranking/ranked_gene_list.tsv", + gsea_file = "GSEA/gsea_results.tsv", + output_dir = "LeadingEdgePlots", + sample_col = "id", + group_col = "group", + save_dataframe = TRUE +) +} +}