BigMindLab · danoguevara · Jul 26, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,7 @@ export(nice_tSNE)
 export(power_analysis)
 export(save_results)
 export(split_cases)
+export(plotheatmap_leadingedge_GSEA)
 export(tpm)
 import(ggplot2)
 importFrom(magrittr,"%>%")

diff --git a/R/plotheatmap_leadingedge_GSEA.R b/R/plotheatmap_leadingedge_GSEA.R
@@ -0,0 +1,185 @@
+#' Plot leading edge heatmaps from GSEA results
+#'
+#' Generates heatmaps for leading edge genes of each gene set from GSEA output.
+#'
+#' @param main_dir_2 Optional base directory to prepend to file paths.
+#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir_2.
+#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir_2.
+#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir_2.
+#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir_2.
+#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir_2.
+#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps".
+#' @param sample_col Name of the sample ID column in metadata; default "Sample".
+#' @param group_col Name of the group column in metadata; default "group".
+#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting.
+#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV.
+#' @import dplyr readr tidyr readxl pheatmap
+#' @export
+plotheatmap_leadingedge_GSEA <- function(
+  main_dir_2         = NULL,
+  expression_file,
+  metadata_file,
+  gmt_file,
+  ranked_genes_file,
+  gsea_file,
+  output_dir        = "leading_edge_heatmaps",
+  sample_col        = "Sample",
+  group_col         = "group",
+  save_dataframe    = FALSE
+) {
+  # Ensure required packages are installed
+  if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package 'dplyr' is required.")
+  if (!requireNamespace("readr", quietly = TRUE)) stop("Package 'readr' is required.")
+  if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package 'tidyr' is required.")
+  if (!requireNamespace("readxl", quietly = TRUE)) stop("Package 'readxl' is required.")
+  if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package 'pheatmap' is required.")
+
+  # Prepend base directory if provided
+  if (!is.null(main_dir_2)) {
+    expression_file   <- file.path(main_dir_2, expression_file)
+    metadata_file     <- file.path(main_dir_2, metadata_file)
+    gmt_file          <- file.path(main_dir_2, gmt_file)
+    ranked_genes_file <- file.path(main_dir_2, ranked_genes_file)
+    gsea_file         <- file.path(main_dir_2, gsea_file)
+    output_dir        <- file.path(main_dir_2, output_dir)
+  }
+
+  # 1) Read and process GMT
+  gmt_data <- readLines(gmt_file) %>%
+    strsplit("\t") %>%
+    lapply(function(x) data.frame(
+      NAME        = x[1],
+      DESCRIPTION = x[2],
+      GENES       = paste(x[-c(1,2)], collapse = ","),
+      stringsAsFactors = FALSE
+    )) %>%
+    dplyr::bind_rows()
+
+  # 2) Read GSEA results and join genes
+  gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>%
+    dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME")
+
+  # 3) Read ranked genes list
+  ranked_df     <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE)
+  ranked_vector <- ranked_df[[1]]
+
+  # 4) Internal helper: extract top-n genes from leading edge
+  extract_top_n <- function(genes_str, n) {
+    if (is.na(genes_str) || n <= 0) return(NA_character_)
+    glist <- unlist(strsplit(genes_str, ","))
+    glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)]
+    paste(head(glist, n), collapse = ",")
+  }
+
+  # 5) Compute leading edge size and genes
+  gsea_df <- gsea_df %>%
+    dplyr::mutate(
+      L.EDGE_size = ifelse(
+        is.na(SIZE * tags), NA,
+        ifelse((SIZE * tags) %% 1 <= 0.5,
+               floor(SIZE * tags),
+               ceiling(SIZE * tags))
+      )
+    ) %>%
+    dplyr::rowwise() %>%
+    dplyr::mutate(
+      LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)
+    ) %>%
+    dplyr::ungroup()
+
+  # Save intermediate dataframe if requested
+  if (save_dataframe) {
+    if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
+    intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv")
+    readr::write_tsv(gsea_df, intermediate_file)
+    message("Saved data frame to: ", intermediate_file)
+  }
+
+  # 6) Read metadata and prepare annotation
+  meta <- readxl::read_xlsx(metadata_file) %>%
+    dplyr::select(dplyr::all_of(c(sample_col, group_col))) %>%
+    dplyr::rename(Sample = dplyr::all_of(sample_col),
+                  Group  = dplyr::all_of(group_col)) %>%
+    as.data.frame()
+  rownames(meta) <- meta$Sample
+
+  # 7) Read expression data
+  expr_raw <- read.table(expression_file, header = TRUE, sep = "\t",
+                         stringsAsFactors = FALSE, check.names = FALSE)
+  # Determine gene-name column
+  if ("NAME" %in% colnames(expr_raw)) {
+    rownames(expr_raw) <- expr_raw$NAME
+    expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE]
+  } else {
+    gene_col  <- colnames(expr_raw)[1]
+    rownames(expr_raw) <- expr_raw[[gene_col]]
+    expr_mat  <- expr_raw[, -1, drop = FALSE]
+  }
+  # Clean sample names
+  colnames(expr_mat) <- sub("^X", "", colnames(expr_mat))
+
+  # Ensure output directory exists
+  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
+
+  # 8) Loop through each gene set and plot heatmap
+  for (i in seq_len(nrow(gsea_df))) {
+    geneset_name   <- gsea_df$NAME[i]
+    leading_genes  <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ","))
+    genes_present  <- leading_genes[leading_genes %in% rownames(expr_mat)]
+    if (length(genes_present) == 0) next
+
+    heatmap_mat <- expr_mat[genes_present, , drop = FALSE]
+    common_samps <- intersect(colnames(heatmap_mat), rownames(meta))
+    if (length(common_samps) == 0) next
+
+    heatmap_mat   <- heatmap_mat[, common_samps, drop = FALSE]
+    annot_col     <- data.frame(Group = meta[common_samps, "Group"])
+    rownames(annot_col) <- common_samps
+
+    # Dynamic sizing
+    w <- 10
+    h <- max(5, nrow(heatmap_mat) * 0.1 + 2)
+
+    # PDF output
+    pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")),
+        width = w, height = h)
+    pheatmap::pheatmap(
+      heatmap_mat,
+      main               = geneset_name,
+      color              = colorRampPalette(c("blue","white","red"))(30),
+      scale              = "row",
+      clustering_distance_rows = "euclidean",
+      cluster_cols       = FALSE,
+      clustering_method  = "complete",
+      fontsize_row       = 6,
+      fontsize_col       = 7,
+      annotation_col     = annot_col,
+      border_color       = NA,
+      cellheight         = 5,
+      cellwidth          = 8
+    )
+    dev.off()
+
+    # JPG output
+    jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")),
+         width = w * 100, height = h * 100, res = 150)
+    pheatmap::pheatmap(
+      heatmap_mat,
+      main               = geneset_name,
+      color              = colorRampPalette(c("blue","white","red"))(30),
+      scale              = "row",
+      clustering_distance_rows = "euclidean",
+      cluster_cols       = FALSE,
+      clustering_method  = "complete",
+      fontsize_row       = 6,
+      fontsize_col       = 7,
+      annotation_col     = annot_col,
+      border_color       = NA,
+      cellheight         = 5,
+      cellwidth          = 8
+    )
+    dev.off()
+  }
+
+  message("Heatmaps saved in: ", normalizePath(output_dir))
+}
diff --git a/man/plotheatmap_leadingedge_GSEA.Rd b/man/plotheatmap_leadingedge_GSEA.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2; do not edit by hand
+\name{plotheatmap_leadingedge_GSEA}
+\alias{plotheatmap_leadingedge_GSEA}
+\title{Plot leading edge heatmaps from GSEA results}
+\usage{
+plotheatmap_leadingedge_GSEA(
+  main_dir_2 = NULL,
+  expression_file,
+  metadata_file,
+  gmt_file,
+  ranked_genes_file,
+  gsea_file,
+  output_dir = "leading_edge_heatmaps",
+  sample_col = "Sample",
+  group_col = "group",
+  save_dataframe = FALSE
+)
+}
+\arguments{
+  \item{main_dir_2}{Optional base directory to prepend to all file paths.}
+  \item{expression_file}{Path to the expression data file (tab-delimited) or relative to \code{main_dir_2}.}
+  \item{metadata_file}{Path to the metadata file (Excel) or relative to \code{main_dir_2}.}
+  \item{gmt_file}{Path to the GMT file defining gene sets or relative to \code{main_dir_2}.}
+  \item{ranked_genes_file}{Path to the ranked genes list file or relative to \code{main_dir_2}.}
+  \item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to \code{main_dir_2}.}
+  \item{output_dir}{Directory to save heatmap outputs and optional TSV; default \code{"leading_edge_heatmaps"}.}
+  \item{sample_col}{Name of the sample ID column in metadata; default \code{"Sample"}.}
+  \item{group_col}{Name of the group column in metadata; default \code{"group"}.}
+  \item{save_dataframe}{Logical; if \code{TRUE}, saves the merged data frame as TSV before plotting.}
+}
+\description{
+Generates and saves per-geneset heatmaps of leading edge genes from GSEA results.  
+Reads input files, extracts top-ranked leading edge genes, and plots expression heatmaps.
+}
+\details{
+This function performs the following steps:
+\enumerate{
+  \item Reads a GMT file to map gene set names to member genes.
+  \item Reads GSEA output and merges in the full gene lists for each gene set.
+  \item Reads a ranked gene list and determines the order for leading edge extraction.
+  \item Calculates the leading-edge size (\code{SIZE * tags}) and extracts the top genes accordingly.
+  \item Optionally saves the intermediate merged data frame to a TSV file.
+  \item Reads expression and metadata tables, aligns sample names, and prepares annotation.
+  \item Iterates over each gene set, selecting expression rows for the leading edge genes, and writes a heatmap (PDF and JPG) to \code{output_dir}.
+}
+}
+\value{
+No return value; files are written to disk. A message indicates the final output directory.
+}
+\examples{
+\dontrun{
+plotheatmap_leadingedge_GSEA(
+  main_dir_2         = "C:/Projects/project1",
+  expression_file    = "Results/expression_data.txt",
+  metadata_file      = "sample_data.xlsx",
+  gmt_file           = "GSEA/genesets.gmt",
+  ranked_genes_file  = "Ranking/ranked_gene_list.tsv",
+  gsea_file          = "GSEA/gsea_results.tsv",
+  output_dir         = "LeadingEdgePlots",
+  sample_col         = "id",
+  group_col          = "group",
+  save_dataframe     = TRUE
+)
+}
+}