Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(nice_tSNE)
export(power_analysis)
export(save_results)
export(split_cases)
export(plotheatmap_leadingedge_GSEA)
export(tpm)
import(ggplot2)
importFrom(magrittr,"%>%")
Expand Down
185 changes: 185 additions & 0 deletions R/plotheatmap_leadingedge_GSEA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#' Plot leading edge heatmaps from GSEA results
#'
#' Generates heatmaps for leading edge genes of each gene set from GSEA output.
#'
#' @param main_dir_2 Optional base directory to prepend to file paths.
#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir_2.
#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir_2.
#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir_2.
#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir_2.
#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir_2.
#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps".
#' @param sample_col Name of the sample ID column in metadata; default "Sample".
#' @param group_col Name of the group column in metadata; default "group".
#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting.
#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV.
#' @import dplyr readr tidyr readxl pheatmap
#' @export
plotheatmap_leadingedge_GSEA <- function(
main_dir_2 = NULL,
expression_file,
metadata_file,
gmt_file,
ranked_genes_file,
gsea_file,
output_dir = "leading_edge_heatmaps",
sample_col = "Sample",
group_col = "group",
save_dataframe = FALSE
) {
# Ensure required packages are installed
if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package 'dplyr' is required.")
if (!requireNamespace("readr", quietly = TRUE)) stop("Package 'readr' is required.")
if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package 'tidyr' is required.")
if (!requireNamespace("readxl", quietly = TRUE)) stop("Package 'readxl' is required.")
if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package 'pheatmap' is required.")

# Prepend base directory if provided
if (!is.null(main_dir_2)) {
expression_file <- file.path(main_dir_2, expression_file)
metadata_file <- file.path(main_dir_2, metadata_file)
gmt_file <- file.path(main_dir_2, gmt_file)
ranked_genes_file <- file.path(main_dir_2, ranked_genes_file)
gsea_file <- file.path(main_dir_2, gsea_file)
output_dir <- file.path(main_dir_2, output_dir)
}

# 1) Read and process GMT
gmt_data <- readLines(gmt_file) %>%
strsplit("\t") %>%
lapply(function(x) data.frame(
NAME = x[1],
DESCRIPTION = x[2],
GENES = paste(x[-c(1,2)], collapse = ","),
stringsAsFactors = FALSE
)) %>%
dplyr::bind_rows()

# 2) Read GSEA results and join genes
gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>%
dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME")

# 3) Read ranked genes list
ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE)
ranked_vector <- ranked_df[[1]]

# 4) Internal helper: extract top-n genes from leading edge
extract_top_n <- function(genes_str, n) {
if (is.na(genes_str) || n <= 0) return(NA_character_)
glist <- unlist(strsplit(genes_str, ","))
glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)]
paste(head(glist, n), collapse = ",")
}

# 5) Compute leading edge size and genes
gsea_df <- gsea_df %>%
dplyr::mutate(
L.EDGE_size = ifelse(
is.na(SIZE * tags), NA,
ifelse((SIZE * tags) %% 1 <= 0.5,
floor(SIZE * tags),
ceiling(SIZE * tags))
)
) %>%
dplyr::rowwise() %>%
dplyr::mutate(
LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)
) %>%
dplyr::ungroup()

# Save intermediate dataframe if requested
if (save_dataframe) {
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv")
readr::write_tsv(gsea_df, intermediate_file)
message("Saved data frame to: ", intermediate_file)
}

# 6) Read metadata and prepare annotation
meta <- readxl::read_xlsx(metadata_file) %>%
dplyr::select(dplyr::all_of(c(sample_col, group_col))) %>%
dplyr::rename(Sample = dplyr::all_of(sample_col),
Group = dplyr::all_of(group_col)) %>%
as.data.frame()
rownames(meta) <- meta$Sample

# 7) Read expression data
expr_raw <- read.table(expression_file, header = TRUE, sep = "\t",
stringsAsFactors = FALSE, check.names = FALSE)
# Determine gene-name column
if ("NAME" %in% colnames(expr_raw)) {
rownames(expr_raw) <- expr_raw$NAME
expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE]
} else {
gene_col <- colnames(expr_raw)[1]
rownames(expr_raw) <- expr_raw[[gene_col]]
expr_mat <- expr_raw[, -1, drop = FALSE]
}
# Clean sample names
colnames(expr_mat) <- sub("^X", "", colnames(expr_mat))

# Ensure output directory exists
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

# 8) Loop through each gene set and plot heatmap
for (i in seq_len(nrow(gsea_df))) {
geneset_name <- gsea_df$NAME[i]
leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ","))
genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)]
if (length(genes_present) == 0) next

heatmap_mat <- expr_mat[genes_present, , drop = FALSE]
common_samps <- intersect(colnames(heatmap_mat), rownames(meta))
if (length(common_samps) == 0) next

heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE]
annot_col <- data.frame(Group = meta[common_samps, "Group"])
rownames(annot_col) <- common_samps

# Dynamic sizing
w <- 10
h <- max(5, nrow(heatmap_mat) * 0.1 + 2)

# PDF output
pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")),
width = w, height = h)
pheatmap::pheatmap(
heatmap_mat,
main = geneset_name,
color = colorRampPalette(c("blue","white","red"))(30),
scale = "row",
clustering_distance_rows = "euclidean",
cluster_cols = FALSE,
clustering_method = "complete",
fontsize_row = 6,
fontsize_col = 7,
annotation_col = annot_col,
border_color = NA,
cellheight = 5,
cellwidth = 8
)
dev.off()

# JPG output
jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")),
width = w * 100, height = h * 100, res = 150)
pheatmap::pheatmap(
heatmap_mat,
main = geneset_name,
color = colorRampPalette(c("blue","white","red"))(30),
scale = "row",
clustering_distance_rows = "euclidean",
cluster_cols = FALSE,
clustering_method = "complete",
fontsize_row = 6,
fontsize_col = 7,
annotation_col = annot_col,
border_color = NA,
cellheight = 5,
cellwidth = 8
)
dev.off()
}

message("Heatmaps saved in: ", normalizePath(output_dir))
}
65 changes: 65 additions & 0 deletions man/plotheatmap_leadingedge_GSEA.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
% Generated by roxygen2; do not edit by hand
\name{plotheatmap_leadingedge_GSEA}
\alias{plotheatmap_leadingedge_GSEA}
\title{Plot leading edge heatmaps from GSEA results}
\usage{
plotheatmap_leadingedge_GSEA(
main_dir_2 = NULL,
expression_file,
metadata_file,
gmt_file,
ranked_genes_file,
gsea_file,
output_dir = "leading_edge_heatmaps",
sample_col = "Sample",
group_col = "group",
save_dataframe = FALSE
)
}
\arguments{
\item{main_dir_2}{Optional base directory to prepend to all file paths.}
\item{expression_file}{Path to the expression data file (tab-delimited) or relative to \code{main_dir_2}.}
\item{metadata_file}{Path to the metadata file (Excel) or relative to \code{main_dir_2}.}
\item{gmt_file}{Path to the GMT file defining gene sets or relative to \code{main_dir_2}.}
\item{ranked_genes_file}{Path to the ranked genes list file or relative to \code{main_dir_2}.}
\item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to \code{main_dir_2}.}
\item{output_dir}{Directory to save heatmap outputs and optional TSV; default \code{"leading_edge_heatmaps"}.}
\item{sample_col}{Name of the sample ID column in metadata; default \code{"Sample"}.}
\item{group_col}{Name of the group column in metadata; default \code{"group"}.}
\item{save_dataframe}{Logical; if \code{TRUE}, saves the merged data frame as TSV before plotting.}
}
\description{
Generates and saves per-geneset heatmaps of leading edge genes from GSEA results.
Reads input files, extracts top-ranked leading edge genes, and plots expression heatmaps.
}
\details{
This function performs the following steps:
\enumerate{
\item Reads a GMT file to map gene set names to member genes.
\item Reads GSEA output and merges in the full gene lists for each gene set.
\item Reads a ranked gene list and determines the order for leading edge extraction.
\item Calculates the leading-edge size (\code{SIZE * tags}) and extracts the top genes accordingly.
\item Optionally saves the intermediate merged data frame to a TSV file.
\item Reads expression and metadata tables, aligns sample names, and prepares annotation.
\item Iterates over each gene set, selecting expression rows for the leading edge genes, and writes a heatmap (PDF and JPG) to \code{output_dir}.
}
}
\value{
No return value; files are written to disk. A message indicates the final output directory.
}
\examples{
\dontrun{
plotheatmap_leadingedge_GSEA(
main_dir_2 = "C:/Projects/project1",
expression_file = "Results/expression_data.txt",
metadata_file = "sample_data.xlsx",
gmt_file = "GSEA/genesets.gmt",
ranked_genes_file = "Ranking/ranked_gene_list.tsv",
gsea_file = "GSEA/gsea_results.tsv",
output_dir = "LeadingEdgePlots",
sample_col = "id",
group_col = "group",
save_dataframe = TRUE
)
}
}
Loading