From f517979925e5154564a87bc0cb72a8e9ce8dc991 Mon Sep 17 00:00:00 2001 From: DanielGarbozo Date: Wed, 12 Feb 2025 12:58:27 -0500 Subject: [PATCH 1/6] Add merge_GSEA.R function --- R/merge_GSEA.R | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 R/merge_GSEA.R diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R new file mode 100644 index 0000000..6ff753a --- /dev/null +++ b/R/merge_GSEA.R @@ -0,0 +1,85 @@ + +# Merge GSEA DATA into a unique table # + +#' Merge collection results from multiple output GSEA files from broad institute into a single table. +#' After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory +#' +#' @param input_directory The directory containing the GSEA collection results in TSV format. +#' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. +#' @importFrom dplyr bind_rows mutate select relocate rename filter if_any +#' @importFrom readr read_tsv write_tsv +#' @importFrom tidyr separate +#' @importFrom magrittr %>% +#' @importFrom base sub basename ifelse file.path dir.exists list.files stop commandArgs length print cat +#' @importFrom stringr sub grepl str_replace +#' @export + + +merge_GSEA <- function(input_directory, output_file) { + # Read command-line arguments + args <- commandArgs(trailingOnly = TRUE) + if (length(args) < 1) { + stop("Usage: merge_GSEA.R [output_file]\n") + } + input_directory <- args[1] + # If a second argument is provided, use it as the output file; otherwise, save the file in the input directory. + output_file <- ifelse(length(args) >= 2, args[2], file.path(input_directory, "collections_merged_gsea_data.tsv")) + + # Validate input directory and check for TSV files + if (!dir.exists(input_directory)) { + stop("The specified directory does not exist: ", input_directory) + } + files <- list.files(path = input_directory, pattern = "\\.tsv$", full.names = TRUE) + if (length(files) == 0) { + stop("No TSV files found in ", input_directory) + } + + # Function to read each file and add a column with the modified file name + read_file <- function(file) { + data <- read_tsv(file) + file_name <- basename(file) + file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary + # Convert all numeric columns explicitly + numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") + data <- data %>% + mutate(across(all_of(numeric_cols), as.numeric)) + data$COLLECTION <- file_name + return(data) + } + + # Read and combine all TSV files into a single data frame + gsea_data <- lapply(files, read_file) %>% bind_rows() + + gsea_data <- gsea_data %>% + select(-`...12`) + + # Define numeric columns + numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") + + # Find problematic values in numeric columns + gsea_data %>% + filter(if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% + print() + + # Data processing: selection, separation, mutation, and renaming of columns + gsea_data <- gsea_data %>% + select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% #, -"...12" + separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% + mutate( + tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), + list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), + signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), + `FDR q-val` = ifelse(`FDR q-val` == 0, 0.001, `FDR q-val`), + `Log10FDR` = -log10(`FDR q-val`) + ) %>% + relocate(`Log10FDR`, .after = `FWER p-val`) %>% + rename(COMPARISON = Comparison, FDR = `FDR q-val`) + + # Save the processed data to a TSV file + write_tsv(gsea_data, output_file) + + cat("GSEA data saved to:", output_file, "\n") +} + + + From a8630593bba01de5fa447b9c8c23ede17dd587b5 Mon Sep 17 00:00:00 2001 From: DanielGarbozo Date: Thu, 13 Feb 2025 01:32:28 -0500 Subject: [PATCH 2/6] Add documentation for merge_GSEA --- NAMESPACE | 1 + R/merge_GSEA.R | 40 ++++++++++++++++++++++------------------ man/merge_GSEA.Rd | 16 ++++++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 man/merge_GSEA.Rd diff --git a/NAMESPACE b/NAMESPACE index b9af139..7e79e20 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(add_annotations) export(detect_filter) export(get_annotations) +export(merge_GSEA) export(nice_BSV) export(nice_PCA) export(nice_UMAP) diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R index 6ff753a..1a0ae1f 100644 --- a/R/merge_GSEA.R +++ b/R/merge_GSEA.R @@ -1,21 +1,25 @@ -# Merge GSEA DATA into a unique table # +# Function merge_GSEA # -#' Merge collection results from multiple output GSEA files from broad institute into a single table. +#' Merge GSEA results data frames. +#' #' After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory #' #' @param input_directory The directory containing the GSEA collection results in TSV format. #' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. -#' @importFrom dplyr bind_rows mutate select relocate rename filter if_any -#' @importFrom readr read_tsv write_tsv -#' @importFrom tidyr separate #' @importFrom magrittr %>% -#' @importFrom base sub basename ifelse file.path dir.exists list.files stop commandArgs length print cat -#' @importFrom stringr sub grepl str_replace #' @export merge_GSEA <- function(input_directory, output_file) { + + if (!requireNamespace("dplyr", quietly = TRUE)) { + stop( + "Package \"dplyr\" must be installed to use this function.", + call. = FALSE + ) + } + # Read command-line arguments args <- commandArgs(trailingOnly = TRUE) if (length(args) < 1) { @@ -36,47 +40,47 @@ merge_GSEA <- function(input_directory, output_file) { # Function to read each file and add a column with the modified file name read_file <- function(file) { - data <- read_tsv(file) + data <- readr::read_tsv(file) file_name <- basename(file) file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary # Convert all numeric columns explicitly numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") data <- data %>% - mutate(across(all_of(numeric_cols), as.numeric)) + dplyr::mutate(across(tidyselect::all_of(numeric_cols), as.numeric)) data$COLLECTION <- file_name return(data) } # Read and combine all TSV files into a single data frame - gsea_data <- lapply(files, read_file) %>% bind_rows() + gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() gsea_data <- gsea_data %>% - select(-`...12`) + dplyr::select(-`...12`) # Define numeric columns numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") # Find problematic values in numeric columns gsea_data %>% - filter(if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% + dplyr::filter(dplyr::if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% print() # Data processing: selection, separation, mutation, and renaming of columns gsea_data <- gsea_data %>% - select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% #, -"...12" - separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% - mutate( + dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% #, -"...12" + tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% + dplyr::mutate( tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), `FDR q-val` = ifelse(`FDR q-val` == 0, 0.001, `FDR q-val`), `Log10FDR` = -log10(`FDR q-val`) ) %>% - relocate(`Log10FDR`, .after = `FWER p-val`) %>% - rename(COMPARISON = Comparison, FDR = `FDR q-val`) + dplyr::relocate(`Log10FDR`, .after = `FWER p-val`) %>% + dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) # Save the processed data to a TSV file - write_tsv(gsea_data, output_file) + readr::write_tsv(gsea_data, output_file) cat("GSEA data saved to:", output_file, "\n") } diff --git a/man/merge_GSEA.Rd b/man/merge_GSEA.Rd new file mode 100644 index 0000000..fab3e99 --- /dev/null +++ b/man/merge_GSEA.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_GSEA.R +\name{merge_GSEA} +\alias{merge_GSEA} +\title{Merge GSEA results data frames.} +\usage{ +merge_GSEA(input_directory, output_file) +} +\arguments{ +\item{input_directory}{The directory containing the GSEA collection results in TSV format.} + +\item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.} +} +\description{ +After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory +} From 68b698f20eaf1a38f23a699a46e1f2fddbec50e6 Mon Sep 17 00:00:00 2001 From: Daniel Garbozo <158525441+DanielGarbozo@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:49:45 -0400 Subject: [PATCH 3/6] Update merge_GSEA.R Refactored merge_GSEA to use function arguments instead of CLI parsing --- R/merge_GSEA.R | 39 ++++++++------------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R index 1a0ae1f..a52618f 100644 --- a/R/merge_GSEA.R +++ b/R/merge_GSEA.R @@ -3,7 +3,7 @@ #' Merge GSEA results data frames. #' -#' After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory +#' After run GSEA_all.sh from GSEA.sh, merge_GSEA function join .tsv files to a single file #' #' @param input_directory The directory containing the GSEA collection results in TSV format. #' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. @@ -11,24 +11,12 @@ #' @export -merge_GSEA <- function(input_directory, output_file) { +merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_data.tsv") { if (!requireNamespace("dplyr", quietly = TRUE)) { - stop( - "Package \"dplyr\" must be installed to use this function.", - call. = FALSE - ) + stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) } - # Read command-line arguments - args <- commandArgs(trailingOnly = TRUE) - if (length(args) < 1) { - stop("Usage: merge_GSEA.R [output_file]\n") - } - input_directory <- args[1] - # If a second argument is provided, use it as the output file; otherwise, save the file in the input directory. - output_file <- ifelse(length(args) >= 2, args[2], file.path(input_directory, "collections_merged_gsea_data.tsv")) - # Validate input directory and check for TSV files if (!dir.exists(input_directory)) { stop("The specified directory does not exist: ", input_directory) @@ -43,7 +31,6 @@ merge_GSEA <- function(input_directory, output_file) { data <- readr::read_tsv(file) file_name <- basename(file) file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary - # Convert all numeric columns explicitly numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") data <- data %>% dplyr::mutate(across(tidyselect::all_of(numeric_cols), as.numeric)) @@ -51,23 +38,17 @@ merge_GSEA <- function(input_directory, output_file) { return(data) } - # Read and combine all TSV files into a single data frame - gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() - - gsea_data <- gsea_data %>% - dplyr::select(-`...12`) - - # Define numeric columns - numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") - - # Find problematic values in numeric columns + # Read and combine all TSV files into a single data frame, remove empty columns + gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() %>% dplyr::select(-`...12`) + + # Find problematic values in numeric columns gsea_data %>% dplyr::filter(dplyr::if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% print() # Data processing: selection, separation, mutation, and renaming of columns gsea_data <- gsea_data %>% - dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% #, -"...12" + dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% dplyr::mutate( tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), @@ -81,9 +62,5 @@ merge_GSEA <- function(input_directory, output_file) { # Save the processed data to a TSV file readr::write_tsv(gsea_data, output_file) - cat("GSEA data saved to:", output_file, "\n") } - - - From 9f5b76e056654bb78a5da808761786fcb252f635 Mon Sep 17 00:00:00 2001 From: Daniel Garbozo <158525441+DanielGarbozo@users.noreply.github.com> Date: Thu, 24 Jul 2025 16:18:10 -0400 Subject: [PATCH 4/6] Add new function: plot_GSEA --- R/plot_GSEA.R | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 R/plot_GSEA.R diff --git a/R/plot_GSEA.R b/R/plot_GSEA.R new file mode 100644 index 0000000..1c79858 --- /dev/null +++ b/R/plot_GSEA.R @@ -0,0 +1,114 @@ +#' Plot global GSEA results +#' +#' Generates a composite plot displaying NES values, pathway labels, +#' and a \emph{logFDR} legend, organized by MSigDB collections. +#' +#' @param data Data frame containing the GSEA results. +#' @param geneset_col Name of the column containing the genesets. +#' @param collection_col Name of the column containing the collections. +#' @param nes_col Name of the column containing the NES values. +#' @param logfdr_col Name of the column containing \eqn{-\log_{10}(FDR)} values. +#' @param output_path_base Path and base name for the output files (without extension). +#' @param width_output Plot width in inches. +#' @param height_output Plot height in inches. +#' @param text_size_genesets Text size for the geneset labels. +#' @param text_size_collection Text size for the collection labels. +#' @return Saves two files: \code{output_path_base}.jpg and \code{output_path_base}.pdf. +#' @import ggplot2 patchwork cowplot scales +#' @export +plot_GSEA <- function(data, geneset_col, collection_col, nes_col, logfdr_col, output_path_base, width_output, height_output, text_size_genesets = 5, text_size_collection = 5) { + + # Rename columns dynamically + data <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] + colnames(data) <- c("Geneset", "Collection", "NES", "logFDR") + + # Order data by NES value (descending) + data <- data[order(data$NES, decreasing = TRUE), ] + + # Ensure Geneset and Collection are factors with ordered levels + data$Geneset <- factor(data$Geneset, levels = rev(unique(data$Geneset))) + data$Collection <- factor(data$Collection, levels = unique(data$Collection)) + + # Right-side label: "MSigDB" vertically centered, in bold and italic + plot_text_msigdb <- ggplot() + + annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Lef-side label: "Pathways" vertically centered, in bold and italic + plot_text_pathways <- ggplot() + + annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Right panel: Collection labels (without repetition) + plot_right <- ggplot(data, aes(y = Geneset, x = 1.5, label = Collection)) + + geom_text(aes(label = ifelse(duplicated(Collection), "", Collection)), + hjust = 0.5, size = 0, fontface = "bold") + + facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + + theme_void() + + theme(strip.text.y = element_text(angle = 0, hjust = 1, size = text_size_collection), + panel.spacing = unit(1, "lines")) + + # Center panel: NES bar plot + plot_center <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_col(color = "black", size = 1) + + scale_fill_gradient(low = "white", high = "red", + limits = c(0,3), breaks = seq(0,3,1)) + + scale_y_discrete(position = "right") + + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + + theme_bw() + + labs(x = "NES", y = "") + + theme(axis.text.y = element_blank(), + strip.background = element_rect(fill = "white", color = "black",linewidth = 1 ), + axis.ticks.y = element_line(color = "black", size = 1.5), + axis.ticks.length = unit(0.3, "cm"), + strip.text.y = element_text(size = 1, margin = margin(0, 0, 0, 0)),# element_blank(), + legend.position = "none", + axis.title.x = element_text(size = 49), + axis.text.x = element_text(size = 45), + panel.spacing = unit(4, "lines") + ) + + # Left panel: Pathays labels + plot_left <- ggplot(data, aes(y = Geneset, x = 0, label = Geneset)) + + geom_text(hjust = 1, size = text_size_genesets) + + theme_void() + + theme(axis.text.y = element_blank(), + plot.margin = margin(0, 0, 0, -50)) + + # Legend panel + plot_legend <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_tile() + + scale_fill_gradient(low = "white", high = "red", + name = expression(-log[10] ~ FDR), # log10FDR with subscrip, + limits = c(0,3), breaks = seq(0,3,1), + guide = guide_colorbar(ticks.colour = "black", # Make ticks black + ticks.linewidth = 1.5, # Make ticks thicker + draw.ulim = TRUE, # Draw upper limit tick + draw.llim = TRUE)) + # Draw lower limit tick + theme_bw() + + theme(legend.position = "right", + legend.box = "vertical", + legend.title = element_text(size = 44, hjust = 0.5, face = "bold"), # Bigger title + legend.text = element_text(size = 30), # Bigger legend text + legend.key.size = unit(1.5, "cm"), # Bigger color key size + legend.key.height = unit(2, "cm"), # Increase the height of the legend box + legend.spacing = unit(3.5, "cm"), # More space between title and legend + legend.box.margin = margin(10, 20, 10, 10))#5, 5, 10, 5)) # Adjust internal spacing + + plot_legend <- plot_legend + theme(legend.box = "vertical") + plot_right_legend <- get_legend(plot_legend) + + # Extract legend + #plot_right_legend <- get_legend(plot_legend) + + # Combine all plots + final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + plot_text_msigdb + plot_right_legend + + plot_layout(ncol = 6, widths = c(4, 25, 15, 3, 10, 3)) + + # Display the plot + #print(final_plot) + + # Save the plot as JPG and PDF + ggsave(paste0(output_path_base, ".jpg"), final_plot, width = width_output, height = height_output, units = "in", dpi = 300) + ggsave(paste0(output_path_base, ".pdf"), final_plot, width = width_output, height = height_output, units = "in") +} From 29427bcb9fe2698ede6e60b3d4eb99608f0320ae Mon Sep 17 00:00:00 2001 From: Daniel Garbozo <158525441+DanielGarbozo@users.noreply.github.com> Date: Thu, 24 Jul 2025 16:20:28 -0400 Subject: [PATCH 5/6] Update NAMESPACE Adds plot_global_GSEA to the NAMESPACE file so it can be accessed when the package is loaded. --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 7e79e20..d5d892d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(nice_tSNE) export(save_results) export(split_cases) export(tpm) +export(plot_global_GSEA) import(ggplot2) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,colData) From d8c1b6d3b6f72f253f4f107cbde08b397b1cfb6d Mon Sep 17 00:00:00 2001 From: Daniel Garbozo <158525441+DanielGarbozo@users.noreply.github.com> Date: Thu, 24 Jul 2025 16:22:59 -0400 Subject: [PATCH 6/6] Document plot_global_GSEA with roxygen2 Document plot_global_GSEA generated by roxygen2 --- man/plot_global_GSEA.Rd | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 man/plot_global_GSEA.Rd diff --git a/man/plot_global_GSEA.Rd b/man/plot_global_GSEA.Rd new file mode 100644 index 0000000..2504c0a --- /dev/null +++ b/man/plot_global_GSEA.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +\name{plot_global_GSEA} +\alias{plot_global_GSEA} +\title{Plot global GSEA results} +\usage{ +plot_global_GSEA(data, geneset_col, collection_col, nes_col, + logfdr_col, output_path_base, width_output, + height_output, text_size_genesets = 5, + text_size_collection = 5) +} +\arguments{ + \item{data}{Data frame containing the GSEA results.} + \item{geneset_col}{Name of the column containing the genesets.} + \item{collection_col}{Name of the column containing the MSigDB collections.} + \item{nes_col}{Name of the column containing NES values.} + \item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.} + \item{output_path_base}{Path and base name for the output files (no extension).} + \item{width_output}{Width of the output plot in inches.} + \item{height_output}{Height of the output plot in inches.} + \item{text_size_genesets}{Text size for the geneset labels.} + \item{text_size_collection}{Text size for the collection labels.} +} +\description{ +Generates a multi-panel plot summarizing GSEA results, showing pathways, +normalized enrichment scores (NES), and log-transformed FDR values +grouped by MSigDB collection. +} +\details{ +This function creates a visual summary of GSEA results by combining: +\itemize{ + \item \strong{Pathway labels} on the left, + \item \strong{NES bars} filled by \emph{log10 FDR} in the center, + \item \strong{Collection labels} on the right, + \item Vertical titles on both ends ("Pathways" and "MSigDB"). +} +The final figure is saved as both PDF and JPG. +} +\examples{ +\dontrun{ +df <- readr::read_tsv("collections_merged_gsea_data.tsv") +plot_global_GSEA(df, "Geneset", "COLLECTION", "NES", "Log10FDR", + "output/plotGSEA", 12, 8) +} +}