diff --git a/NAMESPACE b/NAMESPACE index d8ad75f..052579b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(add_annotations) export(detect_filter) export(get_annotations) +export(merge_GSEA) export(get_stars) export(gsea_barplot) export(nice_BSV) @@ -14,6 +15,7 @@ export(power_analysis) export(save_results) export(split_cases) export(tpm) +export(plot_global_GSEA) import(ggplot2) importFrom(magrittr,"%>%") importFrom(rlang,.data) diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R new file mode 100644 index 0000000..a52618f --- /dev/null +++ b/R/merge_GSEA.R @@ -0,0 +1,66 @@ + +# Function merge_GSEA # + +#' Merge GSEA results data frames. +#' +#' After run GSEA_all.sh from GSEA.sh, merge_GSEA function join .tsv files to a single file +#' +#' @param input_directory The directory containing the GSEA collection results in TSV format. +#' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. +#' @importFrom magrittr %>% +#' @export + + +merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_data.tsv") { + + if (!requireNamespace("dplyr", quietly = TRUE)) { + stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) + } + + # Validate input directory and check for TSV files + if (!dir.exists(input_directory)) { + stop("The specified directory does not exist: ", input_directory) + } + files <- list.files(path = input_directory, pattern = "\\.tsv$", full.names = TRUE) + if (length(files) == 0) { + stop("No TSV files found in ", input_directory) + } + + # Function to read each file and add a column with the modified file name + read_file <- function(file) { + data <- readr::read_tsv(file) + file_name <- basename(file) + file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary + numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") + data <- data %>% + dplyr::mutate(across(tidyselect::all_of(numeric_cols), as.numeric)) + data$COLLECTION <- file_name + return(data) + } + + # Read and combine all TSV files into a single data frame, remove empty columns + gsea_data <- lapply(files, read_file) %>% dplyr::bind_rows() %>% dplyr::select(-`...12`) + + # Find problematic values in numeric columns + gsea_data %>% + dplyr::filter(dplyr::if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% + print() + + # Data processing: selection, separation, mutation, and renaming of columns + gsea_data <- gsea_data %>% + dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% + tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% + dplyr::mutate( + tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), + list = 0.01 * as.numeric(sub("%", "", sub("list=", "", list))), + signal = 0.01 * as.numeric(sub("%", "", sub("signal=", "", signal))), + `FDR q-val` = ifelse(`FDR q-val` == 0, 0.001, `FDR q-val`), + `Log10FDR` = -log10(`FDR q-val`) + ) %>% + dplyr::relocate(`Log10FDR`, .after = `FWER p-val`) %>% + dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) + + # Save the processed data to a TSV file + readr::write_tsv(gsea_data, output_file) + cat("GSEA data saved to:", output_file, "\n") +} diff --git a/R/plot_GSEA.R b/R/plot_GSEA.R new file mode 100644 index 0000000..1c79858 --- /dev/null +++ b/R/plot_GSEA.R @@ -0,0 +1,114 @@ +#' Plot global GSEA results +#' +#' Generates a composite plot displaying NES values, pathway labels, +#' and a \emph{logFDR} legend, organized by MSigDB collections. +#' +#' @param data Data frame containing the GSEA results. +#' @param geneset_col Name of the column containing the genesets. +#' @param collection_col Name of the column containing the collections. +#' @param nes_col Name of the column containing the NES values. +#' @param logfdr_col Name of the column containing \eqn{-\log_{10}(FDR)} values. +#' @param output_path_base Path and base name for the output files (without extension). +#' @param width_output Plot width in inches. +#' @param height_output Plot height in inches. +#' @param text_size_genesets Text size for the geneset labels. +#' @param text_size_collection Text size for the collection labels. +#' @return Saves two files: \code{output_path_base}.jpg and \code{output_path_base}.pdf. +#' @import ggplot2 patchwork cowplot scales +#' @export +plot_GSEA <- function(data, geneset_col, collection_col, nes_col, logfdr_col, output_path_base, width_output, height_output, text_size_genesets = 5, text_size_collection = 5) { + + # Rename columns dynamically + data <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] + colnames(data) <- c("Geneset", "Collection", "NES", "logFDR") + + # Order data by NES value (descending) + data <- data[order(data$NES, decreasing = TRUE), ] + + # Ensure Geneset and Collection are factors with ordered levels + data$Geneset <- factor(data$Geneset, levels = rev(unique(data$Geneset))) + data$Collection <- factor(data$Collection, levels = unique(data$Collection)) + + # Right-side label: "MSigDB" vertically centered, in bold and italic + plot_text_msigdb <- ggplot() + + annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Lef-side label: "Pathways" vertically centered, in bold and italic + plot_text_pathways <- ggplot() + + annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Right panel: Collection labels (without repetition) + plot_right <- ggplot(data, aes(y = Geneset, x = 1.5, label = Collection)) + + geom_text(aes(label = ifelse(duplicated(Collection), "", Collection)), + hjust = 0.5, size = 0, fontface = "bold") + + facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + + theme_void() + + theme(strip.text.y = element_text(angle = 0, hjust = 1, size = text_size_collection), + panel.spacing = unit(1, "lines")) + + # Center panel: NES bar plot + plot_center <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_col(color = "black", size = 1) + + scale_fill_gradient(low = "white", high = "red", + limits = c(0,3), breaks = seq(0,3,1)) + + scale_y_discrete(position = "right") + + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + + theme_bw() + + labs(x = "NES", y = "") + + theme(axis.text.y = element_blank(), + strip.background = element_rect(fill = "white", color = "black",linewidth = 1 ), + axis.ticks.y = element_line(color = "black", size = 1.5), + axis.ticks.length = unit(0.3, "cm"), + strip.text.y = element_text(size = 1, margin = margin(0, 0, 0, 0)),# element_blank(), + legend.position = "none", + axis.title.x = element_text(size = 49), + axis.text.x = element_text(size = 45), + panel.spacing = unit(4, "lines") + ) + + # Left panel: Pathays labels + plot_left <- ggplot(data, aes(y = Geneset, x = 0, label = Geneset)) + + geom_text(hjust = 1, size = text_size_genesets) + + theme_void() + + theme(axis.text.y = element_blank(), + plot.margin = margin(0, 0, 0, -50)) + + # Legend panel + plot_legend <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_tile() + + scale_fill_gradient(low = "white", high = "red", + name = expression(-log[10] ~ FDR), # log10FDR with subscrip, + limits = c(0,3), breaks = seq(0,3,1), + guide = guide_colorbar(ticks.colour = "black", # Make ticks black + ticks.linewidth = 1.5, # Make ticks thicker + draw.ulim = TRUE, # Draw upper limit tick + draw.llim = TRUE)) + # Draw lower limit tick + theme_bw() + + theme(legend.position = "right", + legend.box = "vertical", + legend.title = element_text(size = 44, hjust = 0.5, face = "bold"), # Bigger title + legend.text = element_text(size = 30), # Bigger legend text + legend.key.size = unit(1.5, "cm"), # Bigger color key size + legend.key.height = unit(2, "cm"), # Increase the height of the legend box + legend.spacing = unit(3.5, "cm"), # More space between title and legend + legend.box.margin = margin(10, 20, 10, 10))#5, 5, 10, 5)) # Adjust internal spacing + + plot_legend <- plot_legend + theme(legend.box = "vertical") + plot_right_legend <- get_legend(plot_legend) + + # Extract legend + #plot_right_legend <- get_legend(plot_legend) + + # Combine all plots + final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + plot_text_msigdb + plot_right_legend + + plot_layout(ncol = 6, widths = c(4, 25, 15, 3, 10, 3)) + + # Display the plot + #print(final_plot) + + # Save the plot as JPG and PDF + ggsave(paste0(output_path_base, ".jpg"), final_plot, width = width_output, height = height_output, units = "in", dpi = 300) + ggsave(paste0(output_path_base, ".pdf"), final_plot, width = width_output, height = height_output, units = "in") +} diff --git a/man/merge_GSEA.Rd b/man/merge_GSEA.Rd new file mode 100644 index 0000000..fab3e99 --- /dev/null +++ b/man/merge_GSEA.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_GSEA.R +\name{merge_GSEA} +\alias{merge_GSEA} +\title{Merge GSEA results data frames.} +\usage{ +merge_GSEA(input_directory, output_file) +} +\arguments{ +\item{input_directory}{The directory containing the GSEA collection results in TSV format.} + +\item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.} +} +\description{ +After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory +} diff --git a/man/plot_global_GSEA.Rd b/man/plot_global_GSEA.Rd new file mode 100644 index 0000000..2504c0a --- /dev/null +++ b/man/plot_global_GSEA.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +\name{plot_global_GSEA} +\alias{plot_global_GSEA} +\title{Plot global GSEA results} +\usage{ +plot_global_GSEA(data, geneset_col, collection_col, nes_col, + logfdr_col, output_path_base, width_output, + height_output, text_size_genesets = 5, + text_size_collection = 5) +} +\arguments{ + \item{data}{Data frame containing the GSEA results.} + \item{geneset_col}{Name of the column containing the genesets.} + \item{collection_col}{Name of the column containing the MSigDB collections.} + \item{nes_col}{Name of the column containing NES values.} + \item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.} + \item{output_path_base}{Path and base name for the output files (no extension).} + \item{width_output}{Width of the output plot in inches.} + \item{height_output}{Height of the output plot in inches.} + \item{text_size_genesets}{Text size for the geneset labels.} + \item{text_size_collection}{Text size for the collection labels.} +} +\description{ +Generates a multi-panel plot summarizing GSEA results, showing pathways, +normalized enrichment scores (NES), and log-transformed FDR values +grouped by MSigDB collection. +} +\details{ +This function creates a visual summary of GSEA results by combining: +\itemize{ + \item \strong{Pathway labels} on the left, + \item \strong{NES bars} filled by \emph{log10 FDR} in the center, + \item \strong{Collection labels} on the right, + \item Vertical titles on both ends ("Pathways" and "MSigDB"). +} +The final figure is saved as both PDF and JPG. +} +\examples{ +\dontrun{ +df <- readr::read_tsv("collections_merged_gsea_data.tsv") +plot_global_GSEA(df, "Geneset", "COLLECTION", "NES", "Log10FDR", + "output/plotGSEA", 12, 8) +} +}