From b40d50c2bfa764394b29399f238c04b41e528114 Mon Sep 17 00:00:00 2001 From: danoguevara Date: Sat, 26 Jul 2025 22:08:14 -0500 Subject: [PATCH] Fix GSEA functions --- DESCRIPTION | 12 +- NAMESPACE | 12 +- R/{gsea_barplot.R => barplot_GSEA.R} | 10 +- R/heatmap_GSEA.R | 165 ++++++++++++++++ R/merge_GSEA.R | 39 ++-- R/{nice_BSV.R => nice_VSB.R} | 10 +- R/plot_GSEA.R | 231 ++++++++++++----------- R/plotheatmap_leadingedge_GSEA.R | 185 ------------------ man/{gsea_barplot.Rd => barplot_GSEA.Rd} | 19 +- man/heatmap_GSEA.Rd | 46 +++++ man/merge_GSEA.Rd | 4 +- man/{nice_BSV.Rd => nice_VSB.Rd} | 10 +- man/plot_GSEA.Rd | 38 ++++ man/plot_global_GSEA.Rd | 44 ----- man/plotheatmap_leadingedge_GSEA.Rd | 65 ------- 15 files changed, 422 insertions(+), 468 deletions(-) rename R/{gsea_barplot.R => barplot_GSEA.R} (85%) create mode 100644 R/heatmap_GSEA.R rename R/{nice_BSV.R => nice_VSB.R} (95%) delete mode 100644 R/plotheatmap_leadingedge_GSEA.R rename man/{gsea_barplot.Rd => barplot_GSEA.Rd} (69%) create mode 100644 man/heatmap_GSEA.Rd rename man/{nice_BSV.Rd => nice_VSB.Rd} (93%) create mode 100644 man/plot_GSEA.Rd delete mode 100644 man/plot_global_GSEA.Rd delete mode 100644 man/plotheatmap_leadingedge_GSEA.Rd diff --git a/DESCRIPTION b/DESCRIPTION index ea3882d..1dfc1a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,14 +27,22 @@ Imports: umap, tibble, scales, - grid + grid, + utils, + patchwork Suggests: biomaRt, + cowplot, dbscan, ggnewscale, ggrepel, + grDevices, matrixStats, openxlsx, + pheatmap, + readr, survival, - survminer + survminer, + tidyr, + tidyselect Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 9251469..821d6cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,23 +1,23 @@ # Generated by roxygen2: do not edit by hand export(add_annotations) +export(barplot_GSEA) export(detect_filter) export(get_annotations) -export(merge_GSEA) export(get_stars) -export(gsea_barplot) -export(nice_BSV) +export(heatmap_GSEA) +export(merge_GSEA) export(nice_KM) export(nice_PCA) export(nice_UMAP) +export(nice_VSB) export(nice_tSNE) -export(plot_global_GSEA) +export(plot_GSEA) export(power_analysis) export(save_results) export(split_cases) -export(plotheatmap_leadingedge_GSEA) export(tpm) -export(plot_global_GSEA) import(ggplot2) importFrom(magrittr,"%>%") +importFrom(patchwork,plot_layout) importFrom(rlang,.data) diff --git a/R/gsea_barplot.R b/R/barplot_GSEA.R similarity index 85% rename from R/gsea_barplot.R rename to R/barplot_GSEA.R index 69759f8..8339beb 100644 --- a/R/gsea_barplot.R +++ b/R/barplot_GSEA.R @@ -1,5 +1,5 @@ ######################### -# Function gsea_barplot # +# Function barplot_GSEA # ######################### #' Create and save a customized barplot for GSEA results @@ -13,15 +13,12 @@ #' @param data A data frame containing GSEA results with columns such as `datatype`, `NES`, `-Log10FDR`, and `New_name`. #' @param output_path The file path where the barplot will be saved (SVG format). #' @param custom_labels A named vector of custom expressions for x-axis labels. -#' @param height Height of the saved plot in inches. Default: 49. -#' @param width Width of the saved plot in inches. Default: 30. #' @param axis_y Name of the column to use for the y-axis aesthetic, as a string. Default: "NES". #' @import ggplot2 #' @importFrom rlang .data #' @export -gsea_barplot <- function(data, output_path, custom_labels, - height = 49, width = 30, axis_y = "NES") +barplot_GSEA <- function(data, output_path, custom_labels, axis_y = "NES") { # Generate the barplot @@ -56,6 +53,5 @@ gsea_barplot <- function(data, output_path, custom_labels, facet_wrap(~ .data$New_name, ncol = 2, strip.position = "left", scales = "free_y") + scale_x_discrete(labels = custom_labels) - # Save the barplot - ggsave(filename = output_path, plot = barplot, height = height, width = width, limitsize = FALSE) + return(barplot) } diff --git a/R/heatmap_GSEA.R b/R/heatmap_GSEA.R new file mode 100644 index 0000000..40ca338 --- /dev/null +++ b/R/heatmap_GSEA.R @@ -0,0 +1,165 @@ +######################### +# Function heatmap_GSEA # +######################### + +#' Plot leading edge heatmaps from GSEA results. +#' +#' Generates heatmaps of leading edge genes for each gene set from GSEA output. +#' +#' @param main_dir Optional base directory. If supplied, it will be prepended to all relative file paths. +#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir. +#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir. +#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir. +#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir. +#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir. +#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps". +#' @param sample_col Name of the sample ID column in metadata; default "Sample". +#' @param group_col Name of the group column in metadata; default "group". +#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting. +#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. +#' @export + +heatmap_GSEA <- function(main_dir = NULL, expression_file, metadata_file, gmt_file, + ranked_genes_file, gsea_file, output_dir = "leading_edge_heatmaps", + sample_col = "Sample", group_col = "group", save_dataframe = FALSE) +{ + # Ensure required packages are installed + if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("grDevices", quietly = TRUE)) stop("Package \"grDevices\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("tidyselect", quietly = TRUE)) stop("Package \"tidyselect\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("openxlsx", quietly = TRUE)) stop("Package \"openxlsx\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package \"pheatmap\" must be installed to use this function.", call. = FALSE) + + # Prepend base directory if provided + if (!is.null(main_dir)) { + expression_file <- file.path(main_dir, expression_file) + metadata_file <- file.path(main_dir, metadata_file) + gmt_file <- file.path(main_dir, gmt_file) + ranked_genes_file <- file.path(main_dir, ranked_genes_file) + gsea_file <- file.path(main_dir, gsea_file) + output_dir <- file.path(main_dir, output_dir) + } + + # 1) Read and process GMT + gmt_data <- readLines(gmt_file) %>% + strsplit("\t") %>% + lapply(function(x) data.frame(NAME = x[1], DESCRIPTION = x[2], GENES = paste(x[-c(1,2)], collapse = ","), stringsAsFactors = FALSE)) %>% + dplyr::bind_rows() + + # 2) Read GSEA results and join genes + gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% + dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME") + + # 3) Read ranked genes list + ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) + ranked_vector <- ranked_df[[1]] + + # 4) Internal helper: extract top-n genes from leading edge + extract_top_n <- function(genes_str, n) { + if (is.na(genes_str) || n <= 0) return(NA_character_) + glist <- unlist(strsplit(genes_str, ",")) + glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] + paste(utils::head(glist, n), collapse = ",") + } + + # 5) Compute leading edge size and genes + gsea_df <- gsea_df %>% + dplyr::mutate(L.EDGE_size = ifelse(is.na(SIZE * tags), NA, ifelse((SIZE * tags) %% 1 <= 0.5, floor(SIZE * tags), ceiling(SIZE * tags)))) %>% + dplyr::rowwise() %>% dplyr::mutate(LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size)) %>% + dplyr::ungroup() + + # Save intermediate dataframe if requested + if (save_dataframe) { + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") + readr::write_tsv(gsea_df, intermediate_file) + message("Saved data frame to: ", intermediate_file) + } + + # 6) Read metadata and prepare annotation + meta <- openxlsx::read.xlsx(metadata_file) %>% + dplyr::select(tidyselect::all_of(c(sample_col, group_col))) %>% + dplyr::rename(Sample = tidyselect::all_of(sample_col), Group = tidyselect::all_of(group_col)) %>% + as.data.frame() + rownames(meta) <- meta$Sample + + # 7) Read expression data + expr_raw <- utils::read.table(expression_file, header = TRUE, sep = "\t", + stringsAsFactors = FALSE, check.names = FALSE) + # Determine gene-name column + if ("NAME" %in% colnames(expr_raw)) { + rownames(expr_raw) <- expr_raw$NAME + expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] + } else { + gene_col <- colnames(expr_raw)[1] + rownames(expr_raw) <- expr_raw[[gene_col]] + expr_mat <- expr_raw[, -1, drop = FALSE] + } + # Clean sample names + colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) + + # Ensure output directory exists + if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) + + # 8) Loop through each gene set and plot heatmap + for (i in seq_len(nrow(gsea_df))) { + geneset_name <- gsea_df$NAME[i] + leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) + genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] + if (length(genes_present) == 0) next + + heatmap_mat <- expr_mat[genes_present, , drop = FALSE] + common_samps <- intersect(colnames(heatmap_mat), rownames(meta)) + if (length(common_samps) == 0) next + + heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE] + annot_col <- data.frame(Group = meta[common_samps, "Group"]) + rownames(annot_col) <- common_samps + + # Dynamic sizing + w <- 10 + h <- max(5, nrow(heatmap_mat) * 0.1 + 2) + + # PDF output + grDevices::pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), width = w, height = h) + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = grDevices::colorRampPalette(c("blue","white","red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + grDevices::dev.off() + + # JPG output + grDevices::jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), + width = w * 100, height = h * 100, res = 150) + pheatmap::pheatmap( + heatmap_mat, + main = geneset_name, + color = grDevices::colorRampPalette(c("blue","white","red"))(30), + scale = "row", + clustering_distance_rows = "euclidean", + cluster_cols = FALSE, + clustering_method = "complete", + fontsize_row = 6, + fontsize_col = 7, + annotation_col = annot_col, + border_color = NA, + cellheight = 5, + cellwidth = 8 + ) + grDevices::dev.off() + } + + message("Heatmaps saved in: ", normalizePath(output_dir)) + return(TRUE) +} diff --git a/R/merge_GSEA.R b/R/merge_GSEA.R index a52618f..990d6b7 100644 --- a/R/merge_GSEA.R +++ b/R/merge_GSEA.R @@ -1,22 +1,23 @@ - -# Function merge_GSEA # +####################### +# Function merge_GSEA # +####################### #' Merge GSEA results data frames. -#' -#' After run GSEA_all.sh from GSEA.sh, merge_GSEA function join .tsv files to a single file -#' +#' +#' After running GSEA_all.sh from GSEA.sh, merge_GSEA function joins .tsv files to a single file +#' #' @param input_directory The directory containing the GSEA collection results in TSV format. #' @param output_file The output file to save the merged data. If not provided, the file will be saved in the input directory. #' @importFrom magrittr %>% #' @export - + merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_data.tsv") { - - if (!requireNamespace("dplyr", quietly = TRUE)) { - stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) - } - + + if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package \"dplyr\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("readr", quietly = TRUE)) stop("Package \"readr\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package \"tidyr\" must be installed to use this function.", call. = FALSE) + # Validate input directory and check for TSV files if (!dir.exists(input_directory)) { stop("The specified directory does not exist: ", input_directory) @@ -25,7 +26,7 @@ merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_d if (length(files) == 0) { stop("No TSV files found in ", input_directory) } - + # Function to read each file and add a column with the modified file name read_file <- function(file) { data <- readr::read_tsv(file) @@ -33,7 +34,7 @@ merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_d file_name <- sub("_all.tsv$", "", file_name) # Change the pattern if necessary numeric_cols <- c("SIZE", "ES", "NES", "NOM p-val", "FDR q-val", "FWER p-val", "RANK AT MAX") data <- data %>% - dplyr::mutate(across(tidyselect::all_of(numeric_cols), as.numeric)) + dplyr::mutate(dplyr::across(tidyselect::all_of(numeric_cols), as.numeric)) data$COLLECTION <- file_name return(data) } @@ -43,12 +44,12 @@ merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_d # Find problematic values in numeric columns gsea_data %>% - dplyr::filter(dplyr::if_any(all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% + dplyr::filter(dplyr::if_any(tidyselect::all_of(numeric_cols), ~ !grepl("^-?[0-9.]+$", .))) %>% print() - + # Data processing: selection, separation, mutation, and renaming of columns gsea_data <- gsea_data %>% - dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% + dplyr::select(-"GS
follow link to MSigDB", -"GS DETAILS") %>% tidyr::separate(col = `LEADING EDGE`, into = c("tags", "list", "signal"), sep = ",", remove = FALSE) %>% dplyr::mutate( tags = 0.01 * as.numeric(sub("%", "", sub("tags=", "", tags))), @@ -59,8 +60,10 @@ merge_GSEA <- function(input_directory, output_file = "collections_merged_gsea_d ) %>% dplyr::relocate(`Log10FDR`, .after = `FWER p-val`) %>% dplyr::rename(COMPARISON = Comparison, FDR = `FDR q-val`) - + # Save the processed data to a TSV file readr::write_tsv(gsea_data, output_file) - cat("GSEA data saved to:", output_file, "\n") + message("GSEA data saved to:", output_file, "\n") + + return(TRUE) } diff --git a/R/nice_BSV.R b/R/nice_VSB.R similarity index 95% rename from R/nice_BSV.R rename to R/nice_VSB.R index 0b80c81..97be5c6 100644 --- a/R/nice_BSV.R +++ b/R/nice_VSB.R @@ -1,8 +1,8 @@ -####################### -# Function nice_BSV.R # -####################### +##################### +# Function nice_VSB # +##################### -#' Function to make Box-Scatter-Violin plots. +#' Function to make Violin-Scatter-Box plots. #' #' This function will make a Boxplot, using a DEseq object. #' It will show the data points on top with a small deviation (jitter) for a better visualization. @@ -27,7 +27,7 @@ #' @importFrom rlang .data #' @export -nice_BSV <- function (object = NULL, annotations, variables = c(fill = "VarFill", shape = "VarShape"), +nice_VSB <- function (object = NULL, annotations, variables = c(fill = "VarFill", shape = "VarShape"), genename = NULL, symbol = NULL, labels = c("N", "P", "R", "M"), categories = c("normal", "primary", "recurrence", "metastasis"), colors = NULL, shapes = NULL, markersize = NULL, alpha = 0.8, jitter = 0.2, diff --git a/R/plot_GSEA.R b/R/plot_GSEA.R index 1c79858..3be4b21 100644 --- a/R/plot_GSEA.R +++ b/R/plot_GSEA.R @@ -1,114 +1,117 @@ -#' Plot global GSEA results -#' -#' Generates a composite plot displaying NES values, pathway labels, -#' and a \emph{logFDR} legend, organized by MSigDB collections. -#' -#' @param data Data frame containing the GSEA results. -#' @param geneset_col Name of the column containing the genesets. -#' @param collection_col Name of the column containing the collections. -#' @param nes_col Name of the column containing the NES values. -#' @param logfdr_col Name of the column containing \eqn{-\log_{10}(FDR)} values. -#' @param output_path_base Path and base name for the output files (without extension). -#' @param width_output Plot width in inches. -#' @param height_output Plot height in inches. -#' @param text_size_genesets Text size for the geneset labels. -#' @param text_size_collection Text size for the collection labels. -#' @return Saves two files: \code{output_path_base}.jpg and \code{output_path_base}.pdf. -#' @import ggplot2 patchwork cowplot scales -#' @export -plot_GSEA <- function(data, geneset_col, collection_col, nes_col, logfdr_col, output_path_base, width_output, height_output, text_size_genesets = 5, text_size_collection = 5) { - - # Rename columns dynamically - data <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] - colnames(data) <- c("Geneset", "Collection", "NES", "logFDR") - - # Order data by NES value (descending) - data <- data[order(data$NES, decreasing = TRUE), ] - - # Ensure Geneset and Collection are factors with ordered levels - data$Geneset <- factor(data$Geneset, levels = rev(unique(data$Geneset))) - data$Collection <- factor(data$Collection, levels = unique(data$Collection)) - - # Right-side label: "MSigDB" vertically centered, in bold and italic - plot_text_msigdb <- ggplot() + - annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ - theme_void() - - # Lef-side label: "Pathways" vertically centered, in bold and italic - plot_text_pathways <- ggplot() + - annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ - theme_void() - - # Right panel: Collection labels (without repetition) - plot_right <- ggplot(data, aes(y = Geneset, x = 1.5, label = Collection)) + - geom_text(aes(label = ifelse(duplicated(Collection), "", Collection)), - hjust = 0.5, size = 0, fontface = "bold") + - facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + - theme_void() + - theme(strip.text.y = element_text(angle = 0, hjust = 1, size = text_size_collection), - panel.spacing = unit(1, "lines")) - - # Center panel: NES bar plot - plot_center <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + - geom_col(color = "black", size = 1) + - scale_fill_gradient(low = "white", high = "red", - limits = c(0,3), breaks = seq(0,3,1)) + - scale_y_discrete(position = "right") + - facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + - theme_bw() + - labs(x = "NES", y = "") + - theme(axis.text.y = element_blank(), - strip.background = element_rect(fill = "white", color = "black",linewidth = 1 ), - axis.ticks.y = element_line(color = "black", size = 1.5), - axis.ticks.length = unit(0.3, "cm"), - strip.text.y = element_text(size = 1, margin = margin(0, 0, 0, 0)),# element_blank(), - legend.position = "none", - axis.title.x = element_text(size = 49), - axis.text.x = element_text(size = 45), - panel.spacing = unit(4, "lines") - ) - - # Left panel: Pathays labels - plot_left <- ggplot(data, aes(y = Geneset, x = 0, label = Geneset)) + - geom_text(hjust = 1, size = text_size_genesets) + - theme_void() + - theme(axis.text.y = element_blank(), - plot.margin = margin(0, 0, 0, -50)) - - # Legend panel - plot_legend <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + - geom_tile() + - scale_fill_gradient(low = "white", high = "red", - name = expression(-log[10] ~ FDR), # log10FDR with subscrip, - limits = c(0,3), breaks = seq(0,3,1), - guide = guide_colorbar(ticks.colour = "black", # Make ticks black - ticks.linewidth = 1.5, # Make ticks thicker - draw.ulim = TRUE, # Draw upper limit tick - draw.llim = TRUE)) + # Draw lower limit tick - theme_bw() + - theme(legend.position = "right", - legend.box = "vertical", - legend.title = element_text(size = 44, hjust = 0.5, face = "bold"), # Bigger title - legend.text = element_text(size = 30), # Bigger legend text - legend.key.size = unit(1.5, "cm"), # Bigger color key size - legend.key.height = unit(2, "cm"), # Increase the height of the legend box - legend.spacing = unit(3.5, "cm"), # More space between title and legend - legend.box.margin = margin(10, 20, 10, 10))#5, 5, 10, 5)) # Adjust internal spacing - - plot_legend <- plot_legend + theme(legend.box = "vertical") - plot_right_legend <- get_legend(plot_legend) - - # Extract legend - #plot_right_legend <- get_legend(plot_legend) - - # Combine all plots - final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + plot_text_msigdb + plot_right_legend + - plot_layout(ncol = 6, widths = c(4, 25, 15, 3, 10, 3)) - - # Display the plot - #print(final_plot) - - # Save the plot as JPG and PDF - ggsave(paste0(output_path_base, ".jpg"), final_plot, width = width_output, height = height_output, units = "in", dpi = 300) - ggsave(paste0(output_path_base, ".pdf"), final_plot, width = width_output, height = height_output, units = "in") -} +###################### +# Function plot_GSEA # +###################### + +#' Plot global GSEA results +#' +#' Generates a composite plot displaying NES values, pathway labels, +#' and a \emph{logFDR} legend, organized by MSigDB collections. +#' +#' @param data Data frame containing the GSEA results. +#' @param geneset_col Name of the column containing the genesets. +#' @param collection_col Name of the column containing the collections. +#' @param nes_col Name of the column containing the NES values. +#' @param logfdr_col Name of the column containing \eqn{-\log_{10}(FDR)} values. +#' @param text_size_genesets Text size for the geneset labels. +#' @param text_size_collection Text size for the collection labels. +#' @import ggplot2 +#' @importFrom patchwork plot_layout +#' @return GSEA barplots arranged in a grid. +#' @export + +plot_GSEA <- function(data, geneset_col, collection_col, nes_col, logfdr_col, + text_size_genesets = 5, text_size_collection = 5) +{ + + if (!requireNamespace("patchwork", quietly = TRUE)) stop("Package \"patchwork\" must be installed to use this function.", call. = FALSE) + if (!requireNamespace("cowplot", quietly = TRUE)) stop("Package \"cowplot\" must be installed to use this function.", call. = FALSE) + + # Rename columns dynamically + data <- data[, c(geneset_col, collection_col, nes_col, logfdr_col)] + colnames(data) <- c("Geneset", "Collection", "NES", "logFDR") + + # Order data by NES value (descending) + data <- data[order(data$NES, decreasing = TRUE), ] + + # Ensure Geneset and Collection are factors with ordered levels + data$Geneset <- factor(data$Geneset, levels = rev(unique(data$Geneset))) + data$Collection <- factor(data$Collection, levels = unique(data$Collection)) + + # Right-side label: "MSigDB" vertically centered, in bold and italic + plot_text_msigdb <- ggplot() + + annotate("text", label = "MSigDB", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Lef-side label: "Pathways" vertically centered, in bold and italic + plot_text_pathways <- ggplot() + + annotate("text", label = "Pathways", fontface = "bold.italic", angle = 90, size = 35, x = 0, y = 0.5)+ + theme_void() + + # Right panel: Collection labels (without repetition) + plot_right <- ggplot(data, aes(y = Geneset, x = 1.5, label = Collection)) + + geom_text(aes(label = ifelse(duplicated(Collection), "", Collection)), + hjust = 0.5, size = 0, fontface = "bold") + + facet_grid(Collection ~ ., scales = "free_y", space = "free", switch = "y") + + theme_void() + + theme(strip.text.y = element_text(angle = 0, hjust = 1, size = text_size_collection), + panel.spacing = grid::unit(1, "lines")) + + # Center panel: NES bar plot + plot_center <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_col(color = "black", size = 1) + + scale_fill_gradient(low = "white", high = "red", + limits = c(0,3), breaks = seq(0,3,1)) + + scale_y_discrete(position = "right") + + facet_grid(Collection ~ ., scales = "free_y", space = "free_y") + + theme_bw() + + labs(x = "NES", y = "") + + theme(axis.text.y = element_blank(), + strip.background = element_rect(fill = "white", color = "black",linewidth = 1 ), + axis.ticks.y = element_line(color = "black", size = 1.5), + axis.ticks.length = grid::unit(0.3, "cm"), + strip.text.y = element_text(size = 1, margin = margin(0, 0, 0, 0)),# element_blank(), + legend.position = "none", + axis.title.x = element_text(size = 49), + axis.text.x = element_text(size = 45), + panel.spacing = grid::unit(4, "lines") + ) + + # Left panel: Pathays labels + plot_left <- ggplot(data, aes(y = Geneset, x = 0, label = Geneset)) + + geom_text(hjust = 1, size = text_size_genesets) + + theme_void() + + theme(axis.text.y = element_blank(), + plot.margin = margin(0, 0, 0, -50)) + + # Legend panel + plot_legend <- ggplot(data, aes(x = NES, y = Geneset, fill = logFDR)) + + geom_tile() + + scale_fill_gradient(low = "white", high = "red", + name = expression(-log[10] ~ FDR), # log10FDR with subscrip, + limits = c(0,3), breaks = seq(0,3,1), + guide = guide_colorbar(ticks.colour = "black", # Make ticks black + ticks.linewidth = 1.5, # Make ticks thicker + draw.ulim = TRUE, # Draw upper limit tick + draw.llim = TRUE)) + # Draw lower limit tick + theme_bw() + + theme(legend.position = "right", + legend.box = "vertical", + legend.title = element_text(size = 44, hjust = 0.5, face = "bold"), # Bigger title + legend.text = element_text(size = 30), # Bigger legend text + legend.key.size = grid::unit(1.5, "cm"), # Bigger color key size + legend.key.height = grid::unit(2, "cm"), # Increase the height of the legend box + legend.spacing = grid::unit(3.5, "cm"), # More space between title and legend + legend.box.margin = margin(10, 20, 10, 10)) # 5, 5, 10, 5)) # Adjust internal spacing + + plot_legend <- plot_legend + theme(legend.box = "vertical") + plot_right_legend <- cowplot::get_legend(plot_legend) + + # Extract legend + #plot_right_legend <- get_legend(plot_legend) + + # Combine all plots + final_plot <- plot_text_pathways + plot_left + plot_center + plot_right + plot_text_msigdb + plot_right_legend + + patchwork::plot_layout(ncol = 6, widths = c(4, 25, 15, 3, 10, 3)) + + return(final_plot) +} diff --git a/R/plotheatmap_leadingedge_GSEA.R b/R/plotheatmap_leadingedge_GSEA.R deleted file mode 100644 index c770b3e..0000000 --- a/R/plotheatmap_leadingedge_GSEA.R +++ /dev/null @@ -1,185 +0,0 @@ -#' Plot leading edge heatmaps from GSEA results -#' -#' Generates heatmaps for leading edge genes of each gene set from GSEA output. -#' -#' @param main_dir_2 Optional base directory to prepend to file paths. -#' @param expression_file Path to the expression data file (tab-delimited) or relative to main_dir_2. -#' @param metadata_file Path to the metadata file (Excel) or relative to main_dir_2. -#' @param gmt_file Path to the GMT file defining gene sets or relative to main_dir_2. -#' @param ranked_genes_file Path to the ranked genes list file or relative to main_dir_2. -#' @param gsea_file Path to the GSEA results file with leading edge genes or relative to main_dir_2. -#' @param output_dir Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps". -#' @param sample_col Name of the sample ID column in metadata; default "Sample". -#' @param group_col Name of the group column in metadata; default "group". -#' @param save_dataframe Logical; if TRUE, saves the merged data frame as TSV before plotting. -#' @return Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. -#' @import dplyr readr tidyr readxl pheatmap -#' @export -plotheatmap_leadingedge_GSEA <- function( - main_dir_2 = NULL, - expression_file, - metadata_file, - gmt_file, - ranked_genes_file, - gsea_file, - output_dir = "leading_edge_heatmaps", - sample_col = "Sample", - group_col = "group", - save_dataframe = FALSE -) { - # Ensure required packages are installed - if (!requireNamespace("dplyr", quietly = TRUE)) stop("Package 'dplyr' is required.") - if (!requireNamespace("readr", quietly = TRUE)) stop("Package 'readr' is required.") - if (!requireNamespace("tidyr", quietly = TRUE)) stop("Package 'tidyr' is required.") - if (!requireNamespace("readxl", quietly = TRUE)) stop("Package 'readxl' is required.") - if (!requireNamespace("pheatmap", quietly = TRUE)) stop("Package 'pheatmap' is required.") - - # Prepend base directory if provided - if (!is.null(main_dir_2)) { - expression_file <- file.path(main_dir_2, expression_file) - metadata_file <- file.path(main_dir_2, metadata_file) - gmt_file <- file.path(main_dir_2, gmt_file) - ranked_genes_file <- file.path(main_dir_2, ranked_genes_file) - gsea_file <- file.path(main_dir_2, gsea_file) - output_dir <- file.path(main_dir_2, output_dir) - } - - # 1) Read and process GMT - gmt_data <- readLines(gmt_file) %>% - strsplit("\t") %>% - lapply(function(x) data.frame( - NAME = x[1], - DESCRIPTION = x[2], - GENES = paste(x[-c(1,2)], collapse = ","), - stringsAsFactors = FALSE - )) %>% - dplyr::bind_rows() - - # 2) Read GSEA results and join genes - gsea_df <- readr::read_tsv(gsea_file, show_col_types = FALSE) %>% - dplyr::left_join(gmt_data %>% dplyr::select(NAME, GENES), by = "NAME") - - # 3) Read ranked genes list - ranked_df <- readr::read_tsv(ranked_genes_file, show_col_types = FALSE) - ranked_vector <- ranked_df[[1]] - - # 4) Internal helper: extract top-n genes from leading edge - extract_top_n <- function(genes_str, n) { - if (is.na(genes_str) || n <= 0) return(NA_character_) - glist <- unlist(strsplit(genes_str, ",")) - glist <- glist[order(match(glist, ranked_vector), na.last = TRUE)] - paste(head(glist, n), collapse = ",") - } - - # 5) Compute leading edge size and genes - gsea_df <- gsea_df %>% - dplyr::mutate( - L.EDGE_size = ifelse( - is.na(SIZE * tags), NA, - ifelse((SIZE * tags) %% 1 <= 0.5, - floor(SIZE * tags), - ceiling(SIZE * tags)) - ) - ) %>% - dplyr::rowwise() %>% - dplyr::mutate( - LEADING_EDGE_GENES = extract_top_n(GENES, L.EDGE_size) - ) %>% - dplyr::ungroup() - - # Save intermediate dataframe if requested - if (save_dataframe) { - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - intermediate_file <- file.path(output_dir, "leading_edge_genes_df.tsv") - readr::write_tsv(gsea_df, intermediate_file) - message("Saved data frame to: ", intermediate_file) - } - - # 6) Read metadata and prepare annotation - meta <- readxl::read_xlsx(metadata_file) %>% - dplyr::select(dplyr::all_of(c(sample_col, group_col))) %>% - dplyr::rename(Sample = dplyr::all_of(sample_col), - Group = dplyr::all_of(group_col)) %>% - as.data.frame() - rownames(meta) <- meta$Sample - - # 7) Read expression data - expr_raw <- read.table(expression_file, header = TRUE, sep = "\t", - stringsAsFactors = FALSE, check.names = FALSE) - # Determine gene-name column - if ("NAME" %in% colnames(expr_raw)) { - rownames(expr_raw) <- expr_raw$NAME - expr_mat <- expr_raw[, setdiff(colnames(expr_raw), "NAME"), drop = FALSE] - } else { - gene_col <- colnames(expr_raw)[1] - rownames(expr_raw) <- expr_raw[[gene_col]] - expr_mat <- expr_raw[, -1, drop = FALSE] - } - # Clean sample names - colnames(expr_mat) <- sub("^X", "", colnames(expr_mat)) - - # Ensure output directory exists - if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) - - # 8) Loop through each gene set and plot heatmap - for (i in seq_len(nrow(gsea_df))) { - geneset_name <- gsea_df$NAME[i] - leading_genes <- unlist(strsplit(gsea_df$LEADING_EDGE_GENES[i], ",")) - genes_present <- leading_genes[leading_genes %in% rownames(expr_mat)] - if (length(genes_present) == 0) next - - heatmap_mat <- expr_mat[genes_present, , drop = FALSE] - common_samps <- intersect(colnames(heatmap_mat), rownames(meta)) - if (length(common_samps) == 0) next - - heatmap_mat <- heatmap_mat[, common_samps, drop = FALSE] - annot_col <- data.frame(Group = meta[common_samps, "Group"]) - rownames(annot_col) <- common_samps - - # Dynamic sizing - w <- 10 - h <- max(5, nrow(heatmap_mat) * 0.1 + 2) - - # PDF output - pdf(file.path(output_dir, paste0(geneset_name, "_heatmap.pdf")), - width = w, height = h) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - dev.off() - - # JPG output - jpeg(file.path(output_dir, paste0(geneset_name, "_heatmap.jpg")), - width = w * 100, height = h * 100, res = 150) - pheatmap::pheatmap( - heatmap_mat, - main = geneset_name, - color = colorRampPalette(c("blue","white","red"))(30), - scale = "row", - clustering_distance_rows = "euclidean", - cluster_cols = FALSE, - clustering_method = "complete", - fontsize_row = 6, - fontsize_col = 7, - annotation_col = annot_col, - border_color = NA, - cellheight = 5, - cellwidth = 8 - ) - dev.off() - } - - message("Heatmaps saved in: ", normalizePath(output_dir)) -} diff --git a/man/gsea_barplot.Rd b/man/barplot_GSEA.Rd similarity index 69% rename from man/gsea_barplot.Rd rename to man/barplot_GSEA.Rd index fd67b12..ba47ad0 100644 --- a/man/gsea_barplot.Rd +++ b/man/barplot_GSEA.Rd @@ -1,17 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gsea_barplot.R -\name{gsea_barplot} -\alias{gsea_barplot} +% Please edit documentation in R/barplot_GSEA.R +\name{barplot_GSEA} +\alias{barplot_GSEA} \title{Create and save a customized barplot for GSEA results} \usage{ -gsea_barplot( - data, - output_path, - custom_labels, - height = 49, - width = 30, - axis_y = "NES" -) +barplot_GSEA(data, output_path, custom_labels, axis_y = "NES") } \arguments{ \item{data}{A data frame containing GSEA results with columns such as \code{datatype}, \code{NES}, \code{-Log10FDR}, and \code{New_name}.} @@ -20,10 +13,6 @@ gsea_barplot( \item{custom_labels}{A named vector of custom expressions for x-axis labels.} -\item{height}{Height of the saved plot in inches. Default: 49.} - -\item{width}{Width of the saved plot in inches. Default: 30.} - \item{axis_y}{Name of the column to use for the y-axis aesthetic, as a string. Default: "NES".} } \description{ diff --git a/man/heatmap_GSEA.Rd b/man/heatmap_GSEA.Rd new file mode 100644 index 0000000..788d4b0 --- /dev/null +++ b/man/heatmap_GSEA.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/heatmap_GSEA.R +\name{heatmap_GSEA} +\alias{heatmap_GSEA} +\title{Plot leading edge heatmaps from GSEA results.} +\usage{ +heatmap_GSEA( + main_dir = NULL, + expression_file, + metadata_file, + gmt_file, + ranked_genes_file, + gsea_file, + output_dir = "leading_edge_heatmaps", + sample_col = "Sample", + group_col = "group", + save_dataframe = FALSE +) +} +\arguments{ +\item{main_dir}{Optional base directory. If supplied, it will be prepended to all relative file paths.} + +\item{expression_file}{Path to the expression data file (tab-delimited) or relative to main_dir.} + +\item{metadata_file}{Path to the metadata file (Excel) or relative to main_dir.} + +\item{gmt_file}{Path to the GMT file defining gene sets or relative to main_dir.} + +\item{ranked_genes_file}{Path to the ranked genes list file or relative to main_dir.} + +\item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to main_dir.} + +\item{output_dir}{Directory to save heatmaps and optional TSV; default "leading_edge_heatmaps".} + +\item{sample_col}{Name of the sample ID column in metadata; default "Sample".} + +\item{group_col}{Name of the group column in metadata; default "group".} + +\item{save_dataframe}{Logical; if TRUE, saves the merged data frame as TSV before plotting.} +} +\value{ +Saves one PDF and one JPG heatmap per gene set under output_dir; optionally saves intermediate TSV. +} +\description{ +Generates heatmaps of leading edge genes for each gene set from GSEA output. +} diff --git a/man/merge_GSEA.Rd b/man/merge_GSEA.Rd index fab3e99..d845bc8 100644 --- a/man/merge_GSEA.Rd +++ b/man/merge_GSEA.Rd @@ -4,7 +4,7 @@ \alias{merge_GSEA} \title{Merge GSEA results data frames.} \usage{ -merge_GSEA(input_directory, output_file) +merge_GSEA(input_directory, output_file = "collections_merged_gsea_data.tsv") } \arguments{ \item{input_directory}{The directory containing the GSEA collection results in TSV format.} @@ -12,5 +12,5 @@ merge_GSEA(input_directory, output_file) \item{output_file}{The output file to save the merged data. If not provided, the file will be saved in the input directory.} } \description{ -After run GSEA_all.sh from GSEA.sh, move the .tsv files to a single directory +After running GSEA_all.sh from GSEA.sh, merge_GSEA function joins .tsv files to a single file } diff --git a/man/nice_BSV.Rd b/man/nice_VSB.Rd similarity index 93% rename from man/nice_BSV.Rd rename to man/nice_VSB.Rd index ba5a925..c7ad3c4 100644 --- a/man/nice_BSV.Rd +++ b/man/nice_VSB.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/nice_BSV.R -\name{nice_BSV} -\alias{nice_BSV} -\title{Function to make Box-Scatter-Violin plots.} +% Please edit documentation in R/nice_VSB.R +\name{nice_VSB} +\alias{nice_VSB} +\title{Function to make Violin-Scatter-Box plots.} \usage{ -nice_BSV( +nice_VSB( object = NULL, annotations, variables = c(fill = "VarFill", shape = "VarShape"), diff --git a/man/plot_GSEA.Rd b/man/plot_GSEA.Rd new file mode 100644 index 0000000..9e39ad3 --- /dev/null +++ b/man/plot_GSEA.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_GSEA.R +\name{plot_GSEA} +\alias{plot_GSEA} +\title{Plot global GSEA results} +\usage{ +plot_GSEA( + data, + geneset_col, + collection_col, + nes_col, + logfdr_col, + text_size_genesets = 5, + text_size_collection = 5 +) +} +\arguments{ +\item{data}{Data frame containing the GSEA results.} + +\item{geneset_col}{Name of the column containing the genesets.} + +\item{collection_col}{Name of the column containing the collections.} + +\item{nes_col}{Name of the column containing the NES values.} + +\item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.} + +\item{text_size_genesets}{Text size for the geneset labels.} + +\item{text_size_collection}{Text size for the collection labels.} +} +\value{ +GSEA barplots arranged in a grid. +} +\description{ +Generates a composite plot displaying NES values, pathway labels, +and a \emph{logFDR} legend, organized by MSigDB collections. +} diff --git a/man/plot_global_GSEA.Rd b/man/plot_global_GSEA.Rd deleted file mode 100644 index 2504c0a..0000000 --- a/man/plot_global_GSEA.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -\name{plot_global_GSEA} -\alias{plot_global_GSEA} -\title{Plot global GSEA results} -\usage{ -plot_global_GSEA(data, geneset_col, collection_col, nes_col, - logfdr_col, output_path_base, width_output, - height_output, text_size_genesets = 5, - text_size_collection = 5) -} -\arguments{ - \item{data}{Data frame containing the GSEA results.} - \item{geneset_col}{Name of the column containing the genesets.} - \item{collection_col}{Name of the column containing the MSigDB collections.} - \item{nes_col}{Name of the column containing NES values.} - \item{logfdr_col}{Name of the column containing \eqn{-\log_{10}(FDR)} values.} - \item{output_path_base}{Path and base name for the output files (no extension).} - \item{width_output}{Width of the output plot in inches.} - \item{height_output}{Height of the output plot in inches.} - \item{text_size_genesets}{Text size for the geneset labels.} - \item{text_size_collection}{Text size for the collection labels.} -} -\description{ -Generates a multi-panel plot summarizing GSEA results, showing pathways, -normalized enrichment scores (NES), and log-transformed FDR values -grouped by MSigDB collection. -} -\details{ -This function creates a visual summary of GSEA results by combining: -\itemize{ - \item \strong{Pathway labels} on the left, - \item \strong{NES bars} filled by \emph{log10 FDR} in the center, - \item \strong{Collection labels} on the right, - \item Vertical titles on both ends ("Pathways" and "MSigDB"). -} -The final figure is saved as both PDF and JPG. -} -\examples{ -\dontrun{ -df <- readr::read_tsv("collections_merged_gsea_data.tsv") -plot_global_GSEA(df, "Geneset", "COLLECTION", "NES", "Log10FDR", - "output/plotGSEA", 12, 8) -} -} diff --git a/man/plotheatmap_leadingedge_GSEA.Rd b/man/plotheatmap_leadingedge_GSEA.Rd deleted file mode 100644 index cf741da..0000000 --- a/man/plotheatmap_leadingedge_GSEA.Rd +++ /dev/null @@ -1,65 +0,0 @@ -% Generated by roxygen2; do not edit by hand -\name{plotheatmap_leadingedge_GSEA} -\alias{plotheatmap_leadingedge_GSEA} -\title{Plot leading edge heatmaps from GSEA results} -\usage{ -plotheatmap_leadingedge_GSEA( - main_dir_2 = NULL, - expression_file, - metadata_file, - gmt_file, - ranked_genes_file, - gsea_file, - output_dir = "leading_edge_heatmaps", - sample_col = "Sample", - group_col = "group", - save_dataframe = FALSE -) -} -\arguments{ - \item{main_dir_2}{Optional base directory to prepend to all file paths.} - \item{expression_file}{Path to the expression data file (tab-delimited) or relative to \code{main_dir_2}.} - \item{metadata_file}{Path to the metadata file (Excel) or relative to \code{main_dir_2}.} - \item{gmt_file}{Path to the GMT file defining gene sets or relative to \code{main_dir_2}.} - \item{ranked_genes_file}{Path to the ranked genes list file or relative to \code{main_dir_2}.} - \item{gsea_file}{Path to the GSEA results file with leading edge genes or relative to \code{main_dir_2}.} - \item{output_dir}{Directory to save heatmap outputs and optional TSV; default \code{"leading_edge_heatmaps"}.} - \item{sample_col}{Name of the sample ID column in metadata; default \code{"Sample"}.} - \item{group_col}{Name of the group column in metadata; default \code{"group"}.} - \item{save_dataframe}{Logical; if \code{TRUE}, saves the merged data frame as TSV before plotting.} -} -\description{ -Generates and saves per-geneset heatmaps of leading edge genes from GSEA results. -Reads input files, extracts top-ranked leading edge genes, and plots expression heatmaps. -} -\details{ -This function performs the following steps: -\enumerate{ - \item Reads a GMT file to map gene set names to member genes. - \item Reads GSEA output and merges in the full gene lists for each gene set. - \item Reads a ranked gene list and determines the order for leading edge extraction. - \item Calculates the leading-edge size (\code{SIZE * tags}) and extracts the top genes accordingly. - \item Optionally saves the intermediate merged data frame to a TSV file. - \item Reads expression and metadata tables, aligns sample names, and prepares annotation. - \item Iterates over each gene set, selecting expression rows for the leading edge genes, and writes a heatmap (PDF and JPG) to \code{output_dir}. -} -} -\value{ -No return value; files are written to disk. A message indicates the final output directory. -} -\examples{ -\dontrun{ -plotheatmap_leadingedge_GSEA( - main_dir_2 = "C:/Projects/project1", - expression_file = "Results/expression_data.txt", - metadata_file = "sample_data.xlsx", - gmt_file = "GSEA/genesets.gmt", - ranked_genes_file = "Ranking/ranked_gene_list.tsv", - gsea_file = "GSEA/gsea_results.tsv", - output_dir = "LeadingEdgePlots", - sample_col = "id", - group_col = "group", - save_dataframe = TRUE -) -} -}