diff --git a/DESCRIPTION b/DESCRIPTION index 1734a94..302ac39 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,7 @@ Authors@R: c( Description: This package contains functions that help in manipulating tables and generating plots for multi-omics analysis including genomics, transcriptomics, proteomics, methylomics and immunoinformatics. License: CC BY-NC-SA 4.0 Encoding: UTF-8 -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Imports: dplyr, ggplot2, @@ -30,7 +30,10 @@ Imports: grid, utils, patchwork, - cowplot + cowplot, + igraph, + visNetwork, + cluster Suggests: biomaRt, dbscan, diff --git a/NAMESPACE b/NAMESPACE index cd794b9..ed9408b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_annotations) +export(build_net) export(detect_filter) export(get_annotations) export(get_stars) @@ -17,9 +18,29 @@ export(power_analysis) export(save_results) export(split_cases) export(tpm) +export(view_net) import(ggplot2) +importFrom(cluster,silhouette) importFrom(cowplot,get_legend) +importFrom(dplyr,distinct) +importFrom(dplyr,filter) +importFrom(dplyr,left_join) +importFrom(dplyr,pull) +importFrom(dplyr,rename) +importFrom(igraph,E) +importFrom(igraph,V) +importFrom(igraph,as_data_frame) +importFrom(igraph,graph_from_adjacency_matrix) importFrom(magrittr,"%>%") importFrom(patchwork,plot_layout) importFrom(rlang,.data) +importFrom(stats,cutree) +importFrom(stats,dist) +importFrom(stats,hclust) +importFrom(tibble,tibble) importFrom(utils,modifyList) +importFrom(utils,read.delim) +importFrom(visNetwork,visLayout) +importFrom(visNetwork,visNetwork) +importFrom(visNetwork,visOptions) +importFrom(visNetwork,visSave) diff --git a/R/build_net.R b/R/build_net.R new file mode 100644 index 0000000..7c6bea5 --- /dev/null +++ b/R/build_net.R @@ -0,0 +1,238 @@ +############################### +# Build enrichment clustering # +############################### + +#' Calculate gene set clusters from GMT files and enrichment results +#' +#' This function parses one or more GMT files, filters an enrichment results +#' data frame by an FDR column, computes a Jaccard similarity matrix between +#' selected gene sets, performs hierarchical clustering, and determines the +#' optimal number of clusters using the silhouette index. +#' +#' @param results_df A \code{data.frame} or \code{tibble} containing enrichment +#' results. Must contain at least the columns specified by \code{fdr_col} and +#' \code{pathway_col}. +#' @param gmt_path A character scalar. Path to a directory containing one or +#' more \code{.gmt} files. +#' @param fdr_threshold A numeric scalar specifying the FDR cutoff used to +#' filter \code{results_df}. Only pathways with FDR values less than this +#' threshold will be considered. Default is \code{0.25}. +#' @param fdr_col A character scalar specifying the column name in +#' \code{results_df} that contains FDR values. +#' @param pathway_col A character scalar specifying the column name in +#' \code{results_df} that contains pathway identifiers (names matching the +#' GMT gene set names). +#' @param max_k_ratio A numeric scalar used to determine the maximum number of +#' clusters to evaluate with the silhouette index. The maximum k is computed +#' as \code{max(2, floor(n / max_k_ratio))}, where \code{n} is the number of +#' selected gene sets. Default is \code{2.5}. +#' +#' @details +#' GMT files are expected to be tab-delimited, with the first column being the +#' gene set name, the second column a description (which is ignored), and the +#' remaining columns listing gene identifiers. +#' +#' Jaccard similarity between gene sets \eqn{A} and \eqn{B} is defined as: +#' \deqn{J(A,B) = |A \cap B| / |A \cup B|}. +#' +#' Hierarchical clustering is performed using Euclidean distances derived from +#' \code{1 - Jaccard_similarity} and the \code{"ward.D2"} linkage method. +#' The optimal number of clusters is chosen as the \code{k} that maximizes the +#' mean silhouette width for \code{k} in \code{2:max_k}. +#' +#' This function does not perform any file I/O beyond reading GMT files; it is +#' intended to be used inside packages and downstream plotting functions. +#' +#' @return A named list with the following elements: +#' \itemize{ +#' \item \code{clusters}: A \code{tibble} with columns \code{pathway} and +#' \code{cluster} (integer cluster memberships). +#' \item \code{jaccard_matrix}: A symmetric numeric matrix of Jaccard +#' similarities between gene sets (rows and columns named by pathways). +#' \item \code{hclust_obj}: An \code{\link[stats]{hclust}} object containing +#' the hierarchical clustering result. +#' \item \code{optimal_k}: An integer giving the selected number of clusters. +#' \item \code{silhouette_data}: A \code{tibble} with columns \code{k} and +#' \code{silhouette}, suitable for plotting the silhouette index as a +#' function of \code{k}. +#' } +#' +#' @examples +#' \dontrun{ +#' res <- read.csv("pathway_results.csv") +#' clustering <- build_net( +#' results_df = res, +#' gmt_path = "gmt_dir/", +#' fdr_threshold = 0.25, +#' fdr_col = "FDR_gsea", +#' pathway_col = "Pathway", +#' max_k_ratio = 2.5 +#' ) +#' } +#' +#' @importFrom stats hclust dist cutree +#' @importFrom utils read.delim +#' @importFrom cluster silhouette +#' @importFrom dplyr filter pull distinct +#' @importFrom rlang .data +#' @importFrom tibble tibble +#' @export + +build_net <- function(results_df, + gmt_path, + fdr_threshold = 0.25, + fdr_col, + pathway_col, + max_k_ratio = 2.5) { + # Basic input checks + if (!dir.exists(gmt_path)) { + stop("The provided 'gmt_path' does not exist: ", gmt_path) + } + if (!is.data.frame(results_df)) { + stop("'results_df' must be a data.frame or tibble.") + } + if (!is.character(fdr_col) || length(fdr_col) != 1L) { + stop("'fdr_col' must be a single character string.") + } + if (!is.character(pathway_col) || length(pathway_col) != 1L) { + stop("'pathway_col' must be a single character string.") + } + if (!fdr_col %in% colnames(results_df)) { + stop("Column '", fdr_col, "' not found in 'results_df'.") + } + if (!pathway_col %in% colnames(results_df)) { + stop("Column '", pathway_col, "' not found in 'results_df'.") + } + if (!is.numeric(fdr_threshold) || length(fdr_threshold) != 1L || is.na(fdr_threshold)) { + stop("'fdr_threshold' must be a single numeric value.") + } + if (!is.numeric(max_k_ratio) || length(max_k_ratio) != 1L || is.na(max_k_ratio) || max_k_ratio <= 0) { + stop("'max_k_ratio' must be a single positive numeric value.") + } + + # Find GMT files + gmt_files <- list.files(gmt_path, pattern = "\\.gmt$", full.names = TRUE) + if (length(gmt_files) == 0L) { + stop("No .gmt files found in directory: ", gmt_path) + } + + # Read GMT files into a named list of gene vectors + geneset_list <- list() + for (f in gmt_files) { + gmt <- utils::read.delim(f, + header = FALSE, + stringsAsFactors = FALSE, + sep = "\t", + quote = "", + comment.char = "") + if (ncol(gmt) < 3L) { + next + } + for (i in seq_len(nrow(gmt))) { + name <- as.character(gmt[i, 1]) + genes <- as.character(gmt[i, 3:ncol(gmt)]) + genes <- genes[genes != "" & !is.na(genes)] + geneset_list[[name]] <- unique(genes) + } + } + + if (length(geneset_list) == 0L) { + stop("No gene sets could be parsed from the GMT files in '", gmt_path, "'.") + } + + # Filter enrichment results by FDR and select pathways + results_tbl <- if (inherits(results_df, "tbl_df")) results_df else tibble::as_tibble(results_df) + + results_filtered <- dplyr::filter( + results_tbl, + .data[[fdr_col]] < fdr_threshold + ) + + if (nrow(results_filtered) == 0L) { + stop("No pathways passed the FDR threshold (", fdr_threshold, ").") + } + + selected_sets <- dplyr::pull(results_filtered, .data[[pathway_col]]) + selected_sets <- unique(as.character(selected_sets)) + + # Intersect with gene sets available in GMTs + selected_sets <- intersect(selected_sets, names(geneset_list)) + if (length(selected_sets) == 0L) { + stop( + "No overlap between pathways in 'results_df' and gene set names in GMT files. ", + "Check that '", pathway_col, "' matches the GMT gene set names." + ) + } + + geneset_list <- geneset_list[selected_sets] + + # Build Jaccard similarity matrix + n <- length(geneset_list) + jaccard_sim <- matrix( + 0, + nrow = n, + ncol = n, + dimnames = list(names(geneset_list), names(geneset_list)) + ) + + for (i in seq_along(geneset_list)) { + for (j in seq_along(geneset_list)) { + a <- geneset_list[[i]] + b <- geneset_list[[j]] + inter <- length(intersect(a, b)) + union <- length(unique(c(a, b))) + if (union == 0) { + jaccard_sim[i, j] <- 0 + } else { + jaccard_sim[i, j] <- inter / union + } + } + } + + # Distance matrix for hierarchical clustering + dist_mat <- stats::as.dist(1 - jaccard_sim) + + # Hierarchical clustering using Ward.D2 + hc <- stats::hclust(dist_mat, method = "ward.D2") + + # Determine maximum k for silhouette evaluation + max_k <- max(2L, floor(n / max_k_ratio)) + max_k <- min(max_k, n - 1L) # cannot have k >= n for silhouette + + if (max_k < 2L) { + stop("Not enough gene sets to form at least two clusters.") + } + + ks <- 2:max_k + sil_scores <- numeric(length(ks)) + + for (idx in seq_along(ks)) { + k <- ks[idx] + cl <- stats::cutree(hc, k = k) + sil <- cluster::silhouette(cl, dist_mat) + sil_scores[idx] <- mean(sil[, "sil_width"]) + } + + optimal_k <- ks[which.max(sil_scores)] + + silhouette_data <- tibble::tibble( + k = ks, + silhouette = sil_scores + ) + + # Cluster membership at optimal k + cluster_assignments <- stats::cutree(hc, k = optimal_k) + clusters_tbl <- tibble::tibble( + pathway = names(cluster_assignments), + cluster = as.integer(cluster_assignments) + ) + + # Return structured output + list( + clusters = clusters_tbl, + jaccard_matrix = jaccard_sim, + hclust_obj = hc, + optimal_k = optimal_k, + silhouette_data = silhouette_data + ) +} \ No newline at end of file diff --git a/R/view_net.R b/R/view_net.R new file mode 100644 index 0000000..6fdc79c --- /dev/null +++ b/R/view_net.R @@ -0,0 +1,164 @@ +############################### +# Plot enrichment clustering # +############################### + +#' Plot a gene set network based on clustering output +#' +#' This function takes the output of [build_net()], +#' constructs a gene set similarity network using Jaccard similarity as edge +#' weights, and generates both a static igraph object and an interactive +#' \code{visNetwork} visualization. +#' +#' @param clustering_output A named list as returned by +#' [build_net()], containing at least +#' \code{clusters} (a tibble with columns \code{pathway} and \code{cluster}) +#' and \code{jaccard_matrix} (a symmetric numeric matrix). +#' @param edge_threshold A numeric scalar specifying the minimum Jaccard +#' similarity required for an edge to be drawn between two gene sets. +#' All similarities below this threshold are set to zero before network +#' construction. Default is \code{0.25}. +#' @param save_html_path Optional character scalar. If non-\code{NULL}, the +#' interactive \code{visNetwork} object will be saved as an HTML file at the +#' specified path using \code{visNetwork::visSave}. If \code{NULL}, no HTML +#' file is written. Default is \code{NULL}. +#' +#' @details +#' The static graph is built with \pkg{igraph} from a thresholded Jaccard +#' similarity matrix. Nodes represent gene sets (pathways), and edges represent +#' Jaccard similarity greater than or equal to \code{edge_threshold}. +#' +#' Cluster memberships are taken from \code{clustering_output$clusters} and +#' mapped to the node attribute \code{group} in the interactive network, which +#' can be used by \pkg{visNetwork} for coloring. +#' +#' This function does not perform any plotting side effects by default; it +#' returns objects that can be plotted or further customized by the caller. +#' +#' @return A named list with the following elements: +#' \itemize{ +#' \item \code{static_graph}: An \code{\link[igraph]{igraph}} object built +#' from the thresholded Jaccard similarity matrix. +#' \item \code{interactive_plot}: A \code{\link[visNetwork]{visNetworkProxy}} +#' or \code{visNetwork} object representing the interactive network +#' visualization. +#' } +#' +#' @examples +#' \dontrun{ +#' clustering <- build_net( +#' results_df = res, +#' gmt_path = "gmt_dir/", +#' fdr_threshold = 0.25, +#' fdr_col = "FDR_gsea", +#' pathway_col = "Pathway" +#' ) +#' +#' plots <- view_net( +#' clustering_output = clustering, +#' edge_threshold = 0.25, +#' save_html_path = "network.html" +#' ) +#' +#' # Static plot +#' plot(plots$static_graph, +#' vertex.label.cex = 0.7, +#' edge.width = igraph::E(plots$static_graph)$weight * 5) +#' +#' # Interactive plot (in RStudio viewer or browser) +#' plots$interactive_plot +#' } +#' +#' @importFrom igraph graph_from_adjacency_matrix V E as_data_frame +#' @importFrom visNetwork visNetwork visOptions visLayout visSave +#' @importFrom dplyr left_join rename +#' @importFrom tibble tibble +#' @export +view_net <- function(clustering_output, + edge_threshold = 0.25, + save_html_path = NULL) { + # Basic input checks + if (!is.list(clustering_output)) { + stop("'clustering_output' must be a list as returned by 'calculate_geneset_clusters'.") + } + required_elements <- c("clusters", "jaccard_matrix") + missing_elements <- setdiff(required_elements, names(clustering_output)) + if (length(missing_elements) > 0L) { + stop( + "'clustering_output' is missing required elements: ", + paste(missing_elements, collapse = ", ") + ) + } + + clusters_tbl <- clustering_output$clusters + jaccard_matrix <- clustering_output$jaccard_matrix + + if (!is.matrix(jaccard_matrix) || !is.numeric(jaccard_matrix)) { + stop("'clustering_output$jaccard_matrix' must be a numeric matrix.") + } + if (is.null(rownames(jaccard_matrix)) || is.null(colnames(jaccard_matrix))) { + stop("The Jaccard matrix must have row and column names corresponding to pathways.") + } + if (!is.data.frame(clusters_tbl)) { + stop("'clustering_output$clusters' must be a data.frame or tibble.") + } + if (!all(c("pathway", "cluster") %in% colnames(clusters_tbl))) { + stop("'clusters' must contain 'pathway' and 'cluster' columns.") + } + if (!is.numeric(edge_threshold) || length(edge_threshold) != 1L || is.na(edge_threshold)) { + stop("'edge_threshold' must be a single numeric value.") + } + + # Threshold the Jaccard matrix to define edges + adj <- jaccard_matrix + adj[adj < edge_threshold] <- 0 + diag(adj) <- 0 + + # Build igraph object + g <- igraph::graph_from_adjacency_matrix( + adjmatrix = adj, + mode = "undirected", + weighted = TRUE, + diag = FALSE + ) + + # Static graph (igraph object) + static_graph <- g + + # Prepare nodes and edges for visNetwork + pathways <- rownames(adj) + + nodes <- tibble::tibble( + id = pathways, + label = pathways + ) + + # Merge cluster membership + clusters_tbl <- dplyr::distinct(clusters_tbl, .data$pathway, .data$cluster) + nodes <- dplyr::left_join( + nodes, + dplyr::rename(clusters_tbl, id = .data$pathway, group = .data$cluster), + by = "id" + ) + + # Convert edges to data.frame + edges_df <- igraph::as_data_frame(g, what = "edges") + # visNetwork uses columns 'from' and 'to' + colnames(edges_df)[1:2] <- c("from", "to") + + interactive_plot <- visNetwork::visNetwork(nodes, edges_df) |> + visNetwork::visOptions(highlightNearest = TRUE) |> + visNetwork::visLayout(randomSeed = 174) + + # Save HTML if requested + if (!is.null(save_html_path)) { + if (!is.character(save_html_path) || length(save_html_path) != 1L) { + stop("'save_html_path' must be a single character string if provided.") + } + visNetwork::visSave(interactive_plot, file = save_html_path) + } + + list( + static_graph = static_graph, + interactive_plot = interactive_plot + ) +} \ No newline at end of file diff --git a/data/raw_counts.rda b/data/raw_counts.rda index ea791d6..082f445 100644 Binary files a/data/raw_counts.rda and b/data/raw_counts.rda differ diff --git a/data/sampledata.rda b/data/sampledata.rda index 6e956c6..d7d03fc 100644 Binary files a/data/sampledata.rda and b/data/sampledata.rda differ diff --git a/man/build_net.Rd b/man/build_net.Rd new file mode 100644 index 0000000..096eb7e --- /dev/null +++ b/man/build_net.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build_net.R +\name{build_net} +\alias{build_net} +\title{Calculate gene set clusters from GMT files and enrichment results} +\usage{ +build_net( + results_df, + gmt_path, + fdr_threshold = 0.25, + fdr_col, + pathway_col, + max_k_ratio = 2.5 +) +} +\arguments{ +\item{results_df}{A \code{data.frame} or \code{tibble} containing enrichment +results. Must contain at least the columns specified by \code{fdr_col} and +\code{pathway_col}.} + +\item{gmt_path}{A character scalar. Path to a directory containing one or +more \code{.gmt} files.} + +\item{fdr_threshold}{A numeric scalar specifying the FDR cutoff used to +filter \code{results_df}. Only pathways with FDR values less than this +threshold will be considered. Default is \code{0.25}.} + +\item{fdr_col}{A character scalar specifying the column name in +\code{results_df} that contains FDR values.} + +\item{pathway_col}{A character scalar specifying the column name in +\code{results_df} that contains pathway identifiers (names matching the +GMT gene set names).} + +\item{max_k_ratio}{A numeric scalar used to determine the maximum number of +clusters to evaluate with the silhouette index. The maximum k is computed +as \code{max(2, floor(n / max_k_ratio))}, where \code{n} is the number of +selected gene sets. Default is \code{2.5}.} +} +\value{ +A named list with the following elements: +\itemize{ +\item \code{clusters}: A \code{tibble} with columns \code{pathway} and +\code{cluster} (integer cluster memberships). +\item \code{jaccard_matrix}: A symmetric numeric matrix of Jaccard +similarities between gene sets (rows and columns named by pathways). +\item \code{hclust_obj}: An \code{\link[stats]{hclust}} object containing +the hierarchical clustering result. +\item \code{optimal_k}: An integer giving the selected number of clusters. +\item \code{silhouette_data}: A \code{tibble} with columns \code{k} and +\code{silhouette}, suitable for plotting the silhouette index as a +function of \code{k}. +} +} +\description{ +This function parses one or more GMT files, filters an enrichment results +data frame by an FDR column, computes a Jaccard similarity matrix between +selected gene sets, performs hierarchical clustering, and determines the +optimal number of clusters using the silhouette index. +} +\details{ +GMT files are expected to be tab-delimited, with the first column being the +gene set name, the second column a description (which is ignored), and the +remaining columns listing gene identifiers. + +Jaccard similarity between gene sets \eqn{A} and \eqn{B} is defined as: +\deqn{J(A,B) = |A \cap B| / |A \cup B|}. + +Hierarchical clustering is performed using Euclidean distances derived from +\code{1 - Jaccard_similarity} and the \code{"ward.D2"} linkage method. +The optimal number of clusters is chosen as the \code{k} that maximizes the +mean silhouette width for \code{k} in \code{2:max_k}. + +This function does not perform any file I/O beyond reading GMT files; it is +intended to be used inside packages and downstream plotting functions. +} +\examples{ +\dontrun{ +res <- read.csv("pathway_results.csv") +clustering <- build_net( + results_df = res, + gmt_path = "gmt_dir/", + fdr_threshold = 0.25, + fdr_col = "FDR_gsea", + pathway_col = "Pathway", + max_k_ratio = 2.5 +) +} + +} diff --git a/man/view_net.Rd b/man/view_net.Rd new file mode 100644 index 0000000..b405b76 --- /dev/null +++ b/man/view_net.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/view_net.R +\name{view_net} +\alias{view_net} +\title{Plot a gene set network based on clustering output} +\usage{ +view_net(clustering_output, edge_threshold = 0.25, save_html_path = NULL) +} +\arguments{ +\item{clustering_output}{A named list as returned by +\code{\link[=build_net]{build_net()}}, containing at least +\code{clusters} (a tibble with columns \code{pathway} and \code{cluster}) +and \code{jaccard_matrix} (a symmetric numeric matrix).} + +\item{edge_threshold}{A numeric scalar specifying the minimum Jaccard +similarity required for an edge to be drawn between two gene sets. +All similarities below this threshold are set to zero before network +construction. Default is \code{0.25}.} + +\item{save_html_path}{Optional character scalar. If non-\code{NULL}, the +interactive \code{visNetwork} object will be saved as an HTML file at the +specified path using \code{visNetwork::visSave}. If \code{NULL}, no HTML +file is written. Default is \code{NULL}.} +} +\value{ +A named list with the following elements: +\itemize{ +\item \code{static_graph}: An \code{\link[igraph]{igraph}} object built +from the thresholded Jaccard similarity matrix. +\item \code{interactive_plot}: A \code{\link[visNetwork]{visNetworkProxy}} +or \code{visNetwork} object representing the interactive network +visualization. +} +} +\description{ +This function takes the output of \code{\link[=build_net]{build_net()}}, +constructs a gene set similarity network using Jaccard similarity as edge +weights, and generates both a static igraph object and an interactive +\code{visNetwork} visualization. +} +\details{ +The static graph is built with \pkg{igraph} from a thresholded Jaccard +similarity matrix. Nodes represent gene sets (pathways), and edges represent +Jaccard similarity greater than or equal to \code{edge_threshold}. + +Cluster memberships are taken from \code{clustering_output$clusters} and +mapped to the node attribute \code{group} in the interactive network, which +can be used by \pkg{visNetwork} for coloring. + +This function does not perform any plotting side effects by default; it +returns objects that can be plotted or further customized by the caller. +} +\examples{ +\dontrun{ +clustering <- build_net( + results_df = res, + gmt_path = "gmt_dir/", + fdr_threshold = 0.25, + fdr_col = "FDR_gsea", + pathway_col = "Pathway" +) + +plots <- view_net( + clustering_output = clustering, + edge_threshold = 0.25, + save_html_path = "network.html" +) + +# Static plot +plot(plots$static_graph, + vertex.label.cex = 0.7, + edge.width = igraph::E(plots$static_graph)$weight * 5) + +# Interactive plot (in RStudio viewer or browser) +plots$interactive_plot +} + +}