From 1e85163bfb9b7bdf70fc09d868887ee7ef220dce Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Fri, 6 Oct 2023 16:58:32 -0400 Subject: [PATCH 1/8] =?UTF-8?q?added=20gostats=20topgo=20to=20=1B[200~?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ontology_goseq.R | 539 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 ontology_goseq.R diff --git a/ontology_goseq.R b/ontology_goseq.R new file mode 100644 index 00000000..1a2f3246 --- /dev/null +++ b/ontology_goseq.R @@ -0,0 +1,539 @@ +## ontology_goseq.r: Some methods to simplify the usage of goseq. goseq is the +## first GSEA tool I learned about. It is not particularly robust, this seeks +## to amend that. + +#' Filter a goseq significance search +#' +#' Given a goseq result, use some simple filters to pull out the +#' categories of likely interest. +#' +#' @param godata goseq result +#' @param expand_categories Extract GO terms from GO.db and add them +#' to the table +#' @param pvalue Significance filter. +#' @param minimum_interesting The category should have more than this +#' number of elements. +#' @param adjust Adjusted p-value filter. +#' @param padjust_method Method for adjusting the p-values. +extract_interesting_goseq <- function(godata, expand_categories = TRUE, pvalue = 0.05, + minimum_interesting = 1, adjust = 0.05, padjust_method = "BH") { + ## Add a little logic if we are using a non-GO input database. + ## This is relevant if you use something like msigdb/reactome/kegg + ## I should probably make the rest of this function generic so that this is not required. + if (is.null(godata[["ontology"]])) { + godata[["ontology"]] <- "MF" + } + + godata_interesting <- godata + if (isTRUE(expand_categories)) { + mesg("simple_goseq(): Filling godata with terms, this is slow.") + godata_interesting <- goseq_table(godata) + } else { + ## Set the 'term' category for plotting. + godata_interesting[["term"]] <- godata_interesting[["category"]] + godata[["term"]] <- godata[["category"]] + } + + if (is.null(adjust)) { + interesting_idx <- godata[["over_represented_pvalue"]] <= pvalue + godata_interesting <- godata[godata_interesting, ] + padjust_method <- "none" + } else { + ## There is a requested pvalue adjustment + interesting_idx <- godata[["over_represented_pvalue"]] <= adjust + godata_interesting <- godata[interesting_idx, ] + if (dim(godata_interesting)[1] < minimum_interesting) { + message("simple_goseq(): There are no genes with an adj.p < ", adjust, " using: ", + padjust_method, ".") + message("simple_goseq(): Providing genes with raw pvalue < ", pvalue, ".") + interesting_idx <- godata[["over_represented_pvalue"]] <= pvalue + godata_interesting <- godata_interesting[interesting_idx, ] + padjust_method <- "none" + } + } + + na_idx <- is.na(godata[["ontology"]]) + godata <- godata[!na_idx, ] + mf_idx <- godata[["ontology"]] == "MF" + mf_subset <- godata[mf_idx, ] + rownames(mf_subset) <- mf_subset[["category"]] + bp_idx <- godata[["ontology"]] == "BP" + bp_subset <- godata[bp_idx, ] + rownames(bp_subset) <- bp_subset[["category"]] + cc_idx <- godata[["ontology"]] == "CC" + cc_subset <- godata[cc_idx, ] + rownames(cc_subset) <- cc_subset[["category"]] + + na_idx <- is.na(godata_interesting[["ontology"]]) + godata_interesting <- godata_interesting[!na_idx, ] + mf_idx <- godata_interesting[["ontology"]] == "MF" + mf_interesting <- godata_interesting[mf_idx, ] + rownames(mf_interesting) <- mf_interesting[["category"]] + if (is.null(mf_interesting[["term"]])) { + mf_interesting <- mf_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue")] + } else { + mf_interesting <- mf_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue", "term")] + } + bp_idx <- godata_interesting[["ontology"]] == "BP" + bp_interesting <- godata_interesting[bp_idx, ] + rownames(bp_interesting) <- bp_interesting[["category"]] + if (is.null(bp_interesting[["term"]])) { + bp_interesting <- bp_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue")] + } else { + bp_interesting <- bp_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue", "term")] + } + cc_idx <- godata_interesting[["ontology"]] == "CC" + cc_interesting <- godata_interesting[cc_idx, ] + rownames(cc_interesting) <- cc_interesting[["category"]] + if (is.null(cc_interesting[["term"]])) { + cc_interesting <- cc_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue")] + } else { + cc_interesting <- cc_interesting[, c("ontology", "numDEInCat", "numInCat", + "over_represented_pvalue", "qvalue", "term")] + } + retlist <- list( + "godata" = godata, + "interesting" = godata_interesting, + "mf_subset" = mf_subset, + "bp_subset" = bp_subset, + "cc_subset" = cc_subset, + "MF" = mf_interesting, + "BP" = bp_interesting, + "CC" = cc_interesting) + return(retlist) +} + +#' Pass MSigDB categorical data to goseq and run it. +#' +#' goseq is probably the easiest method to push varying data types into. Thus +#' it was the first thing I thought of when looking to push MSigDB data into a +#' GSEA method. +#' +#' @param sig_genes Character list of genes deemed significant. I think in the +#' current implementation this must be just a list of IDs as opposed to the +#' full dataframe of interesting genes because we likely need to convert IDs. +#' @param signatures Used by load_gmt_signatures(), the signature file or set. +#' @param data_pkg Used by load_gmt_signatures(). +#' @param signature_category Ibid, but the name of the signatures group. +#' @param current_id Used by convert_msig_ids(), when converting IDs, the name +#' of the existing type. +#' @param required_id What type to convert to in convert_msig_ids(). +#' @param length_db Dataframe of lengths. It is worth noting that goseq +#' explicitly states that one might wish to use other potentially confounding +#' factors here, but they only examine lenghts in their paper. Starting with +#' this parameter, everything is just passed directly to simple_goseq() +#' @param doplot Print the prior plot? +#' @param adjust passed to simple_goseq() +#' @param pvalue passed to simple_goseq() +#' @param length_keytype passed to simple_goseq() +#' @param go_keytype passed to simple_goseq() +#' @param goseq_method passed to simple_goseq() +#' @param padjust_method passed to simple_goseq() +#' @param excel passed to simple_goseq() +#' @param orgdb Ideally used to help goseq collect lengths. +#' @return Some goseq data! +#' @seealso [gsva] [goseq] +#' @export +goseq_msigdb <- function(sig_genes, signatures = "c2BroadSets", data_pkg = "GSVAdata", + signature_category = "c2", current_id = "ENSEMBL", required_id = "ENTREZID", + length_db = NULL, doplot = TRUE, adjust = 0.1, pvalue = 0.1, + length_keytype = "transcripts", go_keytype = "entrezid", + goseq_method = "Wallenius", padjust_method = "BH", + excel = NULL, orgdb = "org.Hs.eg.db",method = 'gostat') { + sig_data <- load_gmt_signatures(signatures = signatures, data_pkg = data_pkg, + signature_category = signature_category) + mesg("Starting to coerce the msig data to the ontology format.") + go_db <- data.table::data.table() + for (i in seq_along(sig_data)) { + gsc <- sig_data[[i]] + gsc_id <- gsc@setName + gsc_genes <- gsc@geneIds + tmp_db <- data.table::data.table("ID" = gsc_genes, "GO" = rep(gsc_id, length(gsc_genes))) + go_db <- rbind(go_db, tmp_db) + } + mesg("Finished coercing the msig data into a df with ", nrow(go_db), " rows.") + + new_ids <- NULL + new_sig <- data.frame() + if ("character" %in% class(sig_genes)) { + new_ids <- convert_ids(sig_genes, from = current_id, to = required_id, orgdb = orgdb) + new_sig <- new_ids + colnames(new_sig) <- c(current_id, "ID") + new_sig <- new_sig[, c("ID", current_id)] + rownames(new_sig) <- make.names(new_sig[["ID"]], unique = TRUE) + } else if ("data.frame" %in% class(sig_genes)) { + new_ids <- convert_ids(rownames(sig_genes), from = current_id, + to = required_id, orgdb = orgdb) + new_sig <- merge(new_ids, sig_genes, by.x = current_id, + by.y = "row.names") + new_sig[["ID"]] <- new_sig[[required_id]] + } else { + stop("I do not understand this input data format for sig_genes.") + } + + new_lids <- convert_ids(rownames(length_db), from = current_id, to = required_id, orgdb = orgdb) + new_length <- merge(new_lids, length_db, by.x = current_id, by.y = "row.names") + new_length[["ID"]] <- NULL + new_length[["ID"]] <- new_length[[required_id]] + if (is.null(new_length[["length"]])) { + new_length <- new_length[, c("ID", "width")] + colnames(new_length) <- c("ID", "length") + } else { + new_length <- new_length[, c("ID", "length")] + } + + if(method == 'goseq'){ + go_result <- simple_goseq(new_sig, go_db = go_db, length_db = new_length, + doplot = doplot, adjust = adjust, pvalue = pvalue, + length_keytype = length_keytype, go_keytype = go_keytype, + goseq_method = goseq_method, padjust_method = padjust_method, + plot_title = plot_title, + expand_categories = expand_categories, + excel = excel, add_trees = add_trees, + gather_genes = gather_genes, width = width,...) + }elif(method == 'gostat'){ + go_result <- simple_gostats(new_sig, go_db = go_db, pcutoff = pvalue, + gff_id = go_keytype, excel = excel,...) + }elif(method == 'topgo'){ + go_result <- simple_topgo(new_sig, go_db = go_db, limit = pvalue, excel = excel,...) + } + return(go_result) +} + +#' Enhance the goseq table of gene ontology information. +#' +#' While goseq has some nice functionality, the table of outputs it provides is +#' somewhat lacking. This attempts to increase that with some extra helpful data +#' like ontology categories, definitions, etc. +#' +#' @param df Dataframe of ontology information. This is intended to be the +#' output from goseq including information like numbers/category, GOids, etc. +#' It requires a column 'category' +#' which contains: GO:000001 and such. +#' @param file Csv file to which to write the table. +#' @return Ontology table with annotation information included. +#' @seealso [goseq] [GO.db] +#' @examples +#' \dontrun{ +#' annotated_go = goseq_table(go_ids) +#' head(annotated_go, n = 1) +#' ## > category numDEInCat numInCat over_represented_pvalue +#' ## > 571 GO:0006364 9 26 4.655108e-08 +#' ## > under_represented_pvalue qvalue ontology +#' ## > 571 1.0000000 6.731286e-05 BP +#' ## > term +#' ## > 571 rRNA processing +#' ## > synonym +#' ## > 571 "35S primary transcript processing, GO:0006365" +#' ## > secondary definition +#' ## > 571 GO:0006365 Any process involved in the conversion of a primary ribosomal +#' ## RNA (rRNA) transcript into one or more mature rRNA molecules. +#' } +#' @export +goseq_table <- function(df, file = NULL) { + df[["term"]] <- goterm(df[["category"]]) + df[["ontology"]] <- goont(df[["category"]]) + message("Testing that go categories are defined.") + df[["good"]] <- gotest(df[["category"]]) + message("Removing undefined categories.") + good_idx <- df[["good"]] == 1 + df <- df[good_idx, ] + na_idx <- is.na(df[["category"]]) + df <- df[!na_idx, ] + na_idx <- is.na(df[["term"]]) + df <- df[!na_idx, ] + message("Gathering synonyms.") + df[["synonym"]] <- gosyn(df[["category"]]) + message("Gathering category definitions.") + df[["definition"]] <- godef(df[["category"]]) + df <- df[, c("category", "numDEInCat", "numInCat", "over_represented_pvalue", + "under_represented_pvalue", "qvalue", "ontology", "term", + "synonym", "definition")] + if (!is.null(file)) { + write.csv(df, file = file) + } + return(df) +} + +#' Perform a simplified goseq analysis. +#' +#' goseq can be pretty difficult to get set up for non-supported organisms. +#' This attempts to make that process a bit simpler as well as give some +#' standard outputs which should be similar to those returned by +#' clusterprofiler/topgo/gostats/gprofiler. +#' +#' @param sig_genes Data frame of differentially expressed genes, containing IDs etc. +#' @param go_db Database of go to gene mappings (OrgDb/OrganismDb) +#' @param length_db Database of gene lengths (gff/TxDb) +#' @param doplot Include pwf plots? +#' @param adjust Minimum adjusted pvalue for 'significant.' +#' @param pvalue Minimum pvalue for 'significant.' +#' @param plot_title Set a title for the pvalue plots. +#' @param length_keytype Keytype to provide to extract lengths +#' @param go_keytype Keytype to provide to extract go IDs +#' @param goseq_method Statistical test for goseq to use. +#' @param padjust_method Which method to use to adjust the pvalues. +#' @param expand_categories Expand the GO categories to make the results more readable? +#' @param excel Print the results to an excel file? +#' @param enrich Convert the goseq result to the clusterProfiler format? +#' @param ... Extra parameters which I do not recall +#' @return Big list including: +#' the pwd:pwf function, +#' alldata:the godata dataframe, +#' pvalue_histogram:p-value histograms, +#' godata_interesting:the ontology information of the enhanced groups, +#' term_table:the goterms with some information about them, +#' mf_subset:a plot of the MF enhanced groups, +#' mfp_plot:the pvalues of the MF group, +#' bp_subset:a plot of the BP enhanced groups, +#' bpp_plot, +#' cc_subset, +#' and ccp_plot +#' @seealso [goseq] [GO.db] [GenomicFeatures] [stats::p.adjust()] +#' @examples +#' \dontrun{ +#' lotsotables <- simple_goseq(gene_list, godb, lengthdb) +#' } +#' @export +simple_goseq <- function(sig_genes, go_db = NULL, length_db = NULL, doplot = TRUE, + adjust = 0.1, pvalue = 0.1, plot_title = NULL, + length_keytype = "transcripts", go_keytype = "entrezid", + goseq_method = "Wallenius", padjust_method = "BH", + expand_categories = TRUE, excel = NULL, enrich = TRUE, + ...) { + arglist <- list(...) + + minimum_interesting <- 1 + if (!is.null(arglist[["minimum_interesting"]])) { + minimum_interesting <- arglist[["minimum_interesting"]] + } + length_df <- data.frame() + length_vector <- vector() + de_vector <- vector() + gene_ids <- NULL + final_keytype <- NULL + goids_df <- NULL + ## sig_genes may be a list, character list, or data frame. + gene_list <- NULL + if (class(sig_genes)[1] == "data.table") { + sig_genes <- as.data.frame(sig_genes) + } + if (class(sig_genes) == "character") { + ## Then this is a character list of gene ids + gene_list <- sig_genes + sig_genes <- data.frame(row.names = sig_genes) + } else if (class(sig_genes) == "list") { + gene_list <- names(sig_genes) + } else if (class(sig_genes) == "data.frame") { + if (is.null(rownames(sig_genes)) && is.null(sig_genes[["ID"]])) { + stop("This requires a set of gene IDs from the rownames or a column named 'ID'.") + } else if (!is.null(sig_genes[["ID"]])) { + ## Use a column named 'ID' first because a bunch of annotation databases + ## use ENTREZ IDs which are just integers, which of course is not allowed + ## by data frame row names. + mesg("Using the ID column from your table rather than the row names.") + gene_list <- sig_genes[["ID"]] + } else if (!is.null(rownames(sig_genes))) { + mesg("Using the row names of your table.") + gene_list <- rownames(sig_genes) + } else { + gene_list <- sig_genes[["ID"]] + } + } else { + stop("Not sure how to handle your set of significant gene ids.") + } + ## At this point I should have a character list of gene ids named 'gene_list' + de_genelist <- as.data.frame(gene_list) + de_genelist[["DE"]] <- 1 + colnames(de_genelist) <- c("ID", "DE") + + ## Database of lengths may be a gff file, TxDb, or OrganismDb + metadf <- NULL + if (class(length_db)[1] == "data.table") { + length_db <- as.data.frame(length_db) + } + if (class(length_db)[[1]] == "character") { + ## Then this should be either a gff file or species name. + if (grepl(pattern = "\\.gff", x = length_db, perl = TRUE) || + grepl(pattern = "\\.gtf", x = length_db, perl = TRUE)) { + ## gff file + txdb <- GenomicFeatures::makeTxDbFromGFF(length_db) + metadf <- extract_lengths(db = txdb, gene_list = gene_list) + } else { + ## Then species name + message("A species name.") + } + } else if (class(length_db)[[1]] == "AnnotationDbi") { + stop("This currently requires an actual OrganismDb, not AnnotationDbi.") + } else if (class(length_db)[[1]] == "OrgDb") { + stop("OrgDb objects contain links to other databases, but no gene lengths.") + } else if (class(length_db)[[1]] == "OrganismDb" || + class(length_db)[[1]] == "AnnotationDbi") { + ##metadf <- extract_lengths(db = length_db, gene_list = gene_list) + metadf <- sm(extract_lengths(db = length_db, gene_list = gene_list, ...)) + } else if (class(length_db)[[1]] == "TxDb") { + metadf <- sm(extract_lengths(db = length_db, gene_list = gene_list, ...)) + } else if (class(length_db)[[1]] == "data.frame") { + metadf <- length_db + } else { + stop("Extracting lengths requires the name of a supported species or orgdb instance.") + } + + ## Sometimes the column with gene lengths is named 'width' + ## In that case, fix it. + if (is.null(metadf[["width"]]) && is.null(metadf[["length"]])) { + stop("The length db needs to have a length or width column.") + } else if (is.null(metadf[["length"]])) { + ## Then it is named 'width' and I want to rename it to length + colnames(metadf) <- gsub(x = colnames(metadf), pattern = "width", replacement = "length") + } + ## Now I should have the gene list and gene lengths + + godf <- data.frame() + if (class(go_db)[[1]] == "character") { + ## A text table or species name + if (grepl(pattern = "\\.csv", x = go_db, perl = TRUE) || + grepl(pattern = "\\.tab", x = go_db, perl = TRUE)) { + ## table + godf <- read.table(go_db, ...) + colnames(godf) <- c("ID", "GO") + } else { + ## Assume species name + supported <- TRUE + species <- go_db + } + } else if (class(go_db)[[1]] == "OrganismDb") { + godf <- extract_go(go_db, keytype = go_keytype) + } else if (class(go_db)[[1]] == "OrgDb") { + godf <- extract_go(go_db, keytype = go_keytype) + } else if (class(go_db)[[1]] == "data.table" || class(go_db)[[1]] == "tbl_df") { + godf <- as.data.frame(go_db) + } else if (class(go_db)[[1]] == "data.frame") { + godf <- go_db + if (!is.null(godf[["ID"]])) { + godf <- godf[, c("ID", "GO")] + } else if (!is.null(godf[["GID"]])) { + godf <- godf[, c("GID", "GO")] + colnames(godf) <- c("ID", "GO") + } else { + stop("Unable to read the gene ID/ GO columns from the go dataframe.") + } + } else { + stop("Unable to determine the input for creating a go dataframe.") + } + + ## entrez IDs are numeric. This is a problem when doing the pwf function + ## because it sets the rownames to the IDs. As a result, we need to call + ## make.names() on them. + colnames(godf) <- c("ID","GO") + godf[["ID"]] <- make.names(godf[["ID"]]) + metadf[["ID"]] <- make.names(metadf[["ID"]]) + de_genelist[["ID"]] <- make.names(de_genelist[["ID"]]) + ## Ok, now I have a df of GOids, all gene lengths, and DE gene list. That is + ## everything I am supposed to need for goseq. + + ## See how many entries from the godb are in the list of genes. + id_xref <- de_genelist[["ID"]] %in% godf[["ID"]] + meta_xref <- de_genelist[["ID"]] %in% metadf[["ID"]] + message("Found ", sum(id_xref), " go_db genes and ", sum(meta_xref), + " length_db genes out of ", nrow(de_genelist), ".") + ## So lets merge the de genes and gene lengths to ensure that they are + ## consistent. Then make the vectors expected by goseq. + merged_ids_lengths <- metadf + ## The following line was done in order to avoid + ## "'unimplemented type 'list' in 'orderVector1'" + merged_ids_lengths[["ID"]] <- as.character(merged_ids_lengths[["ID"]]) + merged_ids_lengths <- merge(merged_ids_lengths, + de_genelist, by.x = "ID", by.y = "ID", all.x = TRUE) + merged_ids_lengths[["length"]] <- suppressWarnings( + as.numeric(merged_ids_lengths[["length"]])) + merged_ids_lengths[is.na(merged_ids_lengths)] <- 0 + ## Not casing the next lines as character/numeric causes weird errors like 'names' attribute + ## must be the same length as the vector + de_vector <- as.vector(as.numeric(merged_ids_lengths[["DE"]])) + names(de_vector) <- make.names(as.character( + merged_ids_lengths[["ID"]]), unique = TRUE) + length_vector <- as.vector(as.numeric(merged_ids_lengths[["length"]])) + names(length_vector) <- make.names(as.character( + merged_ids_lengths[["ID"]]), unique = TRUE) + + pwf_plot <- NULL + tmp_file <- tempfile(pattern = "goseq", fileext = ".png") + this_plot <- png(filename = tmp_file) + controlled <- dev.control("enable") + pwf <- sm(suppressWarnings(goseq::nullp(DEgenes = de_vector, bias.data = length_vector, + plot.fit = doplot))) + if (isTRUE(doplot)) { + pwf_plot <- recordPlot() + } + dev.off() + removed <- file.remove(tmp_file) + + godata <- sm(goseq::goseq(pwf, gene2cat = godf, use_genes_without_cat = TRUE, + method = goseq_method)) + ## I want to limit the y-axis, but I think this is not the best way. + goseq_p <- try(plot_histogram(godata[["over_represented_pvalue"]], bins = 50)) + goseq_p_nearzero <- table(goseq_p[["data"]])[[1]] + goseq_y_limit <- goseq_p_nearzero * 2 + goseq_p <- goseq_p + + ggplot2::scale_y_continuous(limits = c(0, goseq_y_limit)) + godata[["qvalue"]] <- stats::p.adjust(godata[["over_represented_pvalue"]], + method = padjust_method) + + ## Subset the result for 'interesting' categories, which is defined + ## simply as the set of categories with full annotations. + interesting <- extract_interesting_goseq( + godata, expand_categories = expand_categories, pvalue = pvalue, adjust = adjust, + minimum_interesting = minimum_interesting, padjust_method = padjust_method) + + mesg("simple_goseq(): Making pvalue plots for the ontologies.") + pvalue_plots <- plot_goseq_pval(godata, plot_title = plot_title, + x_column = "num_cat", + ...) + pval_plots <- list( + "bpp_plot_over" = pvalue_plots[["bpp_plot_over"]], + "mfp_plot_over" = pvalue_plots[["mfp_plot_over"]], + "ccp_plot_over" = pvalue_plots[["ccp_plot_over"]]) + + retlist <- list( + "input" = sig_genes, + "pwf" = pwf, + "pwf_plot" = pwf_plot, + "all_data" = interesting[["godata"]], + "go_db" = godf, + "godata" = godata, + "pvalue_histogram" = goseq_p, + "godata_interesting" = interesting[["interesting"]], + "mf_interesting" = interesting[["MF"]], + "bp_interesting" = interesting[["BP"]], + "cc_interesting" = interesting[["CC"]], + "goadjust_method" = goseq_method, + "adjust_method" = padjust_method, + "mf_subset" = interesting[["mf_subset"]], + "bp_subset" = interesting[["bp_subset"]], + "cc_subset" = interesting[["cc_subset"]], + "pvalue_plots" = pval_plots) + class(retlist) <- c("goseq_result", "list") + if (isTRUE(enrich)) { + retlist[["mf_enrich"]] <- goseq2enrich( + retlist, ontology = "MF", cutoff = pvalue, padjust_method = padjust_method) + retlist[["bp_enrich"]] <- goseq2enrich( + retlist, ontology = "BP", cutoff = pvalue, padjust_method = padjust_method) + retlist[["cc_enrich"]] <- goseq2enrich( + retlist, ontology = "CC", cutoff = pvalue, padjust_method = padjust_method) + } + if (!is.null(excel)) { + excel_result <- write_goseq_data(retlist, excel = excel, ...) + } + return(retlist) +} + +## EOF + + + From 69f07ba16b20c105443d13c881db922d7d33026c Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Wed, 24 Apr 2024 14:11:22 -0400 Subject: [PATCH 2/8] updated to make sample id correct and autofil batches --- R/expt.R | 41 +++++++++++++++++++++++++++++++++++++---- R/normalize_batch.R | 8 ++++---- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/R/expt.R b/R/expt.R index 9fcdde7b..f2a92073 100644 --- a/R/expt.R +++ b/R/expt.R @@ -46,7 +46,7 @@ combine_expts <- function(expt1, expt2, condition = "condition", all_x = TRUE, a exp2 <- expt2[["expressionset"]] fData(exp2) <- fData(exp1) - testthat::expect_equal(rownames(exprs(exp1)), rownames(exprs(exp2))) + ##testthat::expect_equal(rownames(exprs(exp1)), rownames(exprs(exp2))) if (isTRUE(merge_meta)) { design1 <- pData(exp1) @@ -62,7 +62,8 @@ combine_expts <- function(expt1, expt2, condition = "condition", all_x = TRUE, a pData(exp2) <- new_design2 } - new <- a4Base::combineTwoExpressionSet(exp1, exp2) + ## new <- a4Base::combineTwoExpressionSet(exp1, exp2) + new <- Biobase::combine(exp1, exp2) expt1[["expressionset"]] <- new expt1[["conditions"]] <- pData(expt1)[["condition"]] names(expt1[["conditions"]]) <- rownames(pData(expt1)) @@ -615,12 +616,16 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL ## not match after using tximport. if (!is.null(arglist[["tx_gene_map"]])) { tx_gene_map <- arglist[["tx_gene_map"]] - matched_rows <- sum(rownames(gene_info) %in% tx_gene_map[[2]]) + ## This is wrong, we recast gene_info as a data.table and thus it does not have rownames. + ## matched_rows <- sum(rownames(gene_info) %in% tx_gene_map[[2]]) + ## Because of the previous line, the following test always sees the match between the tx_gene_map + ## and gene annotations as 0, and therefore it messed up the rownames of the annotations. + matched_rows <- gene_info[["rownames"]] %in% tx_gene_map[[2]] if (matched_rows < 1) { message("The mapped IDs are not the rownames of your gene information, changing them now.") if (names(tx_gene_map)[2] %in% colnames(gene_info)) { new_name <- names(tx_gene_map)[2] - rownames(gene_info) <- make.names(tx_gene_map[[new_name]], unique = TRUE) + gene_info[["rownames"]] <- make.names(tx_gene_map[[new_name]], unique = TRUE) } else { warning("Cannot find an appropriate column in gene_info, refusing to use the tx_map.") } @@ -1562,11 +1567,13 @@ If this is not correctly performed, very few genes will be observed") import <- NULL import_scaled <- NULL if (is.null(tx_gene_map)) { + warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.") import <- sm(tximport::tximport(files = files, type = "kallisto", txOut = txout)) import_scaled <- sm(tximport::tximport( files = files, type = "kallisto", txOut = txout, countsFromAbundance = "lengthScaledTPM")) } else { + warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.") import <- sm(tximport::tximport( files = files, type = "kallisto", tx2gene = tx_gene_map, txOut = txout)) import_scaled <- sm(tximport::tximport( @@ -1585,6 +1592,7 @@ If this is not correctly performed, very few genes will be observed") import <- NULL import_scaled <- NULL if (is.null(tx_gene_map)) { + warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.") import <- tximport::tximport(files = files, type = "rsem", txOut = txout) import_scaled <- tximport::tximport(files = files, type = "rsem", txOut = txout, countsFromAbundance = "lengthScaledTPM") @@ -1620,9 +1628,19 @@ If this is not correctly performed, very few genes will be observed") ## match those in the results from salmon. import <- tximport::tximport( files = as.character(files), type = "salmon", tx2gene = tx_gene_map, txOut = txout) + ## It appears that something in tximport recently changed which causes it to no longer + ## put the actual sampleIDs as the colnames of its return. + ## And, since I use the count table columns as the primary arbiter of the sample IDs + ## when merging the sample annotations (metadata) and the counts, this does not end well. + colnames(import[["abundance"]]) <- sample_ids + colnames(import[["counts"]]) <- sample_ids + colnames(import[["length"]]) <- sample_ids import_scaled <- sm(tximport::tximport( files = files, type = "salmon", tx2gene = tx_gene_map, txOut = txout, countsFromAbundance = "lengthScaledTPM")) + colnames(import_scaled[["abundance"]]) <- sample_ids + colnames(import_scaled[["counts"]]) <- sample_ids + colnames(import_scaled[["length"]]) <- sample_ids } retlist[["count_table"]] <- data.table::as.data.table( import[["counts"]], keep.rownames = "rownames") @@ -2494,6 +2512,21 @@ subset_expt <- function(expt, subset = NULL, ids = NULL, subset_design <- as.data.frame(subset_design, stringsAsFactors = FALSE) message("The samples (and read coverage) removed when filtering ", nonzero, " non-zero genes are: ") + dropped <- exprs(expt)[, remove_idx] + if (class(dropped)[1] == "numeric") { + print(sum(dropped)) + } else if (class(dropped)[1] == "matrix") { + print(colSums(dropped)) + } + dropped_names <- colnames(dropped) + colnames(dropped) <- dropped_names + num_genes <- rep(0, length(dropped_names)) + names(num_genes) <- dropped_names + for (sample in colnames(dropped)) { + num_genes[sample] <- sum(dropped[, sample] > 0) + } + message("by number of genes:") + print(num_genes) } else { stop("Unable to determine what is being subset.") } diff --git a/R/normalize_batch.R b/R/normalize_batch.R index f31e3ef2..c0ccb90a 100644 --- a/R/normalize_batch.R +++ b/R/normalize_batch.R @@ -310,11 +310,11 @@ all_adjusters <- function(input, design = NULL, estimate_type = "sva", batch1 = mesg("Attempting fsva surrogate estimation with ", chosen_surrogates, " ", sword, ".") type_color <- "darkred" - sva_object <- sm(sva::sva(log2_mtrx, conditional_model, - null_model, n.sv = chosen_surrogates)) - surrogate_result <- sva::fsva(log2_mtrx, conditional_model, + sva_object <- sink(sm(sva::sva(log2_mtrx, conditional_model, + null_model, n.sv = chosen_surrogates))) + surrogate_result <- sink(sva::fsva(log2_mtrx, conditional_model, sva_object, newdat = as.matrix(log2_mtrx), - method = "exact") + method = "exact")) model_adjust <- as.matrix(surrogate_result[["newsv"]]) source_counts <- surrogate_result[["new"]] }, From b35718197ae63747bb81d3da700df84983ce88cb Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Wed, 24 Apr 2024 14:57:29 -0400 Subject: [PATCH 3/8] updated to make sample id correct and autofil batches --- R/expt.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/R/expt.R b/R/expt.R index f2a92073..c7591eaf 100644 --- a/R/expt.R +++ b/R/expt.R @@ -364,7 +364,6 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL message("Reading the sample metadata.") sample_definitions <- extract_metadata(metadata, id_column = id_column, ...) - ## sample_definitions <- extract_metadata(metadata) ## Add an explicit removal of the file column if the option file_column is NULL. ## This is a just in case measure to avoid conflicts. if (is.null(file_column)) { @@ -1632,15 +1631,15 @@ If this is not correctly performed, very few genes will be observed") ## put the actual sampleIDs as the colnames of its return. ## And, since I use the count table columns as the primary arbiter of the sample IDs ## when merging the sample annotations (metadata) and the counts, this does not end well. - colnames(import[["abundance"]]) <- sample_ids - colnames(import[["counts"]]) <- sample_ids - colnames(import[["length"]]) <- sample_ids + colnames(import[["abundance"]]) <- ids + colnames(import[["counts"]]) <- ids + colnames(import[["length"]]) <- ids import_scaled <- sm(tximport::tximport( files = files, type = "salmon", tx2gene = tx_gene_map, txOut = txout, countsFromAbundance = "lengthScaledTPM")) - colnames(import_scaled[["abundance"]]) <- sample_ids - colnames(import_scaled[["counts"]]) <- sample_ids - colnames(import_scaled[["length"]]) <- sample_ids + colnames(import_scaled[["abundance"]]) <- ids + colnames(import_scaled[["counts"]]) <- ids + colnames(import_scaled[["length"]]) <- ids } retlist[["count_table"]] <- data.table::as.data.table( import[["counts"]], keep.rownames = "rownames") From b5f7a7b9fadb7a12287d1d78f4c4d32c27c5a572 Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Thu, 25 Apr 2024 12:53:52 -0400 Subject: [PATCH 4/8] print.pca_result --- R/zzz_print.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/zzz_print.R b/R/zzz_print.R index 248aaad2..adda9f19 100644 --- a/R/zzz_print.R +++ b/R/zzz_print.R @@ -538,8 +538,8 @@ print.pca_result <- function(x, ...) { cond_column <- x[["cond_column"]] batch_column <- x[["batch_column"]] - color_levels <- toString(levels(as.factor(pData(x)[[cond_column]]))) - batch_levels <- toString(levels(as.factor(pData(x)[[batch_column]]))) + color_levels <- toString(levels(as.factor(x[["design"]][[cond_column]]))) + batch_levels <- toString(levels(as.factor(x[["design"]][[batch_column]]))) message("The result of performing a ", x[["pc_method"]], " dimension reduction. The x-axis is PC", x[["x_pc"]], " and the y-axis is PC", x[["y_pc"]], " Colors are defined by ", color_levels, " From de56aa2506a9da15f388010c958f941f4c6e93dc Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Sun, 28 Apr 2024 14:13:11 -0400 Subject: [PATCH 5/8] copied expt.R from trey --- R/expt.R | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/R/expt.R b/R/expt.R index c7591eaf..06ddac18 100644 --- a/R/expt.R +++ b/R/expt.R @@ -305,7 +305,7 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL savefile = NULL, low_files = FALSE, handle_na = "drop", researcher = "elsayed", study_name = NULL, file_type = NULL, annotation_name = "org.Hs.eg.db", tx_gene_map = NULL, - feature_type = "gene", ...) { + feature_type = "gene", ignore_tx_version = TRUE, ...) { arglist <- list(...) ## pass stuff like sep=, header=, etc here if ("gene_tx_map" %in% names(arglist)) { @@ -418,7 +418,7 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL sample_ids <- rownames(sample_definitions) count_data <- read_counts_expt(sample_ids, filenames, countdir = countdir, tx_gene_map = tx_gene_map, file_type = file_type, - ...) + ignore_tx_version = ignore_tx_version, ...) if (count_data[["source"]] == "tximport") { tximport_data <- list("raw" = count_data[["tximport"]], "scaled" = count_data[["tximport_scaled"]]) @@ -1480,6 +1480,9 @@ make_pombe_expt <- function(annotation = TRUE) { #' @param tx_gene_map Dataframe which provides a mapping between #' transcript IDs and gene IDs. #' @param file_type Short circuit the file format autodetection. +#' @param ignore_tx_version Pass along TRUE to tximport's parameter +#' ignoreTxIds to alleviate the headaches associated with salmon's +#' stupid transcript ID .x suffix. #' @param ... More options for happy time! #' @return Data frame of count tables. #' @seealso [data.table] [create_expt()] [tximport] @@ -1491,7 +1494,7 @@ make_pombe_expt <- function(annotation = TRUE) { read_counts_expt <- function(ids, files, header = FALSE, include_summary_rows = FALSE, all.x = TRUE, all.y = FALSE, merge_type = "merge", suffix = NULL, countdir = NULL, tx_gene_map = NULL, - file_type = NULL, ...) { + file_type = NULL, ignore_tx_version = TRUE, ...) { ## load first sample arglist <- list(...) retlist <- list() @@ -1618,15 +1621,19 @@ If this is not correctly performed, very few genes will be observed") import <- NULL import_scaled <- NULL if (is.null(tx_gene_map)) { - import <- sm(tximport::tximport(files = files, type = "salmon", txOut = txout)) + import <- sm(tximport::tximport(files = files, type = "salmon", + txOut = txout, ignoreTxVersion = ignore_tx_version)) import_scaled <- sm(tximport::tximport( files = files, type = "salmon", - txOut = txout, countsFromAbundance = "lengthScaledTPM")) + txOut = txout, countsFromAbundance = "lengthScaledTPM", + ignoreTxVersion = ignore_tx_version)) } else { ## Add a little test to see how well the tx_gene_map versions ## match those in the results from salmon. import <- tximport::tximport( - files = as.character(files), type = "salmon", tx2gene = tx_gene_map, txOut = txout) + files = as.character(files), type = "salmon", + tx2gene = tx_gene_map, txOut = txout, + ignoreTxVersion = ignore_tx_version) ## It appears that something in tximport recently changed which causes it to no longer ## put the actual sampleIDs as the colnames of its return. ## And, since I use the count table columns as the primary arbiter of the sample IDs @@ -1636,7 +1643,8 @@ If this is not correctly performed, very few genes will be observed") colnames(import[["length"]]) <- ids import_scaled <- sm(tximport::tximport( files = files, type = "salmon", tx2gene = tx_gene_map, - txOut = txout, countsFromAbundance = "lengthScaledTPM")) + txOut = txout, countsFromAbundance = "lengthScaledTPM", + ignoreTxVersion = ignore_tx_version)) colnames(import_scaled[["abundance"]]) <- ids colnames(import_scaled[["counts"]]) <- ids colnames(import_scaled[["length"]]) <- ids From f5540b4f3b6dcb8f2561afbd0c574af93b05cf54 Mon Sep 17 00:00:00 2001 From: Zhezhen Wang Date: Tue, 30 Apr 2024 12:12:46 -0400 Subject: [PATCH 6/8] edit plot_pcs with shape number larger than 25 --- R/dimension_reduction.R | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R index 01006e08..2023c2ec 100644 --- a/R/dimension_reduction.R +++ b/R/dimension_reduction.R @@ -576,7 +576,7 @@ plot_pca <- function(data, design = NULL, plot_colors = NULL, plot_title = TRUE, plot_size = 5, plot_alpha = NULL, plot_labels = FALSE, size_column = NULL, pc_method = "fast_svd", x_pc = 1, y_pc = 2, max_overlaps = 20, num_pc = NULL, expt_names = NULL, label_chars = 10, - cond_column = "condition", batch_column = "batch", + cond_column = "condition", batch_column = "batch", hex = TRUE, ...) { arglist <- list(...) ## Set default columns in the experimental design for condition and batch @@ -1034,6 +1034,9 @@ plot_pca <- function(data, design = NULL, plot_colors = NULL, plot_title = TRUE, } else { ## Leave the title blank. } + + ## If want a hexagonal heatmap + #if(hexagonal) comp_plot + geom_hex() ## Finally, return a list of the interesting bits of what happened. pca_return <- list( @@ -1800,7 +1803,7 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, ggplot2::scale_size_manual(name = size_column, labels = levels(pca_data[[size_column]]), values = as.numeric(levels(pca_data[["size"]]))) - } else if (!is.null(size_column) && num_batches > 5) { + } else if (!is.null(size_column) && num_batches > 5 && num_batches<19) { pca_plot <- pca_plot + ggplot2::geom_point(alpha = plot_alpha, aes(shape = .data[["batch"]], @@ -1816,6 +1819,31 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, ggplot2::scale_size_manual(name = size_column, labels = levels(pca_data[[size_column]]), values = as.numeric(levels(pca_data[["size"]]))) + } else if (!is.null(size_column) && num_batches>19) { + symbols = c(0:18,20,'+','-','o','O','*','|','%','#') + if(num_batches Date: Wed, 1 May 2024 15:36:18 -0400 Subject: [PATCH 7/8] added black outline --- R/dimension_reduction.R | 57 +++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R index 2023c2ec..a579984c 100644 --- a/R/dimension_reduction.R +++ b/R/dimension_reduction.R @@ -1749,11 +1749,33 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, guide = ggplot2::guide_legend(override.aes = list(size = plot_size, fill = "grey")), values = 21:25) } else if (is.null(size_column) && num_batches > 5) { + ### cannot mix integer with characters even using list + ### 19 removed because it looks like 16, which might not be needed since shapes are reused for batches more than 25 + symbols = c(0:18,20:25) + if(length(symbols) < num_batches){ + idx = c(rep(1:length(symbols), + num_batches %/% length(symbols)), + 1:(num_batches %% length(symbols))) + values = symbols[idx] + }else{ + values = symbols[1:num_batches] + } + + ## change outline color to black for batches 21+ + if(num_batches > 20){ + cond20 = unique(pca_data[['condition']])[1:20] + idx = which(pca_data[['condition']] %in% cond20) + othern = nrow(pca_data) - length(idx) + outline_col = c(pca_data[['condition']][idx],rep('black',othern)) + }else{ + outline_col = pca_data[['condition']] + } + pca_plot <- pca_plot + ggplot2::geom_point(size = plot_size, alpha = plot_alpha, aes(shape = .data[["batch"]], fill = .data[["condition"]], - colour = .data[["condition"]])) + + colour = outline_col)) + ggplot2::scale_color_manual(name = "Condition", ##guide = ggplot2::guide_legend(override.aes = list(size = plot_size)), guide = "legend", @@ -1765,7 +1787,7 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, ggplot2::scale_shape_manual(name = "Batch", labels = levels(as.factor(pca_data[["batch"]])), guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)), - values = 1:num_batches) + values = values) } else if (!is.null(size_column) && num_batches <= 5) { ## This will require the 6 steps above and one more pca_plot <- ggplot(data = as.data.frame(pca_data), @@ -1803,41 +1825,16 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, ggplot2::scale_size_manual(name = size_column, labels = levels(pca_data[[size_column]]), values = as.numeric(levels(pca_data[["size"]]))) - } else if (!is.null(size_column) && num_batches > 5 && num_batches<19) { + } else if (!is.null(size_column) && num_batches > 5) { pca_plot <- pca_plot + - ggplot2::geom_point(alpha = plot_alpha, - aes(shape = .data[["batch"]], - colour = .data[["condition"]], - size = .data[["size"]])) + - ggplot2::scale_shape_manual(name = "Batch", - labels = levels(as.factor(pca_data[["batch"]])), - guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)), - values = 1:num_batches) + - ggplot2::scale_color_manual(name = "Condition", - guide = "legend", - values = color_list) + - ggplot2::scale_size_manual(name = size_column, - labels = levels(pca_data[[size_column]]), - values = as.numeric(levels(pca_data[["size"]]))) - } else if (!is.null(size_column) && num_batches>19) { - symbols = c(0:18,20,'+','-','o','O','*','|','%','#') - if(num_batches Date: Mon, 6 May 2024 18:15:40 -0400 Subject: [PATCH 8/8] give up on pca color for dots over 20 --- R/dimension_reduction.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R index 3f8aa8bc..46fab388 100644 --- a/R/dimension_reduction.R +++ b/R/dimension_reduction.R @@ -1753,7 +1753,8 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, } else if (is.null(size_column) && num_batches > 5) { ### cannot mix integer with characters even using list ### 19 removed because it looks like 16, which might not be needed since shapes are reused for batches more than 25 - symbols = c(0:18,20:25) + #symbols = c(0:18,20:25) + symbols = c(0:25) if(length(symbols) < num_batches){ idx = c(rep(1:length(symbols), num_batches %/% length(symbols)), @@ -1764,14 +1765,14 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL, } ## change outline color to black for batches 21+ - if(num_batches > 20){ - cond20 = unique(pca_data[['condition']])[1:20] - idx = which(pca_data[['condition']] %in% cond20) - othern = nrow(pca_data) - length(idx) - outline_col = c(pca_data[['condition']][idx],rep('black',othern)) - }else{ - outline_col = pca_data[['condition']] - } + #if(num_batches > 20){ +# cond20 = unique(pca_data[['condition']])[1:20] +# idx = which(pca_data[['condition']] %in% cond20) +# othern = nrow(pca_data) - length(idx) +## outline_col = c(pca_data[['condition']][idx],rep('black',othern)) +# }else{ +# outline_col = pca_data[['condition']] +# } pca_plot <- pca_plot + ggplot2::geom_point(size = plot_size, alpha = plot_alpha,