From 1e85163bfb9b7bdf70fc09d868887ee7ef220dce Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Fri, 6 Oct 2023 16:58:32 -0400
Subject: [PATCH 1/8] =?UTF-8?q?added=20gostats=20topgo=20to=20=1B[200~?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ontology_goseq.R | 539 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 539 insertions(+)
 create mode 100644 ontology_goseq.R

diff --git a/ontology_goseq.R b/ontology_goseq.R
new file mode 100644
index 00000000..1a2f3246
--- /dev/null
+++ b/ontology_goseq.R
@@ -0,0 +1,539 @@
+## ontology_goseq.r: Some methods to simplify the usage of goseq.  goseq is the
+## first GSEA tool I learned about.  It is not particularly robust, this seeks
+## to amend that.
+
+#' Filter a goseq significance search
+#'
+#' Given a goseq result, use some simple filters to pull out the
+#' categories of likely interest.
+#'
+#' @param godata goseq result
+#' @param expand_categories Extract GO terms from GO.db and add them
+#'  to the table
+#' @param pvalue Significance filter.
+#' @param minimum_interesting The category should have more than this
+#'  number of elements.
+#' @param adjust Adjusted p-value filter.
+#' @param padjust_method Method for adjusting the p-values.
+extract_interesting_goseq <- function(godata, expand_categories = TRUE, pvalue = 0.05,
+                                      minimum_interesting = 1, adjust = 0.05, padjust_method = "BH") {
+  ## Add a little logic if we are using a non-GO input database.
+  ## This is relevant if you use something like msigdb/reactome/kegg
+  ## I should probably make the rest of this function generic so that this is not required.
+  if (is.null(godata[["ontology"]])) {
+    godata[["ontology"]] <- "MF"
+  }
+
+  godata_interesting <- godata
+  if (isTRUE(expand_categories)) {
+    mesg("simple_goseq(): Filling godata with terms, this is slow.")
+    godata_interesting <- goseq_table(godata)
+  } else {
+    ## Set the 'term' category for plotting.
+    godata_interesting[["term"]] <- godata_interesting[["category"]]
+    godata[["term"]] <- godata[["category"]]
+  }
+
+  if (is.null(adjust)) {
+    interesting_idx <- godata[["over_represented_pvalue"]] <= pvalue
+    godata_interesting <- godata[godata_interesting, ]
+    padjust_method <- "none"
+  } else {
+    ## There is a requested pvalue adjustment
+    interesting_idx <- godata[["over_represented_pvalue"]] <= adjust
+    godata_interesting <- godata[interesting_idx, ]
+    if (dim(godata_interesting)[1] < minimum_interesting) {
+      message("simple_goseq(): There are no genes with an adj.p < ", adjust, " using: ",
+              padjust_method, ".")
+      message("simple_goseq(): Providing genes with raw pvalue < ", pvalue, ".")
+      interesting_idx <- godata[["over_represented_pvalue"]] <= pvalue
+      godata_interesting <- godata_interesting[interesting_idx, ]
+      padjust_method <- "none"
+    }
+  }
+
+  na_idx <- is.na(godata[["ontology"]])
+  godata <- godata[!na_idx, ]
+  mf_idx <- godata[["ontology"]] == "MF"
+  mf_subset <- godata[mf_idx, ]
+  rownames(mf_subset) <- mf_subset[["category"]]
+  bp_idx <- godata[["ontology"]] == "BP"
+  bp_subset <- godata[bp_idx, ]
+  rownames(bp_subset) <- bp_subset[["category"]]
+  cc_idx <- godata[["ontology"]] == "CC"
+  cc_subset <- godata[cc_idx, ]
+  rownames(cc_subset) <- cc_subset[["category"]]
+
+  na_idx <- is.na(godata_interesting[["ontology"]])
+  godata_interesting <- godata_interesting[!na_idx, ]
+  mf_idx <- godata_interesting[["ontology"]] == "MF"
+  mf_interesting <- godata_interesting[mf_idx, ]
+  rownames(mf_interesting) <- mf_interesting[["category"]]
+  if (is.null(mf_interesting[["term"]])) {
+    mf_interesting <- mf_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue")]
+  } else {
+    mf_interesting <- mf_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue", "term")]
+  }
+  bp_idx <- godata_interesting[["ontology"]] == "BP"
+  bp_interesting <- godata_interesting[bp_idx, ]
+  rownames(bp_interesting) <- bp_interesting[["category"]]
+  if (is.null(bp_interesting[["term"]])) {
+    bp_interesting <- bp_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue")]
+  } else {
+    bp_interesting <- bp_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue", "term")]
+  }
+  cc_idx <- godata_interesting[["ontology"]] == "CC"
+  cc_interesting <- godata_interesting[cc_idx, ]
+  rownames(cc_interesting) <- cc_interesting[["category"]]
+  if (is.null(cc_interesting[["term"]])) {
+    cc_interesting <- cc_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue")]
+  } else {
+    cc_interesting <- cc_interesting[, c("ontology", "numDEInCat", "numInCat",
+                                         "over_represented_pvalue", "qvalue", "term")]
+  }
+  retlist <- list(
+      "godata" = godata,
+      "interesting" = godata_interesting,
+      "mf_subset" = mf_subset,
+      "bp_subset" = bp_subset,
+      "cc_subset" = cc_subset,
+      "MF" = mf_interesting,
+      "BP" = bp_interesting,
+      "CC" = cc_interesting)
+  return(retlist)
+}
+
+#' Pass MSigDB categorical data to goseq and run it.
+#'
+#' goseq is probably the easiest method to push varying data types into.  Thus
+#' it was the first thing I thought of when looking to push MSigDB data into a
+#' GSEA method.
+#'
+#' @param sig_genes Character list of genes deemed significant.  I think in the
+#'  current implementation this must be just a list of IDs as opposed to the
+#'  full dataframe of interesting genes because we likely need to convert IDs.
+#' @param signatures Used by load_gmt_signatures(), the signature file or set.
+#' @param data_pkg Used by load_gmt_signatures().
+#' @param signature_category Ibid, but the name of the signatures group.
+#' @param current_id Used by convert_msig_ids(), when converting IDs, the name
+#'  of the existing type.
+#' @param required_id What type to convert to in convert_msig_ids().
+#' @param length_db Dataframe of lengths.  It is worth noting that goseq
+#'  explicitly states that one might wish to use other potentially confounding
+#'  factors here, but they only examine lenghts in their paper. Starting with
+#'  this parameter, everything is just passed directly to simple_goseq()
+#' @param doplot Print the prior plot?
+#' @param adjust passed to simple_goseq()
+#' @param pvalue passed to simple_goseq()
+#' @param length_keytype passed to simple_goseq()
+#' @param go_keytype passed to simple_goseq()
+#' @param goseq_method passed to simple_goseq()
+#' @param padjust_method passed to simple_goseq()
+#' @param excel passed to simple_goseq()
+#' @param orgdb Ideally used to help goseq collect lengths.
+#' @return Some goseq data!
+#' @seealso [gsva] [goseq]
+#' @export
+goseq_msigdb <- function(sig_genes, signatures = "c2BroadSets", data_pkg = "GSVAdata",
+                         signature_category = "c2", current_id = "ENSEMBL", required_id = "ENTREZID",
+                         length_db = NULL, doplot = TRUE, adjust = 0.1, pvalue = 0.1,
+                         length_keytype = "transcripts", go_keytype = "entrezid",
+                         goseq_method = "Wallenius", padjust_method = "BH",
+                         excel = NULL, orgdb = "org.Hs.eg.db",method = 'gostat') {
+  sig_data <- load_gmt_signatures(signatures = signatures, data_pkg = data_pkg,
+                                  signature_category = signature_category)
+  mesg("Starting to coerce the msig data to the ontology format.")
+  go_db <- data.table::data.table()
+  for (i in seq_along(sig_data)) {
+    gsc <- sig_data[[i]]
+    gsc_id <- gsc@setName
+    gsc_genes <- gsc@geneIds
+    tmp_db <- data.table::data.table("ID" = gsc_genes, "GO" = rep(gsc_id, length(gsc_genes)))
+    go_db <- rbind(go_db, tmp_db)
+  }
+  mesg("Finished coercing the msig data into a df with ", nrow(go_db), " rows.")
+
+  new_ids <- NULL
+  new_sig <- data.frame()
+  if ("character" %in% class(sig_genes)) {
+    new_ids <- convert_ids(sig_genes, from = current_id, to = required_id, orgdb = orgdb)
+    new_sig <- new_ids
+    colnames(new_sig) <- c(current_id, "ID")
+    new_sig <- new_sig[, c("ID", current_id)]
+    rownames(new_sig) <- make.names(new_sig[["ID"]], unique = TRUE)
+  } else if ("data.frame" %in% class(sig_genes)) {
+    new_ids <- convert_ids(rownames(sig_genes), from = current_id,
+                           to = required_id, orgdb = orgdb)
+    new_sig <- merge(new_ids, sig_genes, by.x = current_id,
+                     by.y = "row.names")
+    new_sig[["ID"]] <- new_sig[[required_id]]
+  } else {
+    stop("I do not understand this input data format for sig_genes.")
+  }
+
+  new_lids <- convert_ids(rownames(length_db), from = current_id, to = required_id, orgdb = orgdb)
+  new_length <- merge(new_lids, length_db, by.x = current_id, by.y = "row.names")
+  new_length[["ID"]] <- NULL
+  new_length[["ID"]] <- new_length[[required_id]]
+  if (is.null(new_length[["length"]])) {
+    new_length <- new_length[, c("ID", "width")]
+    colnames(new_length) <- c("ID", "length")
+  } else {
+    new_length <- new_length[, c("ID", "length")]
+  }
+
+  if(method == 'goseq'){	
+  go_result <- simple_goseq(new_sig, go_db = go_db, length_db = new_length,
+                            doplot = doplot, adjust = adjust, pvalue = pvalue,
+                            length_keytype = length_keytype, go_keytype = go_keytype,
+                            goseq_method = goseq_method, padjust_method = padjust_method,
+                            plot_title = plot_title,
+                            expand_categories = expand_categories,
+                            excel = excel, add_trees = add_trees, 
+			    gather_genes = gather_genes, width = width,...)
+  }elif(method == 'gostat'){
+  go_result <- simple_gostats(new_sig, go_db = go_db, pcutoff = pvalue, 
+			      gff_id = go_keytype, excel = excel,...)
+  }elif(method == 'topgo'){
+  go_result <- simple_topgo(new_sig, go_db = go_db, limit = pvalue, excel = excel,...)
+  }  
+  return(go_result)
+}
+
+#' Enhance the goseq table of gene ontology information.
+#'
+#' While goseq has some nice functionality, the table of outputs it provides is
+#' somewhat lacking. This attempts to increase that with some extra helpful data
+#' like ontology categories, definitions, etc.
+#'
+#' @param df Dataframe of ontology information.  This is intended to be the
+#'  output from goseq including information like numbers/category, GOids, etc.
+#'  It requires a column 'category'
+#'  which contains: GO:000001 and such.
+#' @param file Csv file to which to write the table.
+#' @return Ontology table with annotation information included.
+#' @seealso [goseq] [GO.db]
+#' @examples
+#' \dontrun{
+#'  annotated_go = goseq_table(go_ids)
+#'  head(annotated_go, n = 1)
+#'  ## >        category numDEInCat numInCat over_represented_pvalue
+#'  ## > 571  GO:0006364          9       26            4.655108e-08
+#'  ## >      under_represented_pvalue       qvalue ontology
+#'  ## > 571                 1.0000000 6.731286e-05       BP
+#'  ## >                                term
+#'  ## > 571                 rRNA processing
+#'  ## >                               synonym
+#'  ## > 571        "35S primary transcript processing, GO:0006365"
+#'  ## >        secondary    definition
+#'  ## > 571    GO:0006365   Any process involved in the conversion of a primary ribosomal
+#'  ##          RNA (rRNA) transcript into one or more mature rRNA molecules.
+#' }
+#' @export
+goseq_table <- function(df, file = NULL) {
+  df[["term"]] <- goterm(df[["category"]])
+  df[["ontology"]] <- goont(df[["category"]])
+  message("Testing that go categories are defined.")
+  df[["good"]] <- gotest(df[["category"]])
+  message("Removing undefined categories.")
+  good_idx <- df[["good"]] == 1
+  df <- df[good_idx, ]
+  na_idx <- is.na(df[["category"]])
+  df <- df[!na_idx, ]
+  na_idx <- is.na(df[["term"]])
+  df <- df[!na_idx, ]
+  message("Gathering synonyms.")
+  df[["synonym"]] <- gosyn(df[["category"]])
+  message("Gathering category definitions.")
+  df[["definition"]] <- godef(df[["category"]])
+  df <- df[, c("category", "numDEInCat", "numInCat", "over_represented_pvalue",
+               "under_represented_pvalue", "qvalue", "ontology", "term",
+               "synonym", "definition")]
+  if (!is.null(file)) {
+    write.csv(df, file = file)
+  }
+  return(df)
+}
+
+#' Perform a simplified goseq analysis.
+#'
+#' goseq can be pretty difficult to get set up for non-supported organisms.
+#' This attempts to make that process a bit simpler as well as give some
+#' standard outputs which should be similar to those returned by
+#' clusterprofiler/topgo/gostats/gprofiler.
+#'
+#' @param sig_genes Data frame of differentially expressed genes, containing IDs etc.
+#' @param go_db Database of go to gene mappings (OrgDb/OrganismDb)
+#' @param length_db Database of gene lengths (gff/TxDb)
+#' @param doplot Include pwf plots?
+#' @param adjust Minimum adjusted pvalue for 'significant.'
+#' @param pvalue Minimum pvalue for 'significant.'
+#' @param plot_title Set a title for the pvalue plots.
+#' @param length_keytype Keytype to provide to extract lengths
+#' @param go_keytype Keytype to provide to extract go IDs
+#' @param goseq_method Statistical test for goseq to use.
+#' @param padjust_method Which method to use to adjust the pvalues.
+#' @param expand_categories Expand the GO categories to make the results more readable?
+#' @param excel Print the results to an excel file?
+#' @param enrich Convert the goseq result to the clusterProfiler format?
+#' @param ... Extra parameters which I do not recall
+#' @return Big list including:
+#'  the pwd:pwf function,
+#'  alldata:the godata dataframe,
+#'  pvalue_histogram:p-value histograms,
+#'  godata_interesting:the ontology information of the enhanced groups,
+#'  term_table:the goterms with some information about them,
+#'  mf_subset:a plot of the MF enhanced groups,
+#'  mfp_plot:the pvalues of the MF group,
+#'  bp_subset:a plot of the BP enhanced groups,
+#'  bpp_plot,
+#'  cc_subset,
+#'  and ccp_plot
+#' @seealso [goseq] [GO.db] [GenomicFeatures] [stats::p.adjust()]
+#' @examples
+#' \dontrun{
+#'  lotsotables <- simple_goseq(gene_list, godb, lengthdb)
+#' }
+#' @export
+simple_goseq <- function(sig_genes, go_db = NULL, length_db = NULL, doplot = TRUE,
+                         adjust = 0.1, pvalue = 0.1, plot_title = NULL,
+                         length_keytype = "transcripts", go_keytype = "entrezid",
+                         goseq_method = "Wallenius", padjust_method = "BH",
+                         expand_categories = TRUE, excel = NULL, enrich = TRUE,
+                         ...) {
+  arglist <- list(...)
+
+  minimum_interesting <- 1
+  if (!is.null(arglist[["minimum_interesting"]])) {
+    minimum_interesting <- arglist[["minimum_interesting"]]
+  }
+  length_df <- data.frame()
+  length_vector <- vector()
+  de_vector <- vector()
+  gene_ids <- NULL
+  final_keytype <- NULL
+  goids_df <- NULL
+  ## sig_genes may be a list, character list, or data frame.
+  gene_list <- NULL
+  if (class(sig_genes)[1] == "data.table") {
+    sig_genes <- as.data.frame(sig_genes)
+  }
+  if (class(sig_genes) == "character") {
+    ## Then this is a character list of gene ids
+    gene_list <- sig_genes
+    sig_genes <- data.frame(row.names = sig_genes)
+  } else if (class(sig_genes) == "list") {
+    gene_list <- names(sig_genes)
+  } else if (class(sig_genes) == "data.frame") {
+    if (is.null(rownames(sig_genes)) && is.null(sig_genes[["ID"]])) {
+      stop("This requires a set of gene IDs from the rownames or a column named 'ID'.")
+    } else if (!is.null(sig_genes[["ID"]])) {
+      ## Use a column named 'ID' first because a bunch of annotation databases
+      ## use ENTREZ IDs which are just integers, which of course is not allowed
+      ## by data frame row names.
+      mesg("Using the ID column from your table rather than the row names.")
+      gene_list <- sig_genes[["ID"]]
+    } else if (!is.null(rownames(sig_genes))) {
+      mesg("Using the row names of your table.")
+      gene_list <- rownames(sig_genes)
+    } else {
+      gene_list <- sig_genes[["ID"]]
+    }
+  } else {
+    stop("Not sure how to handle your set of significant gene ids.")
+  }
+  ## At this point I should have a character list of gene ids named 'gene_list'
+  de_genelist <- as.data.frame(gene_list)
+  de_genelist[["DE"]] <- 1
+  colnames(de_genelist) <- c("ID", "DE")
+
+  ## Database of lengths may be a gff file, TxDb, or OrganismDb
+  metadf <- NULL
+  if (class(length_db)[1] == "data.table") {
+    length_db <- as.data.frame(length_db)
+  }
+  if (class(length_db)[[1]] == "character")  {
+    ## Then this should be either a gff file or species name.
+    if (grepl(pattern = "\\.gff", x = length_db, perl = TRUE) ||
+        grepl(pattern = "\\.gtf", x = length_db, perl = TRUE)) {
+      ## gff file
+      txdb <- GenomicFeatures::makeTxDbFromGFF(length_db)
+      metadf <- extract_lengths(db = txdb, gene_list = gene_list)
+    } else {
+      ## Then species name
+      message("A species name.")
+    }
+  } else if (class(length_db)[[1]] == "AnnotationDbi") {
+    stop("This currently requires an actual OrganismDb, not AnnotationDbi.")
+  } else if (class(length_db)[[1]] == "OrgDb") {
+    stop("OrgDb objects contain links to other databases, but no gene lengths.")
+  } else if (class(length_db)[[1]] == "OrganismDb" ||
+             class(length_db)[[1]] == "AnnotationDbi") {
+    ##metadf <- extract_lengths(db = length_db, gene_list = gene_list)
+    metadf <- sm(extract_lengths(db = length_db, gene_list = gene_list, ...))
+  } else if (class(length_db)[[1]] == "TxDb") {
+    metadf <- sm(extract_lengths(db = length_db, gene_list = gene_list, ...))
+  } else if (class(length_db)[[1]] == "data.frame") {
+    metadf <- length_db
+  } else {
+    stop("Extracting lengths requires the name of a supported species or orgdb instance.")
+  }
+
+  ## Sometimes the column with gene lengths is named 'width'
+  ## In that case, fix it.
+  if (is.null(metadf[["width"]]) && is.null(metadf[["length"]])) {
+    stop("The length db needs to have a length or width column.")
+  } else if (is.null(metadf[["length"]])) {
+    ## Then it is named 'width' and I want to rename it to length
+    colnames(metadf) <- gsub(x = colnames(metadf), pattern = "width", replacement = "length")
+  }
+  ## Now I should have the gene list and gene lengths
+
+  godf <- data.frame()
+  if (class(go_db)[[1]] == "character") {
+    ## A text table or species name
+    if (grepl(pattern = "\\.csv", x = go_db, perl = TRUE) ||
+        grepl(pattern = "\\.tab", x = go_db, perl = TRUE)) {
+      ## table
+      godf <- read.table(go_db, ...)
+      colnames(godf) <- c("ID", "GO")
+    } else {
+      ## Assume species name
+      supported <- TRUE
+      species <- go_db
+    }
+  } else if (class(go_db)[[1]] == "OrganismDb") {
+    godf <- extract_go(go_db, keytype = go_keytype)
+  } else if (class(go_db)[[1]] == "OrgDb") {
+    godf <- extract_go(go_db, keytype = go_keytype)
+  } else if (class(go_db)[[1]] == "data.table" || class(go_db)[[1]] == "tbl_df") {
+    godf <- as.data.frame(go_db)
+  } else if (class(go_db)[[1]] == "data.frame") {
+    godf <- go_db
+    if (!is.null(godf[["ID"]])) {
+      godf <- godf[, c("ID", "GO")]
+    } else if (!is.null(godf[["GID"]])) {
+      godf <- godf[, c("GID", "GO")]
+      colnames(godf) <- c("ID", "GO")
+    } else {
+      stop("Unable to read the gene ID/ GO columns from the go dataframe.")
+    }
+  } else {
+    stop("Unable to determine the input for creating a go dataframe.")
+  }
+
+  ## entrez IDs are numeric.  This is a problem when doing the pwf function
+  ## because it sets the rownames to the IDs.  As a result, we need to call
+  ## make.names() on them.
+  colnames(godf) <- c("ID","GO")
+  godf[["ID"]] <- make.names(godf[["ID"]])
+  metadf[["ID"]] <- make.names(metadf[["ID"]])
+  de_genelist[["ID"]] <- make.names(de_genelist[["ID"]])
+  ## Ok, now I have a df of GOids, all gene lengths, and DE gene list. That is
+  ## everything I am supposed to need for goseq.
+
+  ## See how many entries from the godb are in the list of genes.
+  id_xref <- de_genelist[["ID"]] %in% godf[["ID"]]
+  meta_xref <- de_genelist[["ID"]] %in% metadf[["ID"]]
+  message("Found ", sum(id_xref), " go_db genes and ", sum(meta_xref),
+          " length_db genes out of ", nrow(de_genelist), ".")
+  ## So lets merge the de genes and gene lengths to ensure that they are
+  ## consistent. Then make the vectors expected by goseq.
+  merged_ids_lengths <- metadf
+  ## The following line was done in order to avoid
+  ## "'unimplemented type 'list' in 'orderVector1'"
+  merged_ids_lengths[["ID"]] <- as.character(merged_ids_lengths[["ID"]])
+  merged_ids_lengths <- merge(merged_ids_lengths,
+                              de_genelist, by.x = "ID", by.y = "ID", all.x = TRUE)
+  merged_ids_lengths[["length"]] <- suppressWarnings(
+      as.numeric(merged_ids_lengths[["length"]]))
+  merged_ids_lengths[is.na(merged_ids_lengths)] <- 0
+  ## Not casing the next lines as character/numeric causes weird errors like 'names' attribute
+  ## must be the same length as the vector
+  de_vector <- as.vector(as.numeric(merged_ids_lengths[["DE"]]))
+  names(de_vector) <- make.names(as.character(
+      merged_ids_lengths[["ID"]]), unique = TRUE)
+  length_vector <- as.vector(as.numeric(merged_ids_lengths[["length"]]))
+  names(length_vector) <- make.names(as.character(
+      merged_ids_lengths[["ID"]]), unique = TRUE)
+
+  pwf_plot <- NULL
+  tmp_file <- tempfile(pattern = "goseq", fileext = ".png")
+  this_plot <- png(filename = tmp_file)
+  controlled <- dev.control("enable")
+  pwf <- sm(suppressWarnings(goseq::nullp(DEgenes = de_vector, bias.data = length_vector,
+                                          plot.fit = doplot)))
+  if (isTRUE(doplot)) {
+    pwf_plot <- recordPlot()
+  }
+  dev.off()
+  removed <- file.remove(tmp_file)
+
+  godata <- sm(goseq::goseq(pwf, gene2cat = godf, use_genes_without_cat = TRUE,
+                            method = goseq_method))
+  ## I want to limit the y-axis, but I think this is not the best way.
+  goseq_p <- try(plot_histogram(godata[["over_represented_pvalue"]], bins = 50))
+  goseq_p_nearzero <- table(goseq_p[["data"]])[[1]]
+  goseq_y_limit <- goseq_p_nearzero * 2
+  goseq_p <- goseq_p +
+    ggplot2::scale_y_continuous(limits = c(0, goseq_y_limit))
+  godata[["qvalue"]] <- stats::p.adjust(godata[["over_represented_pvalue"]],
+                                        method = padjust_method)
+
+  ## Subset the result for 'interesting' categories, which is defined
+  ## simply as the set of categories with full annotations.
+  interesting <- extract_interesting_goseq(
+    godata, expand_categories = expand_categories, pvalue = pvalue, adjust = adjust,
+    minimum_interesting = minimum_interesting, padjust_method = padjust_method)
+
+  mesg("simple_goseq(): Making pvalue plots for the ontologies.")
+  pvalue_plots <- plot_goseq_pval(godata, plot_title = plot_title,
+                                  x_column = "num_cat",
+                                  ...)
+  pval_plots <- list(
+      "bpp_plot_over" = pvalue_plots[["bpp_plot_over"]],
+      "mfp_plot_over" = pvalue_plots[["mfp_plot_over"]],
+      "ccp_plot_over" = pvalue_plots[["ccp_plot_over"]])
+
+  retlist <- list(
+      "input" = sig_genes,
+      "pwf" = pwf,
+      "pwf_plot" = pwf_plot,
+      "all_data" = interesting[["godata"]],
+      "go_db" = godf,
+      "godata" = godata,
+      "pvalue_histogram" = goseq_p,
+      "godata_interesting" = interesting[["interesting"]],
+      "mf_interesting" = interesting[["MF"]],
+      "bp_interesting" = interesting[["BP"]],
+      "cc_interesting" = interesting[["CC"]],
+      "goadjust_method" = goseq_method,
+      "adjust_method" = padjust_method,
+      "mf_subset" = interesting[["mf_subset"]],
+      "bp_subset" = interesting[["bp_subset"]],
+      "cc_subset" = interesting[["cc_subset"]],
+      "pvalue_plots" = pval_plots)
+  class(retlist) <- c("goseq_result", "list")
+  if (isTRUE(enrich)) {
+    retlist[["mf_enrich"]] <- goseq2enrich(
+      retlist, ontology = "MF", cutoff = pvalue, padjust_method = padjust_method)
+    retlist[["bp_enrich"]] <- goseq2enrich(
+      retlist, ontology = "BP", cutoff = pvalue, padjust_method = padjust_method)
+    retlist[["cc_enrich"]] <- goseq2enrich(
+      retlist, ontology = "CC", cutoff = pvalue, padjust_method = padjust_method)
+  }
+  if (!is.null(excel)) {
+    excel_result <- write_goseq_data(retlist, excel = excel, ...)
+  }
+  return(retlist)
+}
+
+## EOF
+
+
+

From 69f07ba16b20c105443d13c881db922d7d33026c Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Wed, 24 Apr 2024 14:11:22 -0400
Subject: [PATCH 2/8] updated to make sample id correct and autofil batches

---
 R/expt.R            | 41 +++++++++++++++++++++++++++++++++++++----
 R/normalize_batch.R |  8 ++++----
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/R/expt.R b/R/expt.R
index 9fcdde7b..f2a92073 100644
--- a/R/expt.R
+++ b/R/expt.R
@@ -46,7 +46,7 @@ combine_expts <- function(expt1, expt2, condition = "condition", all_x = TRUE, a
   exp2 <- expt2[["expressionset"]]
   fData(exp2) <- fData(exp1)
 
-  testthat::expect_equal(rownames(exprs(exp1)), rownames(exprs(exp2)))
+  ##testthat::expect_equal(rownames(exprs(exp1)), rownames(exprs(exp2)))
 
   if (isTRUE(merge_meta)) {
     design1 <- pData(exp1)
@@ -62,7 +62,8 @@ combine_expts <- function(expt1, expt2, condition = "condition", all_x = TRUE, a
     pData(exp2) <- new_design2
   }
 
-  new <- a4Base::combineTwoExpressionSet(exp1, exp2)
+  ## new <- a4Base::combineTwoExpressionSet(exp1, exp2)
+  new <- Biobase::combine(exp1, exp2)
   expt1[["expressionset"]] <- new
   expt1[["conditions"]] <- pData(expt1)[["condition"]]
   names(expt1[["conditions"]]) <- rownames(pData(expt1))
@@ -615,12 +616,16 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL
   ## not match after using tximport.
   if (!is.null(arglist[["tx_gene_map"]])) {
     tx_gene_map <- arglist[["tx_gene_map"]]
-    matched_rows <- sum(rownames(gene_info) %in% tx_gene_map[[2]])
+    ## This is wrong, we recast gene_info as a data.table and thus it does not have rownames.
+    ## matched_rows <- sum(rownames(gene_info) %in% tx_gene_map[[2]])
+    ## Because of the previous line, the following test always sees the match between the tx_gene_map
+    ## and gene annotations as 0, and therefore it messed up the rownames of the annotations.
+    matched_rows <- gene_info[["rownames"]] %in% tx_gene_map[[2]]
     if (matched_rows < 1) {
       message("The mapped IDs are not the rownames of your gene information, changing them now.")
       if (names(tx_gene_map)[2] %in% colnames(gene_info)) {
         new_name <- names(tx_gene_map)[2]
-        rownames(gene_info) <- make.names(tx_gene_map[[new_name]], unique = TRUE)
+        gene_info[["rownames"]] <- make.names(tx_gene_map[[new_name]], unique = TRUE)
       } else {
         warning("Cannot find an appropriate column in gene_info, refusing to use the tx_map.")
       }
@@ -1562,11 +1567,13 @@ If this is not correctly performed, very few genes will be observed")
     import <- NULL
     import_scaled <- NULL
     if (is.null(tx_gene_map)) {
+      warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.")
       import <- sm(tximport::tximport(files = files, type = "kallisto", txOut = txout))
       import_scaled <- sm(tximport::tximport(
         files = files, type = "kallisto",
         txOut = txout, countsFromAbundance = "lengthScaledTPM"))
     } else {
+      warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.")
       import <- sm(tximport::tximport(
         files = files, type = "kallisto", tx2gene = tx_gene_map, txOut = txout))
       import_scaled <- sm(tximport::tximport(
@@ -1585,6 +1592,7 @@ If this is not correctly performed, very few genes will be observed")
     import <- NULL
     import_scaled <- NULL
     if (is.null(tx_gene_map)) {
+      warning("Check that these count column names are correct, it may be the case that tximport failed and we need to set the colnames as per salmon.")
       import <- tximport::tximport(files = files, type = "rsem", txOut = txout)
       import_scaled <- tximport::tximport(files = files, type = "rsem",
                                           txOut = txout, countsFromAbundance = "lengthScaledTPM")
@@ -1620,9 +1628,19 @@ If this is not correctly performed, very few genes will be observed")
       ## match those in the results from salmon.
       import <- tximport::tximport(
         files = as.character(files), type = "salmon", tx2gene = tx_gene_map, txOut = txout)
+      ## It appears that something in tximport recently changed which causes it to no longer
+      ## put the actual sampleIDs as the colnames of its return.
+      ## And, since I use the count table columns as the primary arbiter of the sample IDs
+      ## when merging the sample annotations (metadata) and the counts, this does not end well.
+      colnames(import[["abundance"]]) <- sample_ids
+      colnames(import[["counts"]]) <- sample_ids
+      colnames(import[["length"]]) <- sample_ids
       import_scaled <- sm(tximport::tximport(
         files = files, type = "salmon", tx2gene = tx_gene_map,
         txOut = txout, countsFromAbundance = "lengthScaledTPM"))
+      colnames(import_scaled[["abundance"]]) <- sample_ids
+      colnames(import_scaled[["counts"]]) <- sample_ids
+      colnames(import_scaled[["length"]]) <- sample_ids
     }
     retlist[["count_table"]] <- data.table::as.data.table(
       import[["counts"]], keep.rownames = "rownames")
@@ -2494,6 +2512,21 @@ subset_expt <- function(expt, subset = NULL, ids = NULL,
     subset_design <- as.data.frame(subset_design, stringsAsFactors = FALSE)
     message("The samples (and read coverage) removed when filtering ",
             nonzero, " non-zero genes are: ")
+    dropped <- exprs(expt)[, remove_idx]
+    if (class(dropped)[1] == "numeric") {
+      print(sum(dropped))
+    } else if (class(dropped)[1] == "matrix") {
+      print(colSums(dropped))
+    }
+    dropped_names <- colnames(dropped)
+    colnames(dropped) <- dropped_names
+    num_genes <- rep(0, length(dropped_names))
+    names(num_genes) <- dropped_names
+    for (sample in colnames(dropped)) {
+      num_genes[sample] <- sum(dropped[, sample] > 0)
+    }
+    message("by number of genes:")
+    print(num_genes)
   } else {
     stop("Unable to determine what is being subset.")
   }
diff --git a/R/normalize_batch.R b/R/normalize_batch.R
index f31e3ef2..c0ccb90a 100644
--- a/R/normalize_batch.R
+++ b/R/normalize_batch.R
@@ -310,11 +310,11 @@ all_adjusters <- function(input, design = NULL, estimate_type = "sva", batch1 =
       mesg("Attempting fsva surrogate estimation with ",
            chosen_surrogates, " ", sword, ".")
       type_color <- "darkred"
-      sva_object <- sm(sva::sva(log2_mtrx, conditional_model,
-                                null_model, n.sv = chosen_surrogates))
-      surrogate_result <- sva::fsva(log2_mtrx, conditional_model,
+      sva_object <- sink(sm(sva::sva(log2_mtrx, conditional_model,
+                                null_model, n.sv = chosen_surrogates)))
+      surrogate_result <- sink(sva::fsva(log2_mtrx, conditional_model,
                                     sva_object, newdat = as.matrix(log2_mtrx),
-                                    method = "exact")
+                                    method = "exact"))
       model_adjust <- as.matrix(surrogate_result[["newsv"]])
       source_counts <- surrogate_result[["new"]]
     },

From b35718197ae63747bb81d3da700df84983ce88cb Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Wed, 24 Apr 2024 14:57:29 -0400
Subject: [PATCH 3/8] updated to make sample id correct and autofil batches

---
 R/expt.R | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/R/expt.R b/R/expt.R
index f2a92073..c7591eaf 100644
--- a/R/expt.R
+++ b/R/expt.R
@@ -364,7 +364,6 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL
   message("Reading the sample metadata.")
   sample_definitions <- extract_metadata(metadata, id_column = id_column,
                                          ...)
-  ## sample_definitions <- extract_metadata(metadata)
   ## Add an explicit removal of the file column if the option file_column is NULL.
   ## This is a just in case measure to avoid conflicts.
   if (is.null(file_column)) {
@@ -1632,15 +1631,15 @@ If this is not correctly performed, very few genes will be observed")
       ## put the actual sampleIDs as the colnames of its return.
       ## And, since I use the count table columns as the primary arbiter of the sample IDs
       ## when merging the sample annotations (metadata) and the counts, this does not end well.
-      colnames(import[["abundance"]]) <- sample_ids
-      colnames(import[["counts"]]) <- sample_ids
-      colnames(import[["length"]]) <- sample_ids
+      colnames(import[["abundance"]]) <- ids
+      colnames(import[["counts"]]) <- ids
+      colnames(import[["length"]]) <- ids
       import_scaled <- sm(tximport::tximport(
         files = files, type = "salmon", tx2gene = tx_gene_map,
         txOut = txout, countsFromAbundance = "lengthScaledTPM"))
-      colnames(import_scaled[["abundance"]]) <- sample_ids
-      colnames(import_scaled[["counts"]]) <- sample_ids
-      colnames(import_scaled[["length"]]) <- sample_ids
+      colnames(import_scaled[["abundance"]]) <- ids
+      colnames(import_scaled[["counts"]]) <- ids
+      colnames(import_scaled[["length"]]) <- ids
     }
     retlist[["count_table"]] <- data.table::as.data.table(
       import[["counts"]], keep.rownames = "rownames")

From b5f7a7b9fadb7a12287d1d78f4c4d32c27c5a572 Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Thu, 25 Apr 2024 12:53:52 -0400
Subject: [PATCH 4/8] print.pca_result

---
 R/zzz_print.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/zzz_print.R b/R/zzz_print.R
index 248aaad2..adda9f19 100644
--- a/R/zzz_print.R
+++ b/R/zzz_print.R
@@ -538,8 +538,8 @@ print.pca_result <- function(x, ...) {
   cond_column <- x[["cond_column"]]
   batch_column <- x[["batch_column"]]
 
-  color_levels <- toString(levels(as.factor(pData(x)[[cond_column]])))
-  batch_levels <- toString(levels(as.factor(pData(x)[[batch_column]])))
+  color_levels <- toString(levels(as.factor(x[["design"]][[cond_column]])))
+  batch_levels <- toString(levels(as.factor(x[["design"]][[batch_column]])))
   message("The result of performing a ", x[["pc_method"]], " dimension reduction.
 The x-axis is PC", x[["x_pc"]], " and the y-axis is PC", x[["y_pc"]], "
 Colors are defined by ", color_levels, "

From de56aa2506a9da15f388010c958f941f4c6e93dc Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Sun, 28 Apr 2024 14:13:11 -0400
Subject: [PATCH 5/8] copied expt.R from trey

---
 R/expt.R | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/R/expt.R b/R/expt.R
index c7591eaf..06ddac18 100644
--- a/R/expt.R
+++ b/R/expt.R
@@ -305,7 +305,7 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL
                         savefile = NULL, low_files = FALSE, handle_na = "drop",
                         researcher = "elsayed", study_name = NULL, file_type = NULL,
                         annotation_name = "org.Hs.eg.db", tx_gene_map = NULL,
-                        feature_type = "gene", ...) {
+                        feature_type = "gene", ignore_tx_version = TRUE, ...) {
   arglist <- list(...)  ## pass stuff like sep=, header=, etc here
 
   if ("gene_tx_map" %in% names(arglist)) {
@@ -418,7 +418,7 @@ create_expt <- function(metadata = NULL, gene_info = NULL, count_dataframe = NUL
     sample_ids <- rownames(sample_definitions)
     count_data <- read_counts_expt(sample_ids, filenames, countdir = countdir,
                                    tx_gene_map = tx_gene_map, file_type = file_type,
-                                   ...)
+                                   ignore_tx_version = ignore_tx_version, ...)
     if (count_data[["source"]] == "tximport") {
       tximport_data <- list("raw" = count_data[["tximport"]],
                             "scaled" = count_data[["tximport_scaled"]])
@@ -1480,6 +1480,9 @@ make_pombe_expt <- function(annotation = TRUE) {
 #' @param tx_gene_map Dataframe which provides a mapping between
 #'  transcript IDs and gene IDs.
 #' @param file_type Short circuit the file format autodetection.
+#' @param ignore_tx_version Pass along TRUE to tximport's parameter
+#'  ignoreTxIds to alleviate the headaches associated with salmon's
+#'  stupid transcript ID .x suffix.
 #' @param ... More options for happy time!
 #' @return Data frame of count tables.
 #' @seealso [data.table] [create_expt()] [tximport]
@@ -1491,7 +1494,7 @@ make_pombe_expt <- function(annotation = TRUE) {
 read_counts_expt <- function(ids, files, header = FALSE, include_summary_rows = FALSE,
                              all.x = TRUE, all.y = FALSE, merge_type = "merge",
                              suffix = NULL, countdir = NULL, tx_gene_map = NULL,
-                             file_type = NULL, ...) {
+                             file_type = NULL, ignore_tx_version = TRUE, ...) {
   ## load first sample
   arglist <- list(...)
   retlist <- list()
@@ -1618,15 +1621,19 @@ If this is not correctly performed, very few genes will be observed")
     import <- NULL
     import_scaled <- NULL
     if (is.null(tx_gene_map)) {
-      import <- sm(tximport::tximport(files = files, type = "salmon", txOut = txout))
+      import <- sm(tximport::tximport(files = files, type = "salmon",
+                                      txOut = txout, ignoreTxVersion = ignore_tx_version))
       import_scaled <- sm(tximport::tximport(
         files = files, type = "salmon",
-        txOut = txout, countsFromAbundance = "lengthScaledTPM"))
+        txOut = txout, countsFromAbundance = "lengthScaledTPM",
+        ignoreTxVersion = ignore_tx_version))
     } else {
       ## Add a little test to see how well the tx_gene_map versions
       ## match those in the results from salmon.
       import <- tximport::tximport(
-        files = as.character(files), type = "salmon", tx2gene = tx_gene_map, txOut = txout)
+        files = as.character(files), type = "salmon",
+        tx2gene = tx_gene_map, txOut = txout,
+        ignoreTxVersion = ignore_tx_version)
       ## It appears that something in tximport recently changed which causes it to no longer
       ## put the actual sampleIDs as the colnames of its return.
       ## And, since I use the count table columns as the primary arbiter of the sample IDs
@@ -1636,7 +1643,8 @@ If this is not correctly performed, very few genes will be observed")
       colnames(import[["length"]]) <- ids
       import_scaled <- sm(tximport::tximport(
         files = files, type = "salmon", tx2gene = tx_gene_map,
-        txOut = txout, countsFromAbundance = "lengthScaledTPM"))
+        txOut = txout, countsFromAbundance = "lengthScaledTPM",
+        ignoreTxVersion = ignore_tx_version))
       colnames(import_scaled[["abundance"]]) <- ids
       colnames(import_scaled[["counts"]]) <- ids
       colnames(import_scaled[["length"]]) <- ids

From f5540b4f3b6dcb8f2561afbd0c574af93b05cf54 Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Tue, 30 Apr 2024 12:12:46 -0400
Subject: [PATCH 6/8] edit plot_pcs with shape number larger than 25

---
 R/dimension_reduction.R | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R
index 01006e08..2023c2ec 100644
--- a/R/dimension_reduction.R
+++ b/R/dimension_reduction.R
@@ -576,7 +576,7 @@ plot_pca <- function(data, design = NULL, plot_colors = NULL, plot_title = TRUE,
                      plot_size = 5, plot_alpha = NULL, plot_labels = FALSE, size_column = NULL,
                      pc_method = "fast_svd", x_pc = 1, y_pc = 2, max_overlaps = 20,
                      num_pc = NULL, expt_names = NULL, label_chars = 10,
-                     cond_column = "condition", batch_column = "batch",
+                     cond_column = "condition", batch_column = "batch", hex = TRUE,
                      ...) {
   arglist <- list(...)
   ## Set default columns in the experimental design for condition and batch
@@ -1034,6 +1034,9 @@ plot_pca <- function(data, design = NULL, plot_colors = NULL, plot_title = TRUE,
   } else {
     ## Leave the title blank.
   }
+	
+  ## If want a hexagonal heatmap
+  #if(hexagonal) comp_plot + geom_hex()
 
   ## Finally, return a list of the interesting bits of what happened.
   pca_return <- list(
@@ -1800,7 +1803,7 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
       ggplot2::scale_size_manual(name = size_column,
         labels = levels(pca_data[[size_column]]),
         values = as.numeric(levels(pca_data[["size"]])))
-  } else if (!is.null(size_column) && num_batches > 5) {
+  } else if (!is.null(size_column) && num_batches > 5 && num_batches<19) {
     pca_plot <- pca_plot +
       ggplot2::geom_point(alpha = plot_alpha,
                           aes(shape = .data[["batch"]],
@@ -1816,6 +1819,31 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
       ggplot2::scale_size_manual(name = size_column,
                                  labels = levels(pca_data[[size_column]]),
                                  values = as.numeric(levels(pca_data[["size"]])))
+  } else if (!is.null(size_column) && num_batches>19) {
+	symbols = c(0:18,20,'+','-','o','O','*','|','%','#')
+  	if(num_batches<length(symbols)){
+		values = symbols[1:num_batches]
+	}else{
+		## wrap around the shapes and give a warning message
+		warning('')
+		idx = c(1:lenght(symbols),1:num_batches %% length(sumbols))
+		values = symbols[idx]
+	}
+    pca_plot <- pca_plot +
+      ggplot2::geom_point(alpha = plot_alpha,
+                          aes(shape = .data[["batch"]],
+                              colour = .data[["condition"]],
+                              size = .data[["size"]])) +
+      ggplot2::scale_shape_manual(name = "Batch",
+                                  labels = levels(as.factor(pca_data[["batch"]])),
+                                  guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)),
+                                  values = symbols[1:]) +
+      ggplot2::scale_color_manual(name = "Condition",
+                                  guide = "legend",
+                                  values = color_list) +
+      ggplot2::scale_size_manual(name = size_column,
+                                 labels = levels(pca_data[[size_column]]),
+                                 values = as.numeric(levels(pca_data[["size"]])))
   } else {
     stop("This should be an impossible state.")
   }

From 28c2651a94e3146e1b23ab26f33018d439f9d374 Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Wed, 1 May 2024 15:36:18 -0400
Subject: [PATCH 7/8] added black outline

---
 R/dimension_reduction.R | 57 +++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R
index 2023c2ec..a579984c 100644
--- a/R/dimension_reduction.R
+++ b/R/dimension_reduction.R
@@ -1749,11 +1749,33 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
         guide = ggplot2::guide_legend(override.aes = list(size = plot_size, fill = "grey")),
         values = 21:25)
   } else if (is.null(size_column) && num_batches > 5) {
+	### cannot mix integer with characters even using list
+	### 19 removed because it looks like 16, which might not be needed since shapes are reused for batches more than 25
+	symbols = c(0:18,20:25)
+	if(length(symbols) < num_batches){
+		idx = c(rep(1:length(symbols),
+			    num_batches %/% length(symbols)),
+			    1:(num_batches %% length(symbols)))
+		values = symbols[idx]
+	}else{
+		values = symbols[1:num_batches]
+	}
+	
+	## change outline color to black for batches 21+
+	if(num_batches > 20){
+		cond20 = unique(pca_data[['condition']])[1:20]
+		idx = which(pca_data[['condition']] %in% cond20)
+		othern = nrow(pca_data) - length(idx)
+	  	outline_col = c(pca_data[['condition']][idx],rep('black',othern))
+	}else{
+	  	outline_col = pca_data[['condition']]
+	}
+
     pca_plot <- pca_plot +
       ggplot2::geom_point(size = plot_size, alpha = plot_alpha,
                           aes(shape = .data[["batch"]],
                               fill = .data[["condition"]],
-                              colour = .data[["condition"]])) +
+                              colour = outline_col)) +
       ggplot2::scale_color_manual(name = "Condition",
                                   ##guide = ggplot2::guide_legend(override.aes = list(size = plot_size)),
                                   guide = "legend",
@@ -1765,7 +1787,7 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
       ggplot2::scale_shape_manual(name = "Batch",
                                   labels = levels(as.factor(pca_data[["batch"]])),
                                   guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)),
-                                  values = 1:num_batches)
+                                  values = values)
   } else if (!is.null(size_column) && num_batches <= 5) {
     ## This will require the 6 steps above and one more
     pca_plot <- ggplot(data = as.data.frame(pca_data),
@@ -1803,41 +1825,16 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
       ggplot2::scale_size_manual(name = size_column,
         labels = levels(pca_data[[size_column]]),
         values = as.numeric(levels(pca_data[["size"]])))
-  } else if (!is.null(size_column) && num_batches > 5 && num_batches<19) {
+  } else if (!is.null(size_column) && num_batches > 5) {
     pca_plot <- pca_plot +
-      ggplot2::geom_point(alpha = plot_alpha,
-                          aes(shape = .data[["batch"]],
-                              colour = .data[["condition"]],
-                              size = .data[["size"]])) +
-      ggplot2::scale_shape_manual(name = "Batch",
-                                  labels = levels(as.factor(pca_data[["batch"]])),
-                                  guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)),
-                                  values = 1:num_batches) +
-      ggplot2::scale_color_manual(name = "Condition",
-                                  guide = "legend",
-                                  values = color_list) +
-      ggplot2::scale_size_manual(name = size_column,
-                                 labels = levels(pca_data[[size_column]]),
-                                 values = as.numeric(levels(pca_data[["size"]])))
-  } else if (!is.null(size_column) && num_batches>19) {
-	symbols = c(0:18,20,'+','-','o','O','*','|','%','#')
-  	if(num_batches<length(symbols)){
-		values = symbols[1:num_batches]
-	}else{
-		## wrap around the shapes and give a warning message
-		warning('')
-		idx = c(1:lenght(symbols),1:num_batches %% length(sumbols))
-		values = symbols[idx]
-	}
-    pca_plot <- pca_plot +
-      ggplot2::geom_point(alpha = plot_alpha,
+      ggplot2::geom_point(alpha = plot_alpha, 
                           aes(shape = .data[["batch"]],
                               colour = .data[["condition"]],
                               size = .data[["size"]])) +
       ggplot2::scale_shape_manual(name = "Batch",
                                   labels = levels(as.factor(pca_data[["batch"]])),
                                   guide = ggplot2::guide_legend(overwrite.aes = list(size = plot_size)),
-                                  values = symbols[1:]) +
+                                  values = values) +
       ggplot2::scale_color_manual(name = "Condition",
                                   guide = "legend",
                                   values = color_list) +

From 3b28333e067778151288bd3265d1e5d23334f6ed Mon Sep 17 00:00:00 2001
From: Zhezhen Wang <zhezhenwang@gmail.com>
Date: Mon, 6 May 2024 18:15:40 -0400
Subject: [PATCH 8/8] give up on pca color for dots over 20

---
 R/dimension_reduction.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R
index 3f8aa8bc..46fab388 100644
--- a/R/dimension_reduction.R
+++ b/R/dimension_reduction.R
@@ -1753,7 +1753,8 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
   } else if (is.null(size_column) && num_batches > 5) {
 	### cannot mix integer with characters even using list
 	### 19 removed because it looks like 16, which might not be needed since shapes are reused for batches more than 25
-	symbols = c(0:18,20:25)
+	#symbols = c(0:18,20:25)
+	symbols = c(0:25)
 	if(length(symbols) < num_batches){
 		idx = c(rep(1:length(symbols),
 			    num_batches %/% length(symbols)),
@@ -1764,14 +1765,14 @@ plot_pcs <- function(pca_data, first = "PC1", second = "PC2", variances = NULL,
 	}
 	
 	## change outline color to black for batches 21+
-	if(num_batches > 20){
-		cond20 = unique(pca_data[['condition']])[1:20]
-		idx = which(pca_data[['condition']] %in% cond20)
-		othern = nrow(pca_data) - length(idx)
-	  	outline_col = c(pca_data[['condition']][idx],rep('black',othern))
-	}else{
-	  	outline_col = pca_data[['condition']]
-	}
+	#if(num_batches > 20){
+#		cond20 = unique(pca_data[['condition']])[1:20]
+#		idx = which(pca_data[['condition']] %in% cond20)
+#		othern = nrow(pca_data) - length(idx)
+##	  	outline_col = c(pca_data[['condition']][idx],rep('black',othern))
+#	}else{
+#	  	outline_col = pca_data[['condition']]
+#	}
 
     pca_plot <- pca_plot +
       ggplot2::geom_point(size = plot_size, alpha = plot_alpha,