From cd4a1a0717dd7681e15c431a9a32fba8ffa68c6d Mon Sep 17 00:00:00 2001
From: dtm2451 <daniel.bunis@ucsf.edu>
Date: Wed, 26 Jul 2023 14:19:07 -0700
Subject: [PATCH 1/5] add dgeMAST function script and update index

---
 README.md                             |   1 +
 single_cell/RNAseq/dgeMAST_function.R | 157 ++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 single_cell/RNAseq/dgeMAST_function.R

diff --git a/README.md b/README.md
index 267f898..d0c14dd 100644
--- a/README.md
+++ b/README.md
@@ -33,3 +33,4 @@ you would like to add:
 | File | Purpose | Maintainer | Library |
 | --- | --- | --- | --- |
 | single_cell/add_module_score_from_excel_gene_sets.R | R code for reading gene sets from an excel file, running Seurat::AddModuleScore, and visualizing the results | Dan | No |
+| single_cell/RNAseq/dgeMAST_function.R | R function for running DGE with MAST | Dan | No |
diff --git a/single_cell/RNAseq/dgeMAST_function.R b/single_cell/RNAseq/dgeMAST_function.R
new file mode 100644
index 0000000..ed84aa9
--- /dev/null
+++ b/single_cell/RNAseq/dgeMAST_function.R
@@ -0,0 +1,157 @@
+##### This script initializes a function called 'dgeMAST' for running differential gene expression
+# This is an adaptation by Dan Bunis of code originally formulated by Ravi Patel
+#
+# The function runs DGE between 2 groups, and can loop across multiple clusters, running DGE within each one.
+#
+# Some key notes:
+# - This is a method meant for droplet-based single-cell transcriptome data.
+# - Raw counts (integers) are expected to exist and to be:
+#   - in the 'counts' slot of a Seurat object's 'RNA' assay.
+#   - in an assay named 'counts' if using a SingleCellExperiment object.
+#
+# To use, `source("path/to/this/file")`
+# Then, something like:
+# dge <- dgeMAST_allImmStates(
+#   object = cd45_data, # Seurat or SCE object
+#   cells.group.by = 'full_annots', # metadata name holding clusters or annotations to explore within
+#   dge.group.by = 'sex_at_birth', # metadata name holding the samples'/cell's groupings to compare across
+#   dge.group.1 = 'M', # value of 'dge.group.by' metadata for cells in the primary group = when foldChange is positive, it means it's up in this population.
+#   dge.group.2 = 'F', # value of 'dge.group.by' metadata for cells in the secondary / control group 
+#   # Optional, but important to note!
+#   cells.group.targets = c('T4 naive', 'T8 em') # Without this, it will run within ALL cell types / clusters (a.k.a. all 'full_annots' per this example)
+# )
+# See comments in the function definition for details on remaining inputs -- batch.by, cells.use, log.prefix, mast.freq.expressed.min
+#
+# DGE methodology: To be filled before merge.
+#
+# Output: a data.frame containing the following columns... TBD by merge!
+#
+# Authors: Dan Bunis, Ravi Patel
+
+suppressPackageStartupMessages({
+  library(assertthat)
+  library(dplyr)
+  library(Seurat)
+  library(SingleCellExperiment)
+  library(MAST)
+  library(data.table)
+  library(dittoSeq)
+})
+
+log_message <- function(...) {
+  cat("[", format(Sys.time()), "]", ..., "\n")
+}
+
+##### This function loops through cell types, performing a single DGE comparison within each one. #####
+# You *might* never need to call this fucntion directly.  See it as just an "inner" function that's called by the main function that I want you to call. 
+dgeMAST <- function(
+  object, # Seurat or SCE object
+  cells.group.by, # metadata name holding clusters or annotations to explore within
+  cells.group.targets = NULL, # Optional but RECOMMENDED. String ("T4 naive" / "0") or strings ('c("T4 naive", "T8 naive")' / 'c(1,3)'). If not changed from the default here, will loop through all annotations/clusters given by 'cells.group.by'.
+  dge.group.by = "ImmState", # metadata column holding samples'/cell's groupings
+  dge.group.1, # The "primary" group = when foldChange is positive, it means it's up in this population. 
+  dge.group.2, # The "other" group
+  batch.by = "orig.ident", # String name of a metadata to treat as an independent variable in the DGE calculation.
+  cells.use = NULL, # NULL or Logical vector (length equal to the number of cells in object) indicating cells to keep (TRUE) or remove (FALSE) before calculating DGE.
+  # 'Inner' function optimization
+  log.prefix = '',
+  # MAST method tweaks
+  mast.freq.expressed.min = 0.2 # Fraction. A gene filter before dge calculation: Minimal percent of cells (among targets groups, within each targetted cell set) that must express a gene for it to be considered.
+) {
+
+  if (!isMeta(cells.group.by, object)) stop("'cells.group.by', ", cells.group.by,", is not a metadata of 'object'")
+  if (!isMeta(dge.group.by, object)) stop("'dge.group.by', ", dge.group.by,", is not a metadata of 'object'")
+  if (!dge.group.1 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.1', ", dge.group.1, ", as their 'dge.group.by' value.")
+  if (!dge.group.2 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.2', ", dge.group.2, ", as their 'dge.group.by' value.")
+  if (!is.null(cells.group.targets) && !all(cells.group.targets %in% object[[cells.group.by, drop = TRUE]])) stop("At least one target of 'cells.group.targets' does not exist within 'cells.group.by' values.")
+  
+  # Tranform minimal bits into an SCE
+  if (!is(object, "SingleCellExperiment")) {
+    sce <- as.SingleCellExperiment(DietSeurat(object, assays="RNA"), assay="RNA")
+  } else {
+    sce <- object
+  }
+
+  # Trim to only cells passing cells.use
+  if (!is.null(cells.use)) {
+    sce <- sce[, dittoSeq:::.which_cells(
+      cells.use,
+      sce)]
+  }
+
+  # Trim to only target dge.groups, and use factor levels to have it target the right groups.
+  sce <- sce[, dittoSeq:::.which_cells(
+    meta(dge.group.by, sce) %in% c(dge.group.1, dge.group.2),
+    sce)]
+  sce[[dge.group.by]] <- factor(
+    meta(dge.group.by, sce),
+    levels = c(dge.group.2, dge.group.1)
+  )
+
+  # Establish cell targets if not given
+  if (is.null(cells.group.targets)) {
+    cell.group.targets <- metaLevels(cells.group.by, sce)
+  }
+
+  # Loop through cell types, building dge output for each
+  dge_all_list <- lapply(
+    cells.group.targets,
+    function(cell_targ) {
+
+      log_message(log.prefix, "Running MAST dge for ", cell_targ, " cells")
+      # Trim to just these cells
+      sce_ct <- sce[, dittoSeq:::.which_cells(
+        meta(cells.group.by, sce) == cell_targ,
+        sce)]
+
+      counts <- assay(sce_ct, "counts")
+      libsizes <- colSums(counts)
+      size.factors <- libsizes/mean(libsizes)
+      logcounts(sce_ct) <- log2(t(t(counts)/size.factors) + 1)
+
+      ### Ensure logcounts comes first!  ...by simply removing all others.
+      for (assay in assayNames(sce_ct)) {
+        if (assay!="logcounts") {
+          assay(sce_ct, assay) <- NULL
+        }
+      }
+      stopifnot(assayNames(sce_ct)[1]=="logcounts")
+
+      # Transform into MAST's required SingleCellAssay structure.
+      sca_ct <- SceToSingleCellAssay(sce_ct)
+
+      # Trim to only genes expressed in enough cells
+      expressed_genes <- freq(sca_ct) > mast.freq.expressed.min
+      sca_ct <- sca_ct[expressed_genes,]
+      
+      # Add meta data for num genes captured per cell, and processing batch
+      cdr2_ct <- colSums(assay(sca_ct)>0)
+      colData(sca_ct)$ngeneson <- scale(cdr2_ct)
+      colData(sca_ct)$batch <- factor(meta(batch.by, sce_ct))
+      
+      # Run MAST
+      contrast_val <- paste0(dge.group.by, dge.group.1)
+      zlmCond <- zlm(
+        as.formula(paste0("~ ngeneson + ", dge.group.by, " + (1 | batch)")),
+        sca_ct, exprs_value = 'logcounts',
+        method='glmer', ebayes = F,
+        strictConvergence = FALSE,
+        fitArgsD = list(nAGQ = 0))
+      summaryCond <- summary(zlmCond, doLRT=contrast_val)$datatable
+      fcHurdle <- merge(
+        summaryCond[contrast==contrast_val & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
+        summaryCond[contrast==contrast_val & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
+      fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
+
+      fcHurdle$cell_group <- cell_targ
+      fcHurdle$up_in_pos_FC <- dge.group.1
+
+      # Return (Functions in R return the output of their last line, but the `return()` function can also be used!)
+      fcHurdle
+    }
+  )
+
+  # Collapse tables from all cell types into one, and return!
+  do.call(rbind, dge_all_list)
+}
+

From 206ed4a282d89a400e1551ff1ace9f3293015cc2 Mon Sep 17 00:00:00 2001
From: dtm2451 <daniel.bunis@ucsf.edu>
Date: Fri, 28 Jul 2023 11:45:01 -0700
Subject: [PATCH 2/5] minor cleanup

---
 single_cell/RNAseq/dgeMAST_function.R | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/single_cell/RNAseq/dgeMAST_function.R b/single_cell/RNAseq/dgeMAST_function.R
index ed84aa9..9b96048 100644
--- a/single_cell/RNAseq/dgeMAST_function.R
+++ b/single_cell/RNAseq/dgeMAST_function.R
@@ -38,12 +38,7 @@ suppressPackageStartupMessages({
   library(dittoSeq)
 })
 
-log_message <- function(...) {
-  cat("[", format(Sys.time()), "]", ..., "\n")
-}
-
-##### This function loops through cell types, performing a single DGE comparison within each one. #####
-# You *might* never need to call this fucntion directly.  See it as just an "inner" function that's called by the main function that I want you to call. 
+##### This function loops through cell types, performing a single DGE comparison within each one. ##### 
 dgeMAST <- function(
   object, # Seurat or SCE object
   cells.group.by, # metadata name holding clusters or annotations to explore within
@@ -59,6 +54,10 @@ dgeMAST <- function(
   mast.freq.expressed.min = 0.2 # Fraction. A gene filter before dge calculation: Minimal percent of cells (among targets groups, within each targetted cell set) that must express a gene for it to be considered.
 ) {
 
+  log_message <- function(...) {
+    cat("[", format(Sys.time()), "]", ..., "\n")
+  }
+
   if (!isMeta(cells.group.by, object)) stop("'cells.group.by', ", cells.group.by,", is not a metadata of 'object'")
   if (!isMeta(dge.group.by, object)) stop("'dge.group.by', ", dge.group.by,", is not a metadata of 'object'")
   if (!dge.group.1 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.1', ", dge.group.1, ", as their 'dge.group.by' value.")

From 5c622ffd5cf4f9f486ee77dd75886807562e2289 Mon Sep 17 00:00:00 2001
From: dtm2451 <daniel.bunis@ucsf.edu>
Date: Thu, 22 Aug 2024 18:15:14 -0400
Subject: [PATCH 3/5] dgeMAST - improve docs and formula control

---
 single_cell/RNAseq/dgeMAST_function.R | 121 ++++++++++++++++++++------
 1 file changed, 92 insertions(+), 29 deletions(-)

diff --git a/single_cell/RNAseq/dgeMAST_function.R b/single_cell/RNAseq/dgeMAST_function.R
index 9b96048..6424f99 100644
--- a/single_cell/RNAseq/dgeMAST_function.R
+++ b/single_cell/RNAseq/dgeMAST_function.R
@@ -1,7 +1,8 @@
 ##### This script initializes a function called 'dgeMAST' for running differential gene expression
 # This is an adaptation by Dan Bunis of code originally formulated by Ravi Patel
 #
-# The function runs DGE between 2 groups, and can loop across multiple clusters, running DGE within each one.
+# The function loop across multiple clusters running DGE between 2 groups, gathering eveything into a single data.frame.
+# Think like FindAllMarkers, but for a biological question per each cluster.
 #
 # Some key notes:
 # - This is a method meant for droplet-based single-cell transcriptome data.
@@ -11,22 +12,15 @@
 #
 # To use, `source("path/to/this/file")`
 # Then, something like:
-# dge <- dgeMAST_allImmStates(
-#   object = cd45_data, # Seurat or SCE object
+# dge <- dgeMAST(
+#   object = sobj, # Seurat or SCE object
 #   cells.group.by = 'full_annots', # metadata name holding clusters or annotations to explore within
 #   dge.group.by = 'sex_at_birth', # metadata name holding the samples'/cell's groupings to compare across
-#   dge.group.1 = 'M', # value of 'dge.group.by' metadata for cells in the primary group = when foldChange is positive, it means it's up in this population.
-#   dge.group.2 = 'F', # value of 'dge.group.by' metadata for cells in the secondary / control group 
-#   # Optional, but important to note!
+#   dge.group.1 = 'M', # value of 'dge.group.by' metadata value of cells in the primary group.
+#   dge.group.2 = 'F', # value of 'dge.group.by' metadata value of cells in the secondary / control group.
 #   cells.group.targets = c('T4 naive', 'T8 em') # Without this, it will run within ALL cell types / clusters (a.k.a. all 'full_annots' per this example)
 # )
-# See comments in the function definition for details on remaining inputs -- batch.by, cells.use, log.prefix, mast.freq.expressed.min
-#
-# DGE methodology: To be filled before merge.
-#
-# Output: a data.frame containing the following columns... TBD by merge!
-#
-# Authors: Dan Bunis, Ravi Patel
+# See roxygen notes below for additional details!
 
 suppressPackageStartupMessages({
   library(assertthat)
@@ -38,20 +32,57 @@ suppressPackageStartupMessages({
   library(dittoSeq)
 })
 
-##### This function loops through cell types, performing a single DGE comparison within each one. ##### 
+#' Run DGE between 2 groups while looping through cell types
+#' @param object A Seurat or SingleCellExperiment (SCE) object
+#' @param cells.group.by String naming a metadata column of \code{object} which holds clusters, annotations, or other cell groupings to explore within
+#' @param cells.group.targets NULL or a string vector naming particular levels of the \code{cell.group.by}-data to run DGE within.  When left as \code{NULL}, all cell groupings will be targeted
+#' @param dge.group.by String naming a metadata column of \code{object} which holds the sample or cell groupings that you wish to compare between
+#' @param dge.group.1,dge.group.2 Strings naming the groups of \code{dge.group.by}-data to compare between. Directionality: Positive log2FC (foldChange) will mean upregulation in \code{dge.group.1} with respect to \code{dge.group.2}.
+#' @param cells.use NULL or a Logical vector (of length equal to the number of cells in \code{object}) indicating cells to keep (TRUE) or remove (FALSE) before calculating DGE. Can be useful for ignoring specific samples.
+#' @param log.prefix String to append at the beginning of log messages. Useful when wrapping this function inside an external loop.
+#' @param mast.freq.expressed.min A fraction between 0 and 1, default = 0.2, which sets the minimal percent of cells that must express a gene for it to be considered. MAST performs less well for genes with low percent expression. This cutoff is run per each considered \code{cell.group.targets} cell grouping, and only expression among cells of the targeted \code{dge.group.1} and \code{dge.group.2} groups are considered.
+#' @param add.random.effect,add.fixed.effect # NULL or String vectors naming metadata columns of \code{object} to treat as a random or fixed effects, respectively, in the mixed effect model used for DGE calculation. Additional details:
+#' \itemize{
+#' \item NOTE: \code{add.random.effect} defaults to "orig.ident" with the assumption that this column generally 1) exists and 2) holds batch information that is normally desired to be modeled as a random effect. Set to NULL, or something different, to turn this off.
+#' \item \code{add.random.effect} must target discrete metadata and will be turned into a factor before use.
+#' \item 'ngeneson' (the number of expressed genes per cell, which will be calculated internally) and \code{dge.group.by} are automatically added as fixed effects and do not need to be added here.
+#' }
+#' 
+#' @details This function loops across multiple clusters running DGE between 2 groups, gathering eveything into a single data.frame.
+#' Think like FindAllMarkers, but for a biological question per each cluster.
+#' 
+#' Raw counts (integers) are expected to exist and to be in the 'counts' slot of the 'RNA' assay if \code{object} is a Seurat object, or in an assay named 'counts' if \code{object} is a SingleCellExperiment object.
+#' 
+#' In preparation:
+#' 
+#' Seurat data is converted to a SingleCellObject (SCE).
+#' The SCE is trimmed to only the cells indicated by \code{cells.use}, and only those where \code{dge.group.by} is either \code{dge.group.1} or \code{dge.group.2}.
+#' And the set of cell targets given by \code{cells.group.targets} is trimmed of any targets that don't have any cells of code{dge.group.1} or \code{dge.group.2}
+#' 
+#' Then, the function loops through remaining \code{cells.group.targets}, performing these steps on just that subset of cells:
+#' 
+#' Log and read-depth normalization of the counts data,
+#' trimming genes expressed in fewer than \code{mast.freq.expressed.min} proportion of cells,
+#' calculation of the number of genes expressed per cell ('ngeneson'),
+#' the DGE calculation by MAST with its zlm function with \code{method='glmer', ebayes = F, strictConvergence = FALSE, fitArgsD = list(nAGQ = 0)} and a \code{formula} based on the contents of \code{add.random.effect} and \code{add.fixed.effect}, followed by running the liklihood ratio test on the contrast of interest.
+#' By default, the formula ammounts to \code{~ <dge.group.by> + ngeneson + (1 | orig.ident)}.
+#' FDR-correction is applied to p-values at this stage -- per each cell type.
+#' 
+#' It then renames columns, converts the column holding fold change information to be base2, and adds an "up_in" column to help explain which code{dge.group.1} or \code{dge.group.2} was found to have higher expression for each gene.
+#' @return a data.frame
+#' @author Daniel Bunis and Ravi Patel
 dgeMAST <- function(
-  object, # Seurat or SCE object
+  object,
   cells.group.by, # metadata name holding clusters or annotations to explore within
   cells.group.targets = NULL, # Optional but RECOMMENDED. String ("T4 naive" / "0") or strings ('c("T4 naive", "T8 naive")' / 'c(1,3)'). If not changed from the default here, will loop through all annotations/clusters given by 'cells.group.by'.
-  dge.group.by = "ImmState", # metadata column holding samples'/cell's groupings
+  dge.group.by, # metadata column holding samples'/cell's groupings
   dge.group.1, # The "primary" group = when foldChange is positive, it means it's up in this population. 
   dge.group.2, # The "other" group
-  batch.by = "orig.ident", # String name of a metadata to treat as an independent variable in the DGE calculation.
   cells.use = NULL, # NULL or Logical vector (length equal to the number of cells in object) indicating cells to keep (TRUE) or remove (FALSE) before calculating DGE.
-  # 'Inner' function optimization
-  log.prefix = '',
-  # MAST method tweaks
-  mast.freq.expressed.min = 0.2 # Fraction. A gene filter before dge calculation: Minimal percent of cells (among targets groups, within each targetted cell set) that must express a gene for it to be considered.
+  log.prefix = '', # String to append at the beginning of log messages.  Useful when wrapping this function inside an external loop.
+  mast.freq.expressed.min = 0.2, # Fraction. A gene filter before dge calculation: Minimal percent of cells (among targets groups, within each targetted cell set) that must express a gene for it to be considered.
+  add.random.effect = "orig.ident", # NULL or a String vector naming a metadata to treat as a random effects in the mixed effect model used for DGE calculation. These metadata must contain discrete data and will be turned into a factor before use. Providing batch information is highly recommended, and is what the default of "orig.ident" aims to capture. 
+  add.fixed.effect = NULL # NULL or a String vector naming a metadata to treat as fixed effects in the mixed effect model used for DGE calculation. Note: You do not need to provide 'dge.group.by' here, and the number of expressed genes will be calculated and stored/used as 'ngeneson'; Both will be added as fixed effects and do not need to be named here.
 ) {
 
   log_message <- function(...) {
@@ -63,7 +94,7 @@ dgeMAST <- function(
   if (!dge.group.1 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.1', ", dge.group.1, ", as their 'dge.group.by' value.")
   if (!dge.group.2 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.2', ", dge.group.2, ", as their 'dge.group.by' value.")
   if (!is.null(cells.group.targets) && !all(cells.group.targets %in% object[[cells.group.by, drop = TRUE]])) stop("At least one target of 'cells.group.targets' does not exist within 'cells.group.by' values.")
-  
+
   # Tranform minimal bits into an SCE
   if (!is(object, "SingleCellExperiment")) {
     sce <- as.SingleCellExperiment(DietSeurat(object, assays="RNA"), assay="RNA")
@@ -92,6 +123,19 @@ dgeMAST <- function(
     cell.group.targets <- metaLevels(cells.group.by, sce)
   }
 
+  # Check for any lack of cell groups, and trim cell targets if so
+  rms <- c()
+  grps <- dittoSeq::meta(dge.group.by, sce)
+  cts <- as.character(dittoSeq::meta(cells.group.by, sce))
+  for (cell_targ in cells.group.targets) {
+    if (any(table(grps[cts==cell_targ])==0)) {
+      log_message(log.prefix, "Skipping cell type ", cell_targ, " because a dge.group has no cells")
+      rms <- c(rms, cell_targ)
+    }
+  }
+  cells.group.targets <- cells.group.targets[!cells.group.targets %in% rms]
+  rm(cell_targ)
+
   # Loop through cell types, building dge output for each
   dge_all_list <- lapply(
     cells.group.targets,
@@ -122,16 +166,26 @@ dgeMAST <- function(
       # Trim to only genes expressed in enough cells
       expressed_genes <- freq(sca_ct) > mast.freq.expressed.min
       sca_ct <- sca_ct[expressed_genes,]
-      
-      # Add meta data for num genes captured per cell, and processing batch
+
+      # Add meta data for num genes captured per cell
       cdr2_ct <- colSums(assay(sca_ct)>0)
       colData(sca_ct)$ngeneson <- scale(cdr2_ct)
-      colData(sca_ct)$batch <- factor(meta(batch.by, sce_ct))
-      
+
+      # Build model
+      model <- paste0("~ ngeneson + ", dge.group.by)
+      # Ensure random effects, like batch, are factors too
+      for (rand in add.random.effect) {
+        colData(sca_ct)[,rand] <- as.factor(colData(sca_ct)[,rand])
+        model <- paste0(model, " + (1 | ", rand, ")")
+      }
+      for (fix in add.fixed.effect) {
+        model <- paste0(model, " + fixed")
+      }
+
       # Run MAST
       contrast_val <- paste0(dge.group.by, dge.group.1)
       zlmCond <- zlm(
-        as.formula(paste0("~ ngeneson + ", dge.group.by, " + (1 | batch)")),
+        as.formula(model),
         sca_ct, exprs_value = 'logcounts',
         method='glmer', ebayes = F,
         strictConvergence = FALSE,
@@ -142,10 +196,19 @@ dgeMAST <- function(
         summaryCond[contrast==contrast_val & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
       fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
 
+      # Rebase coef toward log2() instead of ln()
+      fcHurdle$coef <- fcHurdle$coef / log(2, base = exp(1))
+
+      # Rename columns
+      names(fcHurdle)[1:3] <- c("gene", "p", "log2FC")
+
       fcHurdle$cell_group <- cell_targ
-      fcHurdle$up_in_pos_FC <- dge.group.1
+      fcHurdle$up_in <- ifelse(
+          fcHurdle$log2FC>0, dge.group.1,
+          ifelse(fcHurdle$log2FC<0, dge.group.2, NA)
+      )
 
-      # Return (Functions in R return the output of their last line, but the `return()` function can also be used!)
+      # Return
       fcHurdle
     }
   )

From 9b2b8d0ece51ad9c88f2dc76b4795610e28200b1 Mon Sep 17 00:00:00 2001
From: dtm2451 <daniel.bunis@ucsf.edu>
Date: Fri, 23 Aug 2024 13:09:45 -0400
Subject: [PATCH 4/5] fix for no random effects & update documentation

---
 single_cell/RNAseq/dgeMAST_function.R   |  83 ++++++++------
 single_cell/RNAseq/dgeMAST_function.txt | 141 ++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 32 deletions(-)
 create mode 100644 single_cell/RNAseq/dgeMAST_function.txt

diff --git a/single_cell/RNAseq/dgeMAST_function.R b/single_cell/RNAseq/dgeMAST_function.R
index 6424f99..b9f5b59 100644
--- a/single_cell/RNAseq/dgeMAST_function.R
+++ b/single_cell/RNAseq/dgeMAST_function.R
@@ -47,50 +47,60 @@ suppressPackageStartupMessages({
 #' \item \code{add.random.effect} must target discrete metadata and will be turned into a factor before use.
 #' \item 'ngeneson' (the number of expressed genes per cell, which will be calculated internally) and \code{dge.group.by} are automatically added as fixed effects and do not need to be added here.
 #' }
-#' 
+#'
+#' @return a data.table
 #' @details This function loops across multiple clusters running DGE between 2 groups, gathering eveything into a single data.frame.
 #' Think like FindAllMarkers, but for a biological question per each cluster.
-#' 
+#'
 #' Raw counts (integers) are expected to exist and to be in the 'counts' slot of the 'RNA' assay if \code{object} is a Seurat object, or in an assay named 'counts' if \code{object} is a SingleCellExperiment object.
-#' 
+#'
 #' In preparation:
-#' 
+#'
 #' Seurat data is converted to a SingleCellObject (SCE).
 #' The SCE is trimmed to only the cells indicated by \code{cells.use}, and only those where \code{dge.group.by} is either \code{dge.group.1} or \code{dge.group.2}.
 #' And the set of cell targets given by \code{cells.group.targets} is trimmed of any targets that don't have any cells of code{dge.group.1} or \code{dge.group.2}
-#' 
-#' Then, the function loops through remaining \code{cells.group.targets}, performing these steps on just that subset of cells:
-#' 
+#'
+#' Then, the function loops through \code{cells.group.targets}, performing these steps per each subset of cells:
+#'
 #' Log and read-depth normalization of the counts data,
 #' trimming genes expressed in fewer than \code{mast.freq.expressed.min} proportion of cells,
 #' calculation of the number of genes expressed per cell ('ngeneson'),
-#' the DGE calculation by MAST with its zlm function with \code{method='glmer', ebayes = F, strictConvergence = FALSE, fitArgsD = list(nAGQ = 0)} and a \code{formula} based on the contents of \code{add.random.effect} and \code{add.fixed.effect}, followed by running the liklihood ratio test on the contrast of interest.
-#' By default, the formula ammounts to \code{~ <dge.group.by> + ngeneson + (1 | orig.ident)}.
+#' the DGE calculation with \code{formula} based on the contents of \code{add.random.effect} and \code{add.fixed.effect}.
+#' By default, the formula amounts to \code{~ <dge.group.by> + ngeneson + (1 | orig.ident)}.
 #' FDR-correction is applied to p-values at this stage -- per each cell type.
-#' 
+#'
 #' It then renames columns, converts the column holding fold change information to be base2, and adds an "up_in" column to help explain which code{dge.group.1} or \code{dge.group.2} was found to have higher expression for each gene.
-#' @return a data.frame
+#' @section DGE methodology details:
+#' DGE is run by first modeling the data using MAST's zlm function, followed by determining p-values using its likelihood ratio test methodology on the contrast of interest.
+#'
+#' Specifically, \code{zlm} is run with settings: \itemize{
+#' \item Always: \code{ebayes = FALSE, strictConvergence = FALSE}
+#' \item Additionally:\itemize{
+#'   \item when there is a random effect given (default) to \code{add.random.effect}: \code{method='glmer', fitArgsD = list(nAGQ = 0)}
+#'   \item when there is no random effect given to \code{add.random.effect}: \code{method='glm'}
+#' }
+#' }
 #' @author Daniel Bunis and Ravi Patel
 dgeMAST <- function(
   object,
-  cells.group.by, # metadata name holding clusters or annotations to explore within
-  cells.group.targets = NULL, # Optional but RECOMMENDED. String ("T4 naive" / "0") or strings ('c("T4 naive", "T8 naive")' / 'c(1,3)'). If not changed from the default here, will loop through all annotations/clusters given by 'cells.group.by'.
-  dge.group.by, # metadata column holding samples'/cell's groupings
-  dge.group.1, # The "primary" group = when foldChange is positive, it means it's up in this population. 
-  dge.group.2, # The "other" group
-  cells.use = NULL, # NULL or Logical vector (length equal to the number of cells in object) indicating cells to keep (TRUE) or remove (FALSE) before calculating DGE.
-  log.prefix = '', # String to append at the beginning of log messages.  Useful when wrapping this function inside an external loop.
-  mast.freq.expressed.min = 0.2, # Fraction. A gene filter before dge calculation: Minimal percent of cells (among targets groups, within each targetted cell set) that must express a gene for it to be considered.
-  add.random.effect = "orig.ident", # NULL or a String vector naming a metadata to treat as a random effects in the mixed effect model used for DGE calculation. These metadata must contain discrete data and will be turned into a factor before use. Providing batch information is highly recommended, and is what the default of "orig.ident" aims to capture. 
-  add.fixed.effect = NULL # NULL or a String vector naming a metadata to treat as fixed effects in the mixed effect model used for DGE calculation. Note: You do not need to provide 'dge.group.by' here, and the number of expressed genes will be calculated and stored/used as 'ngeneson'; Both will be added as fixed effects and do not need to be named here.
+  cells.group.by,
+  cells.group.targets = NULL,
+  dge.group.by,
+  dge.group.1,
+  dge.group.2,
+  cells.use = NULL,
+  log.prefix = '',
+  mast.freq.expressed.min = 0.2,
+  add.random.effect = "orig.ident",
+  add.fixed.effect = NULL
 ) {
 
   log_message <- function(...) {
-    cat("[", format(Sys.time()), "]", ..., "\n")
+    cat("[", format(Sys.time()), "]", ..., "\n", sep = "")
   }
 
-  if (!isMeta(cells.group.by, object)) stop("'cells.group.by', ", cells.group.by,", is not a metadata of 'object'")
-  if (!isMeta(dge.group.by, object)) stop("'dge.group.by', ", dge.group.by,", is not a metadata of 'object'")
+  if (!dittoSeq::isMeta(cells.group.by, object)) stop("'cells.group.by', ", cells.group.by,", is not a metadata of 'object'")
+  if (!dittoSeq::isMeta(dge.group.by, object)) stop("'dge.group.by', ", dge.group.by,", is not a metadata of 'object'")
   if (!dge.group.1 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.1', ", dge.group.1, ", as their 'dge.group.by' value.")
   if (!dge.group.2 %in% object[[dge.group.by, drop = TRUE]]) stop("No cells have 'dge.group.2', ", dge.group.2, ", as their 'dge.group.by' value.")
   if (!is.null(cells.group.targets) && !all(cells.group.targets %in% object[[cells.group.by, drop = TRUE]])) stop("At least one target of 'cells.group.targets' does not exist within 'cells.group.by' values.")
@@ -103,7 +113,7 @@ dgeMAST <- function(
   }
 
   # Trim to only cells passing cells.use
-  if (!is.null(cells.use)) {
+  if (!identical(cells.use, NULL)) {
     sce <- sce[, dittoSeq:::.which_cells(
       cells.use,
       sce)]
@@ -120,7 +130,7 @@ dgeMAST <- function(
 
   # Establish cell targets if not given
   if (is.null(cells.group.targets)) {
-    cell.group.targets <- metaLevels(cells.group.by, sce)
+    cells.group.targets <- dittoSeq::metaLevels(cells.group.by, sce)
   }
 
   # Check for any lack of cell groups, and trim cell targets if so
@@ -183,13 +193,22 @@ dgeMAST <- function(
       }
 
       # Run MAST
+      zlm_args <- list(
+          formula = as.formula(model),
+          sca = sca_ct,
+          exprs_values = 'logcounts',
+          ebayes = FALSE,
+          strictConvergence = FALSE,
+          method = 'glm'
+      )
+      if (!identical(add.random.effect, NULL)) {
+          zlm_args$method = 'glmer'
+          zlm_args$fitArgsD = list(nAGQ = 0)
+      }
+
+      log_message(log.prefix, "\twith model formula: ", model)
       contrast_val <- paste0(dge.group.by, dge.group.1)
-      zlmCond <- zlm(
-        as.formula(model),
-        sca_ct, exprs_value = 'logcounts',
-        method='glmer', ebayes = F,
-        strictConvergence = FALSE,
-        fitArgsD = list(nAGQ = 0))
+      zlmCond <- do.call(zlm, zlm_args)
       summaryCond <- summary(zlmCond, doLRT=contrast_val)$datatable
       fcHurdle <- merge(
         summaryCond[contrast==contrast_val & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
diff --git a/single_cell/RNAseq/dgeMAST_function.txt b/single_cell/RNAseq/dgeMAST_function.txt
new file mode 100644
index 0000000..0c3335d
--- /dev/null
+++ b/single_cell/RNAseq/dgeMAST_function.txt
@@ -0,0 +1,141 @@
+Run DGE between 2 groups while looping through cell types
+
+Description:
+
+     Run DGE between 2 groups while looping through cell types
+
+Usage:
+
+     dgeMAST(
+       object,
+       cells.group.by,
+       cells.group.targets = NULL,
+       dge.group.by,
+       dge.group.1,
+       dge.group.2,
+       cells.use = NULL,
+       log.prefix = "",
+       mast.freq.expressed.min = 0.2,
+       add.random.effect = "orig.ident",
+       add.fixed.effect = NULL
+     )
+     
+Arguments:
+
+  object: A Seurat or SingleCellExperiment (SCE) object
+
+cells.group.by: String naming a metadata column of ‘object’ which holds
+          clusters, annotations, or other cell groupings to explore
+          within
+
+cells.group.targets: NULL or a string vector naming particular levels
+          of the ‘cell.group.by’-data to run DGE within.  When left as
+          ‘NULL’, all cell groupings will be targeted
+
+dge.group.by: String naming a metadata column of ‘object’ which holds
+          the sample or cell groupings that you wish to compare between
+
+dge.group.1, dge.group.2: Strings naming the groups of
+          ‘dge.group.by’-data to compare between. Directionality:
+          Positive log2FC (foldChange) will mean upregulation in
+          ‘dge.group.1’ with respect to ‘dge.group.2’.
+
+cells.use: NULL or a Logical vector (of length equal to the number of
+          cells in ‘object’) indicating cells to keep (TRUE) or remove
+          (FALSE) before calculating DGE. Can be useful for ignoring
+          specific samples.
+
+log.prefix: String to append at the beginning of log messages. Useful
+          when wrapping this function inside an external loop.
+
+mast.freq.expressed.min: A fraction between 0 and 1, default = 0.2,
+          which sets the minimal percent of cells that must express a
+          gene for it to be considered. MAST performs less well for
+          genes with low percent expression. This cutoff is run per
+          each considered ‘cell.group.targets’ cell grouping, and only
+          expression among cells of the targeted ‘dge.group.1’ and
+          ‘dge.group.2’ groups are considered.
+
+add.random.effect, add.fixed.effect: # NULL or String vectors naming
+          metadata columns of ‘object’ to treat as a random or fixed
+          effects, respectively, in the mixed effect model used for DGE
+          calculation. Additional details:
+
+            • NOTE: ‘add.random.effect’ defaults to "orig.ident" with
+              the assumption that this column generally 1) exists and
+              2) holds batch information that is normally desired to be
+              modeled as a random effect. Set to NULL, or something
+              different, to turn this off.
+
+            • ‘add.random.effect’ must target discrete metadata and
+              will be turned into a factor before use.
+
+            • 'ngeneson' (the number of expressed genes per cell, which
+              will be calculated internally) and ‘dge.group.by’ are
+              automatically added as fixed effects and do not need to
+              be added here.
+
+Details:
+
+     This function loops across multiple clusters running DGE between 2
+     groups, gathering eveything into a single data.frame. Think like
+     FindAllMarkers, but for a biological question per each cluster.
+
+     Raw counts (integers) are expected to exist and to be in the
+     'counts' slot of the 'RNA' assay if ‘object’ is a Seurat object,
+     or in an assay named 'counts' if ‘object’ is a
+     SingleCellExperiment object.
+
+     In preparation:
+
+     Seurat data is converted to a SingleCellObject (SCE). The SCE is
+     trimmed to only the cells indicated by ‘cells.use’, and only those
+     where ‘dge.group.by’ is either ‘dge.group.1’ or ‘dge.group.2’. And
+     the set of cell targets given by ‘cells.group.targets’ is trimmed
+     of any targets that don't have any cells of codedge.group.1 or
+     ‘dge.group.2’
+
+     Then, the function loops through ‘cells.group.targets’, performing
+     these steps per each subset of cells:
+
+     Log and read-depth normalization of the counts data, trimming
+     genes expressed in fewer than ‘mast.freq.expressed.min’ proportion
+     of cells, calculation of the number of genes expressed per cell
+     ('ngeneson'), the DGE calculation with ‘formula’ based on the
+     contents of ‘add.random.effect’ and ‘add.fixed.effect’. By
+     default, the formula amounts to ‘~ <dge.group.by> + ngeneson + (1
+     | orig.ident)’. FDR-correction is applied to p-values at this
+     stage - per each cell type.
+
+     It then renames columns, converts the column holding fold change
+     information to be base2, and adds an "up_in" column to help
+     explain which codedge.group.1 or ‘dge.group.2’ was found to have
+     higher expression for each gene.
+
+Value:
+
+     a data.table
+
+DGE methodology details:
+
+     DGE is run by first modeling the data using MAST's zlm function,
+     followed by determining p-values using its likelihood ratio test
+     methodology on the contrast of interest.
+
+     Specifically, ‘zlm’ is run with settings:
+
+        • Always: ‘ebayes = FALSE, strictConvergence = FALSE’
+
+        • Additionally:
+
+            • when there is a random effect given (default) to
+              ‘add.random.effect’: ‘method='glmer', fitArgsD =
+              list(nAGQ = 0)’
+
+            • when there is no random effect given to
+              ‘add.random.effect’: ‘method='glm'’
+
+Author(s):
+
+     Daniel Bunis and Ravi Patel
+

From 5e0ec494900bf4f6cd785f9c8ec86fd941267185 Mon Sep 17 00:00:00 2001
From: dtm2451 <daniel.bunis@ucsf.edu>
Date: Fri, 23 Aug 2024 13:43:21 -0400
Subject: [PATCH 5/5] undo Index edit from the branch

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index d0c14dd..267f898 100644
--- a/README.md
+++ b/README.md
@@ -33,4 +33,3 @@ you would like to add:
 | File | Purpose | Maintainer | Library |
 | --- | --- | --- | --- |
 | single_cell/add_module_score_from_excel_gene_sets.R | R code for reading gene sets from an excel file, running Seurat::AddModuleScore, and visualizing the results | Dan | No |
-| single_cell/RNAseq/dgeMAST_function.R | R function for running DGE with MAST | Dan | No |