diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..bc67620 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,9 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.git* +^\.idea* +^nimbleR.iml$ +^nimbleR.*\.tar\.gz$ +^nimbleR.*\.zip$ +^\.github$ +.dockerignore \ No newline at end of file diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..5d02de0 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,40 @@ +on: + workflow_dispatch: + push: + branches: [ main ] + pull_request: + +name: R Build and Checks + +jobs: + R-CMD-check: + runs-on: ubuntu-${{ matrix.config.os }} + + name: ubuntu-${{ matrix.config.os }} (${{ matrix.config.r }} / ${{ matrix.config.bioc }}) + + strategy: + fail-fast: false + matrix: + config: + - { os: 24.04, r: '4.4', bioc: '3.20' } + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v4 + + - uses: bimberlabinternal/DevOps/githubActions/r-gh-setup@master + with: + r_version: ${{ matrix.config.r }} + bioc_version: ${{ matrix.config.bioc }} + cache_version: ${{ secrets.CACHE_VERSION }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + - uses: r-lib/actions/check-r-package@v2 + with: + args: 'c("--no-manual")' + env: + _R_CHECK_CRAN_INCOMING_: false diff --git a/.gitignore b/.gitignore index e75435c..57d7a7d 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,7 @@ po/*~ # RStudio Connect folder rsconnect/ + +*.tar.gz +*.zip +install.bat \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..07115cd --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5315fbb --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/nimbleR.iml b/.idea/nimbleR.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/nimbleR.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..70a9809 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,31 @@ +Package: nimbleR +Type: Package +Title: Tools to use nimble data with seurat objects +Version: 1.0.0 +Author: Bimber Lab +Maintainer: The package maintainer +Description: An R package to format and append nimble RNA-seq count data to seurat objects. +License: MIT +Encoding: UTF-8 +Depends: + R (>= 4.3.0) +Imports: + Seurat, + Matrix, + egg, + scales, + methods, + tidyr, + dplyr, + ggplot2 +Suggests: + RIRA, + SeuratData, + devtools, + testthat (>= 2.1.0) +Remotes: + SeuratData=satijalab/seurat-data, + bimberlab/RIRA +RoxygenNote: 7.3.2 +URL: https://github.com/BimberLab/nimbleR +BugReports: https://github.com/BimberLab/nimbleR/issues diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 9403aa4..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2025 Bimber Lab - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..ec00965 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,8 @@ +# Generated by roxygen2: do not edit by hand + +export(AppendNimbleCounts) +export(LogNormalizeUsingAlternateAssay) +import(Seurat) +import(dplyr) +import(ggplot2) +importFrom(tidyr,pivot_wider) diff --git a/R/NimbleAppend.R b/R/NimbleAppend.R new file mode 100644 index 0000000..637ac18 --- /dev/null +++ b/R/NimbleAppend.R @@ -0,0 +1,327 @@ +utils::globalVariables( + names = c('V1','V2','V3'), + package = 'nimbleR', + add = TRUE +) + +# @include Normalization.R + +#' @import Seurat +#' @importFrom tidyr pivot_wider + +#' @title AppendNimbleCounts +#' @description Reads a given seurat object and a nimble file, and appends the nimble data to the object. +#' +#' @param seuratObj A Seurat object. +#' @param nimbleFile A nimble file, which is a TSV of feature counts created by nimble +#' @param maxAmbiguityAllowed If provided, any features representing more than ths value will be discarded. For example, 'Feat1,Feat2,Feat3' represents 3 features. maxAmbiguityAllowed=1 results in removal of all ambiguous features. +#' @param targetAssayName The target assay. If this assay exists, features will be appended (and an error thrown if there are duplicates). Otherwise a new assay will be created. +#' @param renameConflictingFeatures If true, when appending to an existing assay, any conflicting feature names will be renamed, appending the value of duplicateFeatureSuffix +#' @param duplicateFeatureSuffix If renameConflictingFeatures is true, this string will be appended to duplicated feature names +#' @param normalizeData If true, data will be normalized after appending/creating the assay. This will default to CellMembrane::LogNormalizeUsingAlternateAssay; however, if assayForLibrarySize equals targetAssayName then Seurat::NormalizeData is used. +#' @param performDietSeurat If true, DietSeurat will be run, which removes existing reductions. This may or may not be required based on your usage, but the default is to perform this if the targetAssay exists. +#' @param assayForLibrarySize If normalizeData is true, then this is the assay used for librarySize when normalizing. If assayForLibrarySize equals targetAssayName, Seurat::NormalizeData is used. +#' @param maxLibrarySizeRatio If normalizeData is true, then this is passed to CellMembrane::LogNormalizeUsingAlternateAssay +#' @param doPlot If true, FeaturePlots will be generated for the appended features +#' @param maxFeaturesToPlot If doPlot is true, this is the maximum number of features to plot +#' @param replaceExistingAssayData If true, any existing data in the targetAssay will be deleted +#' @param featureRenameList An optional named list in the format = . If any are present, the will be renamed to . The intention of this is to recover specific ambiguous classes. +#' @return A modified Seurat object. +#' +#' @import dplyr +#' @export +AppendNimbleCounts <- function(seuratObj, nimbleFile, targetAssayName, maxAmbiguityAllowed = 0, renameConflictingFeatures = TRUE, duplicateFeatureSuffix = ".Nimble", normalizeData = TRUE, performDietSeurat = (targetAssayName %in% names(seuratObj@assays)), assayForLibrarySize = 'RNA', maxLibrarySizeRatio = 0.05, doPlot = FALSE, maxFeaturesToPlot = 40, replaceExistingAssayData = TRUE, featureRenameList = NULL) { + if (!file.exists(nimbleFile)) { + stop(paste0("Nimble file does not exist: ", nimbleFile)) + } + + # Read file and construct df + df <- NULL + tryCatch({ + df <- utils::read.table(nimbleFile, sep="\t", header=FALSE) + }, error = function(e){ + if (conditionMessage(e) != 'no lines available in input') { + stop(e) + } else { + print(paste0('No lines in nimble file: ', nimbleFile)) + df <- data.frame(V1 = character(), V2 = character(), V3 = character()) + } + }) + + # Indicates no data in TSV + if (all(is.null(df))){ + return(seuratObj) + } + + if (sum(df$V1 == "") > 0) { + stop("The nimble data contains blank feature names. This should not occur.") + } + + if (sum(grepl(df$V1, pattern = "^,")) > 0) { + stop("The nimble data contains features with leading commas. This should not occur.") + } + + if (sum(grepl(df$V1, pattern = ",$")) > 0) { + stop("The nimble data contains features with trailing commas. This should not occur.") + } + + d <- as.integer(df$V2) + if (any(is.na(d))){ + stop(paste0('Non-integer count values found, were: ', paste0(utils::head(df$V2[is.na(d)]), collapse = ','))) + } + + if (is.na(maxAmbiguityAllowed) || is.null(maxAmbiguityAllowed)){ + maxAmbiguityAllowed <- Inf + } else if (maxAmbiguityAllowed == 0) { + maxAmbiguityAllowed <- 1 + } + + # Ensure consistent sorting of ambiguous features, and re-group if needed: + if (any(grepl(df$V1, pattern = ','))) { + print('Ensuring consistent feature sort within ambiguous features:') + df$V1 <- unlist(sapply(df$V1, function(y){ + return(paste0(sort(unlist(strsplit(y, split = ','))), collapse = ',')) + })) + + df <- df %>% + group_by(V1, V3) %>% + summarize(V2 = sum(V2)) + + df <- df[c('V1', 'V2', 'V3')] + + paste0('Distinct features after re-grouping: ', length(unique(df$V1))) + } + + if (!all(is.null(featureRenameList))) { + print('Potentially renaming features:') + df$V1 <- as.character(df$V1) + totalRenamed <- 0 + for (featName in names(featureRenameList)) { + if (featName %in% df$V1) { + df$V1[df$V1 == featName] <- featureRenameList[[featName]] + totalRenamed <- totalRenamed + 1 + } + } + + print(paste0('Total features renamed: ', totalRenamed)) + } + + #Remove ambiguous features + totalHitsByRow <- sapply(df$V1, function(y){ + return(length(unlist(strsplit(y, split = ',')))) + }) + + ambigFeatRows <- totalHitsByRow > maxAmbiguityAllowed + if (sum(ambigFeatRows) > 0) { + print(paste0('Dropping ', sum(ambigFeatRows), ' rows with ambiguous features (>', maxAmbiguityAllowed, '), ', sum(ambigFeatRows),' of ', nrow(df))) + totalUMI <- sum(df$V2) + x <- df$V1[ambigFeatRows] + totalHitsByRow <- totalHitsByRow[ambigFeatRows] + x[totalHitsByRow > 3] <- 'ManyHits' + + x <- sort(table(x), decreasing = T) + x <- data.frame(Feature = names(x), Total = as.numeric(unname(x))) + + x$Fraction <- x$Total / totalUMI + x <- x[x$Fraction > 0.005,] + + if (nrow(x) > 0) { + x$Name <- substr(x$Feature, start = 1, stop = 40) + x$Name[x$Name != x$Feature] <- paste0(x$Name[x$Name != x$Feature], '..') + x$Total <- paste0(x$Total, ' (', scales::percent(x$Total / totalUMI, accuracy = 0.001), ')') + + print('Top ambiguous combinations:') + print(utils::head(x[c('Name', 'Total')], n = 100)) + } + + df <- df[!ambigFeatRows, , drop = F] + paste0('Distinct features after pruning: ', length(unique(df$V1))) + } + + if (any(duplicated(df[c('V1','V3')]))) { + print(paste0('Duplicate cell/features found. Rows at start: ', nrow(df))) + df <- df %>% + group_by(V1, V3) %>% + summarize(V2 = sum(V2)) + + df <- df[c('V1', 'V2', 'V3')] + + print(paste0('After re-grouping: ', nrow(df))) + } + + tryCatch({ + # Group to ensure we have one value per combination: + d <- as.integer(df$V2) + if (any(is.na(d))){ + stop(paste0('Non-integer count values found, were: ', paste0(df$V2[is.na(d)], collapse = ','))) + } + rm(d) + + paste0('Distinct features: ', length(unique(df$V1))) + + df <- tidyr::pivot_wider(df, names_from=V3, values_from=V2, values_fill=0) + }, error = function(e){ + utils::write.table(df, file = 'debug.nimble.txt.gz', sep = '\t', quote = F, row.names = F) + + print(paste0('Error pivoting input data for assay:', targetAssayName, ', results saved to: debug.nimble.txt.gz')) + print(conditionMessage(e)) + traceback() + e$message <- paste0('Error pivoting nimble data. target assay: ', targetAssayName) + stop(e) + }) + + if (replaceExistingAssayData && targetAssayName %in% names(seuratObj@assays)) { + print('Replacing existing assay') + seuratObj@assays[[targetAssayName]] <- NULL + } + + appendToExistingAssay <- targetAssayName %in% names(seuratObj@assays) + + # Remove barcodes from nimble that aren't in seurat + seuratBarcodes <- colnames(seuratObj@assays[[Seurat::DefaultAssay(seuratObj)]]) + barcodeDiff <- colnames(df) %in% seuratBarcodes + barcodeDiff[1] <- TRUE # retain feature name + numColsToDrop <- length(barcodeDiff) - sum(barcodeDiff) + if (numColsToDrop > 0) { + print(paste0('Dropping ', numColsToDrop, ' cell barcodes not in the seurat object (out of ', (ncol(df)-1), ')')) + } + df <- df[barcodeDiff] + + # Fill zeroed barcodes that are in seurat but not in nimble + zeroedBarcodes <- setdiff(seuratBarcodes, colnames(df)[-1]) + print(paste0('Total cells lacking nimble data: ', length(zeroedBarcodes), ' of ', length(seuratBarcodes), ' cells')) + for (barcode in zeroedBarcodes) { + df[barcode] <- 0 + } + + # Cast nimble df to matrix + featureNames <- df$V1 + if (any(duplicated(featureNames))) { + stop('Error, there were duplicate feature names') + } + + df <- subset(df, select=-(V1)) + m <- Seurat::as.sparse(df) + dimnames(m) <- list(featureNames, colnames(df)) + if (is.null(colnames(m))) { + stop(paste0('Error: no column names in nimble count matrix, size: ', paste0(dim(m), collapse = ' by '))) + } + + m <- m[,seuratBarcodes, drop=FALSE] # Ensure column order matches + if (appendToExistingAssay && ncol(m) != ncol(seuratObj@assays[[targetAssayName]])) { + stop(paste0('Error parsing nimble data, ncol not equal after subset, was ', ncol(m))) + } + + if (is.null(colnames(m))) { + stop(paste0('Error: no column names in matrix after subset, size: ', paste0(dim(m), collapse = ' by '))) + } + + if (appendToExistingAssay) { + if (any(rownames(m) %in% rownames(seuratObj@assays[[targetAssayName]]))) { + conflicting <- rownames(m)[rownames(m) %in% rownames(seuratObj@assays[[targetAssayName]])] + + if (renameConflictingFeatures) { + print(paste0('The following nimble features have conflicts in the existing assay and will be renamed: ', paste0(conflicting, collapse = ','))) + newNames <- rownames(m) + names(newNames) <- newNames + newNames[conflicting] <- paste0(conflicting, duplicateFeatureSuffix) + newNames <- unname(newNames) + rownames(m) <- newNames + } else { + stop(paste0('The following nimble features conflict with features in the seuratObj: ', paste0(conflicting, collapse = ','))) + } + } + + if (performDietSeurat) { + print('Running DietSeurat') + seuratObj <- Seurat::DietSeurat(seuratObj) + } + + # Append nimble matrix to seurat count matrix + existingBarcodes <- colnames(Seurat::GetAssayData(seuratObj, assay = targetAssayName, slot = 'counts')) + if (sum(colnames(m) != existingBarcodes) > 0) { + stop('cellbarcodes do not match on matrices') + } + + # If feature source exists, retain it. Otherwise assume these are from cellranger: + slotName <- .GetAssayMetaSlotName(seuratObj[[targetAssayName]]) + if ('FeatureSource' %in% names(methods::slot(seuratObj@assays[[targetAssayName]], slotName))) { + fs <- methods::slot(seuratObj@assays[[targetAssayName]], slotName)$FeatureSource + } else { + fs <- rep('CellRanger', nrow(seuratObj@assays[[targetAssayName]])) + } + + fs <- c(fs, rep('Nimble', nrow(m))) + + # perform in two steps to avoid warnings: + ad <- Seurat::CreateAssayObject(counts = Seurat::as.sparse(rbind(Seurat::GetAssayData(seuratObj, assay = targetAssayName, slot = 'counts'), m))) + if (targetAssayName != Seurat::DefaultAssay(seuratObj)) { + seuratObj[[targetAssayName]] <- NULL + } + seuratObj[[targetAssayName]] <- ad + + names(fs) <- rownames(seuratObj@assays[[targetAssayName]]) + seuratObj@assays[[targetAssayName]] <- Seurat::AddMetaData(seuratObj@assays[[targetAssayName]], metadata = fs, col.name = 'FeatureSource') + + if (sum(colnames(Seurat::GetAssayData(seuratObj, assay = targetAssayName, slot = 'counts')) != existingBarcodes) > 0) { + stop('cellbarcodes do not match on matrices after assay replacement') + } + } else { + # Add nimble as separate assay + if (any(duplicated(rownames(m)))) { + stop('Error: The count matrix had duplicate rownames') + } + seuratObj[[targetAssayName]] <- Seurat::CreateAssayObject(counts = m, min.features = 0, min.cells = 0) + + fs <- rep('Nimble', nrow(seuratObj@assays[[targetAssayName]])) + names(fs) <- rownames(seuratObj@assays[[targetAssayName]]) + seuratObj@assays[[targetAssayName]] <- Seurat::AddMetaData(seuratObj@assays[[targetAssayName]], metadata = fs, col.name = 'FeatureSource') + } + + if (normalizeData) { + if (targetAssayName == assayForLibrarySize) { + print('Normalizing using Seurat::NormalizeData') + seuratObj <- Seurat::NormalizeData(seuratObj, assay = targetAssayName, verbose = FALSE) + } else { + print(paste0('Normalizing using LogNormalizeUsingAlternateAssay with ', assayForLibrarySize)) + seuratObj <- LogNormalizeUsingAlternateAssay(seuratObj, assay = targetAssayName, assayForLibrarySize = assayForLibrarySize, maxLibrarySizeRatio = maxLibrarySizeRatio) + } + } + + if (doPlot) { + if (!requireNamespace("RIRA", quietly = TRUE)) { + warning("The RIRA package must be installed to perform plotting") + } else { + print('Plotting features') + reductions <- intersect(names(seuratObj@reductions), c('umap', 'tsne', 'wnn')) + if (length(reductions) == 0){ + print('No reductions, cannot plot') + } else { + feats <- paste0(seuratObj[[targetAssayName]]@key, rownames(seuratObj[[targetAssayName]])) + rowSums <- Matrix::rowSums(Seurat::GetAssayData(seuratObj, assay = targetAssayName, layer = 'counts')) + feats <- feats[rowSums > 0] + if (length(feats) == 0) { + print('All features are zero, skipping plot') + } else { + if (length(feats) > maxFeaturesToPlot){ + print(paste0('Too many features, will plot the first: ', maxFeaturesToPlot)) + feats <- feats[1:maxFeaturesToPlot] + } + + RIRA::PlotMarkerSeries(seuratObj, reductions = reductions, features = feats, title = targetAssayName) + } + } + } + } + + return(seuratObj) +} + +.GetAssayMetaSlotName <- function(assayObj) { + slotName <- ifelse('meta.features' %in% methods::slotNames(assayObj), yes = 'meta.features', no = 'meta.data') + if (! slotName %in% methods::slotNames(assayObj)) { + stop(paste0('Assay object lacks slot: ', slotName)) + } + + return(slotName) +} \ No newline at end of file diff --git a/R/Normalization.R b/R/Normalization.R new file mode 100644 index 0000000..607e7c1 --- /dev/null +++ b/R/Normalization.R @@ -0,0 +1,64 @@ +utils::globalVariables( + names = c('lsr'), + package = 'nimbleR', + add = TRUE +) + + +#' @title LogNormalizeUsingAlternateAssay +#' +#' @param seuratObj The seurat object +#' @param assayToNormalize The name of the assay to normalize +#' @param assayForLibrarySize The name of the assay from which to derive library sizes. This will be added to the library size of assayToNormalize. +#' @param scale.factor A scale factor to be applied in normalization +#' @param maxLibrarySizeRatio This normalization relies on the assumption that the library size of the assay being normalized in negligible relative to the assayForLibrarySize. To verify this holds true, the method will error if librarySize(assayToNormalize)/librarySize(assayForLibrarySize) exceeds this value +#' +#' @import ggplot2 +#' @export +LogNormalizeUsingAlternateAssay <- function(seuratObj, assayToNormalize, assayForLibrarySize = 'RNA', scale.factor = 1e4, maxLibrarySizeRatio = 0.01) { + toNormalize <- Seurat::GetAssayData(seuratObj, assayToNormalize, layer = 'counts') + assayForLibrarySizeObj <- Seurat::GetAssayData(seuratObj, assay = assayForLibrarySize, layer = 'counts') + + if (any(colnames(toNormalize) != colnames(assayForLibrarySize))) { + stop(paste0('The assayToNormalize and assayForLibrarySize do not have the same cell names!')) + } + + if (is.null(maxLibrarySizeRatio) || is.na(maxLibrarySizeRatio)) { + maxLibrarySizeRatio <- Inf + } + + margin <- 2 + ncells <- dim(x = toNormalize)[margin] + + start_time <- Sys.time() + assayForLibrarySizeData <- Matrix::colSums(assayForLibrarySizeObj) + ratios <- unlist(sapply(seq_len(length.out = ncells), function(i){ + x <- toNormalize[, i] + sumX <- sum(x) + librarySize <- sumX + assayForLibrarySizeData[i] + + lsr <- (sumX / librarySize) + if (lsr > maxLibrarySizeRatio) { + stop(paste0('The ratio of library sizes was above maxLibrarySizeRatio for cell: ', colnames(assayForLibrarySizeObj)[i], ', on assay: ', assayToNormalize, '. Ratio was: ', lsr, ' (', sumX, ' / ', librarySize, ')')) + } + + xnorm <- log1p(x = x / librarySize * scale.factor) + toNormalize[, i] <<- xnorm + + return(lsr) + })) + end_time <- Sys.time() + + print('Normalization time:') + print(end_time - start_time) + + print(ggplot(data.frame(lsr = ratios), aes(x = lsr)) + + geom_histogram() + + ggtitle(paste0("Library size ratios: ", assayToNormalize)) + + egg::theme_article() + ) + + seuratObj <- Seurat::SetAssayData(seuratObj, assay = assayToNormalize, layer = 'data', new.data = toNormalize) + + return(seuratObj) +} \ No newline at end of file diff --git a/README.md b/README.md index cbe2146..2be3f83 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,28 @@ +![R Build and Checks](https://github.com/BimberLab/nimbleR/workflows/R%20Build%20and%20Checks/badge.svg) + # nimbleR -Contains helper functions to append nimble data to Seurat objects +The nimbleR package is a companion to the [nimble aligner](https://github.com/BimberLab/nimble/). The R package is primarily designed to process and append the per-cell alignment data produced by nimble (in TSV format), and append it to a seurat object either as a new assay, or merged with an existing assay. + +It also contains several specialized normalization functions, developed to work with nimble's supplemental alignment data and also MHC/HLA data. + +### Installation + +```{r} +# Install requirements. Other dependencies will be downloaded automatically +install.packages(c('devtools', 'BiocManager', 'remotes'), dependencies = TRUE, ask = FALSE) + +# Updating your Rprofile (i.e. ~/.Rprofile), with the following line will ensure install.packages() pulls from Bioconductor repos: +local({options(repos = BiocManager::repositories())}) + +# Install latest version: +devtools::install_github(repo = 'bimberlab/nimbleR', dependencies = TRUE) +``` + +### Basic Usage + +Append the output from nimble to a Seurat object as a new assay: +```{r} +library(nimbleR) + +seuratObj <- +``` \ No newline at end of file diff --git a/man/AppendNimbleCounts.Rd b/man/AppendNimbleCounts.Rd new file mode 100644 index 0000000..0821f43 --- /dev/null +++ b/man/AppendNimbleCounts.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/NimbleAppend.R +\name{AppendNimbleCounts} +\alias{AppendNimbleCounts} +\title{AppendNimbleCounts} +\usage{ +AppendNimbleCounts( + seuratObj, + nimbleFile, + targetAssayName, + maxAmbiguityAllowed = 0, + renameConflictingFeatures = TRUE, + duplicateFeatureSuffix = ".Nimble", + normalizeData = TRUE, + performDietSeurat = (targetAssayName \%in\% names(seuratObj@assays)), + assayForLibrarySize = "RNA", + maxLibrarySizeRatio = 0.05, + doPlot = FALSE, + maxFeaturesToPlot = 40, + replaceExistingAssayData = TRUE, + featureRenameList = NULL +) +} +\arguments{ +\item{seuratObj}{A Seurat object.} + +\item{nimbleFile}{A nimble file, which is a TSV of feature counts created by nimble} + +\item{targetAssayName}{The target assay. If this assay exists, features will be appended (and an error thrown if there are duplicates). Otherwise a new assay will be created.} + +\item{maxAmbiguityAllowed}{If provided, any features representing more than ths value will be discarded. For example, 'Feat1,Feat2,Feat3' represents 3 features. maxAmbiguityAllowed=1 results in removal of all ambiguous features.} + +\item{renameConflictingFeatures}{If true, when appending to an existing assay, any conflicting feature names will be renamed, appending the value of duplicateFeatureSuffix} + +\item{duplicateFeatureSuffix}{If renameConflictingFeatures is true, this string will be appended to duplicated feature names} + +\item{normalizeData}{If true, data will be normalized after appending/creating the assay. This will default to CellMembrane::LogNormalizeUsingAlternateAssay; however, if assayForLibrarySize equals targetAssayName then Seurat::NormalizeData is used.} + +\item{performDietSeurat}{If true, DietSeurat will be run, which removes existing reductions. This may or may not be required based on your usage, but the default is to perform this if the targetAssay exists.} + +\item{assayForLibrarySize}{If normalizeData is true, then this is the assay used for librarySize when normalizing. If assayForLibrarySize equals targetAssayName, Seurat::NormalizeData is used.} + +\item{maxLibrarySizeRatio}{If normalizeData is true, then this is passed to CellMembrane::LogNormalizeUsingAlternateAssay} + +\item{doPlot}{If true, FeaturePlots will be generated for the appended features} + +\item{maxFeaturesToPlot}{If doPlot is true, this is the maximum number of features to plot} + +\item{replaceExistingAssayData}{If true, any existing data in the targetAssay will be deleted} + +\item{featureRenameList}{An optional named list in the format = . If any are present, the will be renamed to . The intention of this is to recover specific ambiguous classes.} +} +\value{ +A modified Seurat object. +} +\description{ +Reads a given seurat object and a nimble file, and appends the nimble data to the object. +} diff --git a/man/LogNormalizeUsingAlternateAssay.Rd b/man/LogNormalizeUsingAlternateAssay.Rd new file mode 100644 index 0000000..c03cde0 --- /dev/null +++ b/man/LogNormalizeUsingAlternateAssay.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Normalization.R +\name{LogNormalizeUsingAlternateAssay} +\alias{LogNormalizeUsingAlternateAssay} +\title{LogNormalizeUsingAlternateAssay} +\usage{ +LogNormalizeUsingAlternateAssay( + seuratObj, + assayToNormalize, + assayForLibrarySize = "RNA", + scale.factor = 10000, + maxLibrarySizeRatio = 0.01 +) +} +\arguments{ +\item{seuratObj}{The seurat object} + +\item{assayToNormalize}{The name of the assay to normalize} + +\item{assayForLibrarySize}{The name of the assay from which to derive library sizes. This will be added to the library size of assayToNormalize.} + +\item{scale.factor}{A scale factor to be applied in normalization} + +\item{maxLibrarySizeRatio}{This normalization relies on the assumption that the library size of the assay being normalized in negligible relative to the assayForLibrarySize. To verify this holds true, the method will error if librarySize(assayToNormalize)/librarySize(assayForLibrarySize) exceeds this value} +} +\description{ +LogNormalizeUsingAlternateAssay +} diff --git a/tests/testdata/12345_nimbleCounts.tsv b/tests/testdata/12345_nimbleCounts.tsv new file mode 100644 index 0000000..448500c --- /dev/null +++ b/tests/testdata/12345_nimbleCounts.tsv @@ -0,0 +1,17 @@ +E 3 12345_CCAGCGAAGTCCGTAT +F 3 12345_CCAGCGAAGTCCGTAT +G 3 12345_CCAGCGAAGTCCGTAT +H 3 12345_CCAGCGAAGTCCGTAT +Z,D 3 12345_CCAGCGAAGTCCGTAC +E 1 12345_AAAAAAAAAAAAAAAA +F 1 12345_AAAAAAAAAAAAAAAA +G 1 12345_AAAAAAAAAAAAAAAA +H 1 12345_AAAAAAAAAAAAAAAA +E 2 12345_CAAAAAAAAAAAAAA +F 2 12345_CAAAAAAAAAAAAAA +G 2 12345_CAAAAAAAAAAAAAA +H 2 12345_CAAAAAAAAAAAAAA +E 4 12345_GAAAAAAAAAAAAAAA +F 4 12345_GAAAAAAAAAAAAAAA +G 4 12345_GAAAAAAAAAAAAAAA +H 4 12345_GAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/tests/testdata/nimbleTest.rds b/tests/testdata/nimbleTest.rds new file mode 100644 index 0000000..aac9698 Binary files /dev/null and b/tests/testdata/nimbleTest.rds differ diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..b60d808 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(nimbleR) + +test_check("nimbleR", reporter = "progress") \ No newline at end of file diff --git a/tests/testthat/test-nimbleappend.R b/tests/testthat/test-nimbleappend.R new file mode 100644 index 0000000..9e52919 --- /dev/null +++ b/tests/testthat/test-nimbleappend.R @@ -0,0 +1,112 @@ +context("Nimble") + +test_that("Nimble Append detects missing input file", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + expect_error(AppendNimbleCounts(seuratObj, "../testdata/nonexistent.tsv", targetAssayName = 'RNA'), "Nimble file does not exist: ../testdata/nonexistent.tsv", fixed=TRUE) +}) + +test_that("Nimble Append deletes blank feature names when appending", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'Nimble', maxLibrarySizeRatio = 1) + expect_false('' %in% rownames(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts'))) + expect_true('FeatureSource' %in% names(seuratObj@assays$Nimble@meta.features)) + expect_equal(unique(seuratObj@assays$Nimble@meta.features$FeatureSource), c('Nimble')) +}) + +test_that("Nimble Append works with empty input", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + fn <- tempfile(fileext = '.txt') + file.create(fn) + seuratObj <- AppendNimbleCounts(seuratObj, fn, targetAssayName = 'Nimble') + unlink(fn) + expect_false('Nimble' %in% names(seuratObj@assays)) +}) + +test_that("Nimble Append deletes all barcodes not in Seurat when appending", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + nimbleExclusiveBarcodes <- c("12345_CCAGCGAAGTCCGTAT", "12345_CCAGCGAAGTCCGTAC") + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'RNA', maxLibrarySizeRatio = 1, replaceExistingAssayData = FALSE) + expect_equal(nimbleExclusiveBarcodes %in% colnames(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')), c(FALSE, FALSE)) + + expect_true('FeatureSource' %in% names(seuratObj@assays$RNA@meta.features)) + expect_equal(seuratObj@assays$RNA@meta.features$FeatureSource, c("CellRanger", "CellRanger", "CellRanger", "CellRanger", "Nimble", "Nimble", "Nimble", "Nimble")) +}) + +test_that("Nimble Append supports rename", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + nimbleExclusiveBarcodes <- c("12345_CCAGCGAAGTCCGTAT", "12345_CCAGCGAAGTCCGTAC") + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'RNA', + maxLibrarySizeRatio = 1, + featureRenameList = list( + 'E' = 'AA' + ), + replaceExistingAssayData = FALSE, + performDietSeurat = TRUE + ) + + expect_equal(nimbleExclusiveBarcodes %in% colnames(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')), c(FALSE, FALSE)) + + expect_true('FeatureSource' %in% names(seuratObj@assays$RNA@meta.features)) + expect_equal(seuratObj@assays$RNA@meta.features$FeatureSource, c("CellRanger", "CellRanger", "CellRanger", "CellRanger", "Nimble", "Nimble", "Nimble", "Nimble")) + + expect_equal(rownames(Seurat::GetAssayData(seuratObj, assay = 'RNA')), c('A', 'B', 'C', 'D', 'AA', 'F', 'G', 'H')) +}) + +test_that("Nimble Append fills all barcodes in Seurat but not in Nimble when appending", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + seuratExclusiveBarcode <- "12345_TAAAAAAAAAAAAAAA" + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'RNA', maxLibrarySizeRatio = 1, replaceExistingAssayData = FALSE) + nimbleFeatureCounts <- Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')[, seuratExclusiveBarcode][c('E', 'F', 'G', 'H')] + expect_false(FALSE %in% (nimbleFeatureCounts == c(0, 0, 0, 0))) +}) + +test_that("Nimble Append output Seurat object is valid when appending", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + expectedBarcodes <- c("12345_AAAAAAAAAAAAAAAA", "12345_CAAAAAAAAAAAAAA", "12345_GAAAAAAAAAAAAAAA", "12345_TAAAAAAAAAAAAAAA") + expectedFeatures <- c("A", "B", "C", "D", "E", "F", "G", "H") + expectedValues <- list(c(1, 1, 1, 1, 1, 1, 1, 1), c(1, 1, 1, 1, 2, 2, 2, 2), c(1, 1, 1, 1, 4, 4, 4, 4), c(1, 1, 1, 1, 0, 0, 0, 0)) + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'RNA', maxLibrarySizeRatio = 1, replaceExistingAssayData = FALSE) + expect_equal(colnames(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')), expectedBarcodes) + expect_equal(rownames(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')), expectedFeatures) + + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')[, expectedBarcodes[1]]), expectedValues[[1]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')[, expectedBarcodes[2]]), expectedValues[[2]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')[, expectedBarcodes[3]]), expectedValues[[3]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts')[, expectedBarcodes[4]]), expectedValues[[4]]) +}) + +test_that("Nimble Append deletes blank feature names when creating new assay", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'Nimble', maxLibrarySizeRatio = 1) + expect_false('' %in% rownames(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts'))) +}) + +test_that("Nimble Append deletes all barcodes not in Seurat when creating new assay", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + nimbleExclusiveBarcodes <- c("12345_CCAGCGAAGTCCGTAT", "12345_CCAGCGAAGTCCGTAC") + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'Nimble', maxLibrarySizeRatio = 1) + expect_equal(nimbleExclusiveBarcodes %in% colnames(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')), c(FALSE, FALSE)) +}) + +test_that("Nimble Append fills all barcodes in Seurat but not in Nimble when creating new assay", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + seuratExclusiveBarcode <- "12345_TAAAAAAAAAAAAAAA" + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'Nimble', maxLibrarySizeRatio = 1) + nimbleFeatureCounts <- Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')[, seuratExclusiveBarcode] + expect_equal(FALSE %in% (nimbleFeatureCounts == c(0, 0, 0, 0)), FALSE) +}) + +test_that("Nimble Append output Seurat object is valid when creating new assay", { + seuratObj <- Seurat::UpdateSeuratObject(readRDS("../testdata/nimbleTest.rds")) + expectedBarcodes <- c("12345_AAAAAAAAAAAAAAAA", "12345_CAAAAAAAAAAAAAA", "12345_GAAAAAAAAAAAAAAA", "12345_TAAAAAAAAAAAAAAA") + expectedFeatures <- c("E", "F", "G", "H") + expectedValues <- list(c(1, 1, 1, 1), c(2, 2, 2, 2), c(4, 4, 4, 4), c(0, 0, 0, 0)) + seuratObj <- AppendNimbleCounts(seuratObj, "../testdata/12345_nimbleCounts.tsv", targetAssayName = 'Nimble', maxLibrarySizeRatio = 1) + expect_equal(colnames(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')), expectedBarcodes) + expect_equal(rownames(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')), expectedFeatures) + + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')[, expectedBarcodes[1]]), expectedValues[[1]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')[, expectedBarcodes[2]]), expectedValues[[2]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')[, expectedBarcodes[3]]), expectedValues[[3]]) + expect_equal(unname(Seurat::GetAssayData(seuratObj, assay = 'Nimble', layer = 'counts')[, expectedBarcodes[4]]), expectedValues[[4]]) +}) diff --git a/tests/testthat/test-normalization.R b/tests/testthat/test-normalization.R new file mode 100644 index 0000000..3b6a40b --- /dev/null +++ b/tests/testthat/test-normalization.R @@ -0,0 +1,28 @@ +library(SeuratData) + +context("Normalization") + +getBaseSeuratData <- function(){ + suppressWarnings(SeuratData::InstallData("pbmc3k")) + suppressWarnings(data("pbmc3k")) + seuratObj <- suppressWarnings(Seurat::UpdateSeuratObject(pbmc3k)) + + return(seuratObj) +} + +test_that("LogNormalizeUsingAlternateAssay works as expected", { + seuratObj <- getBaseSeuratData() + + assayToAdd <- Seurat::GetAssayData(seuratObj, assay = 'RNA', layer = 'counts') + assayToAdd <- floor(assayToAdd[c('H6PD', 'H6PD', 'JAK1', 'XAF1', 'TMEM52'),]) + + rownames(assayToAdd) <- paste0('Feature', LETTERS[1:nrow(assayToAdd)]) + + seuratObj[['Norm']] <- Seurat::CreateAssayObject(assayToAdd) + + seuratObj <- LogNormalizeUsingAlternateAssay(seuratObj, assayToNormalize = 'Norm', assayForLibrarySize = 'RNA', maxLibrarySizeRatio = 0.2) + + nd <- Seurat::GetAssayData(seuratObj, assay = 'Norm', layer = 'data') + expect_equal(max(nd[,2]), 1.111578, tolerance = 0.000001) + expect_equal(max(nd[,200]), 1.726143, tolerance = 0.000001) +}) \ No newline at end of file