diff --git a/R/simulator.R b/R/simulator.R index c9b0e1f..9351a8a 100755 --- a/R/simulator.R +++ b/R/simulator.R @@ -39,8 +39,9 @@ NULL #' @param remove_bias_in_counts_method 'read-number' (default) or 'gene-number'; method with which the mRNA bias in counts will be removed #' @param norm_counts boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE #' @param seed numeric; fix this value if you want the same cells to be sampled +#' @param generate_celltype_profiles boolean; if TRUE, generate cell-type specific expression profiles in addition to bulk profile, default is FALSE #' -#' @return returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset +#' @return returns a list containing simulated_count_vector, simulated_tpm_vector, and optionally celltype_count_profiles and celltype_tpm_profiles (when generate_celltype_profiles is TRUE) #' #' @keywords internal simulate_sample <- function(data, @@ -51,7 +52,8 @@ simulate_sample <- function(data, remove_bias_in_counts, remove_bias_in_counts_method, norm_counts, - seed) { + seed, + generate_celltype_profiles = FALSE) { if (!all(names(simulation_vector) %in% unique(SummarizedExperiment::colData(data)[["cell_type"]]))) { stop("Some cell-types in the provided simulation vector are not in the annotation.") } @@ -86,7 +88,7 @@ simulate_sample <- function(data, }) # annotation - # names(sampled_cells) <- names(simulation_vector) + names(sampled_cells) <- names(simulation_vector) # simulated_annotation <- utils::stack(sampled_cells) # get the corresponding columns from the data @@ -106,7 +108,7 @@ simulate_sample <- function(data, SummarizedExperiment::assays(data_sampled)[["counts"]] <- count_matrix_removed_bias } - # apply scaling vector on the sampled cells in the matrices + ## apply scaling vector on the sampled cells in the matrices scaling_vector <- scaling_vector[unlist(sampled_cells)] simulated_count_vector <- NULL @@ -124,7 +126,7 @@ simulate_sample <- function(data, if (sum_counts > total_read_counts) { tmp_phylo <- phyloseq::otu_table(data.frame(simulated_count_vector), taxa_are_rows = TRUE) - tmp_vec <- data.frame(phyloseq::rarefy_even_depth(physeq = tmp_phylo, sample.size = total_read_counts, trimOTUs = FALSE, ))[, 1] + tmp_vec <- data.frame(phyloseq::rarefy_even_depth(physeq = tmp_phylo, sample.size = total_read_counts, trimOTUs = FALSE, rngseed = seed))[, 1] names(tmp_vec) <- names(simulated_count_vector) simulated_count_vector <- tmp_vec } else if (sum_counts < total_read_counts) { @@ -138,15 +140,81 @@ simulate_sample <- function(data, } # only generate simulation based on TPM if tpm assay is present + simulated_tpm_vector <- NULL if ("tpm" %in% names(SummarizedExperiment::assays(data))) { - simulated_tpm_vector <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["tpm"]]) * scaling_vector)) + simulated_tpm_vector_raw <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(data_sampled)[["tpm"]]) * scaling_vector)) # normalize tpm based sample to 1e6 - simulated_tpm_vector <- (simulated_tpm_vector / sum(simulated_tpm_vector)) * 1e6 + simulated_tpm_vector <- (simulated_tpm_vector_raw / sum(simulated_tpm_vector_raw)) * 1e6 + } + + ## Generate cell-type specific profiles if requested + celltype_count_profiles <- NULL + celltype_tpm_profiles <- NULL + + if (generate_celltype_profiles) { + celltype_count_profiles <- list() + celltype_tpm_profiles <- list() + + for (celltype in names(sampled_cells)) { + if (length(sampled_cells[[celltype]]) > 0) { + # Get cells for this cell type + celltype_cells <- sampled_cells[[celltype]] + celltype_data <- data[, celltype_cells] + celltype_scaling <- scaling_vector[celltype_cells] + + # Generate cell-type specific count profile + celltype_count_profile <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(celltype_data)[["counts"]]) * celltype_scaling)) + + # Apply total_read_counts proportionally if specified + if (!is.null(total_read_counts)) { + # Calculate this cell type's proportion of total reads + celltype_fraction <- simulation_vector[celltype] + celltype_target_reads <- total_read_counts * celltype_fraction + + # Scale to target + current_sum <- sum(celltype_count_profile) + if (current_sum > 0) { + if (current_sum > celltype_target_reads) { + # Use rarefaction for downsampling + tmp_phylo <- phyloseq::otu_table(data.frame(celltype_count_profile), taxa_are_rows = TRUE) + tmp_vec <- data.frame(phyloseq::rarefy_even_depth(physeq = tmp_phylo, sample.size = celltype_target_reads, trimOTUs = FALSE, rngseed = seed))[, 1] + names(tmp_vec) <- names(celltype_count_profile) + celltype_count_profile <- tmp_vec + } else { + # Scale up proportionally + celltype_count_profile <- celltype_count_profile * (celltype_target_reads / current_sum) + } + } + } + + celltype_count_profiles[[celltype]] <- celltype_count_profile + + # Generate TPM profile if available + if ("tpm" %in% names(SummarizedExperiment::assays(data))) { + celltype_tpm_profile_raw <- Matrix::rowSums(Matrix::t(Matrix::t(SummarizedExperiment::assays(celltype_data)[["tpm"]]) * celltype_scaling)) + # Store raw TPM values + celltype_tpm_profiles[[celltype]] <- celltype_tpm_profile_raw + } + } + } + + # Scale cell-type TPM profiles + if ("tpm" %in% names(SummarizedExperiment::assays(data)) && !is.null(simulated_tpm_vector)) { + # Calculate the total raw TPM across all cell types + total_raw_tpm <- sum(sapply(celltype_tpm_profiles, sum)) + if (total_raw_tpm > 0) { + # Apply the same scaling to each cell-type profile + normalization_factor <- 1e6 / total_raw_tpm + celltype_tpm_profiles <- lapply(celltype_tpm_profiles, function(x) x * normalization_factor) + } + } } return(list( simulated_count_vector = simulated_count_vector, - simulated_tpm_vector = simulated_tpm_vector + simulated_tpm_vector = simulated_tpm_vector, + celltype_count_profiles = celltype_count_profiles, + celltype_tpm_profiles = celltype_tpm_profiles )) } @@ -178,11 +246,13 @@ simulate_sample <- function(data, #' @param seed numeric; specifiy a seed for RNG. This effects cell sampling; with a fixed seed you will always sample the same cells for each sample (seed value is incrased by 1 for each sample). Default = NA (two simulation runs will sample different cells). #' @param BPPARAM BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x) #' @param run_parallel boolean, decide if multi-threaded calculation will be run. FALSE by default +#' @param generate_celltype_profiles boolean; if TRUE, generate cell-type specific expression profiles in addition to bulk profiles, default is FALSE #' #' #' @return named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset #' \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample; -#' \code{scaling_vector} scaling value for each cell in dataset +#' \code{scaling_vector} scaling value for each cell in dataset; +#' \code{celltype_profiles} (optional) when generate_celltype_profiles is TRUE, contains a list with 'counts' and 'tpm' elements, each containing cell-type specific expression profiles #' @export #' #' @examples @@ -279,7 +349,8 @@ simulate_bulk <- function(data, blacklist = NULL, seed = NA, BPPARAM = BiocParallel::bpparam(), - run_parallel = FALSE) { + run_parallel = FALSE, + generate_celltype_profiles = FALSE) { # switch multi-threading on/off if (!run_parallel) { BPPARAM <- BiocParallel::MulticoreParam(workers = 1) @@ -447,7 +518,8 @@ simulate_bulk <- function(data, remove_bias_in_counts = remove_bias_in_counts, remove_bias_in_counts_method = remove_bias_in_counts_method, norm_counts = norm_counts, - seed = ifelse(is.na(seed), seed, seed + i) # ensure that samples still are different if seed is set + seed = ifelse(is.na(seed), seed, seed + i), # ensure that samples still are different if seed is set + generate_celltype_profiles = generate_celltype_profiles ) return(sample) @@ -491,13 +563,79 @@ simulate_bulk <- function(data, # cell_fractions for all simulated samples cell_fractions <- data.frame(t(data.frame(simulation_vector_list)), check.names = FALSE) - message("Finished simulation.") - - return(list( + # Process cell-type specific profiles if generated + result <- list( bulk = se_bulk, cell_fractions = cell_fractions, scaling_vector = scaling_vector - )) + ) + + if (generate_celltype_profiles) { + # Extract all cell-type profiles from all samples + all_celltype_counts <- vapply(all_samples, function(x) list(x[["celltype_count_profiles"]]), list(NULL)) + all_celltype_tpm <- vapply(all_samples, function(x) list(x[["celltype_tpm_profiles"]]), list(NULL)) + + # Get all unique cell types across samples + all_celltypes <- unique(unlist(lapply(all_celltype_counts, names))) + all_celltypes <- all_celltypes[!is.null(all_celltypes)] + + if (length(all_celltypes) > 0) { + # Create matrices for each cell type - counts + celltype_bulk_counts <- list() + for (celltype in all_celltypes) { + # Extract profiles for this cell type across all samples + celltype_profiles_counts <- vapply(all_celltype_counts, function(x) { + if (is.null(x) || is.null(x[[celltype]])) { + # Return zero vector if cell type not present in this sample + zero_vec <- rep(0, nrow(se_bulk)) + names(zero_vec) <- rownames(se_bulk) + return(zero_vec) + } else { + return(x[[celltype]]) + } + }, double(nrow(se_bulk))) + + # Convert to matrix + celltype_matrix <- Matrix::Matrix(celltype_profiles_counts, sparse = TRUE) + colnames(celltype_matrix) <- sample_names + rownames(celltype_matrix) <- rownames(se_bulk) + celltype_bulk_counts[[celltype]] <- celltype_matrix + } + + # Create matrices for each cell type - TPM (if available) + celltype_bulk_tpm <- list() + if ("tpm" %in% names(SummarizedExperiment::assays(data))) { + for (celltype in all_celltypes) { + # Extract profiles for this cell type across all samples + celltype_profiles_tpm <- vapply(all_celltype_tpm, function(x) { + if (is.null(x) || is.null(x[[celltype]])) { + # Return zero vector if cell type not present in this sample + zero_vec <- rep(0, nrow(se_bulk)) + names(zero_vec) <- rownames(se_bulk) + return(zero_vec) + } else { + return(x[[celltype]]) + } + }, double(nrow(se_bulk))) + + # Convert to matrix + celltype_matrix <- Matrix::Matrix(celltype_profiles_tpm, sparse = TRUE) + colnames(celltype_matrix) <- sample_names + rownames(celltype_matrix) <- rownames(se_bulk) + celltype_bulk_tpm[[celltype]] <- celltype_matrix + } + } + + result$celltype_profiles <- list( + counts = celltype_bulk_counts, + tpm = celltype_bulk_tpm + ) + } + } + + message("Finished simulation.") + + return(result) } diff --git a/man/simulate_bulk.Rd b/man/simulate_bulk.Rd index 41576b3..d2d5e8a 100755 --- a/man/simulate_bulk.Rd +++ b/man/simulate_bulk.Rd @@ -26,7 +26,8 @@ simulate_bulk( blacklist = NULL, seed = NA, BPPARAM = BiocParallel::bpparam(), - run_parallel = FALSE + run_parallel = FALSE, + generate_celltype_profiles = FALSE ) } \arguments{ @@ -71,11 +72,14 @@ simulate_bulk( \item{BPPARAM}{BiocParallel::bpparam() by default; if specific number of threads x want to be used, insert: BiocParallel::MulticoreParam(workers = x)} \item{run_parallel}{boolean, decide if multi-threaded calculation will be run. FALSE by default} + +\item{generate_celltype_profiles}{boolean; if TRUE, generate cell-type specific expression profiles in addition to bulk profiles, default is FALSE} } \value{ named list; \code{bulk} a \link[SummarizedExperiment]{SummarizedExperiment} object, where the assays store the simulated bulk RNAseq datasets. Can hold either one or two assays, depending on how many matrices were present in the dataset \code{cell-fractions} is a dataframe with the simulated cell-fractions per sample; -\code{scaling_vector} scaling value for each cell in dataset +\code{scaling_vector} scaling value for each cell in dataset; +\code{celltype_profiles} (optional) when generate_celltype_profiles is TRUE, contains a list with 'counts' and 'tpm' elements, each containing cell-type specific expression profiles } \description{ This function allows you to create a full pseudo-bulk RNAseq dataset. You need to provide a \link[SummarizedExperiment]{SummarizedExperiment} from which the cells diff --git a/man/simulate_sample.Rd b/man/simulate_sample.Rd index 1ad32b5..64a8cd9 100755 --- a/man/simulate_sample.Rd +++ b/man/simulate_sample.Rd @@ -13,7 +13,8 @@ simulate_sample( remove_bias_in_counts, remove_bias_in_counts_method, norm_counts, - seed + seed, + generate_celltype_profiles = FALSE ) } \arguments{ @@ -34,9 +35,11 @@ simulate_sample( \item{norm_counts}{boolean; if TRUE the samples simulated with counts will be normalized to CPMs, default is FALSE} \item{seed}{numeric; fix this value if you want the same cells to be sampled} + +\item{generate_celltype_profiles}{boolean; if TRUE, generate cell-type specific expression profiles in addition to bulk profile, default is FALSE} } \value{ -returns two vectors (one based on counts, one based on tpm; depends on which matrices are present in data) with expression values for all genes in the provided dataset +returns a list containing simulated_count_vector, simulated_tpm_vector, and optionally celltype_count_profiles and celltype_tpm_profiles (when generate_celltype_profiles is TRUE) } \description{ function to sample cells according to given cell-type fractions. This creates a single pseudo-bulk sample by calculating the diff --git a/tests/testthat/test_celltype_profiles.R b/tests/testthat/test_celltype_profiles.R new file mode 100644 index 0000000..3af9a15 --- /dev/null +++ b/tests/testthat/test_celltype_profiles.R @@ -0,0 +1,171 @@ +library(testthat) +library(SummarizedExperiment) +library(Matrix) + +set.seed(123) +counts <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE) +tpm <- Matrix::Matrix(matrix(stats::rpois(3e5, 5), ncol = 300), sparse = TRUE) +tpm <- Matrix::t(1e6 * Matrix::t(tpm) / Matrix::colSums(tpm)) + +colnames(counts) <- paste0("cell_", rep(1:300)) +colnames(tpm) <- paste0("cell_", rep(1:300)) +rownames(counts) <- paste0("gene_", rep(1:1000)) +rownames(tpm) <- paste0("gene_", rep(1:1000)) + +annotation <- data.frame( + "ID" = paste0("cell_", rep(1:300)), + "cell_type" = c( + rep("T cells CD4", 50), + rep("T cells CD8", 50), + rep("Macrophages", 100), + rep("NK cells", 10), + rep("B cells", 70), + rep("Monocytes", 20) + ) +) + +dataset <- SimBu::dataset( + annotation = annotation, + count_matrix = counts, + tpm_matrix = tpm, + name = "test_dataset" +) + +# Test backward compatibility - simulate_bulk without cell-type profiles +test_that("simulate_bulk works without cell-type profiles (backward compatibility)", { + # Test without cell-type profiles (default behavior) + simulation <- SimBu::simulate_bulk( + dataset, + scenario = "even", + scaling_factor = "NONE", + nsamples = 3, + ncells = 100, + generate_celltype_profiles = FALSE + ) + + # Should have traditional output structure + expect_true("bulk" %in% names(simulation)) + expect_true("cell_fractions" %in% names(simulation)) + expect_true("scaling_vector" %in% names(simulation)) + expect_false("celltype_profiles" %in% names(simulation)) + + # Check bulk data structure + expect_s4_class(simulation$bulk, "SummarizedExperiment") + expect_equal(ncol(simulation$bulk), 3) + expect_equal(nrow(simulation$bulk), 1000) +}) + +# Test new feature - simulate_bulk with cell-type profiles +test_that("simulate_bulk works with cell-type profiles enabled", { + # Test with cell-type profiles enabled + simulation <- SimBu::simulate_bulk( + dataset, + scenario = "even", + scaling_factor = "NONE", + nsamples = 3, + ncells = 100, + generate_celltype_profiles = TRUE + ) + + # Should have extended output structure + expect_true("bulk" %in% names(simulation)) + expect_true("cell_fractions" %in% names(simulation)) + expect_true("scaling_vector" %in% names(simulation)) + expect_true("celltype_profiles" %in% names(simulation)) + + # Check cell-type profiles structure + expect_true("counts" %in% names(simulation$celltype_profiles)) + expect_true("tpm" %in% names(simulation$celltype_profiles)) + + # Check that we have profiles for each cell type + expected_celltypes <- c("T cells CD4", "T cells CD8", "Macrophages", "NK cells", "B cells", "Monocytes") + expect_true(all(expected_celltypes %in% names(simulation$celltype_profiles$counts))) + expect_true(all(expected_celltypes %in% names(simulation$celltype_profiles$tpm))) + + # Check dimensions of cell-type specific matrices + for (celltype in expected_celltypes) { + expect_equal(nrow(simulation$celltype_profiles$counts[[celltype]]), 1000) + expect_equal(ncol(simulation$celltype_profiles$counts[[celltype]]), 3) + expect_equal(nrow(simulation$celltype_profiles$tpm[[celltype]]), 1000) + expect_equal(ncol(simulation$celltype_profiles$tpm[[celltype]]), 3) + } +}) + +# Test mathematical consistency +test_that("Cell-type profiles sum to bulk expression", { + simulation <- SimBu::simulate_bulk( + dataset, + scenario = "even", + scaling_factor = "NONE", + nsamples = 2, + ncells = 100, + generate_celltype_profiles = TRUE + ) + + # Test that cell-type profiles sum to bulk expression (counts) + bulk_counts <- as.matrix(SummarizedExperiment::assays(simulation$bulk)[["bulk_counts"]]) + celltype_sum_counts <- Reduce(`+`, lapply(simulation$celltype_profiles$counts, as.matrix)) + + # Allow for small numerical differences due to floating point arithmetic + expect_true(all(abs(bulk_counts - celltype_sum_counts) < 1e-10)) + + # Test that cell-type profiles sum to bulk expression (TPM) + bulk_tpm <- as.matrix(SummarizedExperiment::assays(simulation$bulk)[["bulk_tpm"]]) + celltype_sum_tpm <- Reduce(`+`, lapply(simulation$celltype_profiles$tpm, as.matrix)) + + # Allow for small numerical differences due to floating point arithmetic and normalization + expect_true(all(abs(bulk_tpm - celltype_sum_tpm) < 1e-6)) +}) + +# Test with custom scenario +test_that("Cell-type profiles work with custom scenarios", { + # Custom scenario with specific fractions + fractions <- data.frame( + "T cells CD4" = c(0.3, 0.2), + "Macrophages" = c(0.5, 0.6), + "B cells" = c(0.2, 0.2), + check.names = FALSE + ) + + simulation <- SimBu::simulate_bulk( + dataset, + scenario = "custom", + custom_scenario_data = fractions, + scaling_factor = "NONE", + nsamples = 2, + ncells = 100, + generate_celltype_profiles = TRUE + ) + + # Check that only the specified cell types have non-zero profiles + specified_celltypes <- c("T cells CD4", "Macrophages", "B cells") + for (celltype in specified_celltypes) { + expect_true(celltype %in% names(simulation$celltype_profiles$counts)) + # Check that profiles are not all zeros + expect_true(any(as.matrix(simulation$celltype_profiles$counts[[celltype]]) > 0)) + } +}) + +# Test edge cases +test_that("Cell-type profiles handle edge cases correctly", { + # Test pure scenario (only one cell type) + simulation_pure <- SimBu::simulate_bulk( + dataset, + scenario = "pure", + pure_cell_type = "B cells", + scaling_factor = "NONE", + nsamples = 2, + ncells = 100, + generate_celltype_profiles = TRUE + ) + + # Only B cells should have non-zero profiles + expect_true("B cells" %in% names(simulation_pure$celltype_profiles$counts)) + expect_true(any(as.matrix(simulation_pure$celltype_profiles$counts[["B cells"]]) > 0)) + + # Other cell types should have zero profiles if they exist + other_celltypes <- setdiff(names(simulation_pure$celltype_profiles$counts), "B cells") + for (celltype in other_celltypes) { + expect_true(all(as.matrix(simulation_pure$celltype_profiles$counts[[celltype]]) == 0)) + } +})