From 45279bbacb30f2335089877a018725761e73c25d Mon Sep 17 00:00:00 2001 From: Habib Rehman Date: Mon, 26 Jan 2026 02:18:11 -0500 Subject: [PATCH 1/2] Added SPLIT, tentatively works --- .../split_correction/config.vsh.yaml | 54 +++++++++++ .../split_correction/script.R | 95 +++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 src/methods_expression_correction/split_correction/config.vsh.yaml create mode 100644 src/methods_expression_correction/split_correction/script.R diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml new file mode 100644 index 00000000..7137954b --- /dev/null +++ b/src/methods_expression_correction/split_correction/config.vsh.yaml @@ -0,0 +1,54 @@ +__merge__: /src/api/comp_method_expression_correction.yaml + +name: split_correction +label: "SPLIT" +summary: "Correct doublet/misegmented cells using SPLIT" +description: "SPLIT (Spatial Purification of Layered Intracellular Transcripts) is a novel method that integrates snRNA-seq with RCTD deconvolution to enhance signal purity. SPLIT effectively resolves mixed transcriptomic signals, improving background correction and cell-type resolution." +links: + documentation: "https://github.com/bdsc-tds/SPLIT" + repository: "https://github.com/bdsc-tds/SPLIT" +references: + doi: "10.1101/2025.04.23.649965" + +# arguments: +# - name: --celltype_key +# required: false +# direction: input +# type: string +# default: cell_type + +resources: + - type: r_script + path: script.R + +engines: + - type: docker + image: openproblems/base_r:1 + setup: + #- type: docker + # run: | + # apt-get update && apt-get install -y wget + - type: r + bioc: [anndataR, rhdf5, devtools, scater] + #- type: r + # bioc: [SummarizedExperiment,SingleCellExperiment,SpatialExperiment] + # bioc_force_install: true + - type: docker + run: | + Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')" + + # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues: + # https://github.com/drighelli/SpatialExperiment/issues/171 + # https://github.com/satijalab/seurat/issues/9889 + # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment. + + # Is there a better way to install an r package from github? + # The 6 million timeout thing stops it from breaking + + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ hightime, highcpu, highmem ] \ No newline at end of file diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R new file mode 100644 index 00000000..8a92c119 --- /dev/null +++ b/src/methods_expression_correction/split_correction/script.R @@ -0,0 +1,95 @@ +library(spacexr) +library(Matrix) +library(SingleCellExperiment) +library(anndataR) +library(SPLIT) +library(Seurat) +library(scuttle) + +## VIASH START +par <- list( + "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad", + "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", + "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad" +) + +meta <- list( + 'cpus': 4, +) + +## VIASH END + +# Read the input h5ad file and convert to SingleCellExperiment and Seurat +sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment") +xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat") + +# Extract spatial coordinates and counts matrix +centroid_x <- colData(sce)$centroid_x +centroid_y <- colData(sce)$centroid_y +coords <- data.frame(centroid_x, centroid_y) +counts <- assay(sce,"counts") +rownames(coords) <- colData(sce)$cell_id +puck <- SpatialRNA(coords, counts) + +# Read reference scrnaseq +ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment") + +#filter reference cell types to those with >25 cells +valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] +filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes] + +ref_counts <- assay(filtered_ref, "counts") +# factor to drop filtered cell types +colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) +cell_types <- colData(filtered_ref)$cell_type +names(cell_types) <- colnames(ref_counts) +reference <- Reference(ref_counts, cell_types, min_UMI = 0) + +# check cores +cores <- 1 +if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus +cat(sprintf("Number of cores: %s\n", cores)) + +# Run the algorithm +myRCTD <- create.RCTD(puck, reference, max_cores = cores) +myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet") + +# Extract results +# results <- myRCTD@results +# spatial_cell_types <- results$results_df$first_type +# # Include None Spatial cell type for the "reject" cells +# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp") +# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp" +# names(spatial_cell_types) <- rownames(results$results_df) + +# # +# colData(sce)$cell_type <- "None_sp" +# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types) + + +# Post-process RCTD output +RCTD <- SPLIT::run_post_process_RCTD(myRCTD) + +# Run SPLIT purification +res_split <- SPLIT::purify( + counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix + rctd = RCTD, + DO_purify_singlets = TRUE # optional +) + +# create corrected counts layer in original SingleCell object +assay(sce, "corrected_counts") <- assay(sce, "counts") +assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts + +#log normalized +size_factors <- librarySizeFactors(assay(sce, "corrected_counts")) + +size_factors[size_factors == 0] <- 1 #why + +#print(min(size_factors)) +assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts") + +# Write the final object to h5ad format +# set to 'w', is this ok? +dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE) +write_h5ad(sce, par$output, mode = "w") From 6080a68c75c6816e1771338ea86368d4c9b6b70a Mon Sep 17 00:00:00 2001 From: Habib Rehman Date: Mon, 2 Feb 2026 00:23:56 -0500 Subject: [PATCH 2/2] Fixed filtering and container for SPLIT --- .../split_correction/config.vsh.yaml | 13 ++--- .../split_correction/script.R | 51 +++++++++++-------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml index 7137954b..3c865d11 100644 --- a/src/methods_expression_correction/split_correction/config.vsh.yaml +++ b/src/methods_expression_correction/split_correction/config.vsh.yaml @@ -10,12 +10,13 @@ links: references: doi: "10.1101/2025.04.23.649965" -# arguments: -# - name: --celltype_key -# required: false -# direction: input -# type: string -# default: cell_type +arguments: + - name: --keep_all_cells + required: false + direction: input + type: boolean + default: false + description: Whether to keep cells with 0 counts (may cause errors if set to TRUE) resources: - type: r_script diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R index 8a92c119..1678d3ec 100644 --- a/src/methods_expression_correction/split_correction/script.R +++ b/src/methods_expression_correction/split_correction/script.R @@ -10,7 +10,8 @@ library(scuttle) par <- list( "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad", "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad", - "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad" + "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad", + "keep_all_cells" = FALSE, ) meta <- list( @@ -23,25 +24,32 @@ meta <- list( sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment") xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat") +# filter out 0 cells +if (!par$keep_all_cells) { + cat("Filtering cells with 0 counts\n") + sce <- sce[, colSums(counts(sce)) > 0] + xe <- subset(xe, subset = nCount_RNA > 0) +} + # Extract spatial coordinates and counts matrix centroid_x <- colData(sce)$centroid_x centroid_y <- colData(sce)$centroid_y coords <- data.frame(centroid_x, centroid_y) -counts <- assay(sce,"counts") +counts <- assay(sce, "counts") rownames(coords) <- colData(sce)$cell_id puck <- SpatialRNA(coords, counts) # Read reference scrnaseq ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment") -#filter reference cell types to those with >25 cells +#filter reference cell types to those with >25 cells (minimum for RCTD) valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes] ref_counts <- assay(filtered_ref, "counts") # factor to drop filtered cell types -colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) -cell_types <- colData(filtered_ref)$cell_type +colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) +cell_types <- colData(filtered_ref)$cell_type names(cell_types) <- colnames(ref_counts) reference <- Reference(ref_counts, cell_types, min_UMI = 0) @@ -51,45 +59,44 @@ if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus cat(sprintf("Number of cores: %s\n", cores)) # Run the algorithm +cat("Running RCTD\n") myRCTD <- create.RCTD(puck, reference, max_cores = cores) myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet") -# Extract results +# Get the "spot_class" annotation from RCTD +# cat("Saving RCTD spot_class\n") # results <- myRCTD@results -# spatial_cell_types <- results$results_df$first_type -# # Include None Spatial cell type for the "reject" cells -# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp") -# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp" -# names(spatial_cell_types) <- rownames(results$results_df) - -# # -# colData(sce)$cell_type <- "None_sp" -# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types) - +# rctd_spot_class <- results$results_df$spot_class +# names(rctd_spot_class) <- rownames(results$results_df) +# colData(sce)$RCTD_class <- "not_included" +# colData(sce)[names(rctd_spot_class),"RCTD_class"] <- as.character(rctd_spot_class) # Post-process RCTD output RCTD <- SPLIT::run_post_process_RCTD(myRCTD) # Run SPLIT purification +cat("Running SPLIT\n") res_split <- SPLIT::purify( counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix rctd = RCTD, DO_purify_singlets = TRUE # optional ) + # create corrected counts layer in original SingleCell object +cat("Normalizing counts\n") + +# First copy in counts assay(sce, "corrected_counts") <- assay(sce, "counts") + +# Then, replace only the updated cells assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts -#log normalized +# Library size normalization - see note in resolVI size_factors <- librarySizeFactors(assay(sce, "corrected_counts")) - -size_factors[size_factors == 0] <- 1 #why - -#print(min(size_factors)) assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts") # Write the final object to h5ad format -# set to 'w', is this ok? +cat("Writing to h5ad\n") dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE) write_h5ad(sce, par$output, mode = "w")