From 45279bbacb30f2335089877a018725761e73c25d Mon Sep 17 00:00:00 2001
From: Habib Rehman <harehman@iu.edu>
Date: Mon, 26 Jan 2026 02:18:11 -0500
Subject: [PATCH 1/2] Added SPLIT, tentatively works

---
 .../split_correction/config.vsh.yaml          | 54 +++++++++++
 .../split_correction/script.R                 | 95 +++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 src/methods_expression_correction/split_correction/config.vsh.yaml
 create mode 100644 src/methods_expression_correction/split_correction/script.R

diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml
new file mode 100644
index 00000000..7137954b
--- /dev/null
+++ b/src/methods_expression_correction/split_correction/config.vsh.yaml
@@ -0,0 +1,54 @@
+__merge__: /src/api/comp_method_expression_correction.yaml
+
+name: split_correction
+label: "SPLIT"
+summary: "Correct doublet/misegmented cells using SPLIT"
+description: "SPLIT (Spatial Purification of Layered Intracellular Transcripts) is a novel method that integrates snRNA-seq with RCTD deconvolution to enhance signal purity. SPLIT effectively resolves mixed transcriptomic signals, improving background correction and cell-type resolution."
+links:
+  documentation: "https://github.com/bdsc-tds/SPLIT"
+  repository: "https://github.com/bdsc-tds/SPLIT"
+references:
+  doi: "10.1101/2025.04.23.649965"
+
+# arguments:
+#   - name: --celltype_key
+#     required: false
+#     direction: input
+#     type: string
+#     default: cell_type
+
+resources:
+  - type: r_script
+    path: script.R
+
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      #- type: docker
+      #  run: |
+      #    apt-get update && apt-get install -y wget
+      - type: r
+        bioc: [anndataR, rhdf5, devtools, scater]
+      #- type: r
+      #  bioc: [SummarizedExperiment,SingleCellExperiment,SpatialExperiment]
+      #  bioc_force_install: true
+      - type: docker
+        run: |
+          Rscript -e "BiocManager::install('SingleCellExperiment', type = 'source', force = TRUE, ask = FALSE); options(timeout = 600000000); devtools::install_github('dmcable/spacexr', build_vignettes = FALSE); devtools::install_github('bdsc-tds/SPLIT')"
+
+      # SingleCellExperiment part can probably be left out again in the future. It currently fixes a bug described in these issues:
+      # https://github.com/drighelli/SpatialExperiment/issues/171
+      # https://github.com/satijalab/seurat/issues/9889
+      # The reinstall of SingleCellExperiment triggers the correct re-install of SpatialExperiment.
+
+      # Is there a better way to install an r package from github?
+      # The 6 million timeout thing stops it from breaking
+
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ hightime, highcpu, highmem ]
\ No newline at end of file
diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R
new file mode 100644
index 00000000..8a92c119
--- /dev/null
+++ b/src/methods_expression_correction/split_correction/script.R
@@ -0,0 +1,95 @@
+library(spacexr)
+library(Matrix)
+library(SingleCellExperiment)
+library(anndataR)
+library(SPLIT)
+library(Seurat)
+library(scuttle)
+
+## VIASH START
+par <- list(
+  "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad",
+  "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad",
+  "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad"
+)
+
+meta <- list(
+  'cpus': 4,
+)
+
+## VIASH END
+
+# Read the input h5ad file and convert to SingleCellExperiment and Seurat
+sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment")
+xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat")
+
+# Extract spatial coordinates and counts matrix
+centroid_x <- colData(sce)$centroid_x
+centroid_y <- colData(sce)$centroid_y
+coords <- data.frame(centroid_x, centroid_y)
+counts <- assay(sce,"counts")
+rownames(coords) <- colData(sce)$cell_id
+puck <- SpatialRNA(coords, counts)
+
+# Read reference scrnaseq
+ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment")
+
+#filter reference cell types to those with >25 cells
+valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] 
+filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes]
+
+ref_counts <- assay(filtered_ref, "counts")
+# factor to drop filtered cell types
+colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) 
+cell_types <- colData(filtered_ref)$cell_type 
+names(cell_types) <- colnames(ref_counts)
+reference <- Reference(ref_counts, cell_types, min_UMI = 0)
+
+# check cores
+cores <- 1
+if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus
+cat(sprintf("Number of cores: %s\n", cores))
+
+# Run the algorithm
+myRCTD <- create.RCTD(puck, reference, max_cores = cores)
+myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet")
+
+# Extract results
+# results <- myRCTD@results
+# spatial_cell_types <- results$results_df$first_type 
+# # Include None Spatial cell type for the "reject" cells
+# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp")
+# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp"
+# names(spatial_cell_types) <- rownames(results$results_df)
+
+# #
+# colData(sce)$cell_type <- "None_sp"
+# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types)
+
+
+# Post-process RCTD output
+RCTD <- SPLIT::run_post_process_RCTD(myRCTD)
+
+# Run SPLIT purification
+res_split <- SPLIT::purify(
+  counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix
+  rctd = RCTD,
+  DO_purify_singlets = TRUE # optional
+)
+
+# create corrected counts layer in original SingleCell object
+assay(sce, "corrected_counts") <- assay(sce, "counts")
+assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts
+
+#log normalized
+size_factors <- librarySizeFactors(assay(sce, "corrected_counts"))
+
+size_factors[size_factors == 0] <- 1 #why
+
+#print(min(size_factors))
+assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts")
+
+# Write the final object to h5ad format
+# set to 'w', is this ok?
+dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE)
+write_h5ad(sce, par$output, mode = "w")

From 6080a68c75c6816e1771338ea86368d4c9b6b70a Mon Sep 17 00:00:00 2001
From: Habib Rehman <harehman@iu.edu>
Date: Mon, 2 Feb 2026 00:23:56 -0500
Subject: [PATCH 2/2] Fixed filtering and container for SPLIT

---
 .../split_correction/config.vsh.yaml          | 13 ++---
 .../split_correction/script.R                 | 51 +++++++++++--------
 2 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/src/methods_expression_correction/split_correction/config.vsh.yaml b/src/methods_expression_correction/split_correction/config.vsh.yaml
index 7137954b..3c865d11 100644
--- a/src/methods_expression_correction/split_correction/config.vsh.yaml
+++ b/src/methods_expression_correction/split_correction/config.vsh.yaml
@@ -10,12 +10,13 @@ links:
 references:
   doi: "10.1101/2025.04.23.649965"
 
-# arguments:
-#   - name: --celltype_key
-#     required: false
-#     direction: input
-#     type: string
-#     default: cell_type
+arguments:
+  - name: --keep_all_cells
+    required: false
+    direction: input
+    type: boolean
+    default: false
+    description: Whether to keep cells with 0 counts (may cause errors if set to TRUE)
 
 resources:
   - type: r_script
diff --git a/src/methods_expression_correction/split_correction/script.R b/src/methods_expression_correction/split_correction/script.R
index 8a92c119..1678d3ec 100644
--- a/src/methods_expression_correction/split_correction/script.R
+++ b/src/methods_expression_correction/split_correction/script.R
@@ -10,7 +10,8 @@ library(scuttle)
 par <- list(
   "input_spatial_with_cell_types" = "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_aggregated_counts.h5ad",
   "input_scrnaseq_reference"= "task_ist_preprocessing/resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad",
-  "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad"
+  "output" = "task_ist_preprocessing/tmp/split_corrected.h5ad",
+  "keep_all_cells" = FALSE,
 )
 
 meta <- list(
@@ -23,25 +24,32 @@ meta <- list(
 sce <- read_h5ad(par$input_spatial_with_cell_types, as = "SingleCellExperiment")
 xe <- read_h5ad(par$input_spatial_with_cell_types, as = "Seurat")
 
+# filter out 0 cells
+if (!par$keep_all_cells) {
+  cat("Filtering cells with 0 counts\n")
+  sce <- sce[, colSums(counts(sce)) > 0]
+  xe <- subset(xe, subset = nCount_RNA > 0)
+}
+
 # Extract spatial coordinates and counts matrix
 centroid_x <- colData(sce)$centroid_x
 centroid_y <- colData(sce)$centroid_y
 coords <- data.frame(centroid_x, centroid_y)
-counts <- assay(sce,"counts")
+counts <- assay(sce, "counts")
 rownames(coords) <- colData(sce)$cell_id
 puck <- SpatialRNA(coords, counts)
 
 # Read reference scrnaseq
 ref <- read_h5ad(par$input_scrnaseq_reference, as = "SingleCellExperiment")
 
-#filter reference cell types to those with >25 cells
+#filter reference cell types to those with >25 cells (minimum for RCTD)
 valid_celltypes <- names(table(colData(ref)$cell_type))[table(colData(ref)$cell_type) >= 25] 
 filtered_ref <- ref[,colData(ref)$cell_type %in% valid_celltypes]
 
 ref_counts <- assay(filtered_ref, "counts")
 # factor to drop filtered cell types
-colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type) 
-cell_types <- colData(filtered_ref)$cell_type 
+colData(filtered_ref)$cell_type <- factor(colData(filtered_ref)$cell_type)
+cell_types <- colData(filtered_ref)$cell_type
 names(cell_types) <- colnames(ref_counts)
 reference <- Reference(ref_counts, cell_types, min_UMI = 0)
 
@@ -51,45 +59,44 @@ if ("cpus" %in% names(meta) && !is.null(meta$cpus)) cores <- meta$cpus
 cat(sprintf("Number of cores: %s\n", cores))
 
 # Run the algorithm
+cat("Running RCTD\n")
 myRCTD <- create.RCTD(puck, reference, max_cores = cores)
 myRCTD <- run.RCTD(myRCTD, doublet_mode = "doublet")
 
-# Extract results
+# Get the "spot_class" annotation from RCTD
+# cat("Saving RCTD spot_class\n")
 # results <- myRCTD@results
-# spatial_cell_types <- results$results_df$first_type 
-# # Include None Spatial cell type for the "reject" cells
-# levels(spatial_cell_types) <- c(levels(spatial_cell_types), "None_sp")
-# spatial_cell_types[results$results_df$spot_class == "reject"] <- "None_sp"
-# names(spatial_cell_types) <- rownames(results$results_df)
-
-# #
-# colData(sce)$cell_type <- "None_sp"
-# colData(sce)[names(spatial_cell_types),"cell_type"] <- as.character(spatial_cell_types)
-
+# rctd_spot_class <- results$results_df$spot_class
+# names(rctd_spot_class) <- rownames(results$results_df)
+# colData(sce)$RCTD_class <- "not_included"
+# colData(sce)[names(rctd_spot_class),"RCTD_class"] <- as.character(rctd_spot_class)
 
 # Post-process RCTD output
 RCTD <- SPLIT::run_post_process_RCTD(myRCTD)
 
 # Run SPLIT purification
+cat("Running SPLIT\n")
 res_split <- SPLIT::purify(
   counts = GetAssayData(xe, assay = 'RNA', layer = 'counts'), # or any gene x cells counts matrix
   rctd = RCTD,
   DO_purify_singlets = TRUE # optional
 )
 
+
 # create corrected counts layer in original SingleCell object
+cat("Normalizing counts\n")
+
+# First copy in counts
 assay(sce, "corrected_counts") <- assay(sce, "counts")
+
+# Then, replace only the updated cells
 assay(sce, "corrected_counts")[rownames(res_split$purified_counts), colnames(res_split$purified_counts)] <- res_split$purified_counts
 
-#log normalized
+# Library size normalization - see note in resolVI
 size_factors <- librarySizeFactors(assay(sce, "corrected_counts"))
-
-size_factors[size_factors == 0] <- 1 #why
-
-#print(min(size_factors))
 assay(sce, "normalized") <- assay(logNormCounts(sce, size_factors=size_factors, assay.type = "corrected_counts"),"logcounts")
 
 # Write the final object to h5ad format
-# set to 'w', is this ok?
+cat("Writing to h5ad\n")
 dir.create(dirname(par$output), showWarnings = FALSE, recursive = TRUE)
 write_h5ad(sce, par$output, mode = "w")