EngreitzLab · kaybrand · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/workflow/scripts/feature_computation/compute_arc_e2g.R b/workflow/scripts/feature_computation/compute_arc_e2g.R
@@ -1,7 +1,6 @@
 # Load required packages
 suppressPackageStartupMessages({
   library(GenomicRanges)
-  library(genomation)
   library(data.table)
 })
 
@@ -87,12 +86,14 @@ kendall_predictions_path = snakemake@input$kendall_predictions
 arc_predictions_path = snakemake@output$arc_predictions
 
 # Load ABC data
-pairs.E2G.ABC = readGeneric(abc_predictions_path,
-                            keep.all.metadata = T,
-                            header = T)
+pairs.E2G.ABC = makeGRangesFromDataFrame(fread(abc_predictions_path),
+                                         keep.extra.columns = TRUE,
+                                         starts.in.df.are.0based = TRUE)
 
 # Load Kendall data
-pairs.E2G.Kendall = GRanges(fread(kendall_predictions_path))
+pairs.E2G.Kendall = makeGRangesFromDataFrame(fread(kendall_predictions_path),
+                                              keep.extra.columns = TRUE,
+                                              starts.in.df.are.0based = TRUE)
 pairs.E2G.Kendall = 
   pairs.E2G.Kendall[!is.na(mcols(pairs.E2G.Kendall)[,"Kendall"])]
 
@@ -123,8 +124,11 @@ mcols(pairs.E2G.ABC)[,c("mean_log_normalized_rna",
                                          "RnaPseudobulkTPM")]
 
 # Write output to file
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input file.
 df.output = as.data.frame(pairs.E2G.ABC)
 colnames(df.output)[1] = "chr"
+df.output$start = df.output$start - 1L
 fwrite(df.output,
        file = arc_predictions_path,
        row.names = F,

diff --git a/workflow/scripts/feature_computation/compute_kendall.R b/workflow/scripts/feature_computation/compute_kendall.R
@@ -3,7 +3,6 @@
 # Load required packages
 suppressPackageStartupMessages({
   library(GenomicRanges)
-  library(genomation)
   library(foreach)
   library(Signac)
   library(Seurat)
@@ -215,9 +214,9 @@ gex_out_path = snakemake@output$all_gex
 cores = as.integer(snakemake@threads)
 
 # Load candidate E-G pairs
-pairs.E2G = readGeneric(kendall_pairs_path,
-                        keep.all.metadata = T,
-                        header = T)
+pairs.E2G = makeGRangesFromDataFrame(fread(kendall_pairs_path),
+                                     keep.extra.columns = TRUE,
+                                     starts.in.df.are.0based = TRUE)
 
 # Load scATAC matrix
 matrix.atac_count = readRDS(atac_matrix_path)
@@ -301,7 +300,7 @@ df.pairs.E2G =
                               "RnaDetectedPercent",
                               "RnaPseudobulkTPM",
                               "Kendall")]
-colnames(df.pairs.E2G) = 
+colnames(df.pairs.E2G) =
   c("chr",
     "start",
     "end",
@@ -312,6 +311,9 @@ colnames(df.pairs.E2G) =
     "RnaDetectedPercent",
     "RnaPseudobulkTPM",
     "Kendall")
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input kendall_pairs file.
+df.pairs.E2G$start = df.pairs.E2G$start - 1L
 fwrite(df.pairs.E2G,
        file = kendall_predictions_path,
        row.names = F,

diff --git a/workflow/scripts/feature_computation/generate_atac_matrix.R b/workflow/scripts/feature_computation/generate_atac_matrix.R
@@ -2,8 +2,8 @@
 
 # Load required packages
 suppressPackageStartupMessages({
-  library(genomation)
   library(GenomicRanges)
+  library(data.table)
   library(Signac)
   library(anndata)
   library(tools)
@@ -19,10 +19,13 @@ atac_matrix_path = snakemake@output[["atac_matrix_path"]]
 
 max_cell_count = snakemake@params[["max_cell_count"]]
 
-# Read the enhancer-gene pairs and extract unique peaks
-pairs.e2g = readGeneric(kendall_pairs_path,
-                        keep.all.metadata = T,
-                        header = T)
+# Read the enhancer-gene pairs and extract unique peaks.
+# Only chr/start/end (for FeatureMatrix) and PeakName (for dedup and
+# row naming) are used; skip TargetGene and PairName.
+pairs.e2g = makeGRangesFromDataFrame(
+  fread(kendall_pairs_path, select = c("chr","start","end","PeakName")),
+  keep.extra.columns = TRUE,
+  starts.in.df.are.0based = TRUE)
 bed.peaks = pairs.e2g[!duplicated(mcols(pairs.e2g)[,"PeakName"])]
 mcols(bed.peaks) = NULL
 
@@ -76,6 +79,15 @@ atac.matrix <- FeatureMatrix(
   features = bed.peaks,
   cells = cells.use
 )
+# Signac::FeatureMatrix names rows via GRangesToString (1-based start).
+# Rewrite rownames using the 0-based BED start so they match the
+# PeakName column in Pairs.Kendall.tsv.gz, which compute_kendall.R
+# uses to subset the matrix. FeatureMatrix preserves the order of
+# the input `features`, so this is a positional rename.
+rownames(atac.matrix) <- paste(seqnames(bed.peaks),
+                               start(bed.peaks) - 1L,
+                               end(bed.peaks),
+                               sep = "-")
 message("Example cell in ATAC matrix: ", colnames(atac.matrix)[1])
 
 

diff --git a/workflow/scripts/feature_computation/make_kendall_pairs.R b/workflow/scripts/feature_computation/make_kendall_pairs.R
@@ -3,7 +3,6 @@
 
 # Load required packages
 suppressPackageStartupMessages({
-  library(genomation)
   library(GenomicRanges)
   library(data.table)
 })
@@ -14,13 +13,18 @@ allPutative_path = snakemake@input[["allPutative"]]
 kendallPairs_path = snakemake@output[["kendallPairs"]]
 
 
-# Read the narrowPeak
-bed.narrowPeak = readGeneric(narrowPeak_path)
+# Read the narrowPeak (only chr/start/end are used downstream)
+bed.narrowPeak = makeGRangesFromDataFrame(
+  fread(narrowPeak_path, select = 1:3, col.names = c("chr","start","end")),
+  starts.in.df.are.0based = TRUE)
 
-# Read the Enhancer-Gene pairs in ABC prediction
-bed.allPutative = readGeneric(allPutative_path,
-                              keep.all.metadata = T,
-                              header = T)
+# Read the Enhancer-Gene pairs in ABC prediction.
+# Only chr/start/end (for findOverlaps) and TargetGene (assigned to
+# output pairs) are used; skip the ~20 other ABC columns.
+bed.allPutative = makeGRangesFromDataFrame(
+  fread(allPutative_path, select = c("chr","start","end","TargetGene")),
+  keep.extra.columns = TRUE,
+  starts.in.df.are.0based = TRUE)
 
 # Find overlaps between narrowPeak and enhancers in ABC prediction
 overlaps.res = findOverlaps(bed.narrowPeak,
@@ -30,10 +34,12 @@ overlaps.res = findOverlaps(bed.narrowPeak,
 pairs.e2g = bed.narrowPeak[overlaps.res@from]
 mcols(pairs.e2g)[,"TargetGene"] = mcols(bed.allPutative)[overlaps.res@to,"TargetGene"]
 
-# Generate peak name
-mcols(pairs.e2g)[,"PeakName"] = 
+# Generate peak name. GRanges starts are 1-based inclusive; subtract 1
+# so PeakName uses the original 0-based BED start (matches ATAC matrix
+# rownames written by generate_atac_matrix.R).
+mcols(pairs.e2g)[,"PeakName"] =
   paste(seqnames(pairs.e2g),
-        start(pairs.e2g),
+        start(pairs.e2g) - 1L,
         end(pairs.e2g),
         sep = "-")
 
@@ -57,14 +63,18 @@ df.pairs.e2g =
                               "PairName")]
 
 # Rename the columns for clarity
-colnames(df.pairs.e2g) = 
+colnames(df.pairs.e2g) =
   c("chr",
     "start",
     "end",
     "TargetGene",
     "PeakName",
     "PairName")
 
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input narrowPeak/AllPutative files.
+df.pairs.e2g$start = df.pairs.e2g$start - 1L
+
 # Write the enhancer-gene pairs to a file
 fwrite(df.pairs.e2g,
        file = kendallPairs_path,