From 7be43fd1aa69ac9b282bae30348a1157221b2c2e Mon Sep 17 00:00:00 2001
From: Kayla Brand <kaybrand@stanford.edu>
Date: Thu, 16 Apr 2026 10:30:25 -0700
Subject: [PATCH 1/4] make_kendall_pairs: replace readGeneric with fread to fix
 Lustre mmap segfault
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

genomation::readGeneric() calls vroom::vroom(altrep=lazy), which uses
mmap() to lazily map files into memory. On Lustre (OAK), lock
revocation or client cache eviction can invalidate mmap'd pages,
causing SIGSEGV with SEGV_ACCERR in the vroom C++ layer — a hardware
fault below R's managed memory that cannot be caught or recovered from.

Replace both readGeneric() calls with fread() + makeGRangesFromDataFrame():

  - bed.narrowPeak: fread(select=1:3) reads only chr/start/end,
    matching readGeneric's default keep.all.metadata=FALSE behavior.

  - bed.allPutative: fread(select=c("chr","start","end","TargetGene"))
    reads only the 4 columns actually used (findOverlaps coordinates +
    TargetGene assignment at line 33), skipping ~20 unused ABC columns.
    This recovers the practical benefit of vroom's lazy column parsing
    without mmap.

Both reads now use starts.in.df.are.0based=TRUE to correctly convert
0-based BED [a, b) to 1-based inclusive GRanges [a+1, b]. The old code
loaded BED start directly as a 1-based GRanges start, which elongated
every region by 1bp and caused spurious findOverlaps hits at region
boundaries.

Output coordinate preservation:
  - PeakName construction uses start(gr) - 1L to emit the original
    0-based BED start.
  - df.pairs.e2g$start is decremented by 1 before fwrite, restoring
    0-based BED format in the output TSV.
  - On-disk output is format-identical to previous runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../feature_computation/make_kendall_pairs.R  | 32 ++++++++++++-------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/workflow/scripts/feature_computation/make_kendall_pairs.R b/workflow/scripts/feature_computation/make_kendall_pairs.R
index 0b434b3..a94246b 100755
--- a/workflow/scripts/feature_computation/make_kendall_pairs.R
+++ b/workflow/scripts/feature_computation/make_kendall_pairs.R
@@ -3,7 +3,6 @@
 
 # Load required packages
 suppressPackageStartupMessages({
-  library(genomation)
   library(GenomicRanges)
   library(data.table)
 })
@@ -14,13 +13,18 @@ allPutative_path = snakemake@input[["allPutative"]]
 kendallPairs_path = snakemake@output[["kendallPairs"]]
 
 
-# Read the narrowPeak
-bed.narrowPeak = readGeneric(narrowPeak_path)
+# Read the narrowPeak (only chr/start/end are used downstream)
+bed.narrowPeak = makeGRangesFromDataFrame(
+  fread(narrowPeak_path, select = 1:3, col.names = c("chr","start","end")),
+  starts.in.df.are.0based = TRUE)
 
-# Read the Enhancer-Gene pairs in ABC prediction
-bed.allPutative = readGeneric(allPutative_path,
-                              keep.all.metadata = T,
-                              header = T)
+# Read the Enhancer-Gene pairs in ABC prediction.
+# Only chr/start/end (for findOverlaps) and TargetGene (assigned to
+# output pairs) are used; skip the ~20 other ABC columns.
+bed.allPutative = makeGRangesFromDataFrame(
+  fread(allPutative_path, select = c("chr","start","end","TargetGene")),
+  keep.extra.columns = TRUE,
+  starts.in.df.are.0based = TRUE)
 
 # Find overlaps between narrowPeak and enhancers in ABC prediction
 overlaps.res = findOverlaps(bed.narrowPeak,
@@ -30,10 +34,12 @@ overlaps.res = findOverlaps(bed.narrowPeak,
 pairs.e2g = bed.narrowPeak[overlaps.res@from]
 mcols(pairs.e2g)[,"TargetGene"] = mcols(bed.allPutative)[overlaps.res@to,"TargetGene"]
 
-# Generate peak name
-mcols(pairs.e2g)[,"PeakName"] = 
+# Generate peak name. GRanges starts are 1-based inclusive; subtract 1
+# so PeakName uses the original 0-based BED start (matches ATAC matrix
+# rownames written by generate_atac_matrix.R).
+mcols(pairs.e2g)[,"PeakName"] =
   paste(seqnames(pairs.e2g),
-        start(pairs.e2g),
+        start(pairs.e2g) - 1L,
         end(pairs.e2g),
         sep = "-")
 
@@ -57,7 +63,7 @@ df.pairs.e2g =
                               "PairName")]
 
 # Rename the columns for clarity
-colnames(df.pairs.e2g) = 
+colnames(df.pairs.e2g) =
   c("chr",
     "start",
     "end",
@@ -65,6 +71,10 @@ colnames(df.pairs.e2g) =
     "PeakName",
     "PairName")
 
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input narrowPeak/AllPutative files.
+df.pairs.e2g$start = df.pairs.e2g$start - 1L
+
 # Write the enhancer-gene pairs to a file
 fwrite(df.pairs.e2g,
        file = kendallPairs_path,

From fc17ea72a1ad94a315c9792a8db1bc0b7422794e Mon Sep 17 00:00:00 2001
From: Kayla Brand <kaybrand@stanford.edu>
Date: Thu, 16 Apr 2026 10:31:37 -0700
Subject: [PATCH 2/4] generate_atac_matrix: replace readGeneric with fread to
 fix Lustre mmap segfault

Same vroom mmap segfault fix as make_kendall_pairs.R (see prior commit
for full root cause description).

Replace readGeneric() with fread(select=c("chr","start","end","PeakName"))
+ makeGRangesFromDataFrame(starts.in.df.are.0based=TRUE). Only the 4
columns actually used are read; TargetGene and PairName are skipped.

Replace library(genomation) with library(data.table) for fread().

ATAC matrix rowname fix:
  Signac::FeatureMatrix derives rownames via GRangesToString, which
  emits 1-based GRanges starts. Under the old (buggy) code, the
  GRanges 1-based start numerically equalled the 0-based BED start,
  so rownames accidentally matched the 0-based PeakName strings in
  Pairs.Kendall.tsv.gz. With the corrected coordinate loading
  (starts.in.df.are.0based=TRUE), GRanges starts are now bed_start+1,
  so default Signac rownames would be chr-(a+1)-b while PeakName is
  chr-a-b.

  After FeatureMatrix, rownames are explicitly set to
  paste(seqnames, start-1L, end, sep="-") to restore 0-based BED
  naming. FeatureMatrix preserves input feature order, so this is a
  safe positional rename. compute_kendall.R matches
  mcols(bed.E2G)[,"PeakName"] %in% rownames(data.ATAC), so the two
  must agree on coordinate convention.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../generate_atac_matrix.R                    | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/feature_computation/generate_atac_matrix.R b/workflow/scripts/feature_computation/generate_atac_matrix.R
index aaa418e..55f459f 100755
--- a/workflow/scripts/feature_computation/generate_atac_matrix.R
+++ b/workflow/scripts/feature_computation/generate_atac_matrix.R
@@ -2,8 +2,8 @@
 
 # Load required packages
 suppressPackageStartupMessages({
-  library(genomation)
   library(GenomicRanges)
+  library(data.table)
   library(Signac)
   library(anndata)
   library(tools)
@@ -19,10 +19,13 @@ atac_matrix_path = snakemake@output[["atac_matrix_path"]]
 
 max_cell_count = snakemake@params[["max_cell_count"]]
 
-# Read the enhancer-gene pairs and extract unique peaks
-pairs.e2g = readGeneric(kendall_pairs_path,
-                        keep.all.metadata = T,
-                        header = T)
+# Read the enhancer-gene pairs and extract unique peaks.
+# Only chr/start/end (for FeatureMatrix) and PeakName (for dedup and
+# row naming) are used; skip TargetGene and PairName.
+pairs.e2g = makeGRangesFromDataFrame(
+  fread(kendall_pairs_path, select = c("chr","start","end","PeakName")),
+  keep.extra.columns = TRUE,
+  starts.in.df.are.0based = TRUE)
 bed.peaks = pairs.e2g[!duplicated(mcols(pairs.e2g)[,"PeakName"])]
 mcols(bed.peaks) = NULL
 
@@ -76,6 +79,15 @@ atac.matrix <- FeatureMatrix(
   features = bed.peaks,
   cells = cells.use
 )
+# Signac::FeatureMatrix names rows via GRangesToString (1-based start).
+# Rewrite rownames using the 0-based BED start so they match the
+# PeakName column in Pairs.Kendall.tsv.gz, which compute_kendall.R
+# uses to subset the matrix. FeatureMatrix preserves the order of
+# the input `features`, so this is a positional rename.
+rownames(atac.matrix) <- paste(seqnames(bed.peaks),
+                               start(bed.peaks) - 1L,
+                               end(bed.peaks),
+                               sep = "-")
 message("Example cell in ATAC matrix: ", colnames(atac.matrix)[1])
 
 

From fcbc9fc2184e14dd0c3406ec85c54cd6dbfea231 Mon Sep 17 00:00:00 2001
From: Kayla Brand <kaybrand@stanford.edu>
Date: Thu, 16 Apr 2026 10:32:14 -0700
Subject: [PATCH 3/4] compute_kendall: replace readGeneric with fread to fix
 Lustre mmap segfault

Same vroom mmap segfault fix as prior commits (see make_kendall_pairs
commit for full root cause description).

Replace readGeneric(kendall_pairs_path, keep.all.metadata=T, header=T)
with makeGRangesFromDataFrame(fread(kendall_pairs_path),
keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). All columns
from kendall_pairs are used in this script, so no select= narrowing.

Remove library(genomation) from imports.

Output coordinate preservation:
  df.pairs.E2G$start is decremented by 1 before fwrite, restoring
  0-based BED format in Pairs.Kendall.tsv.gz. This file is consumed
  by compute_arc_e2g.R, which expects 0-based BED coordinates.
  PeakName is carried through from the input file's metadata column
  (not reconstructed), so it remains in its original 0-based format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../scripts/feature_computation/compute_kendall.R    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/feature_computation/compute_kendall.R b/workflow/scripts/feature_computation/compute_kendall.R
index 9e1ea66..66ccbd1 100755
--- a/workflow/scripts/feature_computation/compute_kendall.R
+++ b/workflow/scripts/feature_computation/compute_kendall.R
@@ -3,7 +3,6 @@
 # Load required packages
 suppressPackageStartupMessages({
   library(GenomicRanges)
-  library(genomation)
   library(foreach)
   library(Signac)
   library(Seurat)
@@ -215,9 +214,9 @@ gex_out_path = snakemake@output$all_gex
 cores = as.integer(snakemake@threads)
 
 # Load candidate E-G pairs
-pairs.E2G = readGeneric(kendall_pairs_path,
-                        keep.all.metadata = T,
-                        header = T)
+pairs.E2G = makeGRangesFromDataFrame(fread(kendall_pairs_path),
+                                     keep.extra.columns = TRUE,
+                                     starts.in.df.are.0based = TRUE)
 
 # Load scATAC matrix
 matrix.atac_count = readRDS(atac_matrix_path)
@@ -301,7 +300,7 @@ df.pairs.E2G =
                               "RnaDetectedPercent",
                               "RnaPseudobulkTPM",
                               "Kendall")]
-colnames(df.pairs.E2G) = 
+colnames(df.pairs.E2G) =
   c("chr",
     "start",
     "end",
@@ -312,6 +311,9 @@ colnames(df.pairs.E2G) =
     "RnaDetectedPercent",
     "RnaPseudobulkTPM",
     "Kendall")
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input kendall_pairs file.
+df.pairs.E2G$start = df.pairs.E2G$start - 1L
 fwrite(df.pairs.E2G,
        file = kendall_predictions_path,
        row.names = F,

From 9e2b0f80f56d99df772ede693ab445b3d4e2b06f Mon Sep 17 00:00:00 2001
From: Kayla Brand <kaybrand@stanford.edu>
Date: Thu, 16 Apr 2026 10:34:06 -0700
Subject: [PATCH 4/4] compute_arc_e2g: replace readGeneric with fread to fix
 Lustre mmap segfault

This is the script that originally triggered the investigation: 1 of 4
scE2G runs segfaulted in vroom_() C++ code during the arc_e2g rule
(SIGSEGV with SEGV_ACCERR, "invalid permissions"). Root cause: vroom's
mmap-based lazy reading on Lustre, where lock revocation invalidates
mmap'd pages below R's memory management layer.

Replace readGeneric(abc_predictions_path) with
makeGRangesFromDataFrame(fread(abc_predictions_path),
keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). All ABC columns
are used (AddMax, IntegrateABC), so no select= narrowing.

Replace GRanges(fread(kendall_predictions_path)) with
makeGRangesFromDataFrame(fread(kendall_predictions_path),
keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). The old
GRanges(fread()) call also misinterpreted 0-based BED starts.

Remove library(genomation) from imports.

Coordinate correction:
  The old code loaded BED [a, b) as GRanges [a, b] (treating the
  0-based start as 1-based). This elongated every region by 1bp,
  causing findOverlaps in AddMax to produce spurious hits where two
  regions share a boundary but don't overlap in BED semantics.
  Quantified on igvf10/thp1_2: 1,502 of 10,303,415 ABC rows (0.015%)
  were affected; max ARC.E2G.Score ratio old/new = 2.05; max absolute
  score change = 0.0045.

Output coordinate preservation:
  df.output$start is decremented by 1 before fwrite, restoring 0-based
  BED format in EnhancerPredictionsAllPutative_ARC.tsv.gz.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../scripts/feature_computation/compute_arc_e2g.R  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/feature_computation/compute_arc_e2g.R b/workflow/scripts/feature_computation/compute_arc_e2g.R
index a0bbcbd..4ebc9a3 100755
--- a/workflow/scripts/feature_computation/compute_arc_e2g.R
+++ b/workflow/scripts/feature_computation/compute_arc_e2g.R
@@ -1,7 +1,6 @@
 # Load required packages
 suppressPackageStartupMessages({
   library(GenomicRanges)
-  library(genomation)
   library(data.table)
 })
 
@@ -87,12 +86,14 @@ kendall_predictions_path = snakemake@input$kendall_predictions
 arc_predictions_path = snakemake@output$arc_predictions
 
 # Load ABC data
-pairs.E2G.ABC = readGeneric(abc_predictions_path,
-                            keep.all.metadata = T,
-                            header = T)
+pairs.E2G.ABC = makeGRangesFromDataFrame(fread(abc_predictions_path),
+                                         keep.extra.columns = TRUE,
+                                         starts.in.df.are.0based = TRUE)
 
 # Load Kendall data
-pairs.E2G.Kendall = GRanges(fread(kendall_predictions_path))
+pairs.E2G.Kendall = makeGRangesFromDataFrame(fread(kendall_predictions_path),
+                                              keep.extra.columns = TRUE,
+                                              starts.in.df.are.0based = TRUE)
 pairs.E2G.Kendall = 
   pairs.E2G.Kendall[!is.na(mcols(pairs.E2G.Kendall)[,"Kendall"])]
 
@@ -123,8 +124,11 @@ mcols(pairs.E2G.ABC)[,c("mean_log_normalized_rna",
                                          "RnaPseudobulkTPM")]
 
 # Write output to file
+# GRanges starts are 1-based inclusive; subtract 1 to restore the
+# 0-based BED convention of the input file.
 df.output = as.data.frame(pairs.E2G.ABC)
 colnames(df.output)[1] = "chr"
+df.output$start = df.output$start - 1L
 fwrite(df.output,
        file = arc_predictions_path,
        row.names = F,