From 7be43fd1aa69ac9b282bae30348a1157221b2c2e Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 16 Apr 2026 10:30:25 -0700 Subject: [PATCH 1/4] make_kendall_pairs: replace readGeneric with fread to fix Lustre mmap segfault MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit genomation::readGeneric() calls vroom::vroom(altrep=lazy), which uses mmap() to lazily map files into memory. On Lustre (OAK), lock revocation or client cache eviction can invalidate mmap'd pages, causing SIGSEGV with SEGV_ACCERR in the vroom C++ layer — a hardware fault below R's managed memory that cannot be caught or recovered from. Replace both readGeneric() calls with fread() + makeGRangesFromDataFrame(): - bed.narrowPeak: fread(select=1:3) reads only chr/start/end, matching readGeneric's default keep.all.metadata=FALSE behavior. - bed.allPutative: fread(select=c("chr","start","end","TargetGene")) reads only the 4 columns actually used (findOverlaps coordinates + TargetGene assignment at line 33), skipping ~20 unused ABC columns. This recovers the practical benefit of vroom's lazy column parsing without mmap. Both reads now use starts.in.df.are.0based=TRUE to correctly convert 0-based BED [a, b) to 1-based inclusive GRanges [a+1, b]. The old code loaded BED start directly as a 1-based GRanges start, which elongated every region by 1bp and caused spurious findOverlaps hits at region boundaries. Output coordinate preservation: - PeakName construction uses start(gr) - 1L to emit the original 0-based BED start. - df.pairs.e2g$start is decremented by 1 before fwrite, restoring 0-based BED format in the output TSV. - On-disk output is format-identical to previous runs. Co-Authored-By: Claude Opus 4.6 --- .../feature_computation/make_kendall_pairs.R | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/workflow/scripts/feature_computation/make_kendall_pairs.R b/workflow/scripts/feature_computation/make_kendall_pairs.R index 0b434b3..a94246b 100755 --- a/workflow/scripts/feature_computation/make_kendall_pairs.R +++ b/workflow/scripts/feature_computation/make_kendall_pairs.R @@ -3,7 +3,6 @@ # Load required packages suppressPackageStartupMessages({ - library(genomation) library(GenomicRanges) library(data.table) }) @@ -14,13 +13,18 @@ allPutative_path = snakemake@input[["allPutative"]] kendallPairs_path = snakemake@output[["kendallPairs"]] -# Read the narrowPeak -bed.narrowPeak = readGeneric(narrowPeak_path) +# Read the narrowPeak (only chr/start/end are used downstream) +bed.narrowPeak = makeGRangesFromDataFrame( + fread(narrowPeak_path, select = 1:3, col.names = c("chr","start","end")), + starts.in.df.are.0based = TRUE) -# Read the Enhancer-Gene pairs in ABC prediction -bed.allPutative = readGeneric(allPutative_path, - keep.all.metadata = T, - header = T) +# Read the Enhancer-Gene pairs in ABC prediction. +# Only chr/start/end (for findOverlaps) and TargetGene (assigned to +# output pairs) are used; skip the ~20 other ABC columns. +bed.allPutative = makeGRangesFromDataFrame( + fread(allPutative_path, select = c("chr","start","end","TargetGene")), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Find overlaps between narrowPeak and enhancers in ABC prediction overlaps.res = findOverlaps(bed.narrowPeak, @@ -30,10 +34,12 @@ overlaps.res = findOverlaps(bed.narrowPeak, pairs.e2g = bed.narrowPeak[overlaps.res@from] mcols(pairs.e2g)[,"TargetGene"] = mcols(bed.allPutative)[overlaps.res@to,"TargetGene"] -# Generate peak name -mcols(pairs.e2g)[,"PeakName"] = +# Generate peak name. GRanges starts are 1-based inclusive; subtract 1 +# so PeakName uses the original 0-based BED start (matches ATAC matrix +# rownames written by generate_atac_matrix.R). +mcols(pairs.e2g)[,"PeakName"] = paste(seqnames(pairs.e2g), - start(pairs.e2g), + start(pairs.e2g) - 1L, end(pairs.e2g), sep = "-") @@ -57,7 +63,7 @@ df.pairs.e2g = "PairName")] # Rename the columns for clarity -colnames(df.pairs.e2g) = +colnames(df.pairs.e2g) = c("chr", "start", "end", @@ -65,6 +71,10 @@ colnames(df.pairs.e2g) = "PeakName", "PairName") +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input narrowPeak/AllPutative files. +df.pairs.e2g$start = df.pairs.e2g$start - 1L + # Write the enhancer-gene pairs to a file fwrite(df.pairs.e2g, file = kendallPairs_path, From fc17ea72a1ad94a315c9792a8db1bc0b7422794e Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 16 Apr 2026 10:31:37 -0700 Subject: [PATCH 2/4] generate_atac_matrix: replace readGeneric with fread to fix Lustre mmap segfault Same vroom mmap segfault fix as make_kendall_pairs.R (see prior commit for full root cause description). Replace readGeneric() with fread(select=c("chr","start","end","PeakName")) + makeGRangesFromDataFrame(starts.in.df.are.0based=TRUE). Only the 4 columns actually used are read; TargetGene and PairName are skipped. Replace library(genomation) with library(data.table) for fread(). ATAC matrix rowname fix: Signac::FeatureMatrix derives rownames via GRangesToString, which emits 1-based GRanges starts. Under the old (buggy) code, the GRanges 1-based start numerically equalled the 0-based BED start, so rownames accidentally matched the 0-based PeakName strings in Pairs.Kendall.tsv.gz. With the corrected coordinate loading (starts.in.df.are.0based=TRUE), GRanges starts are now bed_start+1, so default Signac rownames would be chr-(a+1)-b while PeakName is chr-a-b. After FeatureMatrix, rownames are explicitly set to paste(seqnames, start-1L, end, sep="-") to restore 0-based BED naming. FeatureMatrix preserves input feature order, so this is a safe positional rename. compute_kendall.R matches mcols(bed.E2G)[,"PeakName"] %in% rownames(data.ATAC), so the two must agree on coordinate convention. Co-Authored-By: Claude Opus 4.6 --- .../generate_atac_matrix.R | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/feature_computation/generate_atac_matrix.R b/workflow/scripts/feature_computation/generate_atac_matrix.R index aaa418e..55f459f 100755 --- a/workflow/scripts/feature_computation/generate_atac_matrix.R +++ b/workflow/scripts/feature_computation/generate_atac_matrix.R @@ -2,8 +2,8 @@ # Load required packages suppressPackageStartupMessages({ - library(genomation) library(GenomicRanges) + library(data.table) library(Signac) library(anndata) library(tools) @@ -19,10 +19,13 @@ atac_matrix_path = snakemake@output[["atac_matrix_path"]] max_cell_count = snakemake@params[["max_cell_count"]] -# Read the enhancer-gene pairs and extract unique peaks -pairs.e2g = readGeneric(kendall_pairs_path, - keep.all.metadata = T, - header = T) +# Read the enhancer-gene pairs and extract unique peaks. +# Only chr/start/end (for FeatureMatrix) and PeakName (for dedup and +# row naming) are used; skip TargetGene and PairName. +pairs.e2g = makeGRangesFromDataFrame( + fread(kendall_pairs_path, select = c("chr","start","end","PeakName")), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) bed.peaks = pairs.e2g[!duplicated(mcols(pairs.e2g)[,"PeakName"])] mcols(bed.peaks) = NULL @@ -76,6 +79,15 @@ atac.matrix <- FeatureMatrix( features = bed.peaks, cells = cells.use ) +# Signac::FeatureMatrix names rows via GRangesToString (1-based start). +# Rewrite rownames using the 0-based BED start so they match the +# PeakName column in Pairs.Kendall.tsv.gz, which compute_kendall.R +# uses to subset the matrix. FeatureMatrix preserves the order of +# the input `features`, so this is a positional rename. +rownames(atac.matrix) <- paste(seqnames(bed.peaks), + start(bed.peaks) - 1L, + end(bed.peaks), + sep = "-") message("Example cell in ATAC matrix: ", colnames(atac.matrix)[1]) From fcbc9fc2184e14dd0c3406ec85c54cd6dbfea231 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 16 Apr 2026 10:32:14 -0700 Subject: [PATCH 3/4] compute_kendall: replace readGeneric with fread to fix Lustre mmap segfault Same vroom mmap segfault fix as prior commits (see make_kendall_pairs commit for full root cause description). Replace readGeneric(kendall_pairs_path, keep.all.metadata=T, header=T) with makeGRangesFromDataFrame(fread(kendall_pairs_path), keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). All columns from kendall_pairs are used in this script, so no select= narrowing. Remove library(genomation) from imports. Output coordinate preservation: df.pairs.E2G$start is decremented by 1 before fwrite, restoring 0-based BED format in Pairs.Kendall.tsv.gz. This file is consumed by compute_arc_e2g.R, which expects 0-based BED coordinates. PeakName is carried through from the input file's metadata column (not reconstructed), so it remains in its original 0-based format. Co-Authored-By: Claude Opus 4.6 --- .../scripts/feature_computation/compute_kendall.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/feature_computation/compute_kendall.R b/workflow/scripts/feature_computation/compute_kendall.R index 9e1ea66..66ccbd1 100755 --- a/workflow/scripts/feature_computation/compute_kendall.R +++ b/workflow/scripts/feature_computation/compute_kendall.R @@ -3,7 +3,6 @@ # Load required packages suppressPackageStartupMessages({ library(GenomicRanges) - library(genomation) library(foreach) library(Signac) library(Seurat) @@ -215,9 +214,9 @@ gex_out_path = snakemake@output$all_gex cores = as.integer(snakemake@threads) # Load candidate E-G pairs -pairs.E2G = readGeneric(kendall_pairs_path, - keep.all.metadata = T, - header = T) +pairs.E2G = makeGRangesFromDataFrame(fread(kendall_pairs_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Load scATAC matrix matrix.atac_count = readRDS(atac_matrix_path) @@ -301,7 +300,7 @@ df.pairs.E2G = "RnaDetectedPercent", "RnaPseudobulkTPM", "Kendall")] -colnames(df.pairs.E2G) = +colnames(df.pairs.E2G) = c("chr", "start", "end", @@ -312,6 +311,9 @@ colnames(df.pairs.E2G) = "RnaDetectedPercent", "RnaPseudobulkTPM", "Kendall") +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input kendall_pairs file. +df.pairs.E2G$start = df.pairs.E2G$start - 1L fwrite(df.pairs.E2G, file = kendall_predictions_path, row.names = F, From 9e2b0f80f56d99df772ede693ab445b3d4e2b06f Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 16 Apr 2026 10:34:06 -0700 Subject: [PATCH 4/4] compute_arc_e2g: replace readGeneric with fread to fix Lustre mmap segfault This is the script that originally triggered the investigation: 1 of 4 scE2G runs segfaulted in vroom_() C++ code during the arc_e2g rule (SIGSEGV with SEGV_ACCERR, "invalid permissions"). Root cause: vroom's mmap-based lazy reading on Lustre, where lock revocation invalidates mmap'd pages below R's memory management layer. Replace readGeneric(abc_predictions_path) with makeGRangesFromDataFrame(fread(abc_predictions_path), keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). All ABC columns are used (AddMax, IntegrateABC), so no select= narrowing. Replace GRanges(fread(kendall_predictions_path)) with makeGRangesFromDataFrame(fread(kendall_predictions_path), keep.extra.columns=TRUE, starts.in.df.are.0based=TRUE). The old GRanges(fread()) call also misinterpreted 0-based BED starts. Remove library(genomation) from imports. Coordinate correction: The old code loaded BED [a, b) as GRanges [a, b] (treating the 0-based start as 1-based). This elongated every region by 1bp, causing findOverlaps in AddMax to produce spurious hits where two regions share a boundary but don't overlap in BED semantics. Quantified on igvf10/thp1_2: 1,502 of 10,303,415 ABC rows (0.015%) were affected; max ARC.E2G.Score ratio old/new = 2.05; max absolute score change = 0.0045. Output coordinate preservation: df.output$start is decremented by 1 before fwrite, restoring 0-based BED format in EnhancerPredictionsAllPutative_ARC.tsv.gz. Co-Authored-By: Claude Opus 4.6 --- .../scripts/feature_computation/compute_arc_e2g.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/feature_computation/compute_arc_e2g.R b/workflow/scripts/feature_computation/compute_arc_e2g.R index a0bbcbd..4ebc9a3 100755 --- a/workflow/scripts/feature_computation/compute_arc_e2g.R +++ b/workflow/scripts/feature_computation/compute_arc_e2g.R @@ -1,7 +1,6 @@ # Load required packages suppressPackageStartupMessages({ library(GenomicRanges) - library(genomation) library(data.table) }) @@ -87,12 +86,14 @@ kendall_predictions_path = snakemake@input$kendall_predictions arc_predictions_path = snakemake@output$arc_predictions # Load ABC data -pairs.E2G.ABC = readGeneric(abc_predictions_path, - keep.all.metadata = T, - header = T) +pairs.E2G.ABC = makeGRangesFromDataFrame(fread(abc_predictions_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Load Kendall data -pairs.E2G.Kendall = GRanges(fread(kendall_predictions_path)) +pairs.E2G.Kendall = makeGRangesFromDataFrame(fread(kendall_predictions_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) pairs.E2G.Kendall = pairs.E2G.Kendall[!is.na(mcols(pairs.E2G.Kendall)[,"Kendall"])] @@ -123,8 +124,11 @@ mcols(pairs.E2G.ABC)[,c("mean_log_normalized_rna", "RnaPseudobulkTPM")] # Write output to file +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input file. df.output = as.data.frame(pairs.E2G.ABC) colnames(df.output)[1] = "chr" +df.output$start = df.output$start - 1L fwrite(df.output, file = arc_predictions_path, row.names = F,