diff --git a/workflow/scripts/feature_computation/compute_arc_e2g.R b/workflow/scripts/feature_computation/compute_arc_e2g.R index a0bbcbd..4ebc9a3 100755 --- a/workflow/scripts/feature_computation/compute_arc_e2g.R +++ b/workflow/scripts/feature_computation/compute_arc_e2g.R @@ -1,7 +1,6 @@ # Load required packages suppressPackageStartupMessages({ library(GenomicRanges) - library(genomation) library(data.table) }) @@ -87,12 +86,14 @@ kendall_predictions_path = snakemake@input$kendall_predictions arc_predictions_path = snakemake@output$arc_predictions # Load ABC data -pairs.E2G.ABC = readGeneric(abc_predictions_path, - keep.all.metadata = T, - header = T) +pairs.E2G.ABC = makeGRangesFromDataFrame(fread(abc_predictions_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Load Kendall data -pairs.E2G.Kendall = GRanges(fread(kendall_predictions_path)) +pairs.E2G.Kendall = makeGRangesFromDataFrame(fread(kendall_predictions_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) pairs.E2G.Kendall = pairs.E2G.Kendall[!is.na(mcols(pairs.E2G.Kendall)[,"Kendall"])] @@ -123,8 +124,11 @@ mcols(pairs.E2G.ABC)[,c("mean_log_normalized_rna", "RnaPseudobulkTPM")] # Write output to file +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input file. df.output = as.data.frame(pairs.E2G.ABC) colnames(df.output)[1] = "chr" +df.output$start = df.output$start - 1L fwrite(df.output, file = arc_predictions_path, row.names = F, diff --git a/workflow/scripts/feature_computation/compute_kendall.R b/workflow/scripts/feature_computation/compute_kendall.R index 9e1ea66..66ccbd1 100755 --- a/workflow/scripts/feature_computation/compute_kendall.R +++ b/workflow/scripts/feature_computation/compute_kendall.R @@ -3,7 +3,6 @@ # Load required packages suppressPackageStartupMessages({ library(GenomicRanges) - library(genomation) library(foreach) library(Signac) library(Seurat) @@ -215,9 +214,9 @@ gex_out_path = snakemake@output$all_gex cores = as.integer(snakemake@threads) # Load candidate E-G pairs -pairs.E2G = readGeneric(kendall_pairs_path, - keep.all.metadata = T, - header = T) +pairs.E2G = makeGRangesFromDataFrame(fread(kendall_pairs_path), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Load scATAC matrix matrix.atac_count = readRDS(atac_matrix_path) @@ -301,7 +300,7 @@ df.pairs.E2G = "RnaDetectedPercent", "RnaPseudobulkTPM", "Kendall")] -colnames(df.pairs.E2G) = +colnames(df.pairs.E2G) = c("chr", "start", "end", @@ -312,6 +311,9 @@ colnames(df.pairs.E2G) = "RnaDetectedPercent", "RnaPseudobulkTPM", "Kendall") +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input kendall_pairs file. +df.pairs.E2G$start = df.pairs.E2G$start - 1L fwrite(df.pairs.E2G, file = kendall_predictions_path, row.names = F, diff --git a/workflow/scripts/feature_computation/generate_atac_matrix.R b/workflow/scripts/feature_computation/generate_atac_matrix.R index aaa418e..55f459f 100755 --- a/workflow/scripts/feature_computation/generate_atac_matrix.R +++ b/workflow/scripts/feature_computation/generate_atac_matrix.R @@ -2,8 +2,8 @@ # Load required packages suppressPackageStartupMessages({ - library(genomation) library(GenomicRanges) + library(data.table) library(Signac) library(anndata) library(tools) @@ -19,10 +19,13 @@ atac_matrix_path = snakemake@output[["atac_matrix_path"]] max_cell_count = snakemake@params[["max_cell_count"]] -# Read the enhancer-gene pairs and extract unique peaks -pairs.e2g = readGeneric(kendall_pairs_path, - keep.all.metadata = T, - header = T) +# Read the enhancer-gene pairs and extract unique peaks. +# Only chr/start/end (for FeatureMatrix) and PeakName (for dedup and +# row naming) are used; skip TargetGene and PairName. +pairs.e2g = makeGRangesFromDataFrame( + fread(kendall_pairs_path, select = c("chr","start","end","PeakName")), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) bed.peaks = pairs.e2g[!duplicated(mcols(pairs.e2g)[,"PeakName"])] mcols(bed.peaks) = NULL @@ -76,6 +79,15 @@ atac.matrix <- FeatureMatrix( features = bed.peaks, cells = cells.use ) +# Signac::FeatureMatrix names rows via GRangesToString (1-based start). +# Rewrite rownames using the 0-based BED start so they match the +# PeakName column in Pairs.Kendall.tsv.gz, which compute_kendall.R +# uses to subset the matrix. FeatureMatrix preserves the order of +# the input `features`, so this is a positional rename. +rownames(atac.matrix) <- paste(seqnames(bed.peaks), + start(bed.peaks) - 1L, + end(bed.peaks), + sep = "-") message("Example cell in ATAC matrix: ", colnames(atac.matrix)[1]) diff --git a/workflow/scripts/feature_computation/make_kendall_pairs.R b/workflow/scripts/feature_computation/make_kendall_pairs.R index 0b434b3..a94246b 100755 --- a/workflow/scripts/feature_computation/make_kendall_pairs.R +++ b/workflow/scripts/feature_computation/make_kendall_pairs.R @@ -3,7 +3,6 @@ # Load required packages suppressPackageStartupMessages({ - library(genomation) library(GenomicRanges) library(data.table) }) @@ -14,13 +13,18 @@ allPutative_path = snakemake@input[["allPutative"]] kendallPairs_path = snakemake@output[["kendallPairs"]] -# Read the narrowPeak -bed.narrowPeak = readGeneric(narrowPeak_path) +# Read the narrowPeak (only chr/start/end are used downstream) +bed.narrowPeak = makeGRangesFromDataFrame( + fread(narrowPeak_path, select = 1:3, col.names = c("chr","start","end")), + starts.in.df.are.0based = TRUE) -# Read the Enhancer-Gene pairs in ABC prediction -bed.allPutative = readGeneric(allPutative_path, - keep.all.metadata = T, - header = T) +# Read the Enhancer-Gene pairs in ABC prediction. +# Only chr/start/end (for findOverlaps) and TargetGene (assigned to +# output pairs) are used; skip the ~20 other ABC columns. +bed.allPutative = makeGRangesFromDataFrame( + fread(allPutative_path, select = c("chr","start","end","TargetGene")), + keep.extra.columns = TRUE, + starts.in.df.are.0based = TRUE) # Find overlaps between narrowPeak and enhancers in ABC prediction overlaps.res = findOverlaps(bed.narrowPeak, @@ -30,10 +34,12 @@ overlaps.res = findOverlaps(bed.narrowPeak, pairs.e2g = bed.narrowPeak[overlaps.res@from] mcols(pairs.e2g)[,"TargetGene"] = mcols(bed.allPutative)[overlaps.res@to,"TargetGene"] -# Generate peak name -mcols(pairs.e2g)[,"PeakName"] = +# Generate peak name. GRanges starts are 1-based inclusive; subtract 1 +# so PeakName uses the original 0-based BED start (matches ATAC matrix +# rownames written by generate_atac_matrix.R). +mcols(pairs.e2g)[,"PeakName"] = paste(seqnames(pairs.e2g), - start(pairs.e2g), + start(pairs.e2g) - 1L, end(pairs.e2g), sep = "-") @@ -57,7 +63,7 @@ df.pairs.e2g = "PairName")] # Rename the columns for clarity -colnames(df.pairs.e2g) = +colnames(df.pairs.e2g) = c("chr", "start", "end", @@ -65,6 +71,10 @@ colnames(df.pairs.e2g) = "PeakName", "PairName") +# GRanges starts are 1-based inclusive; subtract 1 to restore the +# 0-based BED convention of the input narrowPeak/AllPutative files. +df.pairs.e2g$start = df.pairs.e2g$start - 1L + # Write the enhancer-gene pairs to a file fwrite(df.pairs.e2g, file = kendallPairs_path,