Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions workflow/scripts/feature_computation/compute_arc_e2g.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Load required packages
suppressPackageStartupMessages({
library(GenomicRanges)
library(genomation)
library(data.table)
})

Expand Down Expand Up @@ -87,12 +86,14 @@ kendall_predictions_path = snakemake@input$kendall_predictions
arc_predictions_path = snakemake@output$arc_predictions

# Load ABC data
pairs.E2G.ABC = readGeneric(abc_predictions_path,
keep.all.metadata = T,
header = T)
pairs.E2G.ABC = makeGRangesFromDataFrame(fread(abc_predictions_path),
keep.extra.columns = TRUE,
starts.in.df.are.0based = TRUE)

# Load Kendall data
pairs.E2G.Kendall = GRanges(fread(kendall_predictions_path))
pairs.E2G.Kendall = makeGRangesFromDataFrame(fread(kendall_predictions_path),
keep.extra.columns = TRUE,
starts.in.df.are.0based = TRUE)
pairs.E2G.Kendall =
pairs.E2G.Kendall[!is.na(mcols(pairs.E2G.Kendall)[,"Kendall"])]

Expand Down Expand Up @@ -123,8 +124,11 @@ mcols(pairs.E2G.ABC)[,c("mean_log_normalized_rna",
"RnaPseudobulkTPM")]

# Write output to file
# GRanges starts are 1-based inclusive; subtract 1 to restore the
# 0-based BED convention of the input file.
df.output = as.data.frame(pairs.E2G.ABC)
colnames(df.output)[1] = "chr"
df.output$start = df.output$start - 1L
fwrite(df.output,
file = arc_predictions_path,
row.names = F,
Expand Down
12 changes: 7 additions & 5 deletions workflow/scripts/feature_computation/compute_kendall.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Load required packages
suppressPackageStartupMessages({
library(GenomicRanges)
library(genomation)
library(foreach)
library(Signac)
library(Seurat)
Expand Down Expand Up @@ -215,9 +214,9 @@ gex_out_path = snakemake@output$all_gex
cores = as.integer(snakemake@threads)

# Load candidate E-G pairs
pairs.E2G = readGeneric(kendall_pairs_path,
keep.all.metadata = T,
header = T)
pairs.E2G = makeGRangesFromDataFrame(fread(kendall_pairs_path),
keep.extra.columns = TRUE,
starts.in.df.are.0based = TRUE)

# Load scATAC matrix
matrix.atac_count = readRDS(atac_matrix_path)
Expand Down Expand Up @@ -301,7 +300,7 @@ df.pairs.E2G =
"RnaDetectedPercent",
"RnaPseudobulkTPM",
"Kendall")]
colnames(df.pairs.E2G) =
colnames(df.pairs.E2G) =
c("chr",
"start",
"end",
Expand All @@ -312,6 +311,9 @@ colnames(df.pairs.E2G) =
"RnaDetectedPercent",
"RnaPseudobulkTPM",
"Kendall")
# GRanges starts are 1-based inclusive; subtract 1 to restore the
# 0-based BED convention of the input kendall_pairs file.
df.pairs.E2G$start = df.pairs.E2G$start - 1L
fwrite(df.pairs.E2G,
file = kendall_predictions_path,
row.names = F,
Expand Down
22 changes: 17 additions & 5 deletions workflow/scripts/feature_computation/generate_atac_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

# Load required packages
suppressPackageStartupMessages({
library(genomation)
library(GenomicRanges)
library(data.table)
library(Signac)
library(anndata)
library(tools)
Expand All @@ -19,10 +19,13 @@ atac_matrix_path = snakemake@output[["atac_matrix_path"]]

max_cell_count = snakemake@params[["max_cell_count"]]

# Read the enhancer-gene pairs and extract unique peaks
pairs.e2g = readGeneric(kendall_pairs_path,
keep.all.metadata = T,
header = T)
# Read the enhancer-gene pairs and extract unique peaks.
# Only chr/start/end (for FeatureMatrix) and PeakName (for dedup and
# row naming) are used; skip TargetGene and PairName.
pairs.e2g = makeGRangesFromDataFrame(
fread(kendall_pairs_path, select = c("chr","start","end","PeakName")),
keep.extra.columns = TRUE,
starts.in.df.are.0based = TRUE)
bed.peaks = pairs.e2g[!duplicated(mcols(pairs.e2g)[,"PeakName"])]
mcols(bed.peaks) = NULL

Expand Down Expand Up @@ -76,6 +79,15 @@ atac.matrix <- FeatureMatrix(
features = bed.peaks,
cells = cells.use
)
# Signac::FeatureMatrix names rows via GRangesToString (1-based start).
# Rewrite rownames using the 0-based BED start so they match the
# PeakName column in Pairs.Kendall.tsv.gz, which compute_kendall.R
# uses to subset the matrix. FeatureMatrix preserves the order of
# the input `features`, so this is a positional rename.
rownames(atac.matrix) <- paste(seqnames(bed.peaks),
start(bed.peaks) - 1L,
end(bed.peaks),
sep = "-")
message("Example cell in ATAC matrix: ", colnames(atac.matrix)[1])


Expand Down
32 changes: 21 additions & 11 deletions workflow/scripts/feature_computation/make_kendall_pairs.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

# Load required packages
suppressPackageStartupMessages({
library(genomation)
library(GenomicRanges)
library(data.table)
})
Expand All @@ -14,13 +13,18 @@ allPutative_path = snakemake@input[["allPutative"]]
kendallPairs_path = snakemake@output[["kendallPairs"]]


# Read the narrowPeak
bed.narrowPeak = readGeneric(narrowPeak_path)
# Read the narrowPeak (only chr/start/end are used downstream)
bed.narrowPeak = makeGRangesFromDataFrame(
fread(narrowPeak_path, select = 1:3, col.names = c("chr","start","end")),
starts.in.df.are.0based = TRUE)

# Read the Enhancer-Gene pairs in ABC prediction
bed.allPutative = readGeneric(allPutative_path,
keep.all.metadata = T,
header = T)
# Read the Enhancer-Gene pairs in ABC prediction.
# Only chr/start/end (for findOverlaps) and TargetGene (assigned to
# output pairs) are used; skip the ~20 other ABC columns.
bed.allPutative = makeGRangesFromDataFrame(
fread(allPutative_path, select = c("chr","start","end","TargetGene")),
keep.extra.columns = TRUE,
starts.in.df.are.0based = TRUE)

# Find overlaps between narrowPeak and enhancers in ABC prediction
overlaps.res = findOverlaps(bed.narrowPeak,
Expand All @@ -30,10 +34,12 @@ overlaps.res = findOverlaps(bed.narrowPeak,
pairs.e2g = bed.narrowPeak[overlaps.res@from]
mcols(pairs.e2g)[,"TargetGene"] = mcols(bed.allPutative)[overlaps.res@to,"TargetGene"]

# Generate peak name
mcols(pairs.e2g)[,"PeakName"] =
# Generate peak name. GRanges starts are 1-based inclusive; subtract 1
# so PeakName uses the original 0-based BED start (matches ATAC matrix
# rownames written by generate_atac_matrix.R).
mcols(pairs.e2g)[,"PeakName"] =
paste(seqnames(pairs.e2g),
start(pairs.e2g),
start(pairs.e2g) - 1L,
end(pairs.e2g),
sep = "-")

Expand All @@ -57,14 +63,18 @@ df.pairs.e2g =
"PairName")]

# Rename the columns for clarity
colnames(df.pairs.e2g) =
colnames(df.pairs.e2g) =
c("chr",
"start",
"end",
"TargetGene",
"PeakName",
"PairName")

# GRanges starts are 1-based inclusive; subtract 1 to restore the
# 0-based BED convention of the input narrowPeak/AllPutative files.
df.pairs.e2g$start = df.pairs.e2g$start - 1L

# Write the enhancer-gene pairs to a file
fwrite(df.pairs.e2g,
file = kendallPairs_path,
Expand Down