From e18fbaea041c1f3d6a60db1661e184bb5be21dc2 Mon Sep 17 00:00:00 2001 From: Jay Hesselberth Date: Thu, 26 Mar 2026 08:01:04 -0600 Subject: [PATCH] feat: add modkit integration for heatmap workflow Add read_bedmethyl() and summarize_mod_calls() to convert modkit output into formats compatible with the existing heatmap pipeline. Add vignette section demonstrating both workflows. --- NAMESPACE | 2 + NEWS.md | 4 ++ R/read-modkit.R | 108 ++++++++++++++++++++++++++++++ man/build_coldata.Rd | 2 +- man/read_bedmethyl.Rd | 25 +++++++ man/run_deseq.Rd | 2 +- man/summarize_mod_calls.Rd | 29 ++++++++ pkgdown/_pkgdown.yml | 1 + tests/testthat/test-read-modkit.R | 75 +++++++++++++++++++++ vignettes/articles/heatmap.Rmd | 94 ++++++++++++++++++++++++++ 10 files changed, 340 insertions(+), 2 deletions(-) create mode 100644 R/read-modkit.R create mode 100644 man/read_bedmethyl.Rd create mode 100644 man/summarize_mod_calls.Rd create mode 100644 tests/testthat/test-read-modkit.R diff --git a/NAMESPACE b/NAMESPACE index 6cb9a62..8ffc9a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ export(prepare_rewiring_matrix) export(propagate_error_diff) export(propagate_error_ratio) export(read_bcerror) +export(read_bedmethyl) export(read_charging) export(read_charging_multi) export(read_counts) @@ -65,6 +66,7 @@ export(structure_html) export(structure_organisms) export(structure_to_png) export(structure_trnas) +export(summarize_mod_calls) export(tabulate_deseq) export(tidy_deseq_results) export(trna_regions) diff --git a/NEWS.md b/NEWS.md index 48b772c..d4cfa50 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # clover 0.0.0.9000 +* `read_bedmethyl()` reads per-position modification percentages from bedMethyl files produced by `modkit pileup`, enabling integration of nanopore modification data with the heatmap workflow. + +* `summarize_mod_calls()` summarizes per-read modification calls from `modkit extract` output into per-position modification frequencies, ready for delta computation and heatmap visualization. + * `plot_abundance_charging()` gains `shorten`, `source_col`, and `error_bars` parameters. Labels are auto-shortened via `shorten_trna_names()`, an optional faceting column separates host and phage tRNAs, and horizontal error bars show `lfcSE` on significant points when present. * `plot_charging_diffs()` gains `source_col`, `label_col`, and `shorten` parameters. Labels are auto-shortened via `shorten_trna_names()` and an optional faceting column separates host and phage tRNAs. diff --git a/R/read-modkit.R b/R/read-modkit.R new file mode 100644 index 0000000..7f5b29b --- /dev/null +++ b/R/read-modkit.R @@ -0,0 +1,108 @@ +#' Read a bedMethyl file from modkit pileup +#' +#' Read per-position modification percentages from a bedMethyl file +#' produced by `modkit pileup`. The 18-column bedMethyl format includes +#' per-position counts and percentages for each modification type. +#' +#' @param path Path to a bedMethyl file (may be gzipped). +#' @param min_cov Minimum valid coverage to retain a position. Default `1`. +#' +#' @return A tibble with columns: `ref`, `pos`, `strand`, `mod_code`, +#' `n_valid`, `percent_mod`, `n_mod`, `n_canonical`. +#' +#' @export +#' +#' @examples +#' # read_bedmethyl("sample.bedmethyl.gz") +read_bedmethyl <- function(path, min_cov = 1) { + bedmethyl_cols <- c( + "chrom", "start", "end", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "n_valid", + "percent_mod", "n_mod", "n_canonical", "n_other_mod", + "n_delete", "n_fail", "n_diff", "n_nocall" + ) + + raw <- readr::read_tsv( + path, + col_names = bedmethyl_cols, + col_types = readr::cols( + chrom = readr::col_character(), + start = readr::col_integer(), + end = readr::col_integer(), + name = readr::col_character(), + score = readr::col_integer(), + strand = readr::col_character(), + thickStart = readr::col_integer(), + thickEnd = readr::col_integer(), + itemRgb = readr::col_character(), + n_valid = readr::col_integer(), + percent_mod = readr::col_double(), + n_mod = readr::col_integer(), + n_canonical = readr::col_integer(), + n_other_mod = readr::col_integer(), + n_delete = readr::col_integer(), + n_fail = readr::col_integer(), + n_diff = readr::col_integer(), + n_nocall = readr::col_integer() + ), + comment = "#", + show_col_types = FALSE + ) + + raw |> + dplyr::filter(.data$n_valid >= min_cov) |> + dplyr::transmute( + ref = .data$chrom, + pos = .data$start, + strand = .data$strand, + mod_code = .data$name, + n_valid = .data$n_valid, + percent_mod = .data$percent_mod, + n_mod = .data$n_mod, + n_canonical = .data$n_canonical + ) +} + +#' Summarize per-position modification frequency from modkit extract +#' +#' Read a `mod_calls.tsv.gz` file from `modkit extract` and compute +#' per-position modification frequency (proportion of reads with a +#' non-canonical call code). +#' +#' @param path Path to a `mod_calls.tsv.gz` file. +#' @param refs Optional character vector of reference names to include. +#' @param min_reads Minimum number of reads at a position to retain. +#' Default `1`. +#' +#' @return A tibble with columns: `ref`, `pos`, `n_reads`, `n_mod`, +#' `mod_freq`. +#' +#' @export +#' +#' @examples +#' path <- clover_example("ecoli/mod_calls.tsv.gz") +#' summarize_mod_calls(path) +summarize_mod_calls <- function(path, refs = NULL, min_reads = 1) { + mc <- readr::read_tsv(path, show_col_types = FALSE) |> + dplyr::filter(.data$within_alignment == TRUE) + + if (!is.null(refs)) { + mc <- mc |> dplyr::filter(.data$chrom %in% refs) + } + + mc |> + dplyr::mutate(modified = as.integer(.data$call_code != "-")) |> + dplyr::summarize( + n_reads = dplyr::n(), + n_mod = sum(.data$modified), + .by = c("chrom", "ref_position") + ) |> + dplyr::filter(.data$n_reads >= min_reads) |> + dplyr::transmute( + ref = .data$chrom, + pos = .data$ref_position, + n_reads = .data$n_reads, + n_mod = .data$n_mod, + mod_freq = .data$n_mod / .data$n_reads + ) +} diff --git a/man/build_coldata.Rd b/man/build_coldata.Rd index 8bc2eb1..1ad3094 100644 --- a/man/build_coldata.Rd +++ b/man/build_coldata.Rd @@ -19,7 +19,7 @@ A data frame with row names matching \code{colnames(count_matrix)}. } \description{ Create a \code{data.frame} of sample metadata suitable for -\code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}}. For charging count matrices, +\code{\link[DESeq2:DESeqDataSetFromMatrix]{DESeq2::DESeqDataSetFromMatrix()}}. For charging count matrices, the \code{charge_status} factor is added automatically. } \examples{ diff --git a/man/read_bedmethyl.Rd b/man/read_bedmethyl.Rd new file mode 100644 index 0000000..e17ef3a --- /dev/null +++ b/man/read_bedmethyl.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read-modkit.R +\name{read_bedmethyl} +\alias{read_bedmethyl} +\title{Read a bedMethyl file from modkit pileup} +\usage{ +read_bedmethyl(path, min_cov = 1) +} +\arguments{ +\item{path}{Path to a bedMethyl file (may be gzipped).} + +\item{min_cov}{Minimum valid coverage to retain a position. Default \code{1}.} +} +\value{ +A tibble with columns: \code{ref}, \code{pos}, \code{strand}, \code{mod_code}, +\code{n_valid}, \code{percent_mod}, \code{n_mod}, \code{n_canonical}. +} +\description{ +Read per-position modification percentages from a bedMethyl file +produced by \verb{modkit pileup}. The 18-column bedMethyl format includes +per-position counts and percentages for each modification type. +} +\examples{ +# read_bedmethyl("sample.bedmethyl.gz") +} diff --git a/man/run_deseq.Rd b/man/run_deseq.Rd index 7039a21..7b5f7d4 100644 --- a/man/run_deseq.Rd +++ b/man/run_deseq.Rd @@ -21,7 +21,7 @@ or \code{\link[=charging_count_matrix]{charging_count_matrix()}}).} A \code{DESeqDataSet} object. } \description{ -Wrapper around \code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}} and \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}} +Wrapper around \code{\link[DESeq2:DESeqDataSetFromMatrix]{DESeq2::DESeqDataSetFromMatrix()}} and \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}} for tRNA abundance or charging data. } \examples{ diff --git a/man/summarize_mod_calls.Rd b/man/summarize_mod_calls.Rd new file mode 100644 index 0000000..def7c49 --- /dev/null +++ b/man/summarize_mod_calls.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read-modkit.R +\name{summarize_mod_calls} +\alias{summarize_mod_calls} +\title{Summarize per-position modification frequency from modkit extract} +\usage{ +summarize_mod_calls(path, refs = NULL, min_reads = 1) +} +\arguments{ +\item{path}{Path to a \code{mod_calls.tsv.gz} file.} + +\item{refs}{Optional character vector of reference names to include.} + +\item{min_reads}{Minimum number of reads at a position to retain. +Default \code{1}.} +} +\value{ +A tibble with columns: \code{ref}, \code{pos}, \code{n_reads}, \code{n_mod}, +\code{mod_freq}. +} +\description{ +Read a \code{mod_calls.tsv.gz} file from \verb{modkit extract} and compute +per-position modification frequency (proportion of reads with a +non-canonical call code). +} +\examples{ +path <- clover_example("ecoli/mod_calls.tsv.gz") +summarize_mod_calls(path) +} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 53f05c3..51cb3ef 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -39,6 +39,7 @@ reference: - compute_charging_diffs - filter_linkages - compute_odds_ratios + - summarize_mod_calls - compute_ror - compute_ror_isodecoder - perform_pcoa diff --git a/tests/testthat/test-read-modkit.R b/tests/testthat/test-read-modkit.R new file mode 100644 index 0000000..6459ef7 --- /dev/null +++ b/tests/testthat/test-read-modkit.R @@ -0,0 +1,75 @@ +test_that("summarize_mod_calls returns expected structure", { + path <- clover_example("ecoli/mod_calls.tsv.gz") + res <- summarize_mod_calls(path) + + expect_s3_class(res, "tbl_df") + expect_named(res, c("ref", "pos", "n_reads", "n_mod", "mod_freq")) + expect_true(all(res$mod_freq >= 0 & res$mod_freq <= 1)) + expect_true(all(res$n_mod <= res$n_reads)) +}) + +test_that("summarize_mod_calls filters by refs", { + path <- clover_example("ecoli/mod_calls.tsv.gz") + res <- summarize_mod_calls(path, refs = "host-tRNA-Glu-TTC-1-1") + + expect_equal(unique(res$ref), "host-tRNA-Glu-TTC-1-1") +}) + +test_that("summarize_mod_calls filters by min_reads", { + path <- clover_example("ecoli/mod_calls.tsv.gz") + res <- summarize_mod_calls(path, min_reads = 20) + + expect_true(all(res$n_reads >= 20)) +}) + +test_that("read_bedmethyl returns expected structure", { + bed_lines <- c( + "tRNA-Glu\t10\t11\tm\t50\t+\t10\t11\t255,0,0\t50\t40.0\t20\t30\t0\t0\t0\t0\t0", + "tRNA-Glu\t20\t21\tm\t5\t+\t20\t21\t255,0,0\t5\t60.0\t3\t2\t0\t0\t0\t0\t0" + ) + tmp <- withr::local_tempfile(fileext = ".bed") + writeLines(bed_lines, tmp) + + res <- read_bedmethyl(tmp) + + expect_s3_class(res, "tbl_df") + expect_named( + res, + c("ref", "pos", "strand", "mod_code", "n_valid", "percent_mod", + "n_mod", "n_canonical") + ) + expect_equal(nrow(res), 2) + expect_equal(res$ref, c("tRNA-Glu", "tRNA-Glu")) + expect_equal(res$pos, c(10, 20)) + expect_equal(res$mod_code, c("m", "m")) +}) + +test_that("read_bedmethyl filters by min_cov", { + bed_lines <- c( + "tRNA-Glu\t10\t11\tm\t50\t+\t10\t11\t255,0,0\t50\t40.0\t20\t30\t0\t0\t0\t0\t0", + "tRNA-Glu\t20\t21\tm\t5\t+\t20\t21\t255,0,0\t5\t60.0\t3\t2\t0\t0\t0\t0\t0" + ) + tmp <- withr::local_tempfile(fileext = ".bed") + writeLines(bed_lines, tmp) + + res <- read_bedmethyl(tmp, min_cov = 10) + + expect_equal(nrow(res), 1) + expect_equal(res$n_valid, 50L) +}) + +test_that("summarize_mod_calls output works with compute_bcerror_delta", { + path <- clover_example("ecoli/mod_calls.tsv.gz") + mod1 <- summarize_mod_calls(path) |> dplyr::mutate(condition = "wt") + mod2 <- summarize_mod_calls(path) |> dplyr::mutate(condition = "mut") + + combined <- dplyr::bind_rows(mod1, mod2) + delta <- compute_bcerror_delta( + combined, + delta = wt - mut, + value_col = "mod_freq" + ) + + expect_s3_class(delta, "tbl_df") + expect_true("delta" %in% names(delta)) +}) diff --git a/vignettes/articles/heatmap.Rmd b/vignettes/articles/heatmap.Rmd index 24d1c23..4fa5644 100644 --- a/vignettes/articles/heatmap.Rmd +++ b/vignettes/articles/heatmap.Rmd @@ -449,6 +449,100 @@ plot_mod_landscape( ) ``` +## Heatmaps from modkit data + +The heatmap workflow also supports modification data from Oxford Nanopore's +[modkit](https://nanoporetech.github.io/modkit/) tool. Two input formats are +supported: per-read calls from `modkit extract` and per-position summaries +from `modkit pileup`. + +### From modkit extract (per-read calls) + +`summarize_mod_calls()` reads a `mod_calls.tsv.gz` file (from +`modkit extract --call-code`) and computes per-position modification +frequency --- the proportion of reads carrying a non-canonical base call +at each position. + +```{r} +#| label: modkit-extract +#| eval: false +# Summarize per-position modification frequency from each sample +wt_mods <- summarize_mod_calls("wt_sample.mod_calls.tsv.gz") +mut_mods <- summarize_mod_calls("mut_sample.mod_calls.tsv.gz") + +# Bind with condition labels +mod_summary <- bind_rows( + mutate(wt_mods, condition = "wt"), + mutate(mut_mods, condition = "mut") +) + +# Compute delta (reuses compute_bcerror_delta with mod_freq as value) +mod_delta <- compute_bcerror_delta( + mod_summary, + delta = wt - mut, + value_col = "mod_freq" +) + +# Prepare and plot (same workflow as bcerror data) +mod_heatmap <- prep_mod_heatmap( + mod_delta, + value_col = "delta", + sprinzl_coords = sprinzl, + mods = mods +) + +plot_mod_heatmap( + mod_heatmap, + value_col = "delta", + ref_col = "trna_label", + color_limits = c(-0.5, 0.5), + fill_name = "\u0394 mod frequency" +) +``` + +### From modkit pileup (bedMethyl) + +`read_bedmethyl()` reads the 18-column bedMethyl format produced by +`modkit pileup`. The `percent_mod` column contains the percentage of reads +with a given modification at each position, which can be compared across +conditions. + +```{r} +#| label: modkit-pileup +#| eval: false +# Read bedMethyl files +wt_bed <- read_bedmethyl("wt_sample.bed.gz", min_cov = 10) +mut_bed <- read_bedmethyl("mut_sample.bed.gz", min_cov = 10) + +# Bind with condition labels +bed_summary <- bind_rows( + mutate(wt_bed, condition = "wt"), + mutate(mut_bed, condition = "mut") +) + +# Compute delta using percent_mod (0-100 scale) +bed_delta <- compute_bcerror_delta( + bed_summary, + delta = wt - mut, + value_col = "percent_mod" +) + +# Prepare and plot +bed_heatmap <- prep_mod_heatmap( + bed_delta, + value_col = "delta", + sprinzl_coords = sprinzl +) + +plot_mod_heatmap( + bed_heatmap, + value_col = "delta", + ref_col = "trna_label", + color_limits = c(-50, 50), + fill_name = "\u0394 % modified" +) +``` + ## Session info ```{r}