From e18fbaea041c1f3d6a60db1661e184bb5be21dc2 Mon Sep 17 00:00:00 2001
From: Jay Hesselberth <jay.hesselberth@gmail.com>
Date: Thu, 26 Mar 2026 08:01:04 -0600
Subject: [PATCH] feat: add modkit integration for heatmap workflow

Add read_bedmethyl() and summarize_mod_calls() to convert modkit output
into formats compatible with the existing heatmap pipeline. Add vignette
section demonstrating both workflows.
---
 NAMESPACE                         |   2 +
 NEWS.md                           |   4 ++
 R/read-modkit.R                   | 108 ++++++++++++++++++++++++++++++
 man/build_coldata.Rd              |   2 +-
 man/read_bedmethyl.Rd             |  25 +++++++
 man/run_deseq.Rd                  |   2 +-
 man/summarize_mod_calls.Rd        |  29 ++++++++
 pkgdown/_pkgdown.yml              |   1 +
 tests/testthat/test-read-modkit.R |  75 +++++++++++++++++++++
 vignettes/articles/heatmap.Rmd    |  94 ++++++++++++++++++++++++++
 10 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 R/read-modkit.R
 create mode 100644 man/read_bedmethyl.Rd
 create mode 100644 man/summarize_mod_calls.Rd
 create mode 100644 tests/testthat/test-read-modkit.R

diff --git a/NAMESPACE b/NAMESPACE
index 6cb9a62..8ffc9a4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,6 +49,7 @@ export(prepare_rewiring_matrix)
 export(propagate_error_diff)
 export(propagate_error_ratio)
 export(read_bcerror)
+export(read_bedmethyl)
 export(read_charging)
 export(read_charging_multi)
 export(read_counts)
@@ -65,6 +66,7 @@ export(structure_html)
 export(structure_organisms)
 export(structure_to_png)
 export(structure_trnas)
+export(summarize_mod_calls)
 export(tabulate_deseq)
 export(tidy_deseq_results)
 export(trna_regions)
diff --git a/NEWS.md b/NEWS.md
index 48b772c..d4cfa50 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # clover 0.0.0.9000
 
+* `read_bedmethyl()` reads per-position modification percentages from bedMethyl files produced by `modkit pileup`, enabling integration of nanopore modification data with the heatmap workflow.
+
+* `summarize_mod_calls()` summarizes per-read modification calls from `modkit extract` output into per-position modification frequencies, ready for delta computation and heatmap visualization.
+
 * `plot_abundance_charging()` gains `shorten`, `source_col`, and `error_bars` parameters. Labels are auto-shortened via `shorten_trna_names()`, an optional faceting column separates host and phage tRNAs, and horizontal error bars show `lfcSE` on significant points when present.
 
 * `plot_charging_diffs()` gains `source_col`, `label_col`, and `shorten` parameters. Labels are auto-shortened via `shorten_trna_names()` and an optional faceting column separates host and phage tRNAs.
diff --git a/R/read-modkit.R b/R/read-modkit.R
new file mode 100644
index 0000000..7f5b29b
--- /dev/null
+++ b/R/read-modkit.R
@@ -0,0 +1,108 @@
+#' Read a bedMethyl file from modkit pileup
+#'
+#' Read per-position modification percentages from a bedMethyl file
+#' produced by `modkit pileup`. The 18-column bedMethyl format includes
+#' per-position counts and percentages for each modification type.
+#'
+#' @param path Path to a bedMethyl file (may be gzipped).
+#' @param min_cov Minimum valid coverage to retain a position. Default `1`.
+#'
+#' @return A tibble with columns: `ref`, `pos`, `strand`, `mod_code`,
+#'   `n_valid`, `percent_mod`, `n_mod`, `n_canonical`.
+#'
+#' @export
+#'
+#' @examples
+#' # read_bedmethyl("sample.bedmethyl.gz")
+read_bedmethyl <- function(path, min_cov = 1) {
+  bedmethyl_cols <- c(
+    "chrom", "start", "end", "name", "score", "strand",
+    "thickStart", "thickEnd", "itemRgb", "n_valid",
+    "percent_mod", "n_mod", "n_canonical", "n_other_mod",
+    "n_delete", "n_fail", "n_diff", "n_nocall"
+  )
+
+  raw <- readr::read_tsv(
+    path,
+    col_names = bedmethyl_cols,
+    col_types = readr::cols(
+      chrom = readr::col_character(),
+      start = readr::col_integer(),
+      end = readr::col_integer(),
+      name = readr::col_character(),
+      score = readr::col_integer(),
+      strand = readr::col_character(),
+      thickStart = readr::col_integer(),
+      thickEnd = readr::col_integer(),
+      itemRgb = readr::col_character(),
+      n_valid = readr::col_integer(),
+      percent_mod = readr::col_double(),
+      n_mod = readr::col_integer(),
+      n_canonical = readr::col_integer(),
+      n_other_mod = readr::col_integer(),
+      n_delete = readr::col_integer(),
+      n_fail = readr::col_integer(),
+      n_diff = readr::col_integer(),
+      n_nocall = readr::col_integer()
+    ),
+    comment = "#",
+    show_col_types = FALSE
+  )
+
+  raw |>
+    dplyr::filter(.data$n_valid >= min_cov) |>
+    dplyr::transmute(
+      ref = .data$chrom,
+      pos = .data$start,
+      strand = .data$strand,
+      mod_code = .data$name,
+      n_valid = .data$n_valid,
+      percent_mod = .data$percent_mod,
+      n_mod = .data$n_mod,
+      n_canonical = .data$n_canonical
+    )
+}
+
+#' Summarize per-position modification frequency from modkit extract
+#'
+#' Read a `mod_calls.tsv.gz` file from `modkit extract` and compute
+#' per-position modification frequency (proportion of reads with a
+#' non-canonical call code).
+#'
+#' @param path Path to a `mod_calls.tsv.gz` file.
+#' @param refs Optional character vector of reference names to include.
+#' @param min_reads Minimum number of reads at a position to retain.
+#'   Default `1`.
+#'
+#' @return A tibble with columns: `ref`, `pos`, `n_reads`, `n_mod`,
+#'   `mod_freq`.
+#'
+#' @export
+#'
+#' @examples
+#' path <- clover_example("ecoli/mod_calls.tsv.gz")
+#' summarize_mod_calls(path)
+summarize_mod_calls <- function(path, refs = NULL, min_reads = 1) {
+  mc <- readr::read_tsv(path, show_col_types = FALSE) |>
+    dplyr::filter(.data$within_alignment == TRUE)
+
+  if (!is.null(refs)) {
+    mc <- mc |> dplyr::filter(.data$chrom %in% refs)
+  }
+
+  mc |>
+    dplyr::mutate(modified = as.integer(.data$call_code != "-")) |>
+    dplyr::summarize(
+      n_reads = dplyr::n(),
+      n_mod = sum(.data$modified),
+      .by = c("chrom", "ref_position")
+    ) |>
+    dplyr::filter(.data$n_reads >= min_reads) |>
+    dplyr::transmute(
+      ref = .data$chrom,
+      pos = .data$ref_position,
+      n_reads = .data$n_reads,
+      n_mod = .data$n_mod,
+      mod_freq = .data$n_mod / .data$n_reads
+    )
+}
diff --git a/man/build_coldata.Rd b/man/build_coldata.Rd
index 8bc2eb1..1ad3094 100644
--- a/man/build_coldata.Rd
+++ b/man/build_coldata.Rd
@@ -19,7 +19,7 @@ A data frame with row names matching \code{colnames(count_matrix)}.
 }
 \description{
 Create a \code{data.frame} of sample metadata suitable for
-\code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}}. For charging count matrices,
+\code{\link[DESeq2:DESeqDataSetFromMatrix]{DESeq2::DESeqDataSetFromMatrix()}}. For charging count matrices,
 the \code{charge_status} factor is added automatically.
 }
 \examples{
diff --git a/man/read_bedmethyl.Rd b/man/read_bedmethyl.Rd
new file mode 100644
index 0000000..e17ef3a
--- /dev/null
+++ b/man/read_bedmethyl.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read-modkit.R
+\name{read_bedmethyl}
+\alias{read_bedmethyl}
+\title{Read a bedMethyl file from modkit pileup}
+\usage{
+read_bedmethyl(path, min_cov = 1)
+}
+\arguments{
+\item{path}{Path to a bedMethyl file (may be gzipped).}
+
+\item{min_cov}{Minimum valid coverage to retain a position. Default \code{1}.}
+}
+\value{
+A tibble with columns: \code{ref}, \code{pos}, \code{strand}, \code{mod_code},
+\code{n_valid}, \code{percent_mod}, \code{n_mod}, \code{n_canonical}.
+}
+\description{
+Read per-position modification percentages from a bedMethyl file
+produced by \verb{modkit pileup}. The 18-column bedMethyl format includes
+per-position counts and percentages for each modification type.
+}
+\examples{
+# read_bedmethyl("sample.bedmethyl.gz")
+}
diff --git a/man/run_deseq.Rd b/man/run_deseq.Rd
index 7039a21..7b5f7d4 100644
--- a/man/run_deseq.Rd
+++ b/man/run_deseq.Rd
@@ -21,7 +21,7 @@ or \code{\link[=charging_count_matrix]{charging_count_matrix()}}).}
 A \code{DESeqDataSet} object.
 }
 \description{
-Wrapper around \code{\link[DESeq2:DESeqDataSet]{DESeq2::DESeqDataSetFromMatrix()}} and \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}
+Wrapper around \code{\link[DESeq2:DESeqDataSetFromMatrix]{DESeq2::DESeqDataSetFromMatrix()}} and \code{\link[DESeq2:DESeq]{DESeq2::DESeq()}}
 for tRNA abundance or charging data.
 }
 \examples{
diff --git a/man/summarize_mod_calls.Rd b/man/summarize_mod_calls.Rd
new file mode 100644
index 0000000..def7c49
--- /dev/null
+++ b/man/summarize_mod_calls.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read-modkit.R
+\name{summarize_mod_calls}
+\alias{summarize_mod_calls}
+\title{Summarize per-position modification frequency from modkit extract}
+\usage{
+summarize_mod_calls(path, refs = NULL, min_reads = 1)
+}
+\arguments{
+\item{path}{Path to a \code{mod_calls.tsv.gz} file.}
+
+\item{refs}{Optional character vector of reference names to include.}
+
+\item{min_reads}{Minimum number of reads at a position to retain.
+Default \code{1}.}
+}
+\value{
+A tibble with columns: \code{ref}, \code{pos}, \code{n_reads}, \code{n_mod},
+\code{mod_freq}.
+}
+\description{
+Read a \code{mod_calls.tsv.gz} file from \verb{modkit extract} and compute
+per-position modification frequency (proportion of reads with a
+non-canonical call code).
+}
+\examples{
+path <- clover_example("ecoli/mod_calls.tsv.gz")
+summarize_mod_calls(path)
+}
diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml
index 53f05c3..51cb3ef 100644
--- a/pkgdown/_pkgdown.yml
+++ b/pkgdown/_pkgdown.yml
@@ -39,6 +39,7 @@ reference:
     - compute_charging_diffs
     - filter_linkages
     - compute_odds_ratios
+    - summarize_mod_calls
     - compute_ror
     - compute_ror_isodecoder
     - perform_pcoa
diff --git a/tests/testthat/test-read-modkit.R b/tests/testthat/test-read-modkit.R
new file mode 100644
index 0000000..6459ef7
--- /dev/null
+++ b/tests/testthat/test-read-modkit.R
@@ -0,0 +1,75 @@
+test_that("summarize_mod_calls returns expected structure", {
+  path <- clover_example("ecoli/mod_calls.tsv.gz")
+  res <- summarize_mod_calls(path)
+
+  expect_s3_class(res, "tbl_df")
+  expect_named(res, c("ref", "pos", "n_reads", "n_mod", "mod_freq"))
+  expect_true(all(res$mod_freq >= 0 & res$mod_freq <= 1))
+  expect_true(all(res$n_mod <= res$n_reads))
+})
+
+test_that("summarize_mod_calls filters by refs", {
+  path <- clover_example("ecoli/mod_calls.tsv.gz")
+  res <- summarize_mod_calls(path, refs = "host-tRNA-Glu-TTC-1-1")
+
+  expect_equal(unique(res$ref), "host-tRNA-Glu-TTC-1-1")
+})
+
+test_that("summarize_mod_calls filters by min_reads", {
+  path <- clover_example("ecoli/mod_calls.tsv.gz")
+  res <- summarize_mod_calls(path, min_reads = 20)
+
+  expect_true(all(res$n_reads >= 20))
+})
+
+test_that("read_bedmethyl returns expected structure", {
+  bed_lines <- c(
+    "tRNA-Glu\t10\t11\tm\t50\t+\t10\t11\t255,0,0\t50\t40.0\t20\t30\t0\t0\t0\t0\t0",
+    "tRNA-Glu\t20\t21\tm\t5\t+\t20\t21\t255,0,0\t5\t60.0\t3\t2\t0\t0\t0\t0\t0"
+  )
+  tmp <- withr::local_tempfile(fileext = ".bed")
+  writeLines(bed_lines, tmp)
+
+  res <- read_bedmethyl(tmp)
+
+  expect_s3_class(res, "tbl_df")
+  expect_named(
+    res,
+    c("ref", "pos", "strand", "mod_code", "n_valid", "percent_mod",
+      "n_mod", "n_canonical")
+  )
+  expect_equal(nrow(res), 2)
+  expect_equal(res$ref, c("tRNA-Glu", "tRNA-Glu"))
+  expect_equal(res$pos, c(10, 20))
+  expect_equal(res$mod_code, c("m", "m"))
+})
+
+test_that("read_bedmethyl filters by min_cov", {
+  bed_lines <- c(
+    "tRNA-Glu\t10\t11\tm\t50\t+\t10\t11\t255,0,0\t50\t40.0\t20\t30\t0\t0\t0\t0\t0",
+    "tRNA-Glu\t20\t21\tm\t5\t+\t20\t21\t255,0,0\t5\t60.0\t3\t2\t0\t0\t0\t0\t0"
+  )
+  tmp <- withr::local_tempfile(fileext = ".bed")
+  writeLines(bed_lines, tmp)
+
+  res <- read_bedmethyl(tmp, min_cov = 10)
+
+  expect_equal(nrow(res), 1)
+  expect_equal(res$n_valid, 50L)
+})
+
+test_that("summarize_mod_calls output works with compute_bcerror_delta", {
+  path <- clover_example("ecoli/mod_calls.tsv.gz")
+  mod1 <- summarize_mod_calls(path) |> dplyr::mutate(condition = "wt")
+  mod2 <- summarize_mod_calls(path) |> dplyr::mutate(condition = "mut")
+
+  combined <- dplyr::bind_rows(mod1, mod2)
+  delta <- compute_bcerror_delta(
+    combined,
+    delta = wt - mut,
+    value_col = "mod_freq"
+  )
+
+  expect_s3_class(delta, "tbl_df")
+  expect_true("delta" %in% names(delta))
+})
diff --git a/vignettes/articles/heatmap.Rmd b/vignettes/articles/heatmap.Rmd
index 24d1c23..4fa5644 100644
--- a/vignettes/articles/heatmap.Rmd
+++ b/vignettes/articles/heatmap.Rmd
@@ -449,6 +449,100 @@ plot_mod_landscape(
 )
 ```
 
+## Heatmaps from modkit data
+
+The heatmap workflow also supports modification data from Oxford Nanopore's
+[modkit](https://nanoporetech.github.io/modkit/) tool. Two input formats are
+supported: per-read calls from `modkit extract` and per-position summaries
+from `modkit pileup`.
+
+### From modkit extract (per-read calls)
+
+`summarize_mod_calls()` reads a `mod_calls.tsv.gz` file (from
+`modkit extract --call-code`) and computes per-position modification
+frequency --- the proportion of reads carrying a non-canonical base call
+at each position.
+
+```{r}
+#| label: modkit-extract
+#| eval: false
+# Summarize per-position modification frequency from each sample
+wt_mods <- summarize_mod_calls("wt_sample.mod_calls.tsv.gz")
+mut_mods <- summarize_mod_calls("mut_sample.mod_calls.tsv.gz")
+
+# Bind with condition labels
+mod_summary <- bind_rows(
+  mutate(wt_mods, condition = "wt"),
+  mutate(mut_mods, condition = "mut")
+)
+
+# Compute delta (reuses compute_bcerror_delta with mod_freq as value)
+mod_delta <- compute_bcerror_delta(
+  mod_summary,
+  delta = wt - mut,
+  value_col = "mod_freq"
+)
+
+# Prepare and plot (same workflow as bcerror data)
+mod_heatmap <- prep_mod_heatmap(
+  mod_delta,
+  value_col = "delta",
+  sprinzl_coords = sprinzl,
+  mods = mods
+)
+
+plot_mod_heatmap(
+  mod_heatmap,
+  value_col = "delta",
+  ref_col = "trna_label",
+  color_limits = c(-0.5, 0.5),
+  fill_name = "\u0394 mod frequency"
+)
+```
+
+### From modkit pileup (bedMethyl)
+
+`read_bedmethyl()` reads the 18-column bedMethyl format produced by
+`modkit pileup`. The `percent_mod` column contains the percentage of reads
+with a given modification at each position, which can be compared across
+conditions.
+
+```{r}
+#| label: modkit-pileup
+#| eval: false
+# Read bedMethyl files
+wt_bed <- read_bedmethyl("wt_sample.bed.gz", min_cov = 10)
+mut_bed <- read_bedmethyl("mut_sample.bed.gz", min_cov = 10)
+
+# Bind with condition labels
+bed_summary <- bind_rows(
+  mutate(wt_bed, condition = "wt"),
+  mutate(mut_bed, condition = "mut")
+)
+
+# Compute delta using percent_mod (0-100 scale)
+bed_delta <- compute_bcerror_delta(
+  bed_summary,
+  delta = wt - mut,
+  value_col = "percent_mod"
+)
+
+# Prepare and plot
+bed_heatmap <- prep_mod_heatmap(
+  bed_delta,
+  value_col = "delta",
+  sprinzl_coords = sprinzl
+)
+
+plot_mod_heatmap(
+  bed_heatmap,
+  value_col = "delta",
+  ref_col = "trna_label",
+  color_limits = c(-50, 50),
+  fill_name = "\u0394 % modified"
+)
+```
+
 ## Session info
 
 ```{r}