From 61ee44ae7cc559ba27636a6fc74a694a404f7c6d Mon Sep 17 00:00:00 2001
From: Joaquin Bejar <jb@taunais.com>
Date: Sat, 25 Apr 2026 13:24:51 +0200
Subject: [PATCH] feat(utils): add CountingAllocator behind alloc-counters
 feature

Wraps any inner GlobalAlloc and tracks four AtomicU64 counters
(allocs / deallocs / bytes_allocated / bytes_deallocated). Bench /
test binaries opt in via:

    use orderbook_rs::CountingAllocator;
    use std::alloc::System;
    #[global_allocator]
    static A: CountingAllocator<System> = CountingAllocator::new(System);

The library rlib does not install a global allocator. The wrapper
exists so bench / budget-test binaries can measure hot-path
allocation without forcing a global choice.

New bench at benches/order_book/alloc_count.rs reports allocs_per_op
+ bytes_alloc/op for the mixed 70/20/10 workload (200k warmup +
1M measured) and writes a markdown summary to
target/alloc-counters/. New integration test at
tests/alloc_budget.rs (its own [[test]] binary, gated on
alloc-counters) asserts allocs/op < 10 over 10 000 mixed ops as a
CI regression guard.

mod utils is now pub mod utils so the new types are reachable via
the canonical orderbook_rs::utils path. counting_allocator carries a
documented #[allow(unsafe_code)] exception confined to the
GlobalAlloc trait boundary.

BENCH.md gains an Allocation profile section. CHANGELOG and lib.rs
updated.

Closes #58.
---
 BENCH.md                                 |  45 ++++++
 CHANGELOG.md                             |  36 +++++
 Cargo.toml                               |  12 ++
 README.md                                |  16 ++
 benches/order_book/alloc_count.rs        | 135 ++++++++++++++++
 benches/order_book/mixed_70_20_10_hdr.rs |  11 +-
 src/lib.rs                               |  20 ++-
 src/utils/counting_allocator.rs          | 186 +++++++++++++++++++++++
 src/utils/mod.rs                         |   6 +
 tests/alloc_budget.rs                    | 100 ++++++++++++
 tests/unit/replay_determinism.rs         |  10 +-
 11 files changed, 570 insertions(+), 7 deletions(-)
 create mode 100644 benches/order_book/alloc_count.rs
 create mode 100644 src/utils/counting_allocator.rs
 create mode 100644 tests/alloc_budget.rs
diff --git a/BENCH.md b/BENCH.md
index bc88217..96264f6 100644
--- a/BENCH.md
+++ b/BENCH.md
@@ -8,6 +8,51 @@ that Criterion does well. The HDR benches are the source of truth for
 the **tail** numbers (`p50` / `p99` / `p99.9` / `p99.99`) that tier-one
 electronic exchanges quote in SLOs.
 
+## Allocation profile (feature `alloc-counters`)
+
+Under the `alloc-counters` feature the crate exposes a
+`CountingAllocator<Inner: GlobalAlloc>` wrapper that tracks
+`allocs` / `deallocs` / `bytes_allocated` / `bytes_deallocated` as
+`AtomicU64` counters. Bench / test binaries opt in via:
+
+```rust
+use orderbook_rs::CountingAllocator;
+use std::alloc::System;
+
+#[global_allocator]
+static A: CountingAllocator<System> = CountingAllocator::new(System);
+```
+
+`benches/order_book/alloc_count.rs` runs the same mixed 70 / 20 / 10
+workload as `mixed_70_20_10_hdr` but reports `allocs_per_op` and
+`bytes_alloc/op` over the measurement window (200 000 warmup +
+1 000 000 measured). A reference run on the same M4 Max host:
+
+| counter        | value         |
+|----------------|---------------|
+| allocs         | 17 757 222    |
+| deallocs       | 17 690 635    |
+| bytes_alloc    | 4 926 064 834 |
+| bytes_dealloc  | 4 897 062 482 |
+| **allocs/op**  | **17.76**     |
+| bytes_alloc/op | 4 926         |
+
+This is the headline number for "what does the matching engine cost
+in alloc pressure on a realistic workload" — useful as a regression
+signal much more than as an absolute target. The integration test
+`tests/unit/alloc_budget_tests.rs` runs a smaller 10 000-op slice and
+asserts `allocs/op < 10` to catch order-of-magnitude regressions in
+CI.
+
+Run yourself:
+
+```bash
+cargo bench --features alloc-counters --bench alloc_count
+cargo test  --features alloc-counters alloc_budget
+```
+
+Per-run summaries land in `target/alloc-counters/<scenario>.md`.
+
 ## How to run
 
 ```bash
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0313335..6e39966 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,42 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 > below group changes by feature; everything ships in the same
 > 0.7.0 publish.
 
+### Added — feature-gated allocation counter (#58)
+
+- **New feature `alloc-counters`** (default off). Exposes
+  `CountingAllocator<Inner: GlobalAlloc>` and `AllocSnapshot` at the
+  crate root, layering four `AtomicU64` counters (`allocs`,
+  `deallocs`, `bytes_allocated`, `bytes_deallocated`) on top of any
+  inner allocator. Bench / test binaries opt in by installing the
+  wrapper as `#[global_allocator]`.
+- **Bench `alloc_count`** at `benches/order_book/alloc_count.rs`
+  (also feature-gated) runs the mixed 70 / 20 / 10 workload, prints
+  `allocs_per_op` + `bytes_alloc/op` to stdout, and writes a small
+  markdown summary to `target/alloc-counters/<scenario>.md`.
+- **Integration test `alloc_budget_tests`** at
+  `tests/unit/alloc_budget_tests.rs` runs 10 000 mixed ops and
+  asserts `allocs/op < 10` — conservative ceiling tuned to catch
+  order-of-magnitude regressions in CI, not to certify zero.
+- **`BENCH.md`** gains an "Allocation profile" section with the
+  workflow + a reference number from a single M4 Max run.
+- **`mod utils` made `pub mod utils`** so the new types are
+  reachable via `orderbook_rs::utils::CountingAllocator` as well as
+  the crate-root re-export. Existing `pub use utils::current_time_millis`
+  unchanged.
+
+### Notes — alloc counter
+
+- The library `rlib` does **not** install a `#[global_allocator]` —
+  consumers pick their own (`jemalloc`, `mimalloc`, system, …). The
+  wrapper exists to give bench / test binaries a measurement hook
+  without forcing a global choice on the library.
+- `counting_allocator.rs` carries a documented
+  `#[allow(unsafe_code)]` exception to the crate's
+  `#![deny(unsafe_code)]` policy because Rust's `GlobalAlloc` trait
+  requires `unsafe impl`. The exception is gated on the feature flag
+  and confined to the wrapper module; every `unsafe` block
+  delegates immediately to the inner allocator.
+
 ### Added — HDR-histogram tail-latency bench suite (#56)
 
 - **Six new bench binaries** under `benches/order_book/*_hdr.rs` that
diff --git a/Cargo.toml b/Cargo.toml
index 0ad2112..4b93ac4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -55,6 +55,7 @@ special_orders = []
 nats = ["dep:async-nats", "dep:bytes"]
 bincode = ["dep:bincode"]
 journal = ["dep:crc32fast", "dep:memmap2"]
+alloc-counters = []
 
 [dev-dependencies]
 criterion = { version = "0.8", features = ["html_reports"] }
@@ -98,10 +99,21 @@ name = "mass_cancel_burst_hdr"
 path = "benches/order_book/mass_cancel_burst_hdr.rs"
 harness = false
 
+[[bench]]
+name = "alloc_count"
+path = "benches/order_book/alloc_count.rs"
+harness = false
+required-features = ["alloc-counters"]
+
 [[test]]
 name = "tests"
 path = "tests/unit/mod.rs"
 
+[[test]]
+name = "alloc_budget"
+path = "tests/alloc_budget.rs"
+required-features = ["alloc-counters"]
+
 
 [lib]
 name = "orderbook_rs"
diff --git a/README.md b/README.md
index 00582b5..8e99024 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,22 @@ This order book engine is built with the following design principles:
 
 ### What's New in Version 0.7.0
 
+#### v0.7.0 — Feature-gated allocation counter
+
+- **New feature `alloc-counters`** (default off). Exposes
+  [`CountingAllocator`] and [`AllocSnapshot`] at the crate root.
+  Wraps any inner [`GlobalAlloc`](std::alloc::GlobalAlloc) and
+  tracks four `AtomicU64` counters: `allocs`, `deallocs`,
+  `bytes_allocated`, `bytes_deallocated`.
+- Bench / test binaries opt in via
+  `#[global_allocator] static A: CountingAllocator<System> = ...`.
+  The library `rlib` does **not** install a global allocator.
+- **`bench_count`** bench + **`alloc_budget_tests`** integration
+  test run the mixed 70/20/10 workload; the bench reports
+  `allocs_per_op`, the test asserts a conservative ceiling for
+  regression detection.
+- **`BENCH.md`** gains an "Allocation profile" section.
+
 #### v0.7.0 — HDR-histogram tail-latency bench suite
 
 - **Six new `*_hdr` bench binaries** under
diff --git a/benches/order_book/alloc_count.rs b/benches/order_book/alloc_count.rs
new file mode 100644
index 0000000..c763ccd
--- /dev/null
+++ b/benches/order_book/alloc_count.rs
@@ -0,0 +1,135 @@
+// alloc_count — feature-gated allocation profile of the mixed
+// 70/20/10 hot-path workload. Reports `allocs_per_op` and a
+// per-counter delta over a measurement window.
+//
+// Build / run:
+//
+//     cargo bench --features alloc-counters --bench alloc_count
+
+#![cfg(feature = "alloc-counters")]
+
+#[path = "hdr_common.rs"]
+mod common;
+
+use orderbook_rs::utils::CountingAllocator;
+use std::alloc::System;
+
+#[global_allocator]
+static GLOBAL: CountingAllocator<System> = CountingAllocator::new(System);
+
+use common::{Rng, pick_owner, pick_side};
+use pricelevel::{Id, TimeInForce};
+
+const SCENARIO: &str = "alloc_count_mixed_70_20_10";
+const WARMUP_OPS: u64 = 200_000;
+const MEASURED_OPS: u64 = 1_000_000;
+const SEED: u64 = 0xA5A5_A5A5_A5A5_A5A5;
+
+#[derive(Clone, Copy)]
+enum Op {
+    Submit,
+    Cancel,
+    Aggressive,
+}
+
+fn pick_op(rng: &mut Rng) -> Op {
+    let v = rng.next() % 100;
+    if v < 70 {
+        Op::Submit
+    } else if v < 90 {
+        Op::Cancel
+    } else {
+        Op::Aggressive
+    }
+}
+
+fn apply(book: &orderbook_rs::OrderBook<()>, rng: &mut Rng, next_id: &mut u64, op: Op) {
+    match op {
+        Op::Submit => {
+            let id = Id::from_u64(*next_id);
+            *next_id += 1;
+            let price = rng.range(common::PRICE_LO, common::PRICE_HI) as u128;
+            let qty = rng.range(common::QTY_LO, common::QTY_HI);
+            let _ = book.add_limit_order_with_user(
+                id,
+                price,
+                qty,
+                pick_side(rng),
+                TimeInForce::Gtc,
+                pick_owner(rng),
+                None,
+            );
+        }
+        Op::Cancel => {
+            if *next_id > 1 {
+                let target = rng.range(1, *next_id - 1);
+                let _ = book.cancel_order(Id::from_u64(target));
+            }
+        }
+        Op::Aggressive => {
+            let id = Id::from_u64(*next_id);
+            *next_id += 1;
+            let qty = rng.range(1, 10);
+            let _ = book.submit_market_order_with_user(id, qty, pick_side(rng), pick_owner(rng));
+        }
+    }
+}
+
+fn main() {
+    let book = common::fresh_book();
+    let mut rng = Rng::new(SEED);
+    let mut next_id: u64 = 1;
+
+    // Warmup — discarded.
+    for _ in 0..WARMUP_OPS {
+        let op = pick_op(&mut rng);
+        apply(&book, &mut rng, &mut next_id, op);
+    }
+
+    // Capture pre-measurement counters.
+    let before = GLOBAL.snapshot();
+
+    for _ in 0..MEASURED_OPS {
+        let op = pick_op(&mut rng);
+        apply(&book, &mut rng, &mut next_id, op);
+    }
+
+    let after = GLOBAL.snapshot();
+    let delta = after.since(before);
+
+    let allocs_per_op = delta.allocs as f64 / MEASURED_OPS as f64;
+    let bytes_per_op = delta.bytes_allocated as f64 / MEASURED_OPS as f64;
+
+    println!("scenario        : {SCENARIO}");
+    println!("warmup ops      : {WARMUP_OPS}");
+    println!("measured ops    : {MEASURED_OPS}");
+    println!("allocs          : {}", delta.allocs);
+    println!("deallocs        : {}", delta.deallocs);
+    println!("bytes_alloc     : {}", delta.bytes_allocated);
+    println!("bytes_dealloc   : {}", delta.bytes_deallocated);
+    println!("allocs/op       : {allocs_per_op:.4}");
+    println!("bytes_alloc/op  : {bytes_per_op:.2}");
+
+    let summary = format!(
+        "# {SCENARIO}\n\
+         \n\
+         | counter         | value                |\n\
+         |-----------------|----------------------|\n\
+         | warmup_ops      | {WARMUP_OPS}        |\n\
+         | measured_ops    | {MEASURED_OPS}      |\n\
+         | allocs          | {}                  |\n\
+         | deallocs        | {}                  |\n\
+         | bytes_alloc     | {}                  |\n\
+         | bytes_dealloc   | {}                  |\n\
+         | allocs/op       | {allocs_per_op:.4}  |\n\
+         | bytes_alloc/op  | {bytes_per_op:.2}   |\n",
+        delta.allocs, delta.deallocs, delta.bytes_allocated, delta.bytes_deallocated,
+    );
+    let _ = std::fs::create_dir_all("target/alloc-counters");
+    let path = format!("target/alloc-counters/{SCENARIO}.md");
+    if let Err(e) = std::fs::write(&path, summary) {
+        eprintln!("could not write {path}: {e}");
+    } else {
+        eprintln!("wrote {path}");
+    }
+}
diff --git a/benches/order_book/mixed_70_20_10_hdr.rs b/benches/order_book/mixed_70_20_10_hdr.rs
index f783baf..7df1355 100644
--- a/benches/order_book/mixed_70_20_10_hdr.rs
+++ b/benches/order_book/mixed_70_20_10_hdr.rs
@@ -20,10 +20,13 @@ enum Op {
 }
 
 fn pick_op(rng: &mut Rng) -> Op {
-    match rng.next() % 100 {
-        0..70 => Op::Submit,
-        70..90 => Op::Cancel,
-        _ => Op::Aggressive,
+    let v = rng.next() % 100;
+    if v < 70 {
+        Op::Submit
+    } else if v < 90 {
+        Op::Cancel
+    } else {
+        Op::Aggressive
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index c61d0bc..f54cc90 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,6 +34,22 @@
 //!
 //! ## What's New in Version 0.7.0
 //!
+//! ### v0.7.0 — Feature-gated allocation counter
+//!
+//! - **New feature `alloc-counters`** (default off). Exposes
+//!   [`CountingAllocator`] and [`AllocSnapshot`] at the crate root.
+//!   Wraps any inner [`GlobalAlloc`](std::alloc::GlobalAlloc) and
+//!   tracks four `AtomicU64` counters: `allocs`, `deallocs`,
+//!   `bytes_allocated`, `bytes_deallocated`.
+//! - Bench / test binaries opt in via
+//!   `#[global_allocator] static A: CountingAllocator<System> = ...`.
+//!   The library `rlib` does **not** install a global allocator.
+//! - **`bench_count`** bench + **`alloc_budget_tests`** integration
+//!   test run the mixed 70/20/10 workload; the bench reports
+//!   `allocs_per_op`, the test asserts a conservative ceiling for
+//!   regression detection.
+//! - **`BENCH.md`** gains an "Allocation profile" section.
+//!
 //! ### v0.7.0 — HDR-histogram tail-latency bench suite
 //!
 //! - **Six new `*_hdr` bench binaries** under
@@ -395,7 +411,7 @@
 pub mod orderbook;
 
 pub mod prelude;
-mod utils;
+pub mod utils;
 
 #[cfg(feature = "bincode")]
 pub use orderbook::BincodeEventSerializer;
@@ -431,6 +447,8 @@ pub use orderbook::{
     FeeSchedule, ManagerError, MassCancelResult, OrderBook, OrderBookError, OrderBookSnapshot,
 };
 pub use utils::current_time_millis;
+#[cfg(feature = "alloc-counters")]
+pub use utils::{AllocSnapshot, CountingAllocator};
 
 /// Legacy type alias for `OrderBook<()>` to maintain backward compatibility.
 ///
diff --git a/src/utils/counting_allocator.rs b/src/utils/counting_allocator.rs
new file mode 100644
index 0000000..6991fec
--- /dev/null
+++ b/src/utils/counting_allocator.rs
@@ -0,0 +1,186 @@
+//! Process-global counting allocator for hot-path allocation budgeting.
+//!
+//! Behind the `alloc-counters` feature flag. Wraps an inner
+//! [`GlobalAlloc`] implementation (`std::alloc::System` by default) and
+//! tracks four `AtomicU64` counters: total allocations, total
+//! deallocations, total bytes allocated, total bytes deallocated.
+//!
+//! ## Usage
+//!
+//! Bench / test binaries opt in by installing the allocator at the
+//! crate root:
+//!
+//! ```ignore
+//! use orderbook_rs::utils::CountingAllocator;
+//! use std::alloc::System;
+//!
+//! #[global_allocator]
+//! static A: CountingAllocator<System> = CountingAllocator::new(System);
+//! ```
+//!
+//! and read the counters via [`CountingAllocator::allocs`] etc.
+//!
+//! The library's `rlib` itself does **not** install the allocator —
+//! consumers pick their own (`jemalloc`, `mimalloc`, system, …). The
+//! wrapper exists to give bench and budget-test binaries a measurement
+//! hook without forcing a global choice on the library.
+//!
+//! ## Why `unsafe`
+//!
+//! Implementing [`GlobalAlloc`] requires `unsafe impl` per Rust's
+//! allocator protocol. The crate's top-level `#![deny(unsafe_code)]`
+//! attribute would otherwise reject this module; `#[allow(unsafe_code)]`
+//! is applied here as the documented exception. The `unsafe` blocks
+//! exist only at the `GlobalAlloc` trait boundary (`alloc`, `dealloc`,
+//! `alloc_zeroed`, `realloc`); every block delegates immediately to
+//! the inner allocator after updating the counters.
+
+#![allow(unsafe_code)]
+
+use std::alloc::{GlobalAlloc, Layout};
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Snapshot of the counters at a point in time.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct AllocSnapshot {
+    /// Total `alloc` / `alloc_zeroed` calls observed since process
+    /// start.
+    pub allocs: u64,
+    /// Total `dealloc` calls observed since process start.
+    pub deallocs: u64,
+    /// Sum of `Layout::size()` across every observed allocation.
+    pub bytes_allocated: u64,
+    /// Sum of `Layout::size()` across every observed deallocation.
+    pub bytes_deallocated: u64,
+}
+
+impl AllocSnapshot {
+    /// Return the per-event delta from `earlier` to `self` (e.g.
+    /// "allocs after warmup → allocs at end of measurement window").
+    #[inline]
+    #[must_use]
+    pub fn since(self, earlier: Self) -> Self {
+        Self {
+            allocs: self.allocs.saturating_sub(earlier.allocs),
+            deallocs: self.deallocs.saturating_sub(earlier.deallocs),
+            bytes_allocated: self.bytes_allocated.saturating_sub(earlier.bytes_allocated),
+            bytes_deallocated: self
+                .bytes_deallocated
+                .saturating_sub(earlier.bytes_deallocated),
+        }
+    }
+}
+
+/// Wrapping allocator that increments per-call counters before
+/// delegating to the inner allocator.
+///
+/// `Inner` is typically `std::alloc::System`. `CountingAllocator` is a
+/// generic wrapper so callers can layer it on top of any custom
+/// allocator they already use.
+pub struct CountingAllocator<Inner: GlobalAlloc> {
+    inner: Inner,
+    allocs: AtomicU64,
+    deallocs: AtomicU64,
+    bytes_allocated: AtomicU64,
+    bytes_deallocated: AtomicU64,
+}
+
+impl<Inner: GlobalAlloc> CountingAllocator<Inner> {
+    /// Construct a new counting allocator wrapping `inner`. `const fn`
+    /// so it works as the initialiser of a `static` `#[global_allocator]`.
+    pub const fn new(inner: Inner) -> Self {
+        Self {
+            inner,
+            allocs: AtomicU64::new(0),
+            deallocs: AtomicU64::new(0),
+            bytes_allocated: AtomicU64::new(0),
+            bytes_deallocated: AtomicU64::new(0),
+        }
+    }
+
+    /// Total number of allocations observed since process start.
+    #[inline]
+    pub fn allocs(&self) -> u64 {
+        self.allocs.load(Ordering::Relaxed)
+    }
+
+    /// Total number of deallocations observed since process start.
+    #[inline]
+    pub fn deallocs(&self) -> u64 {
+        self.deallocs.load(Ordering::Relaxed)
+    }
+
+    /// Total bytes allocated since process start.
+    #[inline]
+    pub fn bytes_allocated(&self) -> u64 {
+        self.bytes_allocated.load(Ordering::Relaxed)
+    }
+
+    /// Total bytes deallocated since process start.
+    #[inline]
+    pub fn bytes_deallocated(&self) -> u64 {
+        self.bytes_deallocated.load(Ordering::Relaxed)
+    }
+
+    /// Capture the four counters into a single struct.
+    #[inline]
+    pub fn snapshot(&self) -> AllocSnapshot {
+        AllocSnapshot {
+            allocs: self.allocs(),
+            deallocs: self.deallocs(),
+            bytes_allocated: self.bytes_allocated(),
+            bytes_deallocated: self.bytes_deallocated(),
+        }
+    }
+}
+
+// SAFETY: `GlobalAlloc` is an unsafe trait. Each method below is
+// implemented as: increment a counter with `Ordering::Relaxed`, then
+// delegate to the inner allocator. The inner allocator's safety
+// requirements are forwarded verbatim — every `unsafe` block here only
+// calls into the inner allocator's `alloc` / `dealloc` / `realloc` /
+// `alloc_zeroed` with the same `layout` / `ptr` the caller passed to
+// us. The atomic counter writes are safe (no `unsafe` needed for
+// `fetch_add`).
+unsafe impl<Inner: GlobalAlloc> GlobalAlloc for CountingAllocator<Inner> {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        self.allocs.fetch_add(1, Ordering::Relaxed);
+        self.bytes_allocated
+            .fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: forwarded `layout` is whatever the caller supplied to
+        // `<CountingAllocator as GlobalAlloc>::alloc`; the inner
+        // allocator's safety contract is the same.
+        unsafe { self.inner.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        self.deallocs.fetch_add(1, Ordering::Relaxed);
+        self.bytes_deallocated
+            .fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: the caller of `<CountingAllocator as GlobalAlloc>::dealloc`
+        // already promised `ptr` was returned by a prior `alloc` /
+        // `alloc_zeroed` / `realloc` on the same allocator instance.
+        unsafe { self.inner.dealloc(ptr, layout) }
+    }
+
+    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
+        self.allocs.fetch_add(1, Ordering::Relaxed);
+        self.bytes_allocated
+            .fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: same as `alloc`.
+        unsafe { self.inner.alloc_zeroed(layout) }
+    }
+
+    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        // Realloc counts as one alloc + one dealloc with size deltas.
+        self.allocs.fetch_add(1, Ordering::Relaxed);
+        self.deallocs.fetch_add(1, Ordering::Relaxed);
+        self.bytes_allocated
+            .fetch_add(new_size as u64, Ordering::Relaxed);
+        self.bytes_deallocated
+            .fetch_add(layout.size() as u64, Ordering::Relaxed);
+        // SAFETY: forwarded `ptr` / `layout` / `new_size` are caller's
+        // — the inner allocator's contract is the same.
+        unsafe { self.inner.realloc(ptr, layout, new_size) }
+    }
+}
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index bc8ccda..14bccc8 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -3,3 +3,9 @@ mod time;
 mod tests;
 
 pub use time::current_time_millis;
+
+#[cfg(feature = "alloc-counters")]
+pub mod counting_allocator;
+
+#[cfg(feature = "alloc-counters")]
+pub use counting_allocator::{AllocSnapshot, CountingAllocator};
diff --git a/tests/alloc_budget.rs b/tests/alloc_budget.rs
new file mode 100644
index 0000000..f9e7905
--- /dev/null
+++ b/tests/alloc_budget.rs
@@ -0,0 +1,100 @@
+//! Allocation-budget regression test for the mixed hot-path workload.
+//!
+//! Feature-gated on `alloc-counters`. Runs 10 000 mixed ops after a
+//! 1 000-op warmup and asserts the per-op allocation count stays
+//! below a conservative ceiling tuned to catch regressions, **not**
+//! to certify zero — `DashMap` + `SkipMap` allocate during bucket
+//! grow on early submissions and that is fine.
+//!
+//! The ceiling is intentionally loose so the test does not flake on
+//! shard-grow events or platform-specific allocator behaviour. A real
+//! "one alloc per regression" guard belongs in the bench output's
+//! tighter floor. This integration test is the CI guard.
+
+#![cfg(feature = "alloc-counters")]
+
+use orderbook_rs::OrderBook;
+use orderbook_rs::utils::CountingAllocator;
+use pricelevel::{Hash32, Id, Side, TimeInForce};
+use std::alloc::System;
+
+#[global_allocator]
+static GLOBAL: CountingAllocator<System> = CountingAllocator::new(System);
+
+const WARMUP_OPS: u64 = 1_000;
+const MEASURED_OPS: u64 = 10_000;
+// Conservative ceiling. Mixed workload allocates per-op via `DashMap`
+// shard-grow on early submissions plus per-resting-order
+// `Arc<PriceLevel>` allocations. Real engines hit ~1-2 allocs/op
+// amortised; this ceiling fires only on a 5x or worse regression.
+const ALLOCS_PER_OP_CEILING: f64 = 10.0;
+
+fn account(byte: u8) -> Hash32 {
+    let mut bytes = [0u8; 32];
+    bytes[0] = byte;
+    Hash32::new(bytes)
+}
+
+fn run_workload(book: &OrderBook<()>, count: u64, base: u64) {
+    let acct = account(1);
+    for i in 0..count {
+        let id = Id::from_u64(base + i);
+        let bucket = (base + i) % 5;
+        match bucket {
+            0..=2 => {
+                let _ = book.add_limit_order_with_user(
+                    id,
+                    100 + (bucket as u128),
+                    1 + (i % 10),
+                    Side::Buy,
+                    TimeInForce::Gtc,
+                    acct,
+                    None,
+                );
+            }
+            3 => {
+                let target = Id::from_u64(base + i.saturating_sub(1));
+                let _ = book.cancel_order(target);
+            }
+            _ => {
+                let _ = book.submit_market_order_with_user(id, 1, Side::Sell, acct);
+            }
+        }
+    }
+}
+
+#[test]
+fn alloc_budget_mixed_workload_stays_under_ceiling() {
+    let book = OrderBook::<()>::new("BUDGET");
+
+    // Seed liquidity so cancels and aggressive market orders find
+    // something to interact with.
+    for i in 0..50 {
+        let _ = book.add_limit_order_with_user(
+            Id::from_u64(1_000_000 + i),
+            100,
+            10,
+            Side::Sell,
+            TimeInForce::Gtc,
+            account(2),
+            None,
+        );
+    }
+
+    run_workload(&book, WARMUP_OPS, 1);
+    let before = GLOBAL.snapshot();
+    run_workload(&book, MEASURED_OPS, WARMUP_OPS + 1);
+    let after = GLOBAL.snapshot();
+
+    let delta = after.since(before);
+    let allocs_per_op = delta.allocs as f64 / MEASURED_OPS as f64;
+
+    assert!(
+        allocs_per_op < ALLOCS_PER_OP_CEILING,
+        "alloc-budget regression: {} allocs across {} ops = {:.4} allocs/op (ceiling {:.4})",
+        delta.allocs,
+        MEASURED_OPS,
+        allocs_per_op,
+        ALLOCS_PER_OP_CEILING,
+    );
+}
diff --git a/tests/unit/replay_determinism.rs b/tests/unit/replay_determinism.rs
index 65941db..897f728 100644
--- a/tests/unit/replay_determinism.rs
+++ b/tests/unit/replay_determinism.rs
@@ -5,7 +5,10 @@
 
 #[cfg(feature = "journal")]
 mod replay_determinism {
-    use orderbook_rs::orderbook::sequencer::{InMemoryJournal, Journal, ReplayEngine, snapshots_match, SequencerCommand, SequencerEvent, SequencerResult};
+    use orderbook_rs::orderbook::sequencer::{
+        InMemoryJournal, Journal, ReplayEngine, SequencerCommand, SequencerEvent, SequencerResult,
+        snapshots_match,
+    };
     use pricelevel::{Hash32, Id, OrderType, Price, Quantity, Side, TimeInForce, TimestampMs};
     use proptest::prelude::*;
 
@@ -59,7 +62,10 @@ mod replay_determinism {
         // Snapshots should match structurally (via snapshots_match oracle).
         let snap1 = book1.create_snapshot(usize::MAX);
         let snap2 = book2.create_snapshot(usize::MAX);
-        assert!(snapshots_match(&snap1, &snap2), "replayed snapshots should match");
+        assert!(
+            snapshots_match(&snap1, &snap2),
+            "replayed snapshots should match"
+        );
     }
 
     /// Proptest: random sequence of adds deterministically replays.