diff --git a/Cargo.lock b/Cargo.lock
index 455b1e9a..868e1c21 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -388,6 +388,15 @@ dependencies = [
  "crossbeam-utils",
 ]
 
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.21"
@@ -2979,6 +2988,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "clap",
+ "crossbeam-queue",
  "dispatch2",
  "divan",
  "event-manager",
diff --git a/Cargo.toml b/Cargo.toml
index 50607e5f..af267aec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,6 +113,11 @@ socket2 = { version = "0.5", features = ["all"] }
 # path of a NAT keyed by guest-side ports the guest itself chooses.
 rustc-hash = "2"
 
+# Lock-free MPMC queue used to hand virtio-net RX frames from the
+# net-poll thread to the vCPU thread without taking the
+# `Arc<Mutex<VirtioNetDevice>>` device lock on the hot path.
+crossbeam-queue = "0.3"
+
 # --- macOS-only dependencies ---
 [target.'cfg(target_os = "macos")'.dependencies]
 # Objective-C 2.0 bindings (auto-generated from Apple frameworks)
diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md
new file mode 100644
index 00000000..89f21661
--- /dev/null
+++ b/docs/passt-comparison.md
@@ -0,0 +1,96 @@
+# passt head-to-head comparison harness
+
+Tools under `tools/perf-harness/` produce a side-by-side comparison of voidbox
+(real KVM VM + SLIRP) against passt's [`pasta`](https://passt.top/passt/about/)
+running in a network namespace.
+
+This is the deferred deliverable from
+[`docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md`](superpowers/plans/2026-04-27-smoltcp-passt-port.md)
+§ "passt head-to-head methodology".
+
+## What the harness measures
+
+Both sides run the same workload shape — the same fields the
+`voidbox-network-bench` `Report` already emits:
+
+| Field | Workload |
+|---|---|
+| `tcp_throughput_g2h_mbps` | `dd if=/dev/zero bs=1M count=N \| nc HOST PORT` from inside the guest / netns; host TCP server times the drain |
+| `tcp_rr_latency_us_p50/p99` | Persistent connection, host-side echo loop bouncing one byte per round trip |
+| `tcp_crr_latency_us_p50` | Independent `nc` invocations in a tight loop; host-side timing of the full accept→read→write→close cycle |
+
+The pasta side uses `pasta -- COMMAND` to run the client inside a fresh
+network namespace.  Pasta's `--map-host-loopback` (default: the host's
+gateway IP) translates to the host's loopback, so the client connects
+to `<host-gateway>:PORT` and reaches the host server bound on `127.0.0.1:PORT`.
+
+## What it's good for
+
+**CRR latency is the most apples-to-apples metric** — it's dominated by
+NAT-table operations and the round-trip path through the user-mode
+networking stack, which is the same code on both sides.  Per the spec:
+
+> Connect rate (CRR latency) is the most apples-to-apples metric —
+> dominated by NAT-table operations, not MMIO. If passt does CRR in 135 µs
+> and we do 600 µs, that's a meaningful "we have 4× more overhead per
+> connect" signal that this refactor should narrow.
+
+## What it's not
+
+**Throughput numbers are not directly comparable.**
+
+- voidbox runs a real KVM VM; every packet incurs `virtio-mmio`
+  exits, vCPU IPI overhead, and per-packet copy across the device
+  boundary.
+- pasta runs in a network namespace; the data path is just user-mode
+  socket forwarding, no VM, no MMIO.
+
+The throughput gap is therefore a *sum of the user-mode overhead the
+two stacks share* plus *the VM transit cost only voidbox pays*.
+Use the throughput numbers as a sanity bound, not a parity target.
+
+A proper VM-vs-VM comparison would run passt under
+`qemu-system-x86_64` with a guest image carrying `nc` / `iperf3`.
+That is documented as a separate follow-up; the harness here is the
+quick, low-friction sibling that exercises the apples-to-apples
+metric (CRR) without requiring an extra guest image.
+
+## Usage
+
+```bash
+# Generate voidbox numbers (requires VOID_BOX_KERNEL/VOID_BOX_INITRAMFS).
+cargo run --release --bin voidbox-network-bench -- \
+    --iterations 3 --output /tmp/voidbox-bench.json
+
+# Generate pasta numbers (requires pasta on PATH or via $PASTA).
+tools/perf-harness/bench-pasta.py --output /tmp/pasta-bench.json
+
+# Side-by-side markdown.
+tools/perf-harness/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \
+    --output /tmp/voidbox-vs-pasta.md
+
+# qemu+libslirp / qemu+passt CRR (apples-to-apples SLIRP-vs-SLIRP).
+gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c
+tools/perf-harness/bench-qemu-slirp.sh --backend libslirp --iterations 30
+tools/perf-harness/bench-qemu-slirp.sh --backend passt    --iterations 30
+
+# Voidbox single-process CRR (no per-iteration nc fork).
+cargo run --release --example crr_singleproc_bench -- --iterations 30
+```
+
+`tools/perf-harness/bench-pasta.py --help` lists tunables (iterations,
+transfer size, sample counts).
+
+## Reading the report
+
+| Δ column | Meaning |
+|---|---|
+| `voidbox N× faster`  (throughput) | voidbox has the higher Mbps number |
+| `voidbox N× slower`  (throughput) | pasta has the higher Mbps number — expected, since pasta has no VM |
+| `voidbox N× faster`  (latency)    | voidbox has the lower µs number |
+| `voidbox N× slower`  (latency)    | pasta has the lower µs number — large multiples here mean voidbox spends much of its CRR time outside the NAT path (poll-thread cadence, vCPU exits, virtio handling) |
+
+A useful CRR signal: if `voidbox N× slower on CRR p50` is much larger
+than `voidbox N× slower on RR p50`, the per-connection overhead is the
+bottleneck, not the data path.  RR p50 captures the data path; CRR
+captures the connect path.
diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs
new file mode 100644
index 00000000..0b109f8d
--- /dev/null
+++ b/examples/crr_singleproc_bench.rs
@@ -0,0 +1,157 @@
+//! crr_singleproc_bench — voidbox-side N-iteration TCP CRR loop in a
+//! single guest process, isolating voidbox's NAT-path cost from the
+//! existing bench's per-iteration `nc` fork+exec overhead.
+//!
+//! NOT meant for the production bench surface; this is a one-off
+//! diagnostic that pairs with `tools/perf-harness/crr-client.c` + the
+//! pasta side of the head-to-head.  Compile and run directly:
+//!
+//!     gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c
+//!     cargo run --release --example crr_singleproc_bench -- \
+//!         --iterations 100 --bench-binary /tmp/crr-client
+//!
+//! Requires the same env vars as voidbox-network-bench:
+//!   VOID_BOX_KERNEL, VOID_BOX_INITRAMFS
+
+use std::net::TcpListener;
+use std::thread;
+use std::time::Duration;
+
+use clap::Parser;
+use void_box::backend::MountConfig;
+use void_box::sandbox::Sandbox;
+
+#[derive(Parser)]
+#[command(version, about)]
+struct Cli {
+    /// Number of CRR iterations.
+    #[arg(long, default_value_t = 100)]
+    iterations: u32,
+    /// Host path to the static crr-client binary.
+    #[arg(long, default_value = "/tmp/crr-client")]
+    bench_binary: String,
+    /// Memory size for the guest VM (MB).
+    #[arg(long, default_value_t = 1024)]
+    memory_mb: usize,
+}
+
+const HOST_LOOPBACK_FROM_GUEST: &str = "10.0.2.2";
+
+#[tokio::main(flavor = "multi_thread")]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let cli = Cli::parse();
+    let bench_binary = std::path::PathBuf::from(&cli.bench_binary);
+    if !bench_binary.exists() {
+        return Err(format!(
+            "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/perf-harness/crr-client.c`)",
+            cli.bench_binary
+        )
+        .into());
+    }
+    let bench_binary_dir = bench_binary
+        .parent()
+        .ok_or("bench-binary has no parent dir")?
+        .to_string_lossy()
+        .into_owned();
+    let bench_binary_name = bench_binary
+        .file_name()
+        .ok_or("bench-binary has no file name")?
+        .to_string_lossy()
+        .into_owned();
+
+    let listener = TcpListener::bind("127.0.0.1:0")?;
+    let host_port = listener.local_addr()?.port();
+    listener.set_nonblocking(true)?;
+
+    let iterations = cli.iterations;
+    let server_thread = thread::spawn(move || {
+        // Non-blocking accept with a tight poll, deadline-checked.  With
+        // a blocking accept the deadline never fires if the guest never
+        // connects (boot failure, SLIRP rate limit, etc.) and the
+        // example's later `server_thread.join()` would hang forever.
+        // The accept-pickup latency directly inflates each guest CRR
+        // sample, so the wait is kept short — `from_micros(50)` adds
+        // at most ~50 µs of jitter on top of a ~280 µs baseline, while
+        // still letting the deadline check fire every ~50 µs.
+        let mut accepted = 0u32;
+        let deadline = std::time::Instant::now() + Duration::from_secs(120);
+        while accepted < iterations && std::time::Instant::now() < deadline {
+            match listener.accept() {
+                Ok((mut conn, _)) => {
+                    conn.set_nonblocking(false).ok();
+                    let mut buf = [0u8; 1];
+                    let _ = std::io::Read::read(&mut conn, &mut buf);
+                    let _ = std::io::Write::write_all(&mut conn, b"x");
+                    accepted += 1;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {
+                    thread::sleep(Duration::from_micros(50));
+                }
+                Err(_) => break,
+            }
+        }
+        accepted
+    });
+
+    let sandbox = Sandbox::local()
+        .from_env()?
+        .memory_mb(cli.memory_mb)
+        .network(true)
+        // Production SLIRP defaults (50/s rate, 64 concurrent) are
+        // sized to throttle a guest-side flood — far below what a
+        // CRR microbench wants.  Lift both ceilings so the bench
+        // exercises the steady-state NAT path, not the rate limiter.
+        .network_max_connections_per_second(u32::MAX)
+        .network_max_concurrent_connections(usize::MAX)
+        .mount(MountConfig {
+            host_path: bench_binary_dir.clone(),
+            guest_path: "/tmp/host".into(),
+            read_only: true,
+        })
+        .build()?;
+
+    eprintln!(
+        "VM booted; running {} CRRs in a single guest process...",
+        iterations
+    );
+    let probe = sandbox.exec("sh", &["-c", ":"]).await?;
+    if !probe.success() {
+        return Err("VM probe exec failed".into());
+    }
+
+    let cmd = format!(
+        "/tmp/host/{name} {host} {port} {n}",
+        name = bench_binary_name,
+        host = HOST_LOOPBACK_FROM_GUEST,
+        port = host_port,
+        n = iterations,
+    );
+    let output = sandbox.exec("sh", &["-c", &cmd]).await?;
+    let stdout = output.stdout_str().to_string();
+    let stderr = output.stderr_str().to_string();
+    if !output.success() {
+        eprintln!("guest stderr: {stderr}");
+        return Err(format!("guest exec failed: {:?}", output.exit_code).into());
+    }
+
+    let server_thread_count = server_thread.join().unwrap_or(0);
+    eprintln!("host accepts: {server_thread_count}/{iterations}");
+
+    let line = stdout.lines().next().ok_or("empty guest stdout")?;
+    let parts: Vec<&str> = line.split_whitespace().collect();
+    if parts.len() != 4 {
+        return Err(format!("unexpected guest stdout: {line:?}").into());
+    }
+    let n: u32 = parts[0].parse()?;
+    let p50_ns: u64 = parts[1].parse()?;
+    let p99_ns: u64 = parts[2].parse()?;
+    let mean_ns: u64 = parts[3].parse()?;
+
+    println!();
+    println!("voidbox single-process CRR over {n} iterations:");
+    println!("  p50:  {} µs", p50_ns / 1000);
+    println!("  p99:  {} µs", p99_ns / 1000);
+    println!("  mean: {} µs", mean_ns / 1000);
+
+    Ok(())
+}
diff --git a/src/bin/voidbox-network-bench/main.rs b/src/bin/voidbox-network-bench/main.rs
index a18ac09e..9a2cc434 100644
--- a/src/bin/voidbox-network-bench/main.rs
+++ b/src/bin/voidbox-network-bench/main.rs
@@ -192,6 +192,14 @@ FAST SMOKE RUN\n\
             .from_env()?
             .memory_mb(BENCH_MEMORY_MB)
             .network(true)
+            // Production SLIRP defaults (50 connect/s, 64 concurrent)
+            // are anti-DoS limits sized for real workloads.  The CRR
+            // bench intentionally opens hundreds of connections per
+            // second; without this lift it gets RST'd at the 51st
+            // connect, which manifests as a 2 s `crr echo channel
+            // receive error` instead of a real number.
+            .network_max_connections_per_second(u32::MAX)
+            .network_max_concurrent_connections(usize::MAX)
             .build()?;
 
         // Prime the VM (triggers boot + vsock handshake) before any timed work.
diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs
index 71214d47..2c94e1c7 100644
--- a/src/devices/virtio_net.rs
+++ b/src/devices/virtio_net.rs
@@ -8,8 +8,10 @@
 //! - Integration with SLIRP stack for NAT
 //! - No root/TAP required
 
+use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
+use crossbeam_queue::SegQueue;
 use tracing::{debug, trace, warn};
 use vm_memory::{Address, Bytes, GuestAddress, GuestMemory};
 
@@ -157,8 +159,16 @@ pub struct VirtioNetDevice {
     queue_sel: u32,
     /// Device status
     status: u32,
-    /// Interrupt status
-    interrupt_status: u32,
+    /// Interrupt status, accessed concurrently from the vCPU thread
+    /// (MMIO read of `INTERRUPT_STATUS`, MMIO write of `INTERRUPT_ACK`)
+    /// and the net-poll thread (sets bit 0 when new RX frames are
+    /// queued, polls on idle cycles).
+    ///
+    /// Wrapped in [`Arc<AtomicU32>`] so the net-poll thread can hold
+    /// its own clone and read/update the value without taking the
+    /// device mutex.  The vCPU thread accesses it via the device
+    /// guard during MMIO dispatch; both sides see the same atomic.
+    interrupt_status: Arc<AtomicU32>,
     /// Configuration generation counter
     config_generation: u32,
     /// Receive queue state
@@ -181,6 +191,21 @@ pub struct VirtioNetDevice {
     rx_avail_idx: u16,
     /// RX queue: next used index we'll write
     rx_used_idx: u16,
+    /// Lock-free queue of frames waiting to be written into the guest's
+    /// RX descriptors.  The net-poll thread pushes frames here without
+    /// taking the device lock; the vCPU thread drains them on its next
+    /// MMIO exit (via [`Self::flush_pending_rx`]) and writes the
+    /// descriptors in its own context.
+    ///
+    /// Eliminates the `Arc<Mutex<VirtioNetDevice>>` contention that
+    /// previously serialised every net-poll-side `try_inject_rx` call
+    /// against vCPU MMIO exits.
+    pending_rx: Arc<SegQueue<Vec<u8>>>,
+    /// Scratch buffer reused across `flush_pending_rx` calls so the
+    /// per-MMIO-exit `Vec<Vec<u8>>` doesn't grow from cap=0 every
+    /// time.  Heaptrack measured the previous local-Vec allocation as
+    /// 173 calls / 108 MB peak on the CRR microbench.
+    flush_scratch: Vec<Vec<u8>>,
 }
 
 impl VirtioNetDevice {
@@ -200,7 +225,7 @@ impl VirtioNetDevice {
             features_sel: 0,
             queue_sel: 0,
             status: 0,
-            interrupt_status: 0,
+            interrupt_status: Arc::new(AtomicU32::new(0)),
             config_generation: 0,
             rx_queue: QueueState {
                 num_max: 256,
@@ -218,9 +243,38 @@ impl VirtioNetDevice {
             tx_used_idx: 0,
             rx_avail_idx: 0,
             rx_used_idx: 0,
+            pending_rx: Arc::new(SegQueue::new()),
+            flush_scratch: Vec::new(),
         })
     }
 
+    /// Returns a clone of the lock-free RX frame queue Arc.
+    ///
+    /// The net-poll thread holds this clone and pushes frames to it
+    /// without ever taking the [`VirtioNetDevice`] mutex.  The vCPU
+    /// thread (which already holds the device mutex during MMIO
+    /// dispatch) drains it via [`Self::flush_pending_rx`].
+    pub fn pending_rx(&self) -> Arc<SegQueue<Vec<u8>>> {
+        Arc::clone(&self.pending_rx)
+    }
+
+    /// Returns a clone of the [`NetworkBackend`] arc.
+    ///
+    /// Lets the net-poll thread call `drain_to_guest` directly without
+    /// going through the device mutex.  Combined with [`Self::pending_rx`],
+    /// this removes the `Arc<Mutex<VirtioNetDevice>>` contention point
+    /// from the per-packet RX hot path.
+    pub fn slirp_arc(&self) -> Arc<Mutex<dyn NetworkBackend>> {
+        Arc::clone(&self.slirp)
+    }
+
+    /// Returns a clone of the [`Arc<AtomicU32>`] backing
+    /// `interrupt_status`.  The net-poll thread holds this clone and
+    /// reads/updates the ISR without ever taking the device mutex.
+    pub fn interrupt_status_arc(&self) -> Arc<AtomicU32> {
+        Arc::clone(&self.interrupt_status)
+    }
+
     /// Set the MMIO base address
     pub fn set_mmio_base(&mut self, base: u64) {
         self.mmio_base = base;
@@ -264,7 +318,7 @@ impl VirtioNetDevice {
                 let queue = self.current_queue();
                 queue.ready as u32
             }
-            mmio::INTERRUPT_STATUS => self.interrupt_status,
+            mmio::INTERRUPT_STATUS => self.interrupt_status.load(Ordering::Relaxed),
             mmio::STATUS => self.status,
             mmio::CONFIG_GENERATION => self.config_generation,
             // Device config (MAC address at offset 0x100)
@@ -339,7 +393,7 @@ impl VirtioNetDevice {
                 self.handle_queue_notify(value, guest_memory);
             }
             mmio::INTERRUPT_ACK => {
-                self.interrupt_status &= !value;
+                self.interrupt_status.fetch_and(!value, Ordering::Relaxed);
             }
             mmio::STATUS => {
                 self.status = value;
@@ -434,6 +488,17 @@ impl VirtioNetDevice {
         }
     }
 
+    /// Process the TX queue from outside the vCPU thread.
+    ///
+    /// Called by `net_poll_thread` when the KVM_IOEVENTFD registered for
+    /// the virtio-net QUEUE_NOTIFY MMIO fires.  Same body as the
+    /// synchronous TX-queue handler used from the MMIO write path,
+    /// just exposed under a different name so callers outside this
+    /// module can drive it.
+    pub fn process_tx_queue_external<M: GuestMemory + ?Sized>(&mut self, mem: &M) -> Result<()> {
+        self.process_tx_queue(mem)
+    }
+
     /// Process TX queue: read descriptor chains from guest, send frames to SLIRP, update used ring.
     fn process_tx_queue<M: GuestMemory + ?Sized>(&mut self, mem: &M) -> Result<()> {
         let q = &self.tx_queue;
@@ -451,6 +516,16 @@ impl VirtioNetDevice {
             .map_err(|e| crate::Error::Memory(e.to_string()))?;
         let avail_idx = u16::from_le_bytes(idx_buf);
 
+        let initial_tx_used_idx = self.tx_used_idx;
+
+        // Reusable per-call packet buffer.  Capacity carried across
+        // iterations within this call so chained-descriptor frames don't
+        // re-grow the buffer; cleared between frames so each
+        // process_tx_frame sees only this frame's bytes.  Pre-size to
+        // a typical MTU + virtio-net header so the common single-segment
+        // path needs no realloc.
+        let mut packet: Vec<u8> = Vec::with_capacity(1600);
+
         while self.tx_avail_idx != avail_idx {
             // Ring entry: 2 bytes, at avail_addr + 4 + (tx_avail_idx % queue_size)*2
             let ring_offset = 4 + ((self.tx_avail_idx as usize) % queue_size) * 2;
@@ -462,8 +537,7 @@ impl VirtioNetDevice {
             .map_err(|e| crate::Error::Memory(e.to_string()))?;
             let head_idx = u16::from_le_bytes(desc_id_buf) as usize;
 
-            // Walk descriptor chain and collect packet
-            let mut packet = Vec::new();
+            packet.clear();
             let mut next = head_idx;
             loop {
                 if next >= queue_size {
@@ -478,10 +552,14 @@ impl VirtioNetDevice {
                 let flags = u16::from_le_bytes(desc[12..14].try_into().unwrap());
                 let next_desc = u16::from_le_bytes(desc[14..16].try_into().unwrap()) as usize;
                 if len > 0 && addr != 0 {
-                    let mut buf = vec![0u8; len];
-                    mem.read(&mut buf, GuestAddress(addr))
+                    // Read directly into the packet's tail instead of
+                    // allocating an intermediate `Vec<u8>` and then
+                    // `extend_from_slice`-ing it in.  Saves one alloc
+                    // and one full memcpy per descriptor segment.
+                    let off = packet.len();
+                    packet.resize(off + len, 0);
+                    mem.read(&mut packet[off..off + len], GuestAddress(addr))
                         .map_err(|e| crate::Error::Memory(e.to_string()))?;
-                    packet.extend_from_slice(&buf);
                 }
                 if (flags & VIRTQ_DESC_F_NEXT) == 0 {
                     break;
@@ -493,36 +571,96 @@ impl VirtioNetDevice {
                 self.process_tx_frame(&packet)?;
             }
 
-            // Write used ring: used->ring[tx_used_idx % queue_size] = { id: head_idx, len: 0 }
+            // Used-ring entry: 8 bytes (head_idx as u32, 0 as u32).
+            // Built on the stack to avoid heap-alloc-per-frame from
+            // `[...].concat()`.  TX descriptors carry no return data
+            // so the length field is always 0.
             let used_ring_off = 4 + ((self.tx_used_idx as usize) % queue_size) * 8;
-            let used_elem = [
-                (head_idx as u32).to_le_bytes(),
-                0u32.to_le_bytes(), // len for TX typically 0
-            ]
-            .concat();
+            let mut used_elem = [0u8; 8];
+            used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes());
+            // bytes [4..8] stay zero (the length field).
             mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64))
                 .map_err(|e| crate::Error::Memory(e.to_string()))?;
 
             self.tx_used_idx = self.tx_used_idx.wrapping_add(1);
             self.tx_avail_idx = self.tx_avail_idx.wrapping_add(1);
+        }
 
-            // Update used.idx so guest sees progress
+        // Publish used.idx ONCE per batch instead of after every frame.
+        // virtio spec: the device updates the used-ring entries first,
+        // then bumps used.idx; the guest reads used.idx with a memory
+        // barrier and iterates new entries.  Per-frame writes are
+        // redundant for correctness and waste one mem.write per frame.
+        if self.tx_used_idx != initial_tx_used_idx {
             let used_idx_bytes = self.tx_used_idx.to_le_bytes();
             mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64))
                 .map_err(|e| crate::Error::Memory(e.to_string()))?;
         }
 
-        self.interrupt_status |= 1;
+        self.interrupt_status.fetch_or(1, Ordering::Relaxed);
         Ok(())
     }
 
+    /// Drain frames pushed into [`Self::pending_rx`] by the net-poll
+    /// thread and write them into the guest's RX descriptors.
+    ///
+    /// Same descriptor-walking shape as [`Self::try_inject_rx`], but
+    /// the input frames come from the lock-free SegQueue instead of
+    /// going through the (locked) network backend.  The vCPU thread
+    /// calls this on every MMIO entry to virtio-net, materialising any
+    /// frames the net-poll thread queued since the last MMIO exit.
+    ///
+    /// Returns the number of frames written to the RX ring this call.
+    pub fn flush_pending_rx<M: GuestMemory + ?Sized>(&mut self, mem: &M) -> Result<usize> {
+        // Move the scratch out so we can mutate self while populating
+        // it.  The post-write `clear()` keeps capacity, so subsequent
+        // calls reuse the buffer instead of growing from cap=0.
+        let mut frames = std::mem::take(&mut self.flush_scratch);
+        frames.clear();
+        while let Some(frame) = self.pending_rx.pop() {
+            frames.push(frame);
+        }
+        let result = if !frames.is_empty() {
+            self.write_frames_to_rx_ring(&mut frames, mem)
+        } else {
+            Ok(0)
+        };
+        frames.clear();
+        self.flush_scratch = frames;
+        result
+    }
+
     /// Try to inject received frames from SLIRP into guest RX queue. Call from vCPU loop or after RX notify.
-    pub fn try_inject_rx<M: GuestMemory + ?Sized>(&mut self, mem: &M) -> Result<()> {
-        let frames = self.get_rx_frames();
+    ///
+    /// Returns the number of frames the guest now has visible in its RX
+    /// ring after this call.  Callers can use this to decide whether to
+    /// raise an IRQ — pulsing the line is only useful when the guest
+    /// has new work to do, not on every poll cycle while interrupt_status
+    /// is still set from an earlier (un-acked) injection.
+    pub fn try_inject_rx<M: GuestMemory + ?Sized>(&mut self, mem: &M) -> Result<usize> {
+        let mut frames = self.get_rx_frames();
         if frames.is_empty() {
-            return Ok(());
+            return Ok(0);
         }
+        let result = self.write_frames_to_rx_ring(&mut frames, mem);
+        // Stash drained Vec back as scratch so the next call reuses
+        // its capacity instead of allocating from cap=0.
+        frames.clear();
+        self.rx_scratch = frames;
+        result
+    }
 
+    /// Write a batch of fully-formed frames (already including the
+    /// virtio-net header) into the guest's RX descriptor ring.
+    ///
+    /// Shared between [`Self::try_inject_rx`] (frames pulled from the
+    /// network backend) and [`Self::flush_pending_rx`] (frames pushed
+    /// by the net-poll thread into the lock-free SegQueue).
+    fn write_frames_to_rx_ring<M: GuestMemory + ?Sized>(
+        &mut self,
+        frames: &mut Vec<Vec<u8>>,
+        mem: &M,
+    ) -> Result<usize> {
         let q = &self.rx_queue;
         if !q.ready || q.num == 0 {
             // Queue not ready - buffer frames for later
@@ -532,31 +670,32 @@ impl VirtioNetDevice {
                 q.num,
                 frames.len()
             );
-            self.rx_buffer.extend(frames);
-            return Ok(());
+            self.rx_buffer.append(frames);
+            return Ok(0);
         }
         let desc_addr = GuestAddress(q.desc_addr);
         let avail_addr = GuestAddress(q.driver_addr);
         let used_addr = GuestAddress(q.device_addr);
         let queue_size = q.num as usize;
 
-        for frame in frames {
-            // Read available ring: how many buffers has driver given us?
-            let mut idx_buf = [0u8; 2];
-            mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64))
-                .map_err(|e| crate::Error::Memory(e.to_string()))?;
-            let avail_idx = u16::from_le_bytes(idx_buf);
+        // avail_idx is monotonically increasing; the driver bumps it
+        // whenever it adds new buffers.  Read it once per try_inject_rx
+        // call rather than per frame — saves one mem.read per frame in
+        // the hot path.  If the device runs out of available buffers
+        // mid-batch the remaining frames are buffered for the next
+        // call, which is the same correctness contract as before.
+        let mut idx_buf = [0u8; 2];
+        mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64))
+            .map_err(|e| crate::Error::Memory(e.to_string()))?;
+        let avail_idx = u16::from_le_bytes(idx_buf);
+
+        let mut frames_injected: u16 = 0;
+
+        for frame in frames.drain(..) {
             if self.rx_avail_idx == avail_idx {
-                debug!("virtio-net: RX no available buffers (avail_idx={}, our_idx={}), buffering frame ({} bytes)",
-                    avail_idx, self.rx_avail_idx, frame.len());
                 self.rx_buffer.push(frame);
                 continue;
             }
-            debug!(
-                "virtio-net: RX injecting frame ({} bytes), avail_idx={}",
-                frame.len(),
-                avail_idx
-            );
 
             let ring_offset = 4 + ((self.rx_avail_idx as usize) % queue_size) * 2;
             let mut desc_id_buf = [0u8; 2];
@@ -599,32 +738,45 @@ impl VirtioNetDevice {
                 next = next_desc;
             }
 
+            // Used-ring entry is exactly 8 bytes (2x u32, little-endian).
+            // Build it on the stack instead of allocating a Vec via
+            // `[...].concat()` — the previous code did a heap alloc per
+            // frame in the hot path.
             let used_ring_off = 4 + ((self.rx_used_idx as usize) % queue_size) * 8;
-            let used_elem = [
-                (head_idx as u32).to_le_bytes(),
-                (written as u32).to_le_bytes(),
-            ]
-            .concat();
+            let mut used_elem = [0u8; 8];
+            used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes());
+            used_elem[4..8].copy_from_slice(&(written as u32).to_le_bytes());
             mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64))
                 .map_err(|e| crate::Error::Memory(e.to_string()))?;
 
             self.rx_used_idx = self.rx_used_idx.wrapping_add(1);
             self.rx_avail_idx = self.rx_avail_idx.wrapping_add(1);
+            frames_injected = frames_injected.wrapping_add(1);
+        }
 
+        // Publish the new used.idx ONCE at the end of the batch.  The
+        // virtio spec only requires the device to update used.idx after
+        // it has written all corresponding used-ring entries; the guest
+        // reads used.idx with a memory barrier and then iterates new
+        // entries.  Per-frame writes are redundant — saves one
+        // mem.write per frame on the hot path.
+        if frames_injected > 0 {
             let used_idx_bytes = self.rx_used_idx.to_le_bytes();
             mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64))
                 .map_err(|e| crate::Error::Memory(e.to_string()))?;
         }
 
-        self.interrupt_status |= 1;
-        Ok(())
+        if frames_injected > 0 {
+            self.interrupt_status.fetch_or(1, Ordering::Relaxed);
+        }
+        Ok(frames_injected as usize)
     }
 
     /// Reset device to initial state
     fn reset(&mut self) {
         debug!("virtio-net: device reset");
         self.status = 0;
-        self.interrupt_status = 0;
+        self.interrupt_status.store(0, Ordering::Relaxed);
         self.driver_features = 0;
         self.tx_avail_idx = 0;
         self.tx_used_idx = 0;
@@ -691,7 +843,7 @@ impl VirtioNetDevice {
         self.rx_buffer.push(packet);
 
         // Set interrupt
-        self.interrupt_status |= 1;
+        self.interrupt_status.fetch_or(1, Ordering::Relaxed);
     }
 
     /// Capture device state for snapshot.
@@ -726,7 +878,7 @@ impl VirtioNetDevice {
             features_sel: self.features_sel,
             queue_sel: self.queue_sel,
             status: self.status,
-            interrupt_status: self.interrupt_status,
+            interrupt_status: self.interrupt_status.load(Ordering::Relaxed),
             config_generation: self.config_generation,
             mac: self.mac,
             queues,
@@ -740,7 +892,8 @@ impl VirtioNetDevice {
         self.features_sel = state.features_sel;
         self.queue_sel = state.queue_sel;
         self.status = state.status;
-        self.interrupt_status = state.interrupt_status;
+        self.interrupt_status
+            .store(state.interrupt_status, Ordering::Relaxed);
         self.config_generation = state.config_generation;
         self.mac = state.mac;
 
@@ -776,9 +929,13 @@ impl VirtioNetDevice {
         );
     }
 
-    /// Check if there are pending interrupts
+    /// Check if there are pending interrupts.
+    ///
+    /// Atomic load — safe to call from any thread without holding the
+    /// device mutex.  The net-poll thread uses this to decide whether
+    /// to pulse the IRQ line.
     pub fn has_pending_interrupt(&self) -> bool {
-        self.interrupt_status != 0
+        self.interrupt_status.load(Ordering::Relaxed) != 0
     }
 
     /// Get the MAC address
diff --git a/src/network/slirp.rs b/src/network/slirp.rs
index 1e452880..7c7930c3 100644
--- a/src/network/slirp.rs
+++ b/src/network/slirp.rs
@@ -59,6 +59,12 @@ struct PendingDnsQuery {
 /// while keeping the implementation simple.
 const DNS_CACHE_TTL_SECS: u64 = 60;
 
+/// Initial capacity for the ready-event scratch buffers.  Sized to
+/// `EpollDispatch`'s typical per-wait batch so the buffers fit a
+/// busy-loop wakeup without reallocating; oversized batches grow
+/// once and stabilize.
+const EVENTS_PRESIZE: usize = 128;
+
 use ipnet::Ipv4Net;
 
 use smoltcp::iface::{Config, Interface, SocketSet};
@@ -689,6 +695,26 @@ pub struct SlirpBackend {
     /// keep the fallback so synthetic harnesses still observe
     /// readiness.
     has_external_poller: AtomicBool,
+    /// Per-call scratch buffer for the events `drain_to_guest`
+    /// processes.  Owned by `SlirpBackend` so its capacity persists
+    /// across calls — `mem::take`-into-local would discard the
+    /// allocation and force the next round to grow from cap=0,
+    /// which heaptrack measured as ~half of all per-CRR
+    /// allocations.
+    ready_scratch: Vec<EpollEvent>,
+    /// Per-call scratch for `relay_tcp_nat_data`'s deferred frame
+    /// pushes.  The relay can't push directly to `inject_to_guest`
+    /// while iterating `flow_table` (borrow conflict); reusing
+    /// this buffer keeps the per-cycle Vec from growing from cap=0.
+    relay_frames_scratch: Vec<Vec<u8>>,
+    /// Shared scratch for the per-cycle `Vec<FlowKey>` snapshots
+    /// that `relay_tcp_nat_data`, `relay_icmp_echo`, and
+    /// `relay_udp_flows` build to side-step `&mut self` /
+    /// `flow_table` borrow conflicts.  All three relays run
+    /// sequentially inside `drain_to_guest`, so one buffer
+    /// suffices — each callsite takes it, fills it, drains it,
+    /// and stashes it back via `clear()` (capacity preserved).
+    flow_keys_scratch: Vec<FlowKey>,
 }
 
 impl SlirpBackend {
@@ -793,9 +819,12 @@ impl SlirpBackend {
             accept_sender: accept_tx,
             epoll,
             epoll_waker,
-            pending_events: Mutex::new(Vec::new()),
+            pending_events: Mutex::new(Vec::with_capacity(EVENTS_PRESIZE)),
             pending_close: Vec::new(),
             has_external_poller: AtomicBool::new(false),
+            ready_scratch: Vec::with_capacity(EVENTS_PRESIZE),
+            relay_frames_scratch: Vec::new(),
+            flow_keys_scratch: Vec::new(),
         })
     }
 
@@ -1033,26 +1062,33 @@ impl SlirpBackend {
         //
         // Then, only if no net-poll thread has populated the queue
         // (unit tests / benches), fall back to a non-blocking poll on
-        // the epoll FD ourselves. `try_lock` keeps that fallback safe
-        // under contention.
-        let ready: Vec<EpollEvent> = {
-            let mut events: Vec<EpollEvent> = {
-                let mut queue = self.pending_events.lock().unwrap();
-                std::mem::take(&mut *queue)
-            };
-            // Fallback non-blocking poll only when no external poller
-            // (net_poll_thread) is feeding us events — otherwise we'd
-            // pay one mutex op + one epoll_wait syscall per call
-            // (~310 ns) for nothing. The flag is one-way: set by the
-            // first push_ready_events and stays set for the backend's
-            // lifetime.
-            if events.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) {
-                let _ = self
-                    .epoll
-                    .wait_with_timeout(&mut events, std::time::Duration::ZERO);
-            }
-            events
-        };
+        // the epoll FD ourselves.
+        //
+        // The local `ready` Vec is taken from `self.ready_scratch`,
+        // populated by copying out of the locked queue (which is
+        // `clear()`-ed in place to keep its capacity), processed,
+        // then cleared and stashed back.  The previous `mem::take`
+        // pattern dropped the queue's allocation every cycle —
+        // heaptrack measured that as ~half of all per-CRR
+        // allocations on this hot path.
+        let mut ready: Vec<EpollEvent> = std::mem::take(&mut self.ready_scratch);
+        ready.clear();
+        {
+            let mut queue = self.pending_events.lock().unwrap();
+            ready.extend_from_slice(&queue);
+            queue.clear();
+        }
+        // Fallback non-blocking poll only when no external poller
+        // (net_poll_thread) is feeding us events — otherwise we'd
+        // pay one mutex op + one epoll_wait syscall per call
+        // (~310 ns) for nothing. The flag is one-way: set by the
+        // first push_ready_events and stays set for the backend's
+        // lifetime.
+        if ready.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) {
+            let _ = self
+                .epoll
+                .wait_with_timeout(&mut ready, std::time::Duration::ZERO);
+        }
 
         // 0a. Accept any newly-ready listener connections (may push into
         //     accept_sender for the next step).
@@ -1091,6 +1127,12 @@ impl SlirpBackend {
             out.append(&mut q.tx_queue);
         }
         out.append(&mut self.inject_to_guest);
+
+        // Stash the local `ready` Vec back as scratch.  `clear()`
+        // preserves capacity, so the next `drain_to_guest` reuses
+        // the buffer instead of allocating from cap=0.
+        ready.clear();
+        self.ready_scratch = ready;
     }
 
     /// Poll the stack and return ethernet frames to send to the guest.
@@ -2321,8 +2363,13 @@ impl SlirpBackend {
     /// only the flow table entries directly, avoiding a separate Vec allocation.
     /// Data relay is restricted to flows with an EPOLLIN event in `ready`.
     fn relay_tcp_nat_data(&mut self, ready: &[EpollEvent]) {
-        // Collect frames to inject (built separately to avoid borrow issues)
-        let mut frames_to_inject: Vec<Vec<u8>> = Vec::new();
+        // Collect frames to inject in the SlirpBackend-owned scratch
+        // so the buffer's capacity carries across calls.  Pushes
+        // can't go straight to `inject_to_guest` because we're
+        // about to iterate `flow_table` and `inject_to_guest` is
+        // also `&mut self`.
+        let mut frames_to_inject = std::mem::take(&mut self.relay_frames_scratch);
+        frames_to_inject.clear();
 
         // Seed removal set from flows already marked Closed by handle_tcp_frame
         // (FIN/RST path) via the pending_close queue. HashSet gives O(1)
@@ -2382,7 +2429,8 @@ impl SlirpBackend {
             }
         }
 
-        let mut tcp_flow_keys: Vec<FlowKey> = Vec::new();
+        let mut tcp_flow_keys = std::mem::take(&mut self.flow_keys_scratch);
+        tcp_flow_keys.clear();
         for event in ready {
             if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_TCP {
                 continue;
@@ -2396,7 +2444,7 @@ impl SlirpBackend {
             tcp_flow_keys.push(flow_key);
         }
 
-        for flow_key in tcp_flow_keys {
+        for flow_key in tcp_flow_keys.drain(..) {
             let FlowKey::Tcp(key) = flow_key else {
                 continue;
             };
@@ -2607,6 +2655,12 @@ impl SlirpBackend {
             self.flow_table.remove(&flow_key);
         }
         self.inject_to_guest.append(&mut frames_to_inject);
+        // Both `append` calls drained `frames_to_inject` but
+        // preserved its capacity; restore the buffer to the
+        // backend so the next cycle reuses it.  The flow-key
+        // buffer was already drained by the iteration above.
+        self.relay_frames_scratch = frames_to_inject;
+        self.flow_keys_scratch = tcp_flow_keys;
     }
 
     /// Drain replies from each active ICMP echo socket and emit echo-reply
@@ -2619,7 +2673,8 @@ impl SlirpBackend {
         const ICMP_IDLE_TIMEOUT: Duration = Duration::from_secs(60);
         let now = Instant::now();
 
-        let mut ready_flow_keys: Vec<FlowKey> = Vec::new();
+        let mut ready_flow_keys = std::mem::take(&mut self.flow_keys_scratch);
+        ready_flow_keys.clear();
         for event in ready {
             if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_ICMP {
                 continue;
@@ -2681,6 +2736,8 @@ impl SlirpBackend {
             }
             self.flow_table.remove(&flow_key);
         }
+        ready_flow_keys.clear();
+        self.flow_keys_scratch = ready_flow_keys;
     }
 
     /// Build an Ethernet/IPv4/ICMP echo-reply frame addressed to the guest.
@@ -2751,18 +2808,20 @@ impl SlirpBackend {
     fn relay_udp_flows(&mut self, ready: &[EpollEvent]) {
         let now = Instant::now();
         // Per-flow connected sockets are closed by Drop when the entry leaves
-        // flow_table.
-        let mut stale: Vec<FlowKey> = Vec::new();
+        // flow_table.  The two flow-key Vecs here share `flow_keys_scratch`:
+        // the stale-sweep drains it, then the readiness loop refills it.
+        let mut flow_keys = std::mem::take(&mut self.flow_keys_scratch);
+        flow_keys.clear();
         for (flow_key, entry) in &self.flow_table {
             let FlowKey::Udp(_) = flow_key else { continue };
             let FlowEntry::Udp(udp_entry) = entry else {
                 continue;
             };
             if now.duration_since(udp_entry.last_activity) > UDP_IDLE_TIMEOUT {
-                stale.push(*flow_key);
+                flow_keys.push(*flow_key);
             }
         }
-        for flow_key in stale {
+        for flow_key in flow_keys.drain(..) {
             if let Some(FlowEntry::Udp(entry)) = self.flow_table.get(&flow_key) {
                 self.token_to_key.remove(&entry.flow_token);
                 self.epoll.unregister(entry.sock.as_raw_fd()).ok();
@@ -2770,7 +2829,6 @@ impl SlirpBackend {
             self.flow_table.remove(&flow_key);
         }
 
-        let mut flow_keys: Vec<FlowKey> = Vec::new();
         for event in ready {
             if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_UDP {
                 continue;
@@ -2780,7 +2838,7 @@ impl SlirpBackend {
             };
             flow_keys.push(flow_key);
         }
-        for flow_key in flow_keys {
+        for flow_key in flow_keys.drain(..) {
             let FlowKey::Udp(key) = flow_key else {
                 continue;
             };
@@ -2807,6 +2865,7 @@ impl SlirpBackend {
                 self.inject_to_guest.push(frame_bytes);
             }
         }
+        self.flow_keys_scratch = flow_keys;
     }
 
     /// Build an Ethernet/IPv4/UDP frame addressed to the guest, carrying a
diff --git a/src/sandbox/local.rs b/src/sandbox/local.rs
index 69f9f240..a7b82bfe 100644
--- a/src/sandbox/local.rs
+++ b/src/sandbox/local.rs
@@ -91,8 +91,14 @@ impl LocalSandbox {
                 session_secret: SessionSecret::new(session_secret_bytes),
                 command_allowlist: Vec::new(), // Set via provisioning
                 network_deny_list: default_network_deny_list(),
-                max_connections_per_second: DEFAULT_MAX_CONNECTIONS_PER_SECOND,
-                max_concurrent_connections: DEFAULT_MAX_CONCURRENT_CONNECTIONS,
+                max_connections_per_second: self
+                    .config
+                    .network_max_connections_per_second
+                    .unwrap_or(DEFAULT_MAX_CONNECTIONS_PER_SECOND),
+                max_concurrent_connections: self
+                    .config
+                    .network_max_concurrent_connections
+                    .unwrap_or(DEFAULT_MAX_CONCURRENT_CONNECTIONS),
                 seccomp: true,
             },
             snapshot: self.config.snapshot.clone(),
diff --git a/src/sandbox/mod.rs b/src/sandbox/mod.rs
index b2c820c0..9066e478 100644
--- a/src/sandbox/mod.rs
+++ b/src/sandbox/mod.rs
@@ -86,6 +86,15 @@ pub struct SandboxConfig {
     /// validate save/restore support at cold boot instead of deferring a
     /// cryptic failure to save time.
     pub enable_snapshots: bool,
+    /// Optional override for the network backend's
+    /// `max_connections_per_second` rate limit.  `None` keeps the
+    /// production default (50/s); benches that intentionally exceed
+    /// the anti-DoS limit raise it explicitly.
+    pub network_max_connections_per_second: Option<u32>,
+    /// Optional override for the network backend's
+    /// `max_concurrent_connections` ceiling.  `None` keeps the
+    /// production default (64).
+    pub network_max_concurrent_connections: Option<usize>,
 }
 
 impl Default for SandboxConfig {
@@ -108,6 +117,8 @@ impl Default for SandboxConfig {
             env: Vec::new(),
             snapshot: None,
             enable_snapshots: false,
+            network_max_connections_per_second: None,
+            network_max_concurrent_connections: None,
         }
     }
 }
@@ -815,6 +826,41 @@ impl SandboxBuilder {
         self
     }
 
+    /// Overrides the SLIRP backend's per-second new-connection rate
+    /// limit.  The production default (50/s) protects the host from
+    /// guest-side connection floods; benches that intentionally
+    /// exceed it call this to disable the limit.
+    ///
+    /// # Examples
+    ///
+    /// ```no_run
+    /// use void_box::sandbox::Sandbox;
+    /// let _ = Sandbox::local()
+    ///     .network(true)
+    ///     .network_max_connections_per_second(u32::MAX);
+    /// ```
+    pub fn network_max_connections_per_second(mut self, rate: u32) -> Self {
+        self.config.network_max_connections_per_second = Some(rate);
+        self
+    }
+
+    /// Overrides the SLIRP backend's concurrent-connection ceiling.
+    /// Production default is 64; raise for sustained-throughput
+    /// benches.
+    ///
+    /// # Examples
+    ///
+    /// ```no_run
+    /// use void_box::sandbox::Sandbox;
+    /// let _ = Sandbox::local()
+    ///     .network(true)
+    ///     .network_max_concurrent_connections(1024);
+    /// ```
+    pub fn network_max_concurrent_connections(mut self, count: usize) -> Self {
+        self.config.network_max_concurrent_connections = Some(count);
+        self
+    }
+
     /// Set the kernel path
     pub fn kernel(mut self, path: impl Into<PathBuf>) -> Self {
         self.config.kernel = Some(path.into());
diff --git a/src/vmm/cpu.rs b/src/vmm/cpu.rs
index 61008a0a..41f86920 100644
--- a/src/vmm/cpu.rs
+++ b/src/vmm/cpu.rs
@@ -249,8 +249,14 @@ fn vcpu_run_loop(
                     }
                     VcpuExit::MmioRead(addr, data) => {
                         let handled = if let Some(ref dev) = mmio_devices.virtio_net {
-                            let guard = dev.lock().unwrap();
+                            let mut guard = dev.lock().unwrap();
                             if guard.handles_mmio(addr) {
+                                // Materialise any frames the net-poll thread
+                                // pushed into pending_rx since our last MMIO
+                                // entry — writes them into the guest's RX
+                                // descriptors in our context, no cross-thread
+                                // lock contention.
+                                let _ = guard.flush_pending_rx(guest_memory);
                                 let offset = addr - guard.mmio_base();
                                 guard.mmio_read(offset, data);
                                 true
@@ -305,6 +311,11 @@ fn vcpu_run_loop(
                         let handled = if let Some(ref dev) = mmio_devices.virtio_net {
                             let mut guard = dev.lock().unwrap();
                             if guard.handles_mmio(addr) {
+                                // Same pre-flush as the MMIO-read path: the
+                                // guest may write INTERRUPT_ACK or another
+                                // register before reading INTERRUPT_STATUS,
+                                // so we materialise pending frames here too.
+                                let _ = guard.flush_pending_rx(guest_memory);
                                 let offset = addr - guard.mmio_base();
                                 guard.mmio_write(offset, data, Some(guest_memory));
                                 true
diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs
index 97fe2d0f..e1a485e1 100644
--- a/src/vmm/mod.rs
+++ b/src/vmm/mod.rs
@@ -1607,6 +1607,77 @@ fn vsock_irq_thread(
 ///
 /// When the network backend does not provide an epoll instance
 /// (non-SlirpBackend), the thread falls back to a fixed 5 ms sleep.
+/// Registers a host eventfd with KVM via `KVM_IOEVENTFD` for the
+/// virtio-net TX-queue notify MMIO and adds it to the supplied
+/// [`EpollDispatch`] under `token` so the net-poll thread can drain
+/// it.  Returns the eventfd on success, or `None` and logs a
+/// `debug!` on any failure (eventfd creation, epoll registration,
+/// `KVM_IOEVENTFD` registration); callers fall back to the
+/// MMIO-exit TX path when this returns `None`.
+///
+/// Both pieces (epoll registration and `KVM_IOEVENTFD`
+/// registration) must succeed together: if KVM consumes the guest's
+/// TX MMIO writes in-kernel but no userspace path drains the
+/// eventfd, guest TX hangs silently.  This helper rolls back the
+/// epoll registration if the `KVM_IOEVENTFD` half fails.
+///
+/// # Errors
+///
+/// Returns `None` on any of: missing epoll dispatcher, eventfd
+/// creation failure, epoll registration failure, or
+/// `KVM_IOEVENTFD` registration failure.  Each failure is logged at
+/// `debug!` level with the underlying error.
+fn setup_tx_notify_ioeventfd(
+    vm: &Vm,
+    epoll_arc: Option<&Arc<crate::network::epoll_dispatch::EpollDispatch>>,
+    mmio_addr: u64,
+    queue_idx: u32,
+    token: u64,
+) -> Option<vmm_sys_util::eventfd::EventFd> {
+    let Some(ep_arc) = epoll_arc else {
+        debug!(
+            "net-poll: no epoll dispatcher; falling back to MMIO-exit TX path (KVM_IOEVENTFD requires an async drain)"
+        );
+        return None;
+    };
+    let fd = match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) {
+        Ok(fd) => fd,
+        Err(e) => {
+            debug!(
+                "net-poll: eventfd create for tx-notify failed; falling back to MMIO-exit TX path: {}",
+                e
+            );
+            return None;
+        }
+    };
+    if let Err(e) = ep_arc.register(
+        fd.as_raw_fd(),
+        token,
+        crate::network::epoll_dispatch::RegisterMode::Read,
+    ) {
+        debug!(
+            "net-poll: failed to register tx-notify eventfd with epoll dispatch ({e}); falling back to MMIO-exit TX path"
+        );
+        return None;
+    }
+    let kvm_addr = kvm_ioctls::IoEventAddress::Mmio(mmio_addr);
+    if let Err(e) = vm.vm_fd().register_ioevent(&fd, &kvm_addr, queue_idx) {
+        // KVM didn't take the ioevent.  Roll the epoll registration
+        // back so the eventfd doesn't stay armed without a service
+        // path on it.
+        let _ = ep_arc.unregister(fd.as_raw_fd());
+        debug!(
+            "net-poll: KVM_IOEVENTFD register failed ({e}); TX notifies will continue to take MMIO exits"
+        );
+        return None;
+    }
+    debug!(
+        "net-poll: KVM_IOEVENTFD active for TX notify @ MMIO {:#x} queue_idx={queue_idx}",
+        mmio_addr,
+    );
+    Some(fd)
+}
+
 fn net_poll_thread(net_dev: Arc<Mutex<VirtioNetDevice>>, vm: Arc<Vm>, running: Arc<AtomicBool>) {
     #[repr(C)]
     struct KvmIrqLevel {
@@ -1649,6 +1720,98 @@ fn net_poll_thread(net_dev: Arc<Mutex<VirtioNetDevice>>, vm: Arc<Vm>, running: A
 
     let mut epoll_events: Vec<crate::network::epoll_dispatch::EpollEvent> = Vec::new();
 
+    // Tracks whether the device's interrupt_status was non-zero on the
+    // previous cycle.  Used to decide whether to pulse the IRQ line:
+    // we pulse only on transitions clear→pending (or when new RX frames
+    // are injected this cycle), not on every cycle where pending is
+    // still set from an un-acked earlier pulse.
+    let mut prev_pending: bool = false;
+
+    // KVM_IRQFD: register an eventfd that asserts IRQ 10 when written.
+    // Writing 8 bytes to the eventfd is one syscall; the kernel signals
+    // the in-kernel irqchip directly.  This replaces the pair of
+    // KVM_IRQ_LINE ioctls (assert level=1 / deassert level=0) with a
+    // single write.  If setup fails (kernel without irqfd, broken irqchip
+    // routing) we fall back to the ioctl path below.
+    let irq_eventfd: Option<vmm_sys_util::eventfd::EventFd> =
+        match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) {
+            Ok(fd) => match vm.vm_fd().register_irqfd(&fd, 10) {
+                Ok(()) => Some(fd),
+                Err(e) => {
+                    debug!(
+                        "net-poll: KVM_IRQFD register failed; falling back to KVM_IRQ_LINE: {}",
+                        e
+                    );
+                    None
+                }
+            },
+            Err(e) => {
+                debug!(
+                    "net-poll: eventfd create failed; falling back to KVM_IRQ_LINE: {}",
+                    e
+                );
+                None
+            }
+        };
+
+    // KVM_IOEVENTFD for the virtio-net TX queue notify.
+    //
+    // Without this, every guest TX (write to QUEUE_NOTIFY MMIO with value=1)
+    // forces a KVM_RUN exit, the vCPU thread dispatches into virtio-net's
+    // MMIO write handler, then calls process_tx_queue and re-enters KVM_RUN.
+    // ~1–5 µs per packet of pure VM-exit overhead.
+    //
+    // With KVM_IOEVENTFD: the guest's MMIO write is consumed in-kernel,
+    // KVM signals the eventfd, and the vCPU thread continues running.
+    // The net-poll thread sees the eventfd as another epoll source, drains
+    // it, and calls process_tx_queue asynchronously.  No vCPU exit.
+    //
+    // Address: virtio-net mmio_base (0xd000_0000) + QUEUE_NOTIFY offset
+    // (0x050) = 0xd000_0050.  Datamatch=1 triggers only on TX queue
+    // notifies (value=1 → queue index 1 = transmit queue).  Notifies for
+    // queue 0 (RX) still take the slow path through MMIO; they're rare
+    // (only when guest adds new RX buffers) so the optimisation isn't
+    // needed there.
+    const VIRTIO_NET_MMIO_BASE: u64 = 0xd000_0000;
+    const VIRTIO_NET_QUEUE_NOTIFY_OFFSET: u64 = 0x050;
+    const TX_NOTIFY_QUEUE_IDX: u32 = 1;
+    // Token used to identify the TX-notify eventfd in epoll readiness
+    // events.  Lives in a tag space that doesn't collide with the
+    // PROTO_TAG_* values SlirpBackend uses for flow tokens.
+    const TX_NOTIFY_TOKEN: u64 = 0x4000_0000_0000_0000;
+
+    let tx_notify_eventfd = setup_tx_notify_ioeventfd(
+        vm.as_ref(),
+        epoll_arc.as_ref(),
+        VIRTIO_NET_MMIO_BASE + VIRTIO_NET_QUEUE_NOTIFY_OFFSET,
+        TX_NOTIFY_QUEUE_IDX,
+        TX_NOTIFY_TOKEN,
+    );
+
+    // Lock-free hand-off queue + direct backend Arc, pulled out of the
+    // device once at thread startup so the per-cycle hot path doesn't
+    // need to acquire the VirtioNetDevice mutex just to read backend
+    // frames.  The vCPU thread drains `pending_rx` on each MMIO entry
+    // (see vmm/cpu.rs), so this thread only needs to push frames.
+    type PendingRxArc = std::sync::Arc<crossbeam_queue::SegQueue<Vec<u8>>>;
+    type BackendArc = std::sync::Arc<Mutex<dyn crate::network::NetworkBackend>>;
+    type InterruptStatusArc = std::sync::Arc<std::sync::atomic::AtomicU32>;
+    let (pending_rx_arc, slirp_arc, interrupt_status_arc): (
+        Option<PendingRxArc>,
+        Option<BackendArc>,
+        Option<InterruptStatusArc>,
+    ) = match net_dev.lock() {
+        Ok(g) => (
+            Some(g.pending_rx()),
+            Some(g.slirp_arc()),
+            Some(g.interrupt_status_arc()),
+        ),
+        Err(_) => (None, None, None),
+    };
+
+    // Reusable buffer for frames pulled from the backend each cycle.
+    let mut rx_scratch: Vec<Vec<u8>> = Vec::new();
+
     while running.load(Ordering::Relaxed) {
         // Block outside the device lock: either on epoll readiness or a short
         // sleep.  This lets the vCPU thread acquire the device lock without
@@ -1681,39 +1844,115 @@ fn net_poll_thread(net_dev: Arc<Mutex<VirtioNetDevice>>, vm: Arc<Vm>, running: A
             IDLE_TIMEOUT
         };
 
-        // Push ready events into the backend's queue before acquiring the
-        // device lock for inject/IRQ work. drain_to_guest will consume them
-        // without re-locking EpollDispatch, eliminating mutex contention
-        // between the net-poll thread's 50 ms blocking wait and the vCPU
-        // thread's process_guest_frame → drain_to_guest path.
+        // Filter out the TX-notify eventfd event (if any) before pushing
+        // the rest to the SLIRP backend.  When the guest writes to the
+        // virtio-net QUEUE_NOTIFY MMIO with value=1, KVM consumes it
+        // in-kernel and signals our eventfd; we drain it here and call
+        // process_tx_queue ourselves — the vCPU thread never exits for
+        // that MMIO write.
+        let mut tx_notify_fired = false;
+        if tx_notify_eventfd.is_some() {
+            epoll_events.retain(|e| {
+                if e.token == TX_NOTIFY_TOKEN {
+                    tx_notify_fired = true;
+                    false
+                } else {
+                    true
+                }
+            });
+        }
+        if tx_notify_fired {
+            if let Some(ref efd) = tx_notify_eventfd {
+                let _ = efd.read();
+            }
+            if let Ok(mut guard) = net_dev.lock() {
+                let _ = guard.process_tx_queue_external(guest_memory);
+            }
+        }
+
+        // Push remaining (flow) events into the backend's queue before
+        // acquiring the device lock for inject/IRQ work.  drain_to_guest
+        // will consume them without re-locking EpollDispatch, eliminating
+        // mutex contention between the net-poll thread's blocking wait and
+        // the vCPU thread's process_guest_frame → drain_to_guest path.
         if !epoll_events.is_empty() {
             if let Ok(guard) = net_dev.lock() {
                 guard.push_events_to_backend(&epoll_events);
             }
         }
 
-        let has_interrupt = {
-            let mut guard = match net_dev.lock() {
-                Ok(g) => g,
-                Err(_) => continue,
-            };
-            let _ = guard.try_inject_rx(guest_memory);
-            guard.has_pending_interrupt()
-        };
-
-        // Always pulse IRQ10 while pending; this prevents RX stalls if
-        // an earlier edge was missed by the guest.
-        if has_interrupt {
-            let assert_irq = KvmIrqLevel { irq: 10, level: 1 };
-            // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into
-            // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid
-            // for the lifetime of `vm`.
-            unsafe {
-                libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq);
+        // Drain backend frames into the pending_rx SegQueue WITHOUT
+        // touching the VirtioNetDevice mutex.  The vCPU thread will
+        // materialise them into RX descriptors on its next MMIO entry
+        // via VirtioNetDevice::flush_pending_rx (see vmm/cpu.rs).
+        //
+        // This breaks the old contention pattern where the net-poll
+        // thread held the VirtioNetDevice lock for the duration of
+        // try_inject_rx (descriptor walk + memory writes), forcing the
+        // vCPU thread to wait on every MMIO exit that overlapped with
+        // a poll cycle.
+        let frames_pushed: usize = match (&pending_rx_arc, &slirp_arc) {
+            (Some(pending_rx), Some(slirp)) => {
+                rx_scratch.clear();
+                if let Ok(mut backend) = slirp.lock() {
+                    backend.drain_to_guest(&mut rx_scratch);
+                }
+                let n = rx_scratch.len();
+                for frame in rx_scratch.drain(..) {
+                    let mut packet = Vec::with_capacity(
+                        crate::devices::virtio_net::VirtioNetHeader::SIZE + frame.len(),
+                    );
+                    packet.extend_from_slice(
+                        &crate::devices::virtio_net::VirtioNetHeader::new().to_bytes(),
+                    );
+                    packet.extend_from_slice(&frame);
+                    pending_rx.push(packet);
+                }
+                n
             }
-            let deassert_irq = KvmIrqLevel { irq: 10, level: 0 };
-            unsafe {
-                libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq);
+            _ => 0,
+        };
+        // Lock-free check: read interrupt_status via the AtomicU32 we
+        // cached at thread startup.  Avoids one device-mutex acquisition
+        // per cycle on idle paths (the hot RX path skips this branch
+        // because frames_pushed > 0 already implies interrupt_status
+        // is about to be set when the vCPU drains pending_rx).
+        let has_interrupt = frames_pushed > 0
+            || match interrupt_status_arc {
+                Some(ref isr) => isr.load(std::sync::atomic::Ordering::Relaxed) != 0,
+                None => false,
+            };
+        let frames_injected = frames_pushed;
+
+        // Pulse IRQ10 only when there is *new* work for the guest:
+        //   - frames just injected this cycle, OR
+        //   - interrupt_status went from clear → pending (TX completion
+        //     by the vCPU thread between cycles).
+        // Skipping pulses when the guest hasn't acknowledged a previous
+        // pulse saves two ioctl(KVM_IRQ_LINE) calls per cycle (~5–10 µs
+        // on the CRR hot path).  If we pulse once and the guest's
+        // ISR services the queue, has_pending_interrupt will be false
+        // on the next cycle and `prev_pending` resets.
+        let now_pending = has_interrupt;
+        let pulse = frames_injected > 0 || (now_pending && !prev_pending);
+        prev_pending = now_pending;
+        if pulse {
+            if let Some(ref efd) = irq_eventfd {
+                // Fast path: KVM_IRQFD.  One 8-byte write to the eventfd;
+                // the kernel asserts IRQ 10 directly.  No ioctl pair.
+                let _ = efd.write(1);
+            } else {
+                let assert_irq = KvmIrqLevel { irq: 10, level: 1 };
+                // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into
+                // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid
+                // for the lifetime of `vm`.
+                unsafe {
+                    libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq);
+                }
+                let deassert_irq = KvmIrqLevel { irq: 10, level: 0 };
+                unsafe {
+                    libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq);
+                }
             }
         }
     }
diff --git a/tools/perf-harness/bench-compare-pasta.py b/tools/perf-harness/bench-compare-pasta.py
new file mode 100755
index 00000000..ac6af588
--- /dev/null
+++ b/tools/perf-harness/bench-compare-pasta.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# bench-compare-pasta.py — produce a markdown side-by-side comparing
+# voidbox-network-bench output against bench-pasta.py output.
+#
+# Both inputs are JSON files with the same field names (the shared
+# voidbox-network-bench Report shape).  Either argument can be the
+# voidbox or pasta side; the script auto-detects via the `backend`
+# field if present, otherwise positional.
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from typing import Any
+
+
+METRICS = [
+    ("tcp_throughput_g2h_mbps", "TCP throughput g2h", "Mbps", False),
+    ("tcp_bulk_throughput_g2h_mbps", "TCP bulk g2h (constrained)", "Mbps", False),
+    ("tcp_rr_latency_us_p50", "TCP RR latency p50", "µs", True),
+    ("tcp_rr_latency_us_p99", "TCP RR latency p99", "µs", True),
+    ("tcp_crr_latency_us_p50", "TCP CRR latency p50", "µs", True),
+    ("udp_dns_qps", "UDP DNS qps", "qps", False),
+    ("icmp_rr_latency_us_p50", "ICMP RR p50", "µs", True),
+    ("tcp_rx_latency_us_p50", "TCP RX latency p50", "µs", True),
+]
+
+
+def fmt(value: Any, latency: bool) -> str:
+    if value is None:
+        return "n/a"
+    if isinstance(value, (int, float)):
+        if latency:
+            if value >= 1000:
+                return f"{value / 1000:.2f} ms"
+            return f"{value:.1f} µs"
+        if value >= 1000:
+            return f"{value:.0f}"
+        return f"{value:.2f}"
+    return str(value)
+
+
+def fmt_delta(voidbox: Any, pasta: Any, latency: bool) -> str:
+    if voidbox is None or pasta is None:
+        return "—"
+    if pasta == 0:
+        return "—"
+    ratio = voidbox / pasta
+    if latency:
+        if ratio >= 1:
+            return f"voidbox {ratio:.1f}× slower"
+        return f"voidbox {1 / ratio:.2f}× faster"
+    if ratio >= 1:
+        return f"voidbox {ratio:.2f}× faster"
+    return f"voidbox {1 / ratio:.1f}× slower"
+
+
+def load(path: str) -> dict[str, Any]:
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def detect_role(data: dict[str, Any], default: str) -> str:
+    backend = data.get("backend")
+    if backend in ("pasta", "voidbox"):
+        return backend
+    return default
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="voidbox vs pasta head-to-head comparison")
+    p.add_argument("voidbox_json", help="path to voidbox-network-bench JSON output")
+    p.add_argument("pasta_json", help="path to bench-pasta.py JSON output")
+    p.add_argument("--output", help="write markdown to file instead of stdout")
+    args = p.parse_args()
+
+    voidbox = load(args.voidbox_json)
+    pasta = load(args.pasta_json)
+
+    if detect_role(voidbox, "voidbox") == "pasta":
+        voidbox, pasta = pasta, voidbox
+
+    lines: list[str] = []
+    lines.append("# voidbox vs pasta head-to-head\n")
+    lines.append("Methodology per `docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md` §")
+    lines.append("\"passt head-to-head methodology\": same host, same workload (`nc`-based g2h /")
+    lines.append("RR / CRR), same metric names. **CRR latency is the most apples-to-apples**")
+    lines.append("metric — dominated by NAT-table operations on both sides. Throughput numbers")
+    lines.append("are not directly comparable: voidbox runs in a real KVM VM (virtio-mmio exit")
+    lines.append("overhead); pasta runs in a network namespace (no VM).\n")
+    lines.append("| Metric | voidbox (KVM + SLIRP) | pasta (netns) | Δ |")
+    lines.append("|---|---:|---:|---|")
+
+    for key, label, _unit, latency in METRICS:
+        v = voidbox.get(key)
+        pa = pasta.get(key)
+        if v is None and pa is None:
+            continue
+        lines.append(
+            f"| {label} | {fmt(v, latency)} | {fmt(pa, latency)} | {fmt_delta(v, pa, latency)} |"
+        )
+
+    lines.append("")
+    pasta_version = pasta.get("pasta_version")
+    if pasta_version:
+        lines.append(f"_pasta version: `{pasta_version}`_")
+    lines.append("")
+    notes = pasta.get("notes")
+    if isinstance(notes, list) and notes:
+        lines.append("**Notes from pasta side:**")
+        for note in notes:
+            lines.append(f"- {note}")
+        lines.append("")
+
+    md = "\n".join(lines)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(md)
+        print(f"Report written to {args.output}", file=sys.stderr)
+    else:
+        print(md)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tools/perf-harness/bench-pasta.py b/tools/perf-harness/bench-pasta.py
new file mode 100755
index 00000000..264e808d
--- /dev/null
+++ b/tools/perf-harness/bench-pasta.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+# bench-pasta.py — passt/pasta side of the head-to-head comparison.
+#
+# Drives the same workload shape as `voidbox-network-bench`:
+#   - tcp_throughput_g2h_mbps     (sustained guest→host throughput)
+#   - tcp_rr_latency_us_p50/p99   (persistent-connection round-trip)
+#   - tcp_crr_latency_us_p50      (connect-request-response latency)
+#
+# The "guest" is a process running inside a pasta-managed network
+# namespace.  Pasta forwards the host's gateway address into the netns
+# as a translation for the host's loopback (its --map-host-loopback
+# default), so connecting to the host gateway IP from inside the netns
+# reaches the host's 127.0.0.1.  This mirrors voidbox's SLIRP
+# convention (10.0.2.2 → 127.0.0.1) closely enough for the metric
+# comparison to be apples-to-apples on the NAT path.
+#
+# Methodology aligns with docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md
+# § "passt head-to-head methodology": same host, same workload, same
+# metric names, focus on CRR latency (dominated by NAT-table ops, not
+# MMIO exit overhead).
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import socket
+import statistics
+import subprocess
+import sys
+import threading
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Optional
+
+
+@dataclass
+class Report:
+    tcp_bulk_throughput_g2h_mbps: Optional[float] = None
+    tcp_throughput_g2h_mbps: Optional[float] = None
+    tcp_throughput_h2g_mbps: Optional[float] = None
+    tcp_rr_latency_us_p50: Optional[float] = None
+    tcp_rr_latency_us_p99: Optional[float] = None
+    tcp_crr_latency_us_p50: Optional[float] = None
+    udp_dns_qps: Optional[float] = None
+    icmp_rr_latency_us_p50: Optional[float] = None
+    tcp_rx_latency_us_p50: Optional[float] = None
+    backend: str = "pasta"
+    pasta_version: Optional[str] = None
+    notes: list[str] = field(default_factory=list)
+
+
+def _resolve_pasta() -> str:
+    """Find a pasta binary in $PATH or fall back to /usr/bin/pasta."""
+    import shutil
+    found = shutil.which("pasta")
+    if found:
+        return found
+    return "/usr/bin/pasta"
+
+
+def detect_host_gateway() -> str:
+    """Return the host's IPv4 default-route gateway address.
+
+    Parses ``ip -4 route show default`` for ``default via <GW> ...`` lines
+    and returns the address after ``via``.  Routes of the form
+    ``default dev <IFACE> ...`` (no ``via``) are skipped — they don't
+    name a usable IP for pasta's ``--map-host-loopback`` translation.
+    """
+    out = subprocess.check_output(["ip", "-4", "route", "show", "default"], text=True)
+    for line in out.splitlines():
+        parts = line.split()
+        if not parts or parts[0] != "default":
+            continue
+        try:
+            via_index = parts.index("via")
+        except ValueError:
+            continue
+        if via_index + 1 < len(parts):
+            return parts[via_index + 1]
+    raise RuntimeError(
+        "no IPv4 default gateway with a 'via' field found in `ip route show default` output"
+    )
+
+
+def pasta_version(pasta: str) -> str:
+    out = subprocess.run([pasta, "--version"], capture_output=True, text=True, check=False)
+    first = out.stdout.splitlines() or [""]
+    return first[0].strip()
+
+
+def free_port() -> int:
+    s = socket.socket()
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def run_in_netns(pasta: str, cmd: str, *, timeout: float) -> subprocess.CompletedProcess[str]:
+    """Run `cmd` inside a fresh pasta-managed network namespace."""
+    return subprocess.run(
+        [pasta, "-q", "--config-net", "--", "bash", "-c", cmd],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        check=False,
+    )
+
+
+def measure_g2h_throughput(
+    pasta: str,
+    gw: str,
+    iterations: int,
+    transfer_mb: int,
+) -> Optional[float]:
+    samples_mbps: list[float] = []
+    for i in range(iterations):
+        port = free_port()
+        result_box: dict[str, object] = {}
+
+        srv = socket.socket()
+        srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        srv.bind(("127.0.0.1", port))
+        srv.listen(1)
+        srv.settimeout(30.0)
+
+        def host_drain() -> None:
+            try:
+                conn, _ = srv.accept()
+            except socket.timeout:
+                result_box["error"] = "accept timeout"
+                return
+            start = time.perf_counter()
+            total = 0
+            with conn:
+                while True:
+                    buf = conn.recv(1 << 16)
+                    if not buf:
+                        break
+                    total += len(buf)
+            result_box["bytes"] = total
+            result_box["elapsed"] = time.perf_counter() - start
+
+        worker = threading.Thread(target=host_drain, daemon=True)
+        worker.start()
+        time.sleep(0.2)
+
+        cmd = f"dd if=/dev/zero bs=1M count={transfer_mb} 2>/dev/null | nc {gw} {port}"
+        try:
+            run_in_netns(pasta, cmd, timeout=60)
+        except subprocess.TimeoutExpired:
+            print(f"g2h[{i:>2}]: client timeout; skipping", file=sys.stderr)
+            srv.close()
+            continue
+
+        worker.join(timeout=10)
+        srv.close()
+
+        if "error" in result_box:
+            print(f"g2h[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr)
+            continue
+        bytes_received = int(result_box.get("bytes", 0))
+        elapsed = float(result_box.get("elapsed", 0.0))
+        if bytes_received <= 0 or elapsed < 1e-4:
+            print(f"g2h[{i:>2}]: bytes={bytes_received} elapsed={elapsed}s; skipping", file=sys.stderr)
+            continue
+        mbps = bytes_received * 8 / elapsed / 1_000_000
+        print(
+            f"g2h[{i:>2}]: {bytes_received} B in {elapsed:.3f}s = {mbps:.1f} Mbps",
+            file=sys.stderr,
+        )
+        samples_mbps.append(mbps)
+
+    if not samples_mbps:
+        return None
+    return sum(samples_mbps) / len(samples_mbps)
+
+
+def measure_rr_latency(
+    pasta: str,
+    gw: str,
+    iterations: int,
+    samples_per_iter: int,
+) -> tuple[Optional[float], Optional[float]]:
+    all_samples_us: list[float] = []
+    for i in range(iterations):
+        port = free_port()
+        srv = socket.socket()
+        srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        srv.bind(("127.0.0.1", port))
+        srv.listen(1)
+        srv.settimeout(30.0)
+
+        result_box: dict[str, object] = {}
+
+        def host_echo() -> None:
+            try:
+                conn, _ = srv.accept()
+            except socket.timeout:
+                result_box["error"] = "accept timeout"
+                return
+            samples: list[float] = []
+            with conn:
+                buf = bytearray(1)
+                for _ in range(samples_per_iter):
+                    start = time.perf_counter_ns()
+                    nrecv = conn.recv_into(buf, 1)
+                    if nrecv == 0:
+                        break
+                    conn.sendall(bytes(buf[:1]))
+                    samples.append((time.perf_counter_ns() - start) / 1000.0)
+            result_box["samples"] = samples
+
+        worker = threading.Thread(target=host_echo, daemon=True)
+        worker.start()
+        time.sleep(0.2)
+
+        # Send `samples_per_iter` zero bytes.  The guest doesn't read
+        # the echoed bytes back; host-side timing is the ground truth.
+        cmd = f"dd if=/dev/zero bs=1 count={samples_per_iter} 2>/dev/null | nc {gw} {port} >/dev/null"
+        try:
+            run_in_netns(pasta, cmd, timeout=60)
+        except subprocess.TimeoutExpired:
+            print(f"rr[{i:>2}]: client timeout; skipping", file=sys.stderr)
+            srv.close()
+            continue
+
+        worker.join(timeout=10)
+        srv.close()
+
+        if "error" in result_box:
+            print(f"rr[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr)
+            continue
+        iter_samples = list(result_box.get("samples", []))
+        if len(iter_samples) > 1:
+            iter_samples.pop(0)
+        if not iter_samples:
+            print(f"rr[{i:>2}]: no samples; skipping", file=sys.stderr)
+            continue
+        p50 = statistics.median(iter_samples)
+        print(f"rr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.1f} µs", file=sys.stderr)
+        all_samples_us.extend(iter_samples)
+
+    if not all_samples_us:
+        return None, None
+    sorted_s = sorted(all_samples_us)
+    n = len(sorted_s)
+    p50 = sorted_s[n // 2]
+    p99_idx = max(0, int(round(0.99 * (n - 1))))
+    p99 = sorted_s[p99_idx]
+    return p50, p99
+
+
+def measure_crr_latency(
+    pasta: str,
+    gw: str,
+    iterations: int,
+    samples_per_iter: int,
+) -> Optional[float]:
+    all_samples_us: list[float] = []
+    for i in range(iterations):
+        port = free_port()
+        srv = socket.socket()
+        srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        srv.bind(("127.0.0.1", port))
+        srv.listen(64)
+        srv.settimeout(30.0)
+
+        result_box: dict[str, object] = {}
+
+        def host_accept_loop() -> None:
+            samples: list[float] = []
+            for _ in range(samples_per_iter):
+                # Start the timer BEFORE accept() so each sample includes
+                # the TCP connect + accept latency, matching
+                # voidbox-network-bench's measure_crr_latency semantics
+                # (its crr_echo_server starts the timer before
+                # accept_with_deadline).  Without this, the two
+                # harnesses report different metrics under the same
+                # name and the side-by-side comparison becomes
+                # meaningless.
+                start = time.perf_counter_ns()
+                try:
+                    conn, _ = srv.accept()
+                except socket.timeout:
+                    break
+                with conn:
+                    # one read + one write keeps it a true CRR round-trip
+                    try:
+                        conn.recv(1)
+                        conn.sendall(b"x")
+                    except OSError:
+                        pass
+                samples.append((time.perf_counter_ns() - start) / 1000.0)
+            result_box["samples"] = samples
+
+        worker = threading.Thread(target=host_accept_loop, daemon=True)
+        worker.start()
+        time.sleep(0.2)
+
+        # Guest: a tight loop of independent nc invocations
+        cmd = (
+            f"for _ in $(seq 1 {samples_per_iter}); do "
+            f"echo y | nc {gw} {port} >/dev/null; done"
+        )
+        try:
+            run_in_netns(pasta, cmd, timeout=120)
+        except subprocess.TimeoutExpired:
+            print(f"crr[{i:>2}]: client timeout; skipping", file=sys.stderr)
+            srv.close()
+            continue
+
+        worker.join(timeout=15)
+        srv.close()
+
+        iter_samples = list(result_box.get("samples", []))
+        if not iter_samples:
+            print(f"crr[{i:>2}]: no samples; skipping", file=sys.stderr)
+            continue
+        p50 = statistics.median(iter_samples)
+        print(f"crr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.0f} µs", file=sys.stderr)
+        all_samples_us.extend(iter_samples)
+
+    if not all_samples_us:
+        return None
+    sorted_s = sorted(all_samples_us)
+    return sorted_s[len(sorted_s) // 2]
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="passt/pasta head-to-head bench harness")
+    parser.add_argument(
+        "--pasta",
+        default=os.environ.get("PASTA") or _resolve_pasta(),
+        help="path to the pasta binary; default $PASTA, or `pasta` on PATH, or system /usr/bin/pasta",
+    )
+    parser.add_argument("--iterations", type=int, default=3)
+    parser.add_argument("--transfer-mb", type=int, default=50)
+    parser.add_argument("--rr-samples", type=int, default=100)
+    parser.add_argument("--crr-samples", type=int, default=30)
+    parser.add_argument("--output", default=None, help="path to write JSON; default stdout")
+    args = parser.parse_args()
+
+    if not os.access(args.pasta, os.X_OK):
+        print(f"pasta not executable: {args.pasta}", file=sys.stderr)
+        return 2
+
+    gw = detect_host_gateway()
+    version = pasta_version(args.pasta)
+    print(f"pasta: {version}", file=sys.stderr)
+    print(f"host gateway (acts as host-loopback inside netns): {gw}", file=sys.stderr)
+
+    report = Report(backend="pasta", pasta_version=version)
+    report.notes.append(
+        "pasta runs in a network namespace (no VM); excludes the MMIO/virtio-mmio overhead "
+        "that voidbox-network-bench includes.  CRR latency is the most apples-to-apples metric "
+        "because it is dominated by NAT-table operations on both sides."
+    )
+
+    print("\n--- TCP throughput g2h ---", file=sys.stderr)
+    report.tcp_throughput_g2h_mbps = measure_g2h_throughput(
+        args.pasta, gw, args.iterations, args.transfer_mb
+    )
+
+    print("\n--- TCP RR latency ---", file=sys.stderr)
+    p50, p99 = measure_rr_latency(args.pasta, gw, args.iterations, args.rr_samples)
+    report.tcp_rr_latency_us_p50 = p50
+    report.tcp_rr_latency_us_p99 = p99
+
+    print("\n--- TCP CRR latency ---", file=sys.stderr)
+    report.tcp_crr_latency_us_p50 = measure_crr_latency(
+        args.pasta, gw, args.iterations, args.crr_samples
+    )
+
+    payload = json.dumps(asdict(report), indent=2)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(payload)
+            f.write("\n")
+        print(f"\nReport written to {args.output}", file=sys.stderr)
+    else:
+        print()
+        print(payload)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tools/perf-harness/bench-qemu-slirp.sh b/tools/perf-harness/bench-qemu-slirp.sh
new file mode 100755
index 00000000..eacb3f6a
--- /dev/null
+++ b/tools/perf-harness/bench-qemu-slirp.sh
@@ -0,0 +1,235 @@
+#!/usr/bin/env bash
+# bench-qemu-slirp.sh — qemu-side of the proper SLIRP-vs-SLIRP head-to-head.
+#
+# Boots a minimal qemu guest with the static crr-client baked in, runs N
+# TCP CRRs against a host TCP server, and prints `n p50_ns p99_ns mean_ns`.
+#
+# Two backends:
+#   --backend libslirp    qemu's built-in -netdev user (libslirp)
+#   --backend passt       qemu -netdev stream + a passt(1) instance over UNIX socket
+#
+# Both produce a number directly comparable to tools/perf-harness/bench-pasta.py's
+# pasta-side number AND to examples/crr_singleproc_bench.rs's voidbox-side
+# number — same workload, same C client, same iteration count.
+#
+# Why this exists:  voidbox-vs-pasta comparisons mix two different
+# architectures (a real VM vs a netns).  The right SLIRP-vs-SLIRP comparison
+# is voidbox+voidbox-SLIRP vs qemu+passt vs qemu+libslirp — all VM-attached.
+# See docs/passt-comparison.md.
+
+set -euo pipefail
+
+BACKEND=libslirp
+ITERATIONS=30
+KERNEL=${KERNEL:-/boot/vmlinuz-$(uname -r)}
+# NB: must be the `passt` binary (VM/socket mode), NOT the `pasta` symlink
+# (namespace mode).  The two modes are the same code keyed on argv[0].
+# Default discovery order: $PASST env var → `passt` on $PATH → /usr/bin/passt.
+default_passt() {
+  if command -v passt >/dev/null 2>&1; then
+    command -v passt
+  else
+    echo /usr/bin/passt
+  fi
+}
+PASST=${PASST:-$(default_passt)}
+HOST_PORT=${HOST_PORT:-18877}
+GUEST_ADDR=${GUEST_ADDR:-10.0.2.15}
+GUEST_GATEWAY=${GUEST_GATEWAY:-10.0.2.2}
+CRR_CLIENT_BIN=${CRR_CLIENT_BIN:-/tmp/crr-client}
+ROOTFS_DIR=${ROOTFS_DIR:-}
+KEEP_ROOTFS=${KEEP_ROOTFS:-0}
+
+usage() {
+  cat <<EOF
+Usage: $0 [--backend libslirp|passt] [--iterations N] [--kernel PATH] [--port PORT]
+
+Env vars:
+  KERNEL          path to a Linux bzImage (default: host distro kernel)
+  PASST           path to the passt binary (default: \`passt\` on \$PATH, falling back to /usr/bin/passt)
+  CRR_CLIENT_BIN  path to the static crr-client binary (default: /tmp/crr-client)
+  HOST_PORT       TCP port for the host listener (default: 18877)
+  GUEST_ADDR      IPv4 to assign to the guest (default: 10.0.2.15)
+  GUEST_GATEWAY   IPv4 the guest treats as host loopback (default: 10.0.2.2)
+
+Output: one line "n p50_ns p99_ns mean_ns" on stdout.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --backend)    BACKEND="$2"; shift 2 ;;
+    --iterations) ITERATIONS="$2"; shift 2 ;;
+    --kernel)     KERNEL="$2"; shift 2 ;;
+    --port)       HOST_PORT="$2"; shift 2 ;;
+    --rootfs-dir) ROOTFS_DIR="$2"; shift 2 ;;
+    --keep)       KEEP_ROOTFS=1; shift ;;
+    -h|--help)    usage; exit 0 ;;
+    *)            echo "unknown arg: $1" >&2; usage; exit 1 ;;
+  esac
+done
+
+case "$BACKEND" in
+  libslirp|passt) : ;;
+  *) echo "unknown backend: $BACKEND" >&2; exit 1 ;;
+esac
+
+[[ -x "$CRR_CLIENT_BIN" ]] || {
+  echo "ERROR: crr-client not found at $CRR_CLIENT_BIN" >&2
+  echo "       compile it with: gcc -O2 -static -o $CRR_CLIENT_BIN tools/perf-harness/crr-client.c" >&2
+  exit 2
+}
+
+[[ -r "$KERNEL" ]] || { echo "ERROR: kernel not readable: $KERNEL" >&2; exit 2; }
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+INIT_TEMPLATE="$SCRIPT_DIR/qemu-init.sh"
+[[ -r "$INIT_TEMPLATE" ]] || { echo "ERROR: missing $INIT_TEMPLATE" >&2; exit 2; }
+
+# ---------------------------------------------------------------------------
+# Build the initramfs.  Keep it on tmpfs so it doesn't pollute the workspace.
+# ---------------------------------------------------------------------------
+if [[ -z "$ROOTFS_DIR" ]]; then
+  ROOTFS_DIR=$(mktemp -d -t voidbox-qemu-rootfs.XXXXXX)
+  cleanup_rootfs() {
+    if [[ "$KEEP_ROOTFS" -eq 0 ]]; then rm -rf "$ROOTFS_DIR"; fi
+  }
+  trap cleanup_rootfs EXIT
+fi
+
+mkdir -p "$ROOTFS_DIR"/{bin,sbin,proc,sys,dev,tmp}
+
+# Static busybox: prefer host /usr/bin/busybox (Fedora ships static); fall back
+# to extracting from voidbox's claude rootfs if needed.
+if [[ -x /usr/bin/busybox ]] && file /usr/bin/busybox 2>/dev/null | grep -q "statically linked"; then
+  cp /usr/bin/busybox "$ROOTFS_DIR/bin/busybox"
+elif [[ -r "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" ]]; then
+  (cd "$ROOTFS_DIR" && zcat "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" | cpio -idm bin/busybox 2>/dev/null)
+else
+  echo "ERROR: no static busybox found; install busybox-static or build target/void-box-claude.cpio.gz" >&2
+  exit 2
+fi
+
+cp "$INIT_TEMPLATE" "$ROOTFS_DIR/init"
+chmod +x "$ROOTFS_DIR/init"
+cp "$CRR_CLIENT_BIN" "$ROOTFS_DIR/tmp/crr-client"
+
+for cmd in sh ifconfig route poweroff cat sleep echo mount find ls insmod; do
+  ln -sf busybox "$ROOTFS_DIR/bin/$cmd"
+done
+
+# Stage virtio_net + failover modules from the host kernel so the distro-kernel
+# path can probe the qemu virtio-net-pci device.  Voidbox's slim kernel has
+# them built-in and ignores these.
+KMOD_DIR="/lib/modules/$(uname -r)/kernel"
+if [[ -d "$KMOD_DIR" ]]; then
+  KGUEST_DIR="$ROOTFS_DIR/lib/modules/$(uname -r)"
+  mkdir -p "$KGUEST_DIR"
+  for mod in net/core/failover.ko.xz net/core/failover.ko \
+             drivers/net/net_failover.ko.xz drivers/net/net_failover.ko \
+             drivers/net/virtio_net.ko.xz drivers/net/virtio_net.ko; do
+    [[ -r "$KMOD_DIR/$mod" ]] && cp "$KMOD_DIR/$mod" "$KGUEST_DIR/"
+  done
+fi
+
+INITRD=$(mktemp -t voidbox-qemu-initrd.XXXXXX.cpio.gz)
+trap "rm -f $INITRD; ${cleanup_rootfs:-true}" EXIT
+(cd "$ROOTFS_DIR" && find . | cpio -H newc -o 2>/dev/null | gzip > "$INITRD")
+
+# ---------------------------------------------------------------------------
+# Host-side echo server.  The script's outer EXIT trap kills it, so the
+# server stays alive for the entire qemu run rather than racing against a
+# fixed-duration sleep.  HOST_PORT must be free; the script fails fast if
+# bind() refuses (no fallback to ephemeral — the guest's kernel cmdline
+# carries the port and changing it after launch isn't useful).
+# ---------------------------------------------------------------------------
+SERVER_PIDFILE=$(mktemp)
+python3 - <<PY &
+import os, signal, socket, sys, threading
+port = int(os.environ.get("HOST_PORT", "$HOST_PORT"))
+s = socket.socket()
+s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+try:
+    s.bind(("127.0.0.1", port))
+except OSError as e:
+    sys.stderr.write(f"echo-server: bind 127.0.0.1:{port} failed: {e}\n")
+    sys.exit(2)
+s.listen(64)
+sys.stderr.write(f"echo-server: bound 127.0.0.1:{port}\n"); sys.stderr.flush()
+def loop():
+    while True:
+        try: c, _ = s.accept()
+        except OSError: return
+        try:
+            c.recv(1); c.sendall(b"x")
+        except OSError: pass
+        finally: c.close()
+threading.Thread(target=loop, daemon=True).start()
+# Block on an event that nothing ever sets — the parent script's EXIT
+# trap kills us when qemu finishes (or when SIGTERM fires on outer
+# timeout).  Before this fix the server exited after 60s while qemu's
+# own boot+run could approach that limit, racing the harness.
+threading.Event().wait()
+PY
+SERVER_PID=$!
+echo "$SERVER_PID" > "$SERVER_PIDFILE"
+trap "kill $SERVER_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE; ${cleanup_rootfs:-true}" EXIT
+sleep 0.3
+
+# ---------------------------------------------------------------------------
+# Backend: spin up passt if requested.
+# ---------------------------------------------------------------------------
+PASST_PID=""
+PASST_SOCK=""
+NETDEV_ARGS=""
+case "$BACKEND" in
+  libslirp)
+    NETDEV_ARGS="-netdev user,id=n0 -device virtio-net-pci,netdev=n0"
+    ;;
+  passt)
+    [[ -x "$PASST" ]] || { echo "ERROR: passt not executable: $PASST" >&2; exit 2; }
+    PASST_SOCK=$(mktemp -u -t voidbox-passt.XXXXXX.sock)
+    rm -f "$PASST_SOCK"
+    "$PASST" -f -s "$PASST_SOCK" \
+      -a "$GUEST_ADDR" -n 24 -g "$GUEST_GATEWAY" \
+      --map-host-loopback "$GUEST_GATEWAY" \
+      -q >/tmp/passt.log 2>&1 &
+    PASST_PID=$!
+    sleep 0.4
+    [[ -S "$PASST_SOCK" ]] || { echo "ERROR: passt socket not created" >&2; exit 3; }
+    NETDEV_ARGS="-netdev stream,id=n0,addr.type=unix,addr.path=$PASST_SOCK -device virtio-net-pci,netdev=n0"
+    trap "kill $SERVER_PID $PASST_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $PASST_SOCK; ${cleanup_rootfs:-true}" EXIT
+    ;;
+esac
+
+# ---------------------------------------------------------------------------
+# Boot qemu, capture serial output.
+# ---------------------------------------------------------------------------
+QEMU_LOG=$(mktemp -t voidbox-qemu.XXXXXX.log)
+trap "kill ${SERVER_PID} ${PASST_PID:-} 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $QEMU_LOG ${PASST_SOCK:-}; ${cleanup_rootfs:-true}" EXIT
+
+# shellcheck disable=SC2086
+HOST_PORT="$HOST_PORT" timeout 60 qemu-system-x86_64 \
+  -enable-kvm -cpu host -m 512 -smp 1 \
+  -kernel "$KERNEL" \
+  -initrd "$INITRD" \
+  -nographic -no-reboot \
+  -append "console=ttyS0 reboot=t panic=1 quiet crr_target=${GUEST_GATEWAY}:${HOST_PORT}:${ITERATIONS} crr_net=${GUEST_ADDR}/24,${GUEST_GATEWAY}" \
+  $NETDEV_ARGS \
+  > "$QEMU_LOG" 2>&1 || true
+
+# Extract the one-line crr-client output between sentinels.
+RESULT=$(sed -n '/===CRR-START===/,/===CRR-END/p' "$QEMU_LOG" | grep -E '^[0-9]+ [0-9]+ [0-9]+ [0-9]+$' | head -1 || true)
+
+if [[ -z "$RESULT" ]]; then
+  echo "ERROR: no result from guest (qemu log tail follows):" >&2
+  tail -20 "$QEMU_LOG" >&2
+  exit 4
+fi
+
+read -r N P50_NS P99_NS MEAN_NS <<<"$RESULT"
+P50_US=$((P50_NS / 1000))
+P99_US=$((P99_NS / 1000))
+MEAN_US=$((MEAN_NS / 1000))
+echo "qemu+${BACKEND} CRR over $N iterations: p50=${P50_US} µs, p99=${P99_US} µs, mean=${MEAN_US} µs" >&2
+echo "$RESULT"
diff --git a/tools/perf-harness/crr-client.c b/tools/perf-harness/crr-client.c
new file mode 100644
index 00000000..df9ee70d
--- /dev/null
+++ b/tools/perf-harness/crr-client.c
@@ -0,0 +1,85 @@
+// crr-client.c — N-iteration TCP CRR loop inside a single process.
+//
+// Usage: crr-client HOST PORT N
+// Output: one line "n p50_ns p99_ns mean_ns" to stdout.
+//
+// Each iteration: socket → connect → write 1 byte → read 1 byte → close.
+// Times the full cycle with CLOCK_MONOTONIC.  No fork, no exec, no
+// per-iteration interpreter overhead — isolates the user-mode TCP /
+// NAT path from the bench's outer process-spawning loop.
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <limits.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <time.h>
+#include <unistd.h>
+
+static int cmp_long(const void *a, const void *b) {
+    long la = *(const long *)a, lb = *(const long *)b;
+    return (la > lb) - (la < lb);
+}
+
+int main(int argc, char **argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s HOST PORT N\n", argv[0]);
+        return 1;
+    }
+    const char *host = argv[1];
+    int port = atoi(argv[2]);
+    int n = atoi(argv[3]);
+    if (n <= 0 || n > 1000000) {
+        fprintf(stderr, "N out of range\n");
+        return 1;
+    }
+
+    struct sockaddr_in addr;
+    memset(&addr, 0, sizeof addr);
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(port);
+    if (inet_pton(AF_INET, host, &addr.sin_addr) != 1) {
+        fprintf(stderr, "bad host %s\n", host);
+        return 1;
+    }
+
+    long *samples = calloc((size_t)n, sizeof(long));
+    if (!samples) return 2;
+
+    for (int i = 0; i < n; i++) {
+        struct timespec t0, t1;
+        clock_gettime(CLOCK_MONOTONIC, &t0);
+
+        int fd = socket(AF_INET, SOCK_STREAM, 0);
+        if (fd < 0) { perror("socket"); return 3; }
+        if (connect(fd, (struct sockaddr *)&addr, sizeof addr) < 0) {
+            perror("connect");
+            return 3;
+        }
+        ssize_t w = write(fd, "y", 1);
+        (void)w;
+        char buf;
+        ssize_t r = read(fd, &buf, 1);
+        (void)r;
+        close(fd);
+
+        clock_gettime(CLOCK_MONOTONIC, &t1);
+        long ns = (t1.tv_sec - t0.tv_sec) * 1000000000L
+                + (t1.tv_nsec - t0.tv_nsec);
+        samples[i] = ns;
+    }
+
+    qsort(samples, (size_t)n, sizeof(long), cmp_long);
+    long sum = 0;
+    for (int i = 0; i < n; i++) sum += samples[i];
+    long p50  = samples[n / 2];
+    long p99  = samples[(n * 99) / 100];
+    long mean = sum / n;
+    printf("%d %ld %ld %ld\n", n, p50, p99, mean);
+
+    free(samples);
+    return 0;
+}
diff --git a/tools/perf-harness/qemu-init.sh b/tools/perf-harness/qemu-init.sh
new file mode 100755
index 00000000..e32da047
--- /dev/null
+++ b/tools/perf-harness/qemu-init.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+# tools/perf-harness/qemu-init.sh — /init for the SLIRP-vs-SLIRP comparison guest.
+#
+# Used by tools/perf-harness/bench-qemu-slirp.sh.  Read /proc/cmdline for:
+#   crr_target=HOST:PORT:N      target server + iteration count
+#   crr_net=ADDR/MASK,GW        static network config
+#
+# Bring up eth0 with the static IP, run /tmp/crr-client, and halt.
+# The script is paranoid about busybox-vs-distro variations: virtio-net
+# is loaded as a module if present (Fedora-style), or assumed built-in
+# (voidbox's slim kernel).
+
+set +e
+mount -t proc proc /proc 2>/dev/null
+mount -t sysfs sysfs /sys 2>/dev/null
+
+cmdline="$(cat /proc/cmdline)"
+target=""
+net=""
+for tok in $cmdline; do
+  case "$tok" in
+    crr_target=*) target="${tok#crr_target=}" ;;
+    crr_net=*)    net="${tok#crr_net=}" ;;
+  esac
+done
+
+if [ -z "$target" ] || [ -z "$net" ]; then
+  echo "ERROR: missing crr_target or crr_net on cmdline"
+  echo "cmdline: $cmdline"
+  poweroff -f
+fi
+
+addr_mask="${net%,*}"
+gw="${net#*,}"
+host="${target%%:*}"
+rest="${target#*:}"
+port="${rest%%:*}"
+n="${rest#*:}"
+
+busybox ifconfig lo up
+
+# Load virtio modules if shipped in the rootfs (distro-kernel case).
+# Voidbox's slim kernel has them built-in so insmod fails harmlessly.
+for mod in failover net_failover virtio_net; do
+  busybox find /lib/modules -name "${mod}.ko*" -exec busybox insmod {} \; 2>/dev/null
+done
+
+i=0
+while [ $i -lt 30 ] && ! busybox ifconfig eth0 >/dev/null 2>&1; do
+  sleep 0.1
+  i=$((i+1))
+done
+
+# Derive the netmask from the /N suffix instead of hard-coding /24:
+# crr_net is documented as ADDR/MASK,GW and a future call site might
+# reasonably use /16 or /29.  Falls back to /24 if the suffix isn't
+# parseable so existing setups keep working.
+addr="${addr_mask%/*}"
+prefix="${addr_mask#*/}"
+case "$prefix" in
+  8)  mask=255.0.0.0 ;;
+  16) mask=255.255.0.0 ;;
+  24) mask=255.255.255.0 ;;
+  29) mask=255.255.255.248 ;;
+  30) mask=255.255.255.252 ;;
+  *)  mask=255.255.255.0 ;;
+esac
+busybox ifconfig eth0 "$addr" netmask "$mask" up
+busybox route add default gw "$gw"
+
+echo "===CRR-START==="
+echo "addr=${addr_mask} gw=${gw} target=${host}:${port} n=${n}"
+/tmp/crr-client "$host" "$port" "$n"
+rc=$?
+echo "===CRR-END (rc=$rc)==="
+
+poweroff -f