the-void-ia · dpsoft · May 7, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -113,6 +113,11 @@ socket2 = { version = "0.5", features = ["all"] }
 # path of a NAT keyed by guest-side ports the guest itself chooses.
 rustc-hash = "2"
 
+# Lock-free MPMC queue used to hand virtio-net RX frames from the
+# net-poll thread to the vCPU thread without taking the
+# `Arc<Mutex<VirtioNetDevice>>` device lock on the hot path.
+crossbeam-queue = "0.3"
+
 # --- macOS-only dependencies ---
 [target.'cfg(target_os = "macos")'.dependencies]
 # Objective-C 2.0 bindings (auto-generated from Apple frameworks)

diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md
@@ -0,0 +1,96 @@
+# passt head-to-head comparison harness
+
+Tools under `tools/perf-harness/` produce a side-by-side comparison of voidbox
+(real KVM VM + SLIRP) against passt's [`pasta`](https://passt.top/passt/about/)
+running in a network namespace.
+
+This is the deferred deliverable from
+[`docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md`](superpowers/plans/2026-04-27-smoltcp-passt-port.md)
+§ "passt head-to-head methodology".
+
+## What the harness measures
+
+Both sides run the same workload shape — the same fields the
+`voidbox-network-bench` `Report` already emits:
+
+| Field | Workload |
+|---|---|
+| `tcp_throughput_g2h_mbps` | `dd if=/dev/zero bs=1M count=N \| nc HOST PORT` from inside the guest / netns; host TCP server times the drain |
+| `tcp_rr_latency_us_p50/p99` | Persistent connection, host-side echo loop bouncing one byte per round trip |
+| `tcp_crr_latency_us_p50` | Independent `nc` invocations in a tight loop; host-side timing of the full accept→read→write→close cycle |
+
+The pasta side uses `pasta -- COMMAND` to run the client inside a fresh
+network namespace.  Pasta's `--map-host-loopback` (default: the host's
+gateway IP) translates to the host's loopback, so the client connects
+to `<host-gateway>:PORT` and reaches the host server bound on `127.0.0.1:PORT`.
+
+## What it's good for
+
+**CRR latency is the most apples-to-apples metric** — it's dominated by
+NAT-table operations and the round-trip path through the user-mode
+networking stack, which is the same code on both sides.  Per the spec:
+
+> Connect rate (CRR latency) is the most apples-to-apples metric —
+> dominated by NAT-table operations, not MMIO. If passt does CRR in 135 µs
+> and we do 600 µs, that's a meaningful "we have 4× more overhead per
+> connect" signal that this refactor should narrow.
+
+## What it's not
+
+**Throughput numbers are not directly comparable.**
+
+- voidbox runs a real KVM VM; every packet incurs `virtio-mmio`
+  exits, vCPU IPI overhead, and per-packet copy across the device
+  boundary.
+- pasta runs in a network namespace; the data path is just user-mode
+  socket forwarding, no VM, no MMIO.
+
+The throughput gap is therefore a *sum of the user-mode overhead the
+two stacks share* plus *the VM transit cost only voidbox pays*.
+Use the throughput numbers as a sanity bound, not a parity target.
+
+A proper VM-vs-VM comparison would run passt under
+`qemu-system-x86_64` with a guest image carrying `nc` / `iperf3`.
+That is documented as a separate follow-up; the harness here is the
+quick, low-friction sibling that exercises the apples-to-apples
+metric (CRR) without requiring an extra guest image.
+
+## Usage
+
+```bash
+# Generate voidbox numbers (requires VOID_BOX_KERNEL/VOID_BOX_INITRAMFS).
+cargo run --release --bin voidbox-network-bench -- \
+    --iterations 3 --output /tmp/voidbox-bench.json
+
+# Generate pasta numbers (requires pasta on PATH or via $PASTA).
+tools/perf-harness/bench-pasta.py --output /tmp/pasta-bench.json
+
+# Side-by-side markdown.
+tools/perf-harness/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \
+    --output /tmp/voidbox-vs-pasta.md
+
+# qemu+libslirp / qemu+passt CRR (apples-to-apples SLIRP-vs-SLIRP).
+gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c
+tools/perf-harness/bench-qemu-slirp.sh --backend libslirp --iterations 30
+tools/perf-harness/bench-qemu-slirp.sh --backend passt    --iterations 30
+
+# Voidbox single-process CRR (no per-iteration nc fork).
+cargo run --release --example crr_singleproc_bench -- --iterations 30
+```
+
+`tools/perf-harness/bench-pasta.py --help` lists tunables (iterations,
+transfer size, sample counts).
+
+## Reading the report
+
+| Δ column | Meaning |
+|---|---|
+| `voidbox N× faster`  (throughput) | voidbox has the higher Mbps number |
+| `voidbox N× slower`  (throughput) | pasta has the higher Mbps number — expected, since pasta has no VM |
+| `voidbox N× faster`  (latency)    | voidbox has the lower µs number |
+| `voidbox N× slower`  (latency)    | pasta has the lower µs number — large multiples here mean voidbox spends much of its CRR time outside the NAT path (poll-thread cadence, vCPU exits, virtio handling) |
+
+A useful CRR signal: if `voidbox N× slower on CRR p50` is much larger
+than `voidbox N× slower on RR p50`, the per-connection overhead is the
+bottleneck, not the data path.  RR p50 captures the data path; CRR
+captures the connect path.
diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs
@@ -0,0 +1,157 @@
+//! crr_singleproc_bench — voidbox-side N-iteration TCP CRR loop in a
+//! single guest process, isolating voidbox's NAT-path cost from the
+//! existing bench's per-iteration `nc` fork+exec overhead.
+//!
+//! NOT meant for the production bench surface; this is a one-off
+//! diagnostic that pairs with `tools/perf-harness/crr-client.c` + the
+//! pasta side of the head-to-head.  Compile and run directly:
+//!
+//!     gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c
+//!     cargo run --release --example crr_singleproc_bench -- \
+//!         --iterations 100 --bench-binary /tmp/crr-client
+//!
+//! Requires the same env vars as voidbox-network-bench:
+//!   VOID_BOX_KERNEL, VOID_BOX_INITRAMFS
+
+use std::net::TcpListener;
+use std::thread;
+use std::time::Duration;
+
+use clap::Parser;
+use void_box::backend::MountConfig;
+use void_box::sandbox::Sandbox;
+
+#[derive(Parser)]
+#[command(version, about)]
+struct Cli {
+    /// Number of CRR iterations.
+    #[arg(long, default_value_t = 100)]
+    iterations: u32,
+    /// Host path to the static crr-client binary.
+    #[arg(long, default_value = "/tmp/crr-client")]
+    bench_binary: String,
+    /// Memory size for the guest VM (MB).
+    #[arg(long, default_value_t = 1024)]
+    memory_mb: usize,
+}
+
+const HOST_LOOPBACK_FROM_GUEST: &str = "10.0.2.2";
+
+#[tokio::main(flavor = "multi_thread")]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let cli = Cli::parse();
+    let bench_binary = std::path::PathBuf::from(&cli.bench_binary);
+    if !bench_binary.exists() {
+        return Err(format!(
+            "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/perf-harness/crr-client.c`)",
+            cli.bench_binary
+        )
+        .into());
+    }
+    let bench_binary_dir = bench_binary
+        .parent()
+        .ok_or("bench-binary has no parent dir")?
+        .to_string_lossy()
+        .into_owned();
+    let bench_binary_name = bench_binary
+        .file_name()
+        .ok_or("bench-binary has no file name")?
+        .to_string_lossy()
+        .into_owned();
+
+    let listener = TcpListener::bind("127.0.0.1:0")?;
+    let host_port = listener.local_addr()?.port();
+    listener.set_nonblocking(true)?;
+
+    let iterations = cli.iterations;
+    let server_thread = thread::spawn(move || {
+        // Non-blocking accept with a tight poll, deadline-checked.  With
+        // a blocking accept the deadline never fires if the guest never
+        // connects (boot failure, SLIRP rate limit, etc.) and the
+        // example's later `server_thread.join()` would hang forever.
+        // The accept-pickup latency directly inflates each guest CRR
+        // sample, so the wait is kept short — `from_micros(50)` adds
+        // at most ~50 µs of jitter on top of a ~280 µs baseline, while
+        // still letting the deadline check fire every ~50 µs.
+        let mut accepted = 0u32;
+        let deadline = std::time::Instant::now() + Duration::from_secs(120);
+        while accepted < iterations && std::time::Instant::now() < deadline {
+            match listener.accept() {
+                Ok((mut conn, _)) => {
+                    conn.set_nonblocking(false).ok();
+                    let mut buf = [0u8; 1];
+                    let _ = std::io::Read::read(&mut conn, &mut buf);
+                    let _ = std::io::Write::write_all(&mut conn, b"x");
+                    accepted += 1;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {
+                    thread::sleep(Duration::from_micros(50));
+                }
+                Err(_) => break,
+            }
+        }
+        accepted
+    });
+
+    let sandbox = Sandbox::local()
+        .from_env()?
+        .memory_mb(cli.memory_mb)
+        .network(true)
+        // Production SLIRP defaults (50/s rate, 64 concurrent) are
+        // sized to throttle a guest-side flood — far below what a
+        // CRR microbench wants.  Lift both ceilings so the bench
+        // exercises the steady-state NAT path, not the rate limiter.
+        .network_max_connections_per_second(u32::MAX)
+        .network_max_concurrent_connections(usize::MAX)
+        .mount(MountConfig {
+            host_path: bench_binary_dir.clone(),
+            guest_path: "/tmp/host".into(),
+            read_only: true,
+        })
+        .build()?;
+
+    eprintln!(
+        "VM booted; running {} CRRs in a single guest process...",
+        iterations
+    );
+    let probe = sandbox.exec("sh", &["-c", ":"]).await?;
+    if !probe.success() {
+        return Err("VM probe exec failed".into());
+    }
+
+    let cmd = format!(
+        "/tmp/host/{name} {host} {port} {n}",
+        name = bench_binary_name,
+        host = HOST_LOOPBACK_FROM_GUEST,
+        port = host_port,
+        n = iterations,
+    );
+    let output = sandbox.exec("sh", &["-c", &cmd]).await?;
+    let stdout = output.stdout_str().to_string();
+    let stderr = output.stderr_str().to_string();
+    if !output.success() {
+        eprintln!("guest stderr: {stderr}");
+        return Err(format!("guest exec failed: {:?}", output.exit_code).into());
+    }
+
+    let server_thread_count = server_thread.join().unwrap_or(0);
+    eprintln!("host accepts: {server_thread_count}/{iterations}");
+
+    let line = stdout.lines().next().ok_or("empty guest stdout")?;
+    let parts: Vec<&str> = line.split_whitespace().collect();
+    if parts.len() != 4 {
+        return Err(format!("unexpected guest stdout: {line:?}").into());
+    }
+    let n: u32 = parts[0].parse()?;
+    let p50_ns: u64 = parts[1].parse()?;
+    let p99_ns: u64 = parts[2].parse()?;
+    let mean_ns: u64 = parts[3].parse()?;
+
+    println!();
+    println!("voidbox single-process CRR over {n} iterations:");
+    println!("  p50:  {} µs", p50_ns / 1000);
+    println!("  p99:  {} µs", p99_ns / 1000);
+    println!("  mean: {} µs", mean_ns / 1000);
+
+    Ok(())
+}
diff --git a/src/bin/voidbox-network-bench/main.rs b/src/bin/voidbox-network-bench/main.rs
@@ -192,6 +192,14 @@ FAST SMOKE RUN\n\
             .from_env()?
             .memory_mb(BENCH_MEMORY_MB)
             .network(true)
+            // Production SLIRP defaults (50 connect/s, 64 concurrent)
+            // are anti-DoS limits sized for real workloads.  The CRR
+            // bench intentionally opens hundreds of connections per
+            // second; without this lift it gets RST'd at the 51st
+            // connect, which manifests as a 2 s `crr echo channel
+            // receive error` instead of a real number.
+            .network_max_connections_per_second(u32::MAX)
+            .network_max_concurrent_connections(usize::MAX)
             .build()?;
 
         // Prime the VM (triggers boot + vsock handshake) before any timed work.