diff --git a/Cargo.lock b/Cargo.lock index 455b1e9a..868e1c21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -388,6 +388,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -2979,6 +2988,7 @@ dependencies = [ "byteorder", "bytes", "clap", + "crossbeam-queue", "dispatch2", "divan", "event-manager", diff --git a/Cargo.toml b/Cargo.toml index 50607e5f..af267aec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -113,6 +113,11 @@ socket2 = { version = "0.5", features = ["all"] } # path of a NAT keyed by guest-side ports the guest itself chooses. rustc-hash = "2" +# Lock-free MPMC queue used to hand virtio-net RX frames from the +# net-poll thread to the vCPU thread without taking the +# `Arc>` device lock on the hot path. +crossbeam-queue = "0.3" + # --- macOS-only dependencies --- [target.'cfg(target_os = "macos")'.dependencies] # Objective-C 2.0 bindings (auto-generated from Apple frameworks) diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md new file mode 100644 index 00000000..89f21661 --- /dev/null +++ b/docs/passt-comparison.md @@ -0,0 +1,96 @@ +# passt head-to-head comparison harness + +Tools under `tools/perf-harness/` produce a side-by-side comparison of voidbox +(real KVM VM + SLIRP) against passt's [`pasta`](https://passt.top/passt/about/) +running in a network namespace. + +This is the deferred deliverable from +[`docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md`](superpowers/plans/2026-04-27-smoltcp-passt-port.md) +§ "passt head-to-head methodology". + +## What the harness measures + +Both sides run the same workload shape — the same fields the +`voidbox-network-bench` `Report` already emits: + +| Field | Workload | +|---|---| +| `tcp_throughput_g2h_mbps` | `dd if=/dev/zero bs=1M count=N \| nc HOST PORT` from inside the guest / netns; host TCP server times the drain | +| `tcp_rr_latency_us_p50/p99` | Persistent connection, host-side echo loop bouncing one byte per round trip | +| `tcp_crr_latency_us_p50` | Independent `nc` invocations in a tight loop; host-side timing of the full accept→read→write→close cycle | + +The pasta side uses `pasta -- COMMAND` to run the client inside a fresh +network namespace. Pasta's `--map-host-loopback` (default: the host's +gateway IP) translates to the host's loopback, so the client connects +to `:PORT` and reaches the host server bound on `127.0.0.1:PORT`. + +## What it's good for + +**CRR latency is the most apples-to-apples metric** — it's dominated by +NAT-table operations and the round-trip path through the user-mode +networking stack, which is the same code on both sides. Per the spec: + +> Connect rate (CRR latency) is the most apples-to-apples metric — +> dominated by NAT-table operations, not MMIO. If passt does CRR in 135 µs +> and we do 600 µs, that's a meaningful "we have 4× more overhead per +> connect" signal that this refactor should narrow. + +## What it's not + +**Throughput numbers are not directly comparable.** + +- voidbox runs a real KVM VM; every packet incurs `virtio-mmio` + exits, vCPU IPI overhead, and per-packet copy across the device + boundary. +- pasta runs in a network namespace; the data path is just user-mode + socket forwarding, no VM, no MMIO. + +The throughput gap is therefore a *sum of the user-mode overhead the +two stacks share* plus *the VM transit cost only voidbox pays*. +Use the throughput numbers as a sanity bound, not a parity target. + +A proper VM-vs-VM comparison would run passt under +`qemu-system-x86_64` with a guest image carrying `nc` / `iperf3`. +That is documented as a separate follow-up; the harness here is the +quick, low-friction sibling that exercises the apples-to-apples +metric (CRR) without requiring an extra guest image. + +## Usage + +```bash +# Generate voidbox numbers (requires VOID_BOX_KERNEL/VOID_BOX_INITRAMFS). +cargo run --release --bin voidbox-network-bench -- \ + --iterations 3 --output /tmp/voidbox-bench.json + +# Generate pasta numbers (requires pasta on PATH or via $PASTA). +tools/perf-harness/bench-pasta.py --output /tmp/pasta-bench.json + +# Side-by-side markdown. +tools/perf-harness/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \ + --output /tmp/voidbox-vs-pasta.md + +# qemu+libslirp / qemu+passt CRR (apples-to-apples SLIRP-vs-SLIRP). +gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c +tools/perf-harness/bench-qemu-slirp.sh --backend libslirp --iterations 30 +tools/perf-harness/bench-qemu-slirp.sh --backend passt --iterations 30 + +# Voidbox single-process CRR (no per-iteration nc fork). +cargo run --release --example crr_singleproc_bench -- --iterations 30 +``` + +`tools/perf-harness/bench-pasta.py --help` lists tunables (iterations, +transfer size, sample counts). + +## Reading the report + +| Δ column | Meaning | +|---|---| +| `voidbox N× faster` (throughput) | voidbox has the higher Mbps number | +| `voidbox N× slower` (throughput) | pasta has the higher Mbps number — expected, since pasta has no VM | +| `voidbox N× faster` (latency) | voidbox has the lower µs number | +| `voidbox N× slower` (latency) | pasta has the lower µs number — large multiples here mean voidbox spends much of its CRR time outside the NAT path (poll-thread cadence, vCPU exits, virtio handling) | + +A useful CRR signal: if `voidbox N× slower on CRR p50` is much larger +than `voidbox N× slower on RR p50`, the per-connection overhead is the +bottleneck, not the data path. RR p50 captures the data path; CRR +captures the connect path. diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs new file mode 100644 index 00000000..0b109f8d --- /dev/null +++ b/examples/crr_singleproc_bench.rs @@ -0,0 +1,157 @@ +//! crr_singleproc_bench — voidbox-side N-iteration TCP CRR loop in a +//! single guest process, isolating voidbox's NAT-path cost from the +//! existing bench's per-iteration `nc` fork+exec overhead. +//! +//! NOT meant for the production bench surface; this is a one-off +//! diagnostic that pairs with `tools/perf-harness/crr-client.c` + the +//! pasta side of the head-to-head. Compile and run directly: +//! +//! gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c +//! cargo run --release --example crr_singleproc_bench -- \ +//! --iterations 100 --bench-binary /tmp/crr-client +//! +//! Requires the same env vars as voidbox-network-bench: +//! VOID_BOX_KERNEL, VOID_BOX_INITRAMFS + +use std::net::TcpListener; +use std::thread; +use std::time::Duration; + +use clap::Parser; +use void_box::backend::MountConfig; +use void_box::sandbox::Sandbox; + +#[derive(Parser)] +#[command(version, about)] +struct Cli { + /// Number of CRR iterations. + #[arg(long, default_value_t = 100)] + iterations: u32, + /// Host path to the static crr-client binary. + #[arg(long, default_value = "/tmp/crr-client")] + bench_binary: String, + /// Memory size for the guest VM (MB). + #[arg(long, default_value_t = 1024)] + memory_mb: usize, +} + +const HOST_LOOPBACK_FROM_GUEST: &str = "10.0.2.2"; + +#[tokio::main(flavor = "multi_thread")] +async fn main() -> Result<(), Box> { + let cli = Cli::parse(); + let bench_binary = std::path::PathBuf::from(&cli.bench_binary); + if !bench_binary.exists() { + return Err(format!( + "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/perf-harness/crr-client.c`)", + cli.bench_binary + ) + .into()); + } + let bench_binary_dir = bench_binary + .parent() + .ok_or("bench-binary has no parent dir")? + .to_string_lossy() + .into_owned(); + let bench_binary_name = bench_binary + .file_name() + .ok_or("bench-binary has no file name")? + .to_string_lossy() + .into_owned(); + + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + listener.set_nonblocking(true)?; + + let iterations = cli.iterations; + let server_thread = thread::spawn(move || { + // Non-blocking accept with a tight poll, deadline-checked. With + // a blocking accept the deadline never fires if the guest never + // connects (boot failure, SLIRP rate limit, etc.) and the + // example's later `server_thread.join()` would hang forever. + // The accept-pickup latency directly inflates each guest CRR + // sample, so the wait is kept short — `from_micros(50)` adds + // at most ~50 µs of jitter on top of a ~280 µs baseline, while + // still letting the deadline check fire every ~50 µs. + let mut accepted = 0u32; + let deadline = std::time::Instant::now() + Duration::from_secs(120); + while accepted < iterations && std::time::Instant::now() < deadline { + match listener.accept() { + Ok((mut conn, _)) => { + conn.set_nonblocking(false).ok(); + let mut buf = [0u8; 1]; + let _ = std::io::Read::read(&mut conn, &mut buf); + let _ = std::io::Write::write_all(&mut conn, b"x"); + accepted += 1; + } + Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_micros(50)); + } + Err(_) => break, + } + } + accepted + }); + + let sandbox = Sandbox::local() + .from_env()? + .memory_mb(cli.memory_mb) + .network(true) + // Production SLIRP defaults (50/s rate, 64 concurrent) are + // sized to throttle a guest-side flood — far below what a + // CRR microbench wants. Lift both ceilings so the bench + // exercises the steady-state NAT path, not the rate limiter. + .network_max_connections_per_second(u32::MAX) + .network_max_concurrent_connections(usize::MAX) + .mount(MountConfig { + host_path: bench_binary_dir.clone(), + guest_path: "/tmp/host".into(), + read_only: true, + }) + .build()?; + + eprintln!( + "VM booted; running {} CRRs in a single guest process...", + iterations + ); + let probe = sandbox.exec("sh", &["-c", ":"]).await?; + if !probe.success() { + return Err("VM probe exec failed".into()); + } + + let cmd = format!( + "/tmp/host/{name} {host} {port} {n}", + name = bench_binary_name, + host = HOST_LOOPBACK_FROM_GUEST, + port = host_port, + n = iterations, + ); + let output = sandbox.exec("sh", &["-c", &cmd]).await?; + let stdout = output.stdout_str().to_string(); + let stderr = output.stderr_str().to_string(); + if !output.success() { + eprintln!("guest stderr: {stderr}"); + return Err(format!("guest exec failed: {:?}", output.exit_code).into()); + } + + let server_thread_count = server_thread.join().unwrap_or(0); + eprintln!("host accepts: {server_thread_count}/{iterations}"); + + let line = stdout.lines().next().ok_or("empty guest stdout")?; + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() != 4 { + return Err(format!("unexpected guest stdout: {line:?}").into()); + } + let n: u32 = parts[0].parse()?; + let p50_ns: u64 = parts[1].parse()?; + let p99_ns: u64 = parts[2].parse()?; + let mean_ns: u64 = parts[3].parse()?; + + println!(); + println!("voidbox single-process CRR over {n} iterations:"); + println!(" p50: {} µs", p50_ns / 1000); + println!(" p99: {} µs", p99_ns / 1000); + println!(" mean: {} µs", mean_ns / 1000); + + Ok(()) +} diff --git a/src/bin/voidbox-network-bench/main.rs b/src/bin/voidbox-network-bench/main.rs index a18ac09e..9a2cc434 100644 --- a/src/bin/voidbox-network-bench/main.rs +++ b/src/bin/voidbox-network-bench/main.rs @@ -192,6 +192,14 @@ FAST SMOKE RUN\n\ .from_env()? .memory_mb(BENCH_MEMORY_MB) .network(true) + // Production SLIRP defaults (50 connect/s, 64 concurrent) + // are anti-DoS limits sized for real workloads. The CRR + // bench intentionally opens hundreds of connections per + // second; without this lift it gets RST'd at the 51st + // connect, which manifests as a 2 s `crr echo channel + // receive error` instead of a real number. + .network_max_connections_per_second(u32::MAX) + .network_max_concurrent_connections(usize::MAX) .build()?; // Prime the VM (triggers boot + vsock handshake) before any timed work. diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index 71214d47..2c94e1c7 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -8,8 +8,10 @@ //! - Integration with SLIRP stack for NAT //! - No root/TAP required +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use crossbeam_queue::SegQueue; use tracing::{debug, trace, warn}; use vm_memory::{Address, Bytes, GuestAddress, GuestMemory}; @@ -157,8 +159,16 @@ pub struct VirtioNetDevice { queue_sel: u32, /// Device status status: u32, - /// Interrupt status - interrupt_status: u32, + /// Interrupt status, accessed concurrently from the vCPU thread + /// (MMIO read of `INTERRUPT_STATUS`, MMIO write of `INTERRUPT_ACK`) + /// and the net-poll thread (sets bit 0 when new RX frames are + /// queued, polls on idle cycles). + /// + /// Wrapped in [`Arc`] so the net-poll thread can hold + /// its own clone and read/update the value without taking the + /// device mutex. The vCPU thread accesses it via the device + /// guard during MMIO dispatch; both sides see the same atomic. + interrupt_status: Arc, /// Configuration generation counter config_generation: u32, /// Receive queue state @@ -181,6 +191,21 @@ pub struct VirtioNetDevice { rx_avail_idx: u16, /// RX queue: next used index we'll write rx_used_idx: u16, + /// Lock-free queue of frames waiting to be written into the guest's + /// RX descriptors. The net-poll thread pushes frames here without + /// taking the device lock; the vCPU thread drains them on its next + /// MMIO exit (via [`Self::flush_pending_rx`]) and writes the + /// descriptors in its own context. + /// + /// Eliminates the `Arc>` contention that + /// previously serialised every net-poll-side `try_inject_rx` call + /// against vCPU MMIO exits. + pending_rx: Arc>>, + /// Scratch buffer reused across `flush_pending_rx` calls so the + /// per-MMIO-exit `Vec>` doesn't grow from cap=0 every + /// time. Heaptrack measured the previous local-Vec allocation as + /// 173 calls / 108 MB peak on the CRR microbench. + flush_scratch: Vec>, } impl VirtioNetDevice { @@ -200,7 +225,7 @@ impl VirtioNetDevice { features_sel: 0, queue_sel: 0, status: 0, - interrupt_status: 0, + interrupt_status: Arc::new(AtomicU32::new(0)), config_generation: 0, rx_queue: QueueState { num_max: 256, @@ -218,9 +243,38 @@ impl VirtioNetDevice { tx_used_idx: 0, rx_avail_idx: 0, rx_used_idx: 0, + pending_rx: Arc::new(SegQueue::new()), + flush_scratch: Vec::new(), }) } + /// Returns a clone of the lock-free RX frame queue Arc. + /// + /// The net-poll thread holds this clone and pushes frames to it + /// without ever taking the [`VirtioNetDevice`] mutex. The vCPU + /// thread (which already holds the device mutex during MMIO + /// dispatch) drains it via [`Self::flush_pending_rx`]. + pub fn pending_rx(&self) -> Arc>> { + Arc::clone(&self.pending_rx) + } + + /// Returns a clone of the [`NetworkBackend`] arc. + /// + /// Lets the net-poll thread call `drain_to_guest` directly without + /// going through the device mutex. Combined with [`Self::pending_rx`], + /// this removes the `Arc>` contention point + /// from the per-packet RX hot path. + pub fn slirp_arc(&self) -> Arc> { + Arc::clone(&self.slirp) + } + + /// Returns a clone of the [`Arc`] backing + /// `interrupt_status`. The net-poll thread holds this clone and + /// reads/updates the ISR without ever taking the device mutex. + pub fn interrupt_status_arc(&self) -> Arc { + Arc::clone(&self.interrupt_status) + } + /// Set the MMIO base address pub fn set_mmio_base(&mut self, base: u64) { self.mmio_base = base; @@ -264,7 +318,7 @@ impl VirtioNetDevice { let queue = self.current_queue(); queue.ready as u32 } - mmio::INTERRUPT_STATUS => self.interrupt_status, + mmio::INTERRUPT_STATUS => self.interrupt_status.load(Ordering::Relaxed), mmio::STATUS => self.status, mmio::CONFIG_GENERATION => self.config_generation, // Device config (MAC address at offset 0x100) @@ -339,7 +393,7 @@ impl VirtioNetDevice { self.handle_queue_notify(value, guest_memory); } mmio::INTERRUPT_ACK => { - self.interrupt_status &= !value; + self.interrupt_status.fetch_and(!value, Ordering::Relaxed); } mmio::STATUS => { self.status = value; @@ -434,6 +488,17 @@ impl VirtioNetDevice { } } + /// Process the TX queue from outside the vCPU thread. + /// + /// Called by `net_poll_thread` when the KVM_IOEVENTFD registered for + /// the virtio-net QUEUE_NOTIFY MMIO fires. Same body as the + /// synchronous TX-queue handler used from the MMIO write path, + /// just exposed under a different name so callers outside this + /// module can drive it. + pub fn process_tx_queue_external(&mut self, mem: &M) -> Result<()> { + self.process_tx_queue(mem) + } + /// Process TX queue: read descriptor chains from guest, send frames to SLIRP, update used ring. fn process_tx_queue(&mut self, mem: &M) -> Result<()> { let q = &self.tx_queue; @@ -451,6 +516,16 @@ impl VirtioNetDevice { .map_err(|e| crate::Error::Memory(e.to_string()))?; let avail_idx = u16::from_le_bytes(idx_buf); + let initial_tx_used_idx = self.tx_used_idx; + + // Reusable per-call packet buffer. Capacity carried across + // iterations within this call so chained-descriptor frames don't + // re-grow the buffer; cleared between frames so each + // process_tx_frame sees only this frame's bytes. Pre-size to + // a typical MTU + virtio-net header so the common single-segment + // path needs no realloc. + let mut packet: Vec = Vec::with_capacity(1600); + while self.tx_avail_idx != avail_idx { // Ring entry: 2 bytes, at avail_addr + 4 + (tx_avail_idx % queue_size)*2 let ring_offset = 4 + ((self.tx_avail_idx as usize) % queue_size) * 2; @@ -462,8 +537,7 @@ impl VirtioNetDevice { .map_err(|e| crate::Error::Memory(e.to_string()))?; let head_idx = u16::from_le_bytes(desc_id_buf) as usize; - // Walk descriptor chain and collect packet - let mut packet = Vec::new(); + packet.clear(); let mut next = head_idx; loop { if next >= queue_size { @@ -478,10 +552,14 @@ impl VirtioNetDevice { let flags = u16::from_le_bytes(desc[12..14].try_into().unwrap()); let next_desc = u16::from_le_bytes(desc[14..16].try_into().unwrap()) as usize; if len > 0 && addr != 0 { - let mut buf = vec![0u8; len]; - mem.read(&mut buf, GuestAddress(addr)) + // Read directly into the packet's tail instead of + // allocating an intermediate `Vec` and then + // `extend_from_slice`-ing it in. Saves one alloc + // and one full memcpy per descriptor segment. + let off = packet.len(); + packet.resize(off + len, 0); + mem.read(&mut packet[off..off + len], GuestAddress(addr)) .map_err(|e| crate::Error::Memory(e.to_string()))?; - packet.extend_from_slice(&buf); } if (flags & VIRTQ_DESC_F_NEXT) == 0 { break; @@ -493,36 +571,96 @@ impl VirtioNetDevice { self.process_tx_frame(&packet)?; } - // Write used ring: used->ring[tx_used_idx % queue_size] = { id: head_idx, len: 0 } + // Used-ring entry: 8 bytes (head_idx as u32, 0 as u32). + // Built on the stack to avoid heap-alloc-per-frame from + // `[...].concat()`. TX descriptors carry no return data + // so the length field is always 0. let used_ring_off = 4 + ((self.tx_used_idx as usize) % queue_size) * 8; - let used_elem = [ - (head_idx as u32).to_le_bytes(), - 0u32.to_le_bytes(), // len for TX typically 0 - ] - .concat(); + let mut used_elem = [0u8; 8]; + used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes()); + // bytes [4..8] stay zero (the length field). mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; self.tx_used_idx = self.tx_used_idx.wrapping_add(1); self.tx_avail_idx = self.tx_avail_idx.wrapping_add(1); + } - // Update used.idx so guest sees progress + // Publish used.idx ONCE per batch instead of after every frame. + // virtio spec: the device updates the used-ring entries first, + // then bumps used.idx; the guest reads used.idx with a memory + // barrier and iterates new entries. Per-frame writes are + // redundant for correctness and waste one mem.write per frame. + if self.tx_used_idx != initial_tx_used_idx { let used_idx_bytes = self.tx_used_idx.to_le_bytes(); mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; } - self.interrupt_status |= 1; + self.interrupt_status.fetch_or(1, Ordering::Relaxed); Ok(()) } + /// Drain frames pushed into [`Self::pending_rx`] by the net-poll + /// thread and write them into the guest's RX descriptors. + /// + /// Same descriptor-walking shape as [`Self::try_inject_rx`], but + /// the input frames come from the lock-free SegQueue instead of + /// going through the (locked) network backend. The vCPU thread + /// calls this on every MMIO entry to virtio-net, materialising any + /// frames the net-poll thread queued since the last MMIO exit. + /// + /// Returns the number of frames written to the RX ring this call. + pub fn flush_pending_rx(&mut self, mem: &M) -> Result { + // Move the scratch out so we can mutate self while populating + // it. The post-write `clear()` keeps capacity, so subsequent + // calls reuse the buffer instead of growing from cap=0. + let mut frames = std::mem::take(&mut self.flush_scratch); + frames.clear(); + while let Some(frame) = self.pending_rx.pop() { + frames.push(frame); + } + let result = if !frames.is_empty() { + self.write_frames_to_rx_ring(&mut frames, mem) + } else { + Ok(0) + }; + frames.clear(); + self.flush_scratch = frames; + result + } + /// Try to inject received frames from SLIRP into guest RX queue. Call from vCPU loop or after RX notify. - pub fn try_inject_rx(&mut self, mem: &M) -> Result<()> { - let frames = self.get_rx_frames(); + /// + /// Returns the number of frames the guest now has visible in its RX + /// ring after this call. Callers can use this to decide whether to + /// raise an IRQ — pulsing the line is only useful when the guest + /// has new work to do, not on every poll cycle while interrupt_status + /// is still set from an earlier (un-acked) injection. + pub fn try_inject_rx(&mut self, mem: &M) -> Result { + let mut frames = self.get_rx_frames(); if frames.is_empty() { - return Ok(()); + return Ok(0); } + let result = self.write_frames_to_rx_ring(&mut frames, mem); + // Stash drained Vec back as scratch so the next call reuses + // its capacity instead of allocating from cap=0. + frames.clear(); + self.rx_scratch = frames; + result + } + /// Write a batch of fully-formed frames (already including the + /// virtio-net header) into the guest's RX descriptor ring. + /// + /// Shared between [`Self::try_inject_rx`] (frames pulled from the + /// network backend) and [`Self::flush_pending_rx`] (frames pushed + /// by the net-poll thread into the lock-free SegQueue). + fn write_frames_to_rx_ring( + &mut self, + frames: &mut Vec>, + mem: &M, + ) -> Result { let q = &self.rx_queue; if !q.ready || q.num == 0 { // Queue not ready - buffer frames for later @@ -532,31 +670,32 @@ impl VirtioNetDevice { q.num, frames.len() ); - self.rx_buffer.extend(frames); - return Ok(()); + self.rx_buffer.append(frames); + return Ok(0); } let desc_addr = GuestAddress(q.desc_addr); let avail_addr = GuestAddress(q.driver_addr); let used_addr = GuestAddress(q.device_addr); let queue_size = q.num as usize; - for frame in frames { - // Read available ring: how many buffers has driver given us? - let mut idx_buf = [0u8; 2]; - mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64)) - .map_err(|e| crate::Error::Memory(e.to_string()))?; - let avail_idx = u16::from_le_bytes(idx_buf); + // avail_idx is monotonically increasing; the driver bumps it + // whenever it adds new buffers. Read it once per try_inject_rx + // call rather than per frame — saves one mem.read per frame in + // the hot path. If the device runs out of available buffers + // mid-batch the remaining frames are buffered for the next + // call, which is the same correctness contract as before. + let mut idx_buf = [0u8; 2]; + mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64)) + .map_err(|e| crate::Error::Memory(e.to_string()))?; + let avail_idx = u16::from_le_bytes(idx_buf); + + let mut frames_injected: u16 = 0; + + for frame in frames.drain(..) { if self.rx_avail_idx == avail_idx { - debug!("virtio-net: RX no available buffers (avail_idx={}, our_idx={}), buffering frame ({} bytes)", - avail_idx, self.rx_avail_idx, frame.len()); self.rx_buffer.push(frame); continue; } - debug!( - "virtio-net: RX injecting frame ({} bytes), avail_idx={}", - frame.len(), - avail_idx - ); let ring_offset = 4 + ((self.rx_avail_idx as usize) % queue_size) * 2; let mut desc_id_buf = [0u8; 2]; @@ -599,32 +738,45 @@ impl VirtioNetDevice { next = next_desc; } + // Used-ring entry is exactly 8 bytes (2x u32, little-endian). + // Build it on the stack instead of allocating a Vec via + // `[...].concat()` — the previous code did a heap alloc per + // frame in the hot path. let used_ring_off = 4 + ((self.rx_used_idx as usize) % queue_size) * 8; - let used_elem = [ - (head_idx as u32).to_le_bytes(), - (written as u32).to_le_bytes(), - ] - .concat(); + let mut used_elem = [0u8; 8]; + used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes()); + used_elem[4..8].copy_from_slice(&(written as u32).to_le_bytes()); mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; self.rx_used_idx = self.rx_used_idx.wrapping_add(1); self.rx_avail_idx = self.rx_avail_idx.wrapping_add(1); + frames_injected = frames_injected.wrapping_add(1); + } + // Publish the new used.idx ONCE at the end of the batch. The + // virtio spec only requires the device to update used.idx after + // it has written all corresponding used-ring entries; the guest + // reads used.idx with a memory barrier and then iterates new + // entries. Per-frame writes are redundant — saves one + // mem.write per frame on the hot path. + if frames_injected > 0 { let used_idx_bytes = self.rx_used_idx.to_le_bytes(); mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; } - self.interrupt_status |= 1; - Ok(()) + if frames_injected > 0 { + self.interrupt_status.fetch_or(1, Ordering::Relaxed); + } + Ok(frames_injected as usize) } /// Reset device to initial state fn reset(&mut self) { debug!("virtio-net: device reset"); self.status = 0; - self.interrupt_status = 0; + self.interrupt_status.store(0, Ordering::Relaxed); self.driver_features = 0; self.tx_avail_idx = 0; self.tx_used_idx = 0; @@ -691,7 +843,7 @@ impl VirtioNetDevice { self.rx_buffer.push(packet); // Set interrupt - self.interrupt_status |= 1; + self.interrupt_status.fetch_or(1, Ordering::Relaxed); } /// Capture device state for snapshot. @@ -726,7 +878,7 @@ impl VirtioNetDevice { features_sel: self.features_sel, queue_sel: self.queue_sel, status: self.status, - interrupt_status: self.interrupt_status, + interrupt_status: self.interrupt_status.load(Ordering::Relaxed), config_generation: self.config_generation, mac: self.mac, queues, @@ -740,7 +892,8 @@ impl VirtioNetDevice { self.features_sel = state.features_sel; self.queue_sel = state.queue_sel; self.status = state.status; - self.interrupt_status = state.interrupt_status; + self.interrupt_status + .store(state.interrupt_status, Ordering::Relaxed); self.config_generation = state.config_generation; self.mac = state.mac; @@ -776,9 +929,13 @@ impl VirtioNetDevice { ); } - /// Check if there are pending interrupts + /// Check if there are pending interrupts. + /// + /// Atomic load — safe to call from any thread without holding the + /// device mutex. The net-poll thread uses this to decide whether + /// to pulse the IRQ line. pub fn has_pending_interrupt(&self) -> bool { - self.interrupt_status != 0 + self.interrupt_status.load(Ordering::Relaxed) != 0 } /// Get the MAC address diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 1e452880..7c7930c3 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -59,6 +59,12 @@ struct PendingDnsQuery { /// while keeping the implementation simple. const DNS_CACHE_TTL_SECS: u64 = 60; +/// Initial capacity for the ready-event scratch buffers. Sized to +/// `EpollDispatch`'s typical per-wait batch so the buffers fit a +/// busy-loop wakeup without reallocating; oversized batches grow +/// once and stabilize. +const EVENTS_PRESIZE: usize = 128; + use ipnet::Ipv4Net; use smoltcp::iface::{Config, Interface, SocketSet}; @@ -689,6 +695,26 @@ pub struct SlirpBackend { /// keep the fallback so synthetic harnesses still observe /// readiness. has_external_poller: AtomicBool, + /// Per-call scratch buffer for the events `drain_to_guest` + /// processes. Owned by `SlirpBackend` so its capacity persists + /// across calls — `mem::take`-into-local would discard the + /// allocation and force the next round to grow from cap=0, + /// which heaptrack measured as ~half of all per-CRR + /// allocations. + ready_scratch: Vec, + /// Per-call scratch for `relay_tcp_nat_data`'s deferred frame + /// pushes. The relay can't push directly to `inject_to_guest` + /// while iterating `flow_table` (borrow conflict); reusing + /// this buffer keeps the per-cycle Vec from growing from cap=0. + relay_frames_scratch: Vec>, + /// Shared scratch for the per-cycle `Vec` snapshots + /// that `relay_tcp_nat_data`, `relay_icmp_echo`, and + /// `relay_udp_flows` build to side-step `&mut self` / + /// `flow_table` borrow conflicts. All three relays run + /// sequentially inside `drain_to_guest`, so one buffer + /// suffices — each callsite takes it, fills it, drains it, + /// and stashes it back via `clear()` (capacity preserved). + flow_keys_scratch: Vec, } impl SlirpBackend { @@ -793,9 +819,12 @@ impl SlirpBackend { accept_sender: accept_tx, epoll, epoll_waker, - pending_events: Mutex::new(Vec::new()), + pending_events: Mutex::new(Vec::with_capacity(EVENTS_PRESIZE)), pending_close: Vec::new(), has_external_poller: AtomicBool::new(false), + ready_scratch: Vec::with_capacity(EVENTS_PRESIZE), + relay_frames_scratch: Vec::new(), + flow_keys_scratch: Vec::new(), }) } @@ -1033,26 +1062,33 @@ impl SlirpBackend { // // Then, only if no net-poll thread has populated the queue // (unit tests / benches), fall back to a non-blocking poll on - // the epoll FD ourselves. `try_lock` keeps that fallback safe - // under contention. - let ready: Vec = { - let mut events: Vec = { - let mut queue = self.pending_events.lock().unwrap(); - std::mem::take(&mut *queue) - }; - // Fallback non-blocking poll only when no external poller - // (net_poll_thread) is feeding us events — otherwise we'd - // pay one mutex op + one epoll_wait syscall per call - // (~310 ns) for nothing. The flag is one-way: set by the - // first push_ready_events and stays set for the backend's - // lifetime. - if events.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) { - let _ = self - .epoll - .wait_with_timeout(&mut events, std::time::Duration::ZERO); - } - events - }; + // the epoll FD ourselves. + // + // The local `ready` Vec is taken from `self.ready_scratch`, + // populated by copying out of the locked queue (which is + // `clear()`-ed in place to keep its capacity), processed, + // then cleared and stashed back. The previous `mem::take` + // pattern dropped the queue's allocation every cycle — + // heaptrack measured that as ~half of all per-CRR + // allocations on this hot path. + let mut ready: Vec = std::mem::take(&mut self.ready_scratch); + ready.clear(); + { + let mut queue = self.pending_events.lock().unwrap(); + ready.extend_from_slice(&queue); + queue.clear(); + } + // Fallback non-blocking poll only when no external poller + // (net_poll_thread) is feeding us events — otherwise we'd + // pay one mutex op + one epoll_wait syscall per call + // (~310 ns) for nothing. The flag is one-way: set by the + // first push_ready_events and stays set for the backend's + // lifetime. + if ready.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) { + let _ = self + .epoll + .wait_with_timeout(&mut ready, std::time::Duration::ZERO); + } // 0a. Accept any newly-ready listener connections (may push into // accept_sender for the next step). @@ -1091,6 +1127,12 @@ impl SlirpBackend { out.append(&mut q.tx_queue); } out.append(&mut self.inject_to_guest); + + // Stash the local `ready` Vec back as scratch. `clear()` + // preserves capacity, so the next `drain_to_guest` reuses + // the buffer instead of allocating from cap=0. + ready.clear(); + self.ready_scratch = ready; } /// Poll the stack and return ethernet frames to send to the guest. @@ -2321,8 +2363,13 @@ impl SlirpBackend { /// only the flow table entries directly, avoiding a separate Vec allocation. /// Data relay is restricted to flows with an EPOLLIN event in `ready`. fn relay_tcp_nat_data(&mut self, ready: &[EpollEvent]) { - // Collect frames to inject (built separately to avoid borrow issues) - let mut frames_to_inject: Vec> = Vec::new(); + // Collect frames to inject in the SlirpBackend-owned scratch + // so the buffer's capacity carries across calls. Pushes + // can't go straight to `inject_to_guest` because we're + // about to iterate `flow_table` and `inject_to_guest` is + // also `&mut self`. + let mut frames_to_inject = std::mem::take(&mut self.relay_frames_scratch); + frames_to_inject.clear(); // Seed removal set from flows already marked Closed by handle_tcp_frame // (FIN/RST path) via the pending_close queue. HashSet gives O(1) @@ -2382,7 +2429,8 @@ impl SlirpBackend { } } - let mut tcp_flow_keys: Vec = Vec::new(); + let mut tcp_flow_keys = std::mem::take(&mut self.flow_keys_scratch); + tcp_flow_keys.clear(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_TCP { continue; @@ -2396,7 +2444,7 @@ impl SlirpBackend { tcp_flow_keys.push(flow_key); } - for flow_key in tcp_flow_keys { + for flow_key in tcp_flow_keys.drain(..) { let FlowKey::Tcp(key) = flow_key else { continue; }; @@ -2607,6 +2655,12 @@ impl SlirpBackend { self.flow_table.remove(&flow_key); } self.inject_to_guest.append(&mut frames_to_inject); + // Both `append` calls drained `frames_to_inject` but + // preserved its capacity; restore the buffer to the + // backend so the next cycle reuses it. The flow-key + // buffer was already drained by the iteration above. + self.relay_frames_scratch = frames_to_inject; + self.flow_keys_scratch = tcp_flow_keys; } /// Drain replies from each active ICMP echo socket and emit echo-reply @@ -2619,7 +2673,8 @@ impl SlirpBackend { const ICMP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); let now = Instant::now(); - let mut ready_flow_keys: Vec = Vec::new(); + let mut ready_flow_keys = std::mem::take(&mut self.flow_keys_scratch); + ready_flow_keys.clear(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_ICMP { continue; @@ -2681,6 +2736,8 @@ impl SlirpBackend { } self.flow_table.remove(&flow_key); } + ready_flow_keys.clear(); + self.flow_keys_scratch = ready_flow_keys; } /// Build an Ethernet/IPv4/ICMP echo-reply frame addressed to the guest. @@ -2751,18 +2808,20 @@ impl SlirpBackend { fn relay_udp_flows(&mut self, ready: &[EpollEvent]) { let now = Instant::now(); // Per-flow connected sockets are closed by Drop when the entry leaves - // flow_table. - let mut stale: Vec = Vec::new(); + // flow_table. The two flow-key Vecs here share `flow_keys_scratch`: + // the stale-sweep drains it, then the readiness loop refills it. + let mut flow_keys = std::mem::take(&mut self.flow_keys_scratch); + flow_keys.clear(); for (flow_key, entry) in &self.flow_table { let FlowKey::Udp(_) = flow_key else { continue }; let FlowEntry::Udp(udp_entry) = entry else { continue; }; if now.duration_since(udp_entry.last_activity) > UDP_IDLE_TIMEOUT { - stale.push(*flow_key); + flow_keys.push(*flow_key); } } - for flow_key in stale { + for flow_key in flow_keys.drain(..) { if let Some(FlowEntry::Udp(entry)) = self.flow_table.get(&flow_key) { self.token_to_key.remove(&entry.flow_token); self.epoll.unregister(entry.sock.as_raw_fd()).ok(); @@ -2770,7 +2829,6 @@ impl SlirpBackend { self.flow_table.remove(&flow_key); } - let mut flow_keys: Vec = Vec::new(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_UDP { continue; @@ -2780,7 +2838,7 @@ impl SlirpBackend { }; flow_keys.push(flow_key); } - for flow_key in flow_keys { + for flow_key in flow_keys.drain(..) { let FlowKey::Udp(key) = flow_key else { continue; }; @@ -2807,6 +2865,7 @@ impl SlirpBackend { self.inject_to_guest.push(frame_bytes); } } + self.flow_keys_scratch = flow_keys; } /// Build an Ethernet/IPv4/UDP frame addressed to the guest, carrying a diff --git a/src/sandbox/local.rs b/src/sandbox/local.rs index 69f9f240..a7b82bfe 100644 --- a/src/sandbox/local.rs +++ b/src/sandbox/local.rs @@ -91,8 +91,14 @@ impl LocalSandbox { session_secret: SessionSecret::new(session_secret_bytes), command_allowlist: Vec::new(), // Set via provisioning network_deny_list: default_network_deny_list(), - max_connections_per_second: DEFAULT_MAX_CONNECTIONS_PER_SECOND, - max_concurrent_connections: DEFAULT_MAX_CONCURRENT_CONNECTIONS, + max_connections_per_second: self + .config + .network_max_connections_per_second + .unwrap_or(DEFAULT_MAX_CONNECTIONS_PER_SECOND), + max_concurrent_connections: self + .config + .network_max_concurrent_connections + .unwrap_or(DEFAULT_MAX_CONCURRENT_CONNECTIONS), seccomp: true, }, snapshot: self.config.snapshot.clone(), diff --git a/src/sandbox/mod.rs b/src/sandbox/mod.rs index b2c820c0..9066e478 100644 --- a/src/sandbox/mod.rs +++ b/src/sandbox/mod.rs @@ -86,6 +86,15 @@ pub struct SandboxConfig { /// validate save/restore support at cold boot instead of deferring a /// cryptic failure to save time. pub enable_snapshots: bool, + /// Optional override for the network backend's + /// `max_connections_per_second` rate limit. `None` keeps the + /// production default (50/s); benches that intentionally exceed + /// the anti-DoS limit raise it explicitly. + pub network_max_connections_per_second: Option, + /// Optional override for the network backend's + /// `max_concurrent_connections` ceiling. `None` keeps the + /// production default (64). + pub network_max_concurrent_connections: Option, } impl Default for SandboxConfig { @@ -108,6 +117,8 @@ impl Default for SandboxConfig { env: Vec::new(), snapshot: None, enable_snapshots: false, + network_max_connections_per_second: None, + network_max_concurrent_connections: None, } } } @@ -815,6 +826,41 @@ impl SandboxBuilder { self } + /// Overrides the SLIRP backend's per-second new-connection rate + /// limit. The production default (50/s) protects the host from + /// guest-side connection floods; benches that intentionally + /// exceed it call this to disable the limit. + /// + /// # Examples + /// + /// ```no_run + /// use void_box::sandbox::Sandbox; + /// let _ = Sandbox::local() + /// .network(true) + /// .network_max_connections_per_second(u32::MAX); + /// ``` + pub fn network_max_connections_per_second(mut self, rate: u32) -> Self { + self.config.network_max_connections_per_second = Some(rate); + self + } + + /// Overrides the SLIRP backend's concurrent-connection ceiling. + /// Production default is 64; raise for sustained-throughput + /// benches. + /// + /// # Examples + /// + /// ```no_run + /// use void_box::sandbox::Sandbox; + /// let _ = Sandbox::local() + /// .network(true) + /// .network_max_concurrent_connections(1024); + /// ``` + pub fn network_max_concurrent_connections(mut self, count: usize) -> Self { + self.config.network_max_concurrent_connections = Some(count); + self + } + /// Set the kernel path pub fn kernel(mut self, path: impl Into) -> Self { self.config.kernel = Some(path.into()); diff --git a/src/vmm/cpu.rs b/src/vmm/cpu.rs index 61008a0a..41f86920 100644 --- a/src/vmm/cpu.rs +++ b/src/vmm/cpu.rs @@ -249,8 +249,14 @@ fn vcpu_run_loop( } VcpuExit::MmioRead(addr, data) => { let handled = if let Some(ref dev) = mmio_devices.virtio_net { - let guard = dev.lock().unwrap(); + let mut guard = dev.lock().unwrap(); if guard.handles_mmio(addr) { + // Materialise any frames the net-poll thread + // pushed into pending_rx since our last MMIO + // entry — writes them into the guest's RX + // descriptors in our context, no cross-thread + // lock contention. + let _ = guard.flush_pending_rx(guest_memory); let offset = addr - guard.mmio_base(); guard.mmio_read(offset, data); true @@ -305,6 +311,11 @@ fn vcpu_run_loop( let handled = if let Some(ref dev) = mmio_devices.virtio_net { let mut guard = dev.lock().unwrap(); if guard.handles_mmio(addr) { + // Same pre-flush as the MMIO-read path: the + // guest may write INTERRUPT_ACK or another + // register before reading INTERRUPT_STATUS, + // so we materialise pending frames here too. + let _ = guard.flush_pending_rx(guest_memory); let offset = addr - guard.mmio_base(); guard.mmio_write(offset, data, Some(guest_memory)); true diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index 97fe2d0f..e1a485e1 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1607,6 +1607,77 @@ fn vsock_irq_thread( /// /// When the network backend does not provide an epoll instance /// (non-SlirpBackend), the thread falls back to a fixed 5 ms sleep. +/// Registers a host eventfd with KVM via `KVM_IOEVENTFD` for the +/// virtio-net TX-queue notify MMIO and adds it to the supplied +/// [`EpollDispatch`] under `token` so the net-poll thread can drain +/// it. Returns the eventfd on success, or `None` and logs a +/// `debug!` on any failure (eventfd creation, epoll registration, +/// `KVM_IOEVENTFD` registration); callers fall back to the +/// MMIO-exit TX path when this returns `None`. +/// +/// Both pieces (epoll registration and `KVM_IOEVENTFD` +/// registration) must succeed together: if KVM consumes the guest's +/// TX MMIO writes in-kernel but no userspace path drains the +/// eventfd, guest TX hangs silently. This helper rolls back the +/// epoll registration if the `KVM_IOEVENTFD` half fails. +/// +/// # Errors +/// +/// Returns `None` on any of: missing epoll dispatcher, eventfd +/// creation failure, epoll registration failure, or +/// `KVM_IOEVENTFD` registration failure. Each failure is logged at +/// `debug!` level with the underlying error. +fn setup_tx_notify_ioeventfd( + vm: &Vm, + epoll_arc: Option<&Arc>, + mmio_addr: u64, + queue_idx: u32, + token: u64, +) -> Option { + let Some(ep_arc) = epoll_arc else { + debug!( + "net-poll: no epoll dispatcher; falling back to MMIO-exit TX path (KVM_IOEVENTFD requires an async drain)" + ); + return None; + }; + let fd = match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { + Ok(fd) => fd, + Err(e) => { + debug!( + "net-poll: eventfd create for tx-notify failed; falling back to MMIO-exit TX path: {}", + e + ); + return None; + } + }; + if let Err(e) = ep_arc.register( + fd.as_raw_fd(), + token, + crate::network::epoll_dispatch::RegisterMode::Read, + ) { + debug!( + "net-poll: failed to register tx-notify eventfd with epoll dispatch ({e}); falling back to MMIO-exit TX path" + ); + return None; + } + let kvm_addr = kvm_ioctls::IoEventAddress::Mmio(mmio_addr); + if let Err(e) = vm.vm_fd().register_ioevent(&fd, &kvm_addr, queue_idx) { + // KVM didn't take the ioevent. Roll the epoll registration + // back so the eventfd doesn't stay armed without a service + // path on it. + let _ = ep_arc.unregister(fd.as_raw_fd()); + debug!( + "net-poll: KVM_IOEVENTFD register failed ({e}); TX notifies will continue to take MMIO exits" + ); + return None; + } + debug!( + "net-poll: KVM_IOEVENTFD active for TX notify @ MMIO {:#x} queue_idx={queue_idx}", + mmio_addr, + ); + Some(fd) +} + fn net_poll_thread(net_dev: Arc>, vm: Arc, running: Arc) { #[repr(C)] struct KvmIrqLevel { @@ -1649,6 +1720,98 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A let mut epoll_events: Vec = Vec::new(); + // Tracks whether the device's interrupt_status was non-zero on the + // previous cycle. Used to decide whether to pulse the IRQ line: + // we pulse only on transitions clear→pending (or when new RX frames + // are injected this cycle), not on every cycle where pending is + // still set from an un-acked earlier pulse. + let mut prev_pending: bool = false; + + // KVM_IRQFD: register an eventfd that asserts IRQ 10 when written. + // Writing 8 bytes to the eventfd is one syscall; the kernel signals + // the in-kernel irqchip directly. This replaces the pair of + // KVM_IRQ_LINE ioctls (assert level=1 / deassert level=0) with a + // single write. If setup fails (kernel without irqfd, broken irqchip + // routing) we fall back to the ioctl path below. + let irq_eventfd: Option = + match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { + Ok(fd) => match vm.vm_fd().register_irqfd(&fd, 10) { + Ok(()) => Some(fd), + Err(e) => { + debug!( + "net-poll: KVM_IRQFD register failed; falling back to KVM_IRQ_LINE: {}", + e + ); + None + } + }, + Err(e) => { + debug!( + "net-poll: eventfd create failed; falling back to KVM_IRQ_LINE: {}", + e + ); + None + } + }; + + // KVM_IOEVENTFD for the virtio-net TX queue notify. + // + // Without this, every guest TX (write to QUEUE_NOTIFY MMIO with value=1) + // forces a KVM_RUN exit, the vCPU thread dispatches into virtio-net's + // MMIO write handler, then calls process_tx_queue and re-enters KVM_RUN. + // ~1–5 µs per packet of pure VM-exit overhead. + // + // With KVM_IOEVENTFD: the guest's MMIO write is consumed in-kernel, + // KVM signals the eventfd, and the vCPU thread continues running. + // The net-poll thread sees the eventfd as another epoll source, drains + // it, and calls process_tx_queue asynchronously. No vCPU exit. + // + // Address: virtio-net mmio_base (0xd000_0000) + QUEUE_NOTIFY offset + // (0x050) = 0xd000_0050. Datamatch=1 triggers only on TX queue + // notifies (value=1 → queue index 1 = transmit queue). Notifies for + // queue 0 (RX) still take the slow path through MMIO; they're rare + // (only when guest adds new RX buffers) so the optimisation isn't + // needed there. + const VIRTIO_NET_MMIO_BASE: u64 = 0xd000_0000; + const VIRTIO_NET_QUEUE_NOTIFY_OFFSET: u64 = 0x050; + const TX_NOTIFY_QUEUE_IDX: u32 = 1; + // Token used to identify the TX-notify eventfd in epoll readiness + // events. Lives in a tag space that doesn't collide with the + // PROTO_TAG_* values SlirpBackend uses for flow tokens. + const TX_NOTIFY_TOKEN: u64 = 0x4000_0000_0000_0000; + + let tx_notify_eventfd = setup_tx_notify_ioeventfd( + vm.as_ref(), + epoll_arc.as_ref(), + VIRTIO_NET_MMIO_BASE + VIRTIO_NET_QUEUE_NOTIFY_OFFSET, + TX_NOTIFY_QUEUE_IDX, + TX_NOTIFY_TOKEN, + ); + + // Lock-free hand-off queue + direct backend Arc, pulled out of the + // device once at thread startup so the per-cycle hot path doesn't + // need to acquire the VirtioNetDevice mutex just to read backend + // frames. The vCPU thread drains `pending_rx` on each MMIO entry + // (see vmm/cpu.rs), so this thread only needs to push frames. + type PendingRxArc = std::sync::Arc>>; + type BackendArc = std::sync::Arc>; + type InterruptStatusArc = std::sync::Arc; + let (pending_rx_arc, slirp_arc, interrupt_status_arc): ( + Option, + Option, + Option, + ) = match net_dev.lock() { + Ok(g) => ( + Some(g.pending_rx()), + Some(g.slirp_arc()), + Some(g.interrupt_status_arc()), + ), + Err(_) => (None, None, None), + }; + + // Reusable buffer for frames pulled from the backend each cycle. + let mut rx_scratch: Vec> = Vec::new(); + while running.load(Ordering::Relaxed) { // Block outside the device lock: either on epoll readiness or a short // sleep. This lets the vCPU thread acquire the device lock without @@ -1681,39 +1844,115 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A IDLE_TIMEOUT }; - // Push ready events into the backend's queue before acquiring the - // device lock for inject/IRQ work. drain_to_guest will consume them - // without re-locking EpollDispatch, eliminating mutex contention - // between the net-poll thread's 50 ms blocking wait and the vCPU - // thread's process_guest_frame → drain_to_guest path. + // Filter out the TX-notify eventfd event (if any) before pushing + // the rest to the SLIRP backend. When the guest writes to the + // virtio-net QUEUE_NOTIFY MMIO with value=1, KVM consumes it + // in-kernel and signals our eventfd; we drain it here and call + // process_tx_queue ourselves — the vCPU thread never exits for + // that MMIO write. + let mut tx_notify_fired = false; + if tx_notify_eventfd.is_some() { + epoll_events.retain(|e| { + if e.token == TX_NOTIFY_TOKEN { + tx_notify_fired = true; + false + } else { + true + } + }); + } + if tx_notify_fired { + if let Some(ref efd) = tx_notify_eventfd { + let _ = efd.read(); + } + if let Ok(mut guard) = net_dev.lock() { + let _ = guard.process_tx_queue_external(guest_memory); + } + } + + // Push remaining (flow) events into the backend's queue before + // acquiring the device lock for inject/IRQ work. drain_to_guest + // will consume them without re-locking EpollDispatch, eliminating + // mutex contention between the net-poll thread's blocking wait and + // the vCPU thread's process_guest_frame → drain_to_guest path. if !epoll_events.is_empty() { if let Ok(guard) = net_dev.lock() { guard.push_events_to_backend(&epoll_events); } } - let has_interrupt = { - let mut guard = match net_dev.lock() { - Ok(g) => g, - Err(_) => continue, - }; - let _ = guard.try_inject_rx(guest_memory); - guard.has_pending_interrupt() - }; - - // Always pulse IRQ10 while pending; this prevents RX stalls if - // an earlier edge was missed by the guest. - if has_interrupt { - let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; - // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into - // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid - // for the lifetime of `vm`. - unsafe { - libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq); + // Drain backend frames into the pending_rx SegQueue WITHOUT + // touching the VirtioNetDevice mutex. The vCPU thread will + // materialise them into RX descriptors on its next MMIO entry + // via VirtioNetDevice::flush_pending_rx (see vmm/cpu.rs). + // + // This breaks the old contention pattern where the net-poll + // thread held the VirtioNetDevice lock for the duration of + // try_inject_rx (descriptor walk + memory writes), forcing the + // vCPU thread to wait on every MMIO exit that overlapped with + // a poll cycle. + let frames_pushed: usize = match (&pending_rx_arc, &slirp_arc) { + (Some(pending_rx), Some(slirp)) => { + rx_scratch.clear(); + if let Ok(mut backend) = slirp.lock() { + backend.drain_to_guest(&mut rx_scratch); + } + let n = rx_scratch.len(); + for frame in rx_scratch.drain(..) { + let mut packet = Vec::with_capacity( + crate::devices::virtio_net::VirtioNetHeader::SIZE + frame.len(), + ); + packet.extend_from_slice( + &crate::devices::virtio_net::VirtioNetHeader::new().to_bytes(), + ); + packet.extend_from_slice(&frame); + pending_rx.push(packet); + } + n } - let deassert_irq = KvmIrqLevel { irq: 10, level: 0 }; - unsafe { - libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq); + _ => 0, + }; + // Lock-free check: read interrupt_status via the AtomicU32 we + // cached at thread startup. Avoids one device-mutex acquisition + // per cycle on idle paths (the hot RX path skips this branch + // because frames_pushed > 0 already implies interrupt_status + // is about to be set when the vCPU drains pending_rx). + let has_interrupt = frames_pushed > 0 + || match interrupt_status_arc { + Some(ref isr) => isr.load(std::sync::atomic::Ordering::Relaxed) != 0, + None => false, + }; + let frames_injected = frames_pushed; + + // Pulse IRQ10 only when there is *new* work for the guest: + // - frames just injected this cycle, OR + // - interrupt_status went from clear → pending (TX completion + // by the vCPU thread between cycles). + // Skipping pulses when the guest hasn't acknowledged a previous + // pulse saves two ioctl(KVM_IRQ_LINE) calls per cycle (~5–10 µs + // on the CRR hot path). If we pulse once and the guest's + // ISR services the queue, has_pending_interrupt will be false + // on the next cycle and `prev_pending` resets. + let now_pending = has_interrupt; + let pulse = frames_injected > 0 || (now_pending && !prev_pending); + prev_pending = now_pending; + if pulse { + if let Some(ref efd) = irq_eventfd { + // Fast path: KVM_IRQFD. One 8-byte write to the eventfd; + // the kernel asserts IRQ 10 directly. No ioctl pair. + let _ = efd.write(1); + } else { + let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; + // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into + // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid + // for the lifetime of `vm`. + unsafe { + libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq); + } + let deassert_irq = KvmIrqLevel { irq: 10, level: 0 }; + unsafe { + libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq); + } } } } diff --git a/tools/perf-harness/bench-compare-pasta.py b/tools/perf-harness/bench-compare-pasta.py new file mode 100755 index 00000000..ac6af588 --- /dev/null +++ b/tools/perf-harness/bench-compare-pasta.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# bench-compare-pasta.py — produce a markdown side-by-side comparing +# voidbox-network-bench output against bench-pasta.py output. +# +# Both inputs are JSON files with the same field names (the shared +# voidbox-network-bench Report shape). Either argument can be the +# voidbox or pasta side; the script auto-detects via the `backend` +# field if present, otherwise positional. + +from __future__ import annotations + +import argparse +import json +import sys +from typing import Any + + +METRICS = [ + ("tcp_throughput_g2h_mbps", "TCP throughput g2h", "Mbps", False), + ("tcp_bulk_throughput_g2h_mbps", "TCP bulk g2h (constrained)", "Mbps", False), + ("tcp_rr_latency_us_p50", "TCP RR latency p50", "µs", True), + ("tcp_rr_latency_us_p99", "TCP RR latency p99", "µs", True), + ("tcp_crr_latency_us_p50", "TCP CRR latency p50", "µs", True), + ("udp_dns_qps", "UDP DNS qps", "qps", False), + ("icmp_rr_latency_us_p50", "ICMP RR p50", "µs", True), + ("tcp_rx_latency_us_p50", "TCP RX latency p50", "µs", True), +] + + +def fmt(value: Any, latency: bool) -> str: + if value is None: + return "n/a" + if isinstance(value, (int, float)): + if latency: + if value >= 1000: + return f"{value / 1000:.2f} ms" + return f"{value:.1f} µs" + if value >= 1000: + return f"{value:.0f}" + return f"{value:.2f}" + return str(value) + + +def fmt_delta(voidbox: Any, pasta: Any, latency: bool) -> str: + if voidbox is None or pasta is None: + return "—" + if pasta == 0: + return "—" + ratio = voidbox / pasta + if latency: + if ratio >= 1: + return f"voidbox {ratio:.1f}× slower" + return f"voidbox {1 / ratio:.2f}× faster" + if ratio >= 1: + return f"voidbox {ratio:.2f}× faster" + return f"voidbox {1 / ratio:.1f}× slower" + + +def load(path: str) -> dict[str, Any]: + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def detect_role(data: dict[str, Any], default: str) -> str: + backend = data.get("backend") + if backend in ("pasta", "voidbox"): + return backend + return default + + +def main() -> int: + p = argparse.ArgumentParser(description="voidbox vs pasta head-to-head comparison") + p.add_argument("voidbox_json", help="path to voidbox-network-bench JSON output") + p.add_argument("pasta_json", help="path to bench-pasta.py JSON output") + p.add_argument("--output", help="write markdown to file instead of stdout") + args = p.parse_args() + + voidbox = load(args.voidbox_json) + pasta = load(args.pasta_json) + + if detect_role(voidbox, "voidbox") == "pasta": + voidbox, pasta = pasta, voidbox + + lines: list[str] = [] + lines.append("# voidbox vs pasta head-to-head\n") + lines.append("Methodology per `docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md` §") + lines.append("\"passt head-to-head methodology\": same host, same workload (`nc`-based g2h /") + lines.append("RR / CRR), same metric names. **CRR latency is the most apples-to-apples**") + lines.append("metric — dominated by NAT-table operations on both sides. Throughput numbers") + lines.append("are not directly comparable: voidbox runs in a real KVM VM (virtio-mmio exit") + lines.append("overhead); pasta runs in a network namespace (no VM).\n") + lines.append("| Metric | voidbox (KVM + SLIRP) | pasta (netns) | Δ |") + lines.append("|---|---:|---:|---|") + + for key, label, _unit, latency in METRICS: + v = voidbox.get(key) + pa = pasta.get(key) + if v is None and pa is None: + continue + lines.append( + f"| {label} | {fmt(v, latency)} | {fmt(pa, latency)} | {fmt_delta(v, pa, latency)} |" + ) + + lines.append("") + pasta_version = pasta.get("pasta_version") + if pasta_version: + lines.append(f"_pasta version: `{pasta_version}`_") + lines.append("") + notes = pasta.get("notes") + if isinstance(notes, list) and notes: + lines.append("**Notes from pasta side:**") + for note in notes: + lines.append(f"- {note}") + lines.append("") + + md = "\n".join(lines) + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(md) + print(f"Report written to {args.output}", file=sys.stderr) + else: + print(md) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/perf-harness/bench-pasta.py b/tools/perf-harness/bench-pasta.py new file mode 100755 index 00000000..264e808d --- /dev/null +++ b/tools/perf-harness/bench-pasta.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +# bench-pasta.py — passt/pasta side of the head-to-head comparison. +# +# Drives the same workload shape as `voidbox-network-bench`: +# - tcp_throughput_g2h_mbps (sustained guest→host throughput) +# - tcp_rr_latency_us_p50/p99 (persistent-connection round-trip) +# - tcp_crr_latency_us_p50 (connect-request-response latency) +# +# The "guest" is a process running inside a pasta-managed network +# namespace. Pasta forwards the host's gateway address into the netns +# as a translation for the host's loopback (its --map-host-loopback +# default), so connecting to the host gateway IP from inside the netns +# reaches the host's 127.0.0.1. This mirrors voidbox's SLIRP +# convention (10.0.2.2 → 127.0.0.1) closely enough for the metric +# comparison to be apples-to-apples on the NAT path. +# +# Methodology aligns with docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md +# § "passt head-to-head methodology": same host, same workload, same +# metric names, focus on CRR latency (dominated by NAT-table ops, not +# MMIO exit overhead). + +from __future__ import annotations + +import argparse +import json +import os +import socket +import statistics +import subprocess +import sys +import threading +import time +from dataclasses import asdict, dataclass, field +from typing import Optional + + +@dataclass +class Report: + tcp_bulk_throughput_g2h_mbps: Optional[float] = None + tcp_throughput_g2h_mbps: Optional[float] = None + tcp_throughput_h2g_mbps: Optional[float] = None + tcp_rr_latency_us_p50: Optional[float] = None + tcp_rr_latency_us_p99: Optional[float] = None + tcp_crr_latency_us_p50: Optional[float] = None + udp_dns_qps: Optional[float] = None + icmp_rr_latency_us_p50: Optional[float] = None + tcp_rx_latency_us_p50: Optional[float] = None + backend: str = "pasta" + pasta_version: Optional[str] = None + notes: list[str] = field(default_factory=list) + + +def _resolve_pasta() -> str: + """Find a pasta binary in $PATH or fall back to /usr/bin/pasta.""" + import shutil + found = shutil.which("pasta") + if found: + return found + return "/usr/bin/pasta" + + +def detect_host_gateway() -> str: + """Return the host's IPv4 default-route gateway address. + + Parses ``ip -4 route show default`` for ``default via ...`` lines + and returns the address after ``via``. Routes of the form + ``default dev ...`` (no ``via``) are skipped — they don't + name a usable IP for pasta's ``--map-host-loopback`` translation. + """ + out = subprocess.check_output(["ip", "-4", "route", "show", "default"], text=True) + for line in out.splitlines(): + parts = line.split() + if not parts or parts[0] != "default": + continue + try: + via_index = parts.index("via") + except ValueError: + continue + if via_index + 1 < len(parts): + return parts[via_index + 1] + raise RuntimeError( + "no IPv4 default gateway with a 'via' field found in `ip route show default` output" + ) + + +def pasta_version(pasta: str) -> str: + out = subprocess.run([pasta, "--version"], capture_output=True, text=True, check=False) + first = out.stdout.splitlines() or [""] + return first[0].strip() + + +def free_port() -> int: + s = socket.socket() + s.bind(("127.0.0.1", 0)) + port = s.getsockname()[1] + s.close() + return port + + +def run_in_netns(pasta: str, cmd: str, *, timeout: float) -> subprocess.CompletedProcess[str]: + """Run `cmd` inside a fresh pasta-managed network namespace.""" + return subprocess.run( + [pasta, "-q", "--config-net", "--", "bash", "-c", cmd], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + + +def measure_g2h_throughput( + pasta: str, + gw: str, + iterations: int, + transfer_mb: int, +) -> Optional[float]: + samples_mbps: list[float] = [] + for i in range(iterations): + port = free_port() + result_box: dict[str, object] = {} + + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(1) + srv.settimeout(30.0) + + def host_drain() -> None: + try: + conn, _ = srv.accept() + except socket.timeout: + result_box["error"] = "accept timeout" + return + start = time.perf_counter() + total = 0 + with conn: + while True: + buf = conn.recv(1 << 16) + if not buf: + break + total += len(buf) + result_box["bytes"] = total + result_box["elapsed"] = time.perf_counter() - start + + worker = threading.Thread(target=host_drain, daemon=True) + worker.start() + time.sleep(0.2) + + cmd = f"dd if=/dev/zero bs=1M count={transfer_mb} 2>/dev/null | nc {gw} {port}" + try: + run_in_netns(pasta, cmd, timeout=60) + except subprocess.TimeoutExpired: + print(f"g2h[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=10) + srv.close() + + if "error" in result_box: + print(f"g2h[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr) + continue + bytes_received = int(result_box.get("bytes", 0)) + elapsed = float(result_box.get("elapsed", 0.0)) + if bytes_received <= 0 or elapsed < 1e-4: + print(f"g2h[{i:>2}]: bytes={bytes_received} elapsed={elapsed}s; skipping", file=sys.stderr) + continue + mbps = bytes_received * 8 / elapsed / 1_000_000 + print( + f"g2h[{i:>2}]: {bytes_received} B in {elapsed:.3f}s = {mbps:.1f} Mbps", + file=sys.stderr, + ) + samples_mbps.append(mbps) + + if not samples_mbps: + return None + return sum(samples_mbps) / len(samples_mbps) + + +def measure_rr_latency( + pasta: str, + gw: str, + iterations: int, + samples_per_iter: int, +) -> tuple[Optional[float], Optional[float]]: + all_samples_us: list[float] = [] + for i in range(iterations): + port = free_port() + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(1) + srv.settimeout(30.0) + + result_box: dict[str, object] = {} + + def host_echo() -> None: + try: + conn, _ = srv.accept() + except socket.timeout: + result_box["error"] = "accept timeout" + return + samples: list[float] = [] + with conn: + buf = bytearray(1) + for _ in range(samples_per_iter): + start = time.perf_counter_ns() + nrecv = conn.recv_into(buf, 1) + if nrecv == 0: + break + conn.sendall(bytes(buf[:1])) + samples.append((time.perf_counter_ns() - start) / 1000.0) + result_box["samples"] = samples + + worker = threading.Thread(target=host_echo, daemon=True) + worker.start() + time.sleep(0.2) + + # Send `samples_per_iter` zero bytes. The guest doesn't read + # the echoed bytes back; host-side timing is the ground truth. + cmd = f"dd if=/dev/zero bs=1 count={samples_per_iter} 2>/dev/null | nc {gw} {port} >/dev/null" + try: + run_in_netns(pasta, cmd, timeout=60) + except subprocess.TimeoutExpired: + print(f"rr[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=10) + srv.close() + + if "error" in result_box: + print(f"rr[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr) + continue + iter_samples = list(result_box.get("samples", [])) + if len(iter_samples) > 1: + iter_samples.pop(0) + if not iter_samples: + print(f"rr[{i:>2}]: no samples; skipping", file=sys.stderr) + continue + p50 = statistics.median(iter_samples) + print(f"rr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.1f} µs", file=sys.stderr) + all_samples_us.extend(iter_samples) + + if not all_samples_us: + return None, None + sorted_s = sorted(all_samples_us) + n = len(sorted_s) + p50 = sorted_s[n // 2] + p99_idx = max(0, int(round(0.99 * (n - 1)))) + p99 = sorted_s[p99_idx] + return p50, p99 + + +def measure_crr_latency( + pasta: str, + gw: str, + iterations: int, + samples_per_iter: int, +) -> Optional[float]: + all_samples_us: list[float] = [] + for i in range(iterations): + port = free_port() + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(64) + srv.settimeout(30.0) + + result_box: dict[str, object] = {} + + def host_accept_loop() -> None: + samples: list[float] = [] + for _ in range(samples_per_iter): + # Start the timer BEFORE accept() so each sample includes + # the TCP connect + accept latency, matching + # voidbox-network-bench's measure_crr_latency semantics + # (its crr_echo_server starts the timer before + # accept_with_deadline). Without this, the two + # harnesses report different metrics under the same + # name and the side-by-side comparison becomes + # meaningless. + start = time.perf_counter_ns() + try: + conn, _ = srv.accept() + except socket.timeout: + break + with conn: + # one read + one write keeps it a true CRR round-trip + try: + conn.recv(1) + conn.sendall(b"x") + except OSError: + pass + samples.append((time.perf_counter_ns() - start) / 1000.0) + result_box["samples"] = samples + + worker = threading.Thread(target=host_accept_loop, daemon=True) + worker.start() + time.sleep(0.2) + + # Guest: a tight loop of independent nc invocations + cmd = ( + f"for _ in $(seq 1 {samples_per_iter}); do " + f"echo y | nc {gw} {port} >/dev/null; done" + ) + try: + run_in_netns(pasta, cmd, timeout=120) + except subprocess.TimeoutExpired: + print(f"crr[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=15) + srv.close() + + iter_samples = list(result_box.get("samples", [])) + if not iter_samples: + print(f"crr[{i:>2}]: no samples; skipping", file=sys.stderr) + continue + p50 = statistics.median(iter_samples) + print(f"crr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.0f} µs", file=sys.stderr) + all_samples_us.extend(iter_samples) + + if not all_samples_us: + return None + sorted_s = sorted(all_samples_us) + return sorted_s[len(sorted_s) // 2] + + +def main() -> int: + parser = argparse.ArgumentParser(description="passt/pasta head-to-head bench harness") + parser.add_argument( + "--pasta", + default=os.environ.get("PASTA") or _resolve_pasta(), + help="path to the pasta binary; default $PASTA, or `pasta` on PATH, or system /usr/bin/pasta", + ) + parser.add_argument("--iterations", type=int, default=3) + parser.add_argument("--transfer-mb", type=int, default=50) + parser.add_argument("--rr-samples", type=int, default=100) + parser.add_argument("--crr-samples", type=int, default=30) + parser.add_argument("--output", default=None, help="path to write JSON; default stdout") + args = parser.parse_args() + + if not os.access(args.pasta, os.X_OK): + print(f"pasta not executable: {args.pasta}", file=sys.stderr) + return 2 + + gw = detect_host_gateway() + version = pasta_version(args.pasta) + print(f"pasta: {version}", file=sys.stderr) + print(f"host gateway (acts as host-loopback inside netns): {gw}", file=sys.stderr) + + report = Report(backend="pasta", pasta_version=version) + report.notes.append( + "pasta runs in a network namespace (no VM); excludes the MMIO/virtio-mmio overhead " + "that voidbox-network-bench includes. CRR latency is the most apples-to-apples metric " + "because it is dominated by NAT-table operations on both sides." + ) + + print("\n--- TCP throughput g2h ---", file=sys.stderr) + report.tcp_throughput_g2h_mbps = measure_g2h_throughput( + args.pasta, gw, args.iterations, args.transfer_mb + ) + + print("\n--- TCP RR latency ---", file=sys.stderr) + p50, p99 = measure_rr_latency(args.pasta, gw, args.iterations, args.rr_samples) + report.tcp_rr_latency_us_p50 = p50 + report.tcp_rr_latency_us_p99 = p99 + + print("\n--- TCP CRR latency ---", file=sys.stderr) + report.tcp_crr_latency_us_p50 = measure_crr_latency( + args.pasta, gw, args.iterations, args.crr_samples + ) + + payload = json.dumps(asdict(report), indent=2) + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(payload) + f.write("\n") + print(f"\nReport written to {args.output}", file=sys.stderr) + else: + print() + print(payload) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/perf-harness/bench-qemu-slirp.sh b/tools/perf-harness/bench-qemu-slirp.sh new file mode 100755 index 00000000..eacb3f6a --- /dev/null +++ b/tools/perf-harness/bench-qemu-slirp.sh @@ -0,0 +1,235 @@ +#!/usr/bin/env bash +# bench-qemu-slirp.sh — qemu-side of the proper SLIRP-vs-SLIRP head-to-head. +# +# Boots a minimal qemu guest with the static crr-client baked in, runs N +# TCP CRRs against a host TCP server, and prints `n p50_ns p99_ns mean_ns`. +# +# Two backends: +# --backend libslirp qemu's built-in -netdev user (libslirp) +# --backend passt qemu -netdev stream + a passt(1) instance over UNIX socket +# +# Both produce a number directly comparable to tools/perf-harness/bench-pasta.py's +# pasta-side number AND to examples/crr_singleproc_bench.rs's voidbox-side +# number — same workload, same C client, same iteration count. +# +# Why this exists: voidbox-vs-pasta comparisons mix two different +# architectures (a real VM vs a netns). The right SLIRP-vs-SLIRP comparison +# is voidbox+voidbox-SLIRP vs qemu+passt vs qemu+libslirp — all VM-attached. +# See docs/passt-comparison.md. + +set -euo pipefail + +BACKEND=libslirp +ITERATIONS=30 +KERNEL=${KERNEL:-/boot/vmlinuz-$(uname -r)} +# NB: must be the `passt` binary (VM/socket mode), NOT the `pasta` symlink +# (namespace mode). The two modes are the same code keyed on argv[0]. +# Default discovery order: $PASST env var → `passt` on $PATH → /usr/bin/passt. +default_passt() { + if command -v passt >/dev/null 2>&1; then + command -v passt + else + echo /usr/bin/passt + fi +} +PASST=${PASST:-$(default_passt)} +HOST_PORT=${HOST_PORT:-18877} +GUEST_ADDR=${GUEST_ADDR:-10.0.2.15} +GUEST_GATEWAY=${GUEST_GATEWAY:-10.0.2.2} +CRR_CLIENT_BIN=${CRR_CLIENT_BIN:-/tmp/crr-client} +ROOTFS_DIR=${ROOTFS_DIR:-} +KEEP_ROOTFS=${KEEP_ROOTFS:-0} + +usage() { + cat <&2; usage; exit 1 ;; + esac +done + +case "$BACKEND" in + libslirp|passt) : ;; + *) echo "unknown backend: $BACKEND" >&2; exit 1 ;; +esac + +[[ -x "$CRR_CLIENT_BIN" ]] || { + echo "ERROR: crr-client not found at $CRR_CLIENT_BIN" >&2 + echo " compile it with: gcc -O2 -static -o $CRR_CLIENT_BIN tools/perf-harness/crr-client.c" >&2 + exit 2 +} + +[[ -r "$KERNEL" ]] || { echo "ERROR: kernel not readable: $KERNEL" >&2; exit 2; } + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +INIT_TEMPLATE="$SCRIPT_DIR/qemu-init.sh" +[[ -r "$INIT_TEMPLATE" ]] || { echo "ERROR: missing $INIT_TEMPLATE" >&2; exit 2; } + +# --------------------------------------------------------------------------- +# Build the initramfs. Keep it on tmpfs so it doesn't pollute the workspace. +# --------------------------------------------------------------------------- +if [[ -z "$ROOTFS_DIR" ]]; then + ROOTFS_DIR=$(mktemp -d -t voidbox-qemu-rootfs.XXXXXX) + cleanup_rootfs() { + if [[ "$KEEP_ROOTFS" -eq 0 ]]; then rm -rf "$ROOTFS_DIR"; fi + } + trap cleanup_rootfs EXIT +fi + +mkdir -p "$ROOTFS_DIR"/{bin,sbin,proc,sys,dev,tmp} + +# Static busybox: prefer host /usr/bin/busybox (Fedora ships static); fall back +# to extracting from voidbox's claude rootfs if needed. +if [[ -x /usr/bin/busybox ]] && file /usr/bin/busybox 2>/dev/null | grep -q "statically linked"; then + cp /usr/bin/busybox "$ROOTFS_DIR/bin/busybox" +elif [[ -r "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" ]]; then + (cd "$ROOTFS_DIR" && zcat "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" | cpio -idm bin/busybox 2>/dev/null) +else + echo "ERROR: no static busybox found; install busybox-static or build target/void-box-claude.cpio.gz" >&2 + exit 2 +fi + +cp "$INIT_TEMPLATE" "$ROOTFS_DIR/init" +chmod +x "$ROOTFS_DIR/init" +cp "$CRR_CLIENT_BIN" "$ROOTFS_DIR/tmp/crr-client" + +for cmd in sh ifconfig route poweroff cat sleep echo mount find ls insmod; do + ln -sf busybox "$ROOTFS_DIR/bin/$cmd" +done + +# Stage virtio_net + failover modules from the host kernel so the distro-kernel +# path can probe the qemu virtio-net-pci device. Voidbox's slim kernel has +# them built-in and ignores these. +KMOD_DIR="/lib/modules/$(uname -r)/kernel" +if [[ -d "$KMOD_DIR" ]]; then + KGUEST_DIR="$ROOTFS_DIR/lib/modules/$(uname -r)" + mkdir -p "$KGUEST_DIR" + for mod in net/core/failover.ko.xz net/core/failover.ko \ + drivers/net/net_failover.ko.xz drivers/net/net_failover.ko \ + drivers/net/virtio_net.ko.xz drivers/net/virtio_net.ko; do + [[ -r "$KMOD_DIR/$mod" ]] && cp "$KMOD_DIR/$mod" "$KGUEST_DIR/" + done +fi + +INITRD=$(mktemp -t voidbox-qemu-initrd.XXXXXX.cpio.gz) +trap "rm -f $INITRD; ${cleanup_rootfs:-true}" EXIT +(cd "$ROOTFS_DIR" && find . | cpio -H newc -o 2>/dev/null | gzip > "$INITRD") + +# --------------------------------------------------------------------------- +# Host-side echo server. The script's outer EXIT trap kills it, so the +# server stays alive for the entire qemu run rather than racing against a +# fixed-duration sleep. HOST_PORT must be free; the script fails fast if +# bind() refuses (no fallback to ephemeral — the guest's kernel cmdline +# carries the port and changing it after launch isn't useful). +# --------------------------------------------------------------------------- +SERVER_PIDFILE=$(mktemp) +python3 - < "$SERVER_PIDFILE" +trap "kill $SERVER_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE; ${cleanup_rootfs:-true}" EXIT +sleep 0.3 + +# --------------------------------------------------------------------------- +# Backend: spin up passt if requested. +# --------------------------------------------------------------------------- +PASST_PID="" +PASST_SOCK="" +NETDEV_ARGS="" +case "$BACKEND" in + libslirp) + NETDEV_ARGS="-netdev user,id=n0 -device virtio-net-pci,netdev=n0" + ;; + passt) + [[ -x "$PASST" ]] || { echo "ERROR: passt not executable: $PASST" >&2; exit 2; } + PASST_SOCK=$(mktemp -u -t voidbox-passt.XXXXXX.sock) + rm -f "$PASST_SOCK" + "$PASST" -f -s "$PASST_SOCK" \ + -a "$GUEST_ADDR" -n 24 -g "$GUEST_GATEWAY" \ + --map-host-loopback "$GUEST_GATEWAY" \ + -q >/tmp/passt.log 2>&1 & + PASST_PID=$! + sleep 0.4 + [[ -S "$PASST_SOCK" ]] || { echo "ERROR: passt socket not created" >&2; exit 3; } + NETDEV_ARGS="-netdev stream,id=n0,addr.type=unix,addr.path=$PASST_SOCK -device virtio-net-pci,netdev=n0" + trap "kill $SERVER_PID $PASST_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $PASST_SOCK; ${cleanup_rootfs:-true}" EXIT + ;; +esac + +# --------------------------------------------------------------------------- +# Boot qemu, capture serial output. +# --------------------------------------------------------------------------- +QEMU_LOG=$(mktemp -t voidbox-qemu.XXXXXX.log) +trap "kill ${SERVER_PID} ${PASST_PID:-} 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $QEMU_LOG ${PASST_SOCK:-}; ${cleanup_rootfs:-true}" EXIT + +# shellcheck disable=SC2086 +HOST_PORT="$HOST_PORT" timeout 60 qemu-system-x86_64 \ + -enable-kvm -cpu host -m 512 -smp 1 \ + -kernel "$KERNEL" \ + -initrd "$INITRD" \ + -nographic -no-reboot \ + -append "console=ttyS0 reboot=t panic=1 quiet crr_target=${GUEST_GATEWAY}:${HOST_PORT}:${ITERATIONS} crr_net=${GUEST_ADDR}/24,${GUEST_GATEWAY}" \ + $NETDEV_ARGS \ + > "$QEMU_LOG" 2>&1 || true + +# Extract the one-line crr-client output between sentinels. +RESULT=$(sed -n '/===CRR-START===/,/===CRR-END/p' "$QEMU_LOG" | grep -E '^[0-9]+ [0-9]+ [0-9]+ [0-9]+$' | head -1 || true) + +if [[ -z "$RESULT" ]]; then + echo "ERROR: no result from guest (qemu log tail follows):" >&2 + tail -20 "$QEMU_LOG" >&2 + exit 4 +fi + +read -r N P50_NS P99_NS MEAN_NS <<<"$RESULT" +P50_US=$((P50_NS / 1000)) +P99_US=$((P99_NS / 1000)) +MEAN_US=$((MEAN_NS / 1000)) +echo "qemu+${BACKEND} CRR over $N iterations: p50=${P50_US} µs, p99=${P99_US} µs, mean=${MEAN_US} µs" >&2 +echo "$RESULT" diff --git a/tools/perf-harness/crr-client.c b/tools/perf-harness/crr-client.c new file mode 100644 index 00000000..df9ee70d --- /dev/null +++ b/tools/perf-harness/crr-client.c @@ -0,0 +1,85 @@ +// crr-client.c — N-iteration TCP CRR loop inside a single process. +// +// Usage: crr-client HOST PORT N +// Output: one line "n p50_ns p99_ns mean_ns" to stdout. +// +// Each iteration: socket → connect → write 1 byte → read 1 byte → close. +// Times the full cycle with CLOCK_MONOTONIC. No fork, no exec, no +// per-iteration interpreter overhead — isolates the user-mode TCP / +// NAT path from the bench's outer process-spawning loop. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int cmp_long(const void *a, const void *b) { + long la = *(const long *)a, lb = *(const long *)b; + return (la > lb) - (la < lb); +} + +int main(int argc, char **argv) { + if (argc != 4) { + fprintf(stderr, "usage: %s HOST PORT N\n", argv[0]); + return 1; + } + const char *host = argv[1]; + int port = atoi(argv[2]); + int n = atoi(argv[3]); + if (n <= 0 || n > 1000000) { + fprintf(stderr, "N out of range\n"); + return 1; + } + + struct sockaddr_in addr; + memset(&addr, 0, sizeof addr); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + if (inet_pton(AF_INET, host, &addr.sin_addr) != 1) { + fprintf(stderr, "bad host %s\n", host); + return 1; + } + + long *samples = calloc((size_t)n, sizeof(long)); + if (!samples) return 2; + + for (int i = 0; i < n; i++) { + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { perror("socket"); return 3; } + if (connect(fd, (struct sockaddr *)&addr, sizeof addr) < 0) { + perror("connect"); + return 3; + } + ssize_t w = write(fd, "y", 1); + (void)w; + char buf; + ssize_t r = read(fd, &buf, 1); + (void)r; + close(fd); + + clock_gettime(CLOCK_MONOTONIC, &t1); + long ns = (t1.tv_sec - t0.tv_sec) * 1000000000L + + (t1.tv_nsec - t0.tv_nsec); + samples[i] = ns; + } + + qsort(samples, (size_t)n, sizeof(long), cmp_long); + long sum = 0; + for (int i = 0; i < n; i++) sum += samples[i]; + long p50 = samples[n / 2]; + long p99 = samples[(n * 99) / 100]; + long mean = sum / n; + printf("%d %ld %ld %ld\n", n, p50, p99, mean); + + free(samples); + return 0; +} diff --git a/tools/perf-harness/qemu-init.sh b/tools/perf-harness/qemu-init.sh new file mode 100755 index 00000000..e32da047 --- /dev/null +++ b/tools/perf-harness/qemu-init.sh @@ -0,0 +1,77 @@ +#!/bin/sh +# tools/perf-harness/qemu-init.sh — /init for the SLIRP-vs-SLIRP comparison guest. +# +# Used by tools/perf-harness/bench-qemu-slirp.sh. Read /proc/cmdline for: +# crr_target=HOST:PORT:N target server + iteration count +# crr_net=ADDR/MASK,GW static network config +# +# Bring up eth0 with the static IP, run /tmp/crr-client, and halt. +# The script is paranoid about busybox-vs-distro variations: virtio-net +# is loaded as a module if present (Fedora-style), or assumed built-in +# (voidbox's slim kernel). + +set +e +mount -t proc proc /proc 2>/dev/null +mount -t sysfs sysfs /sys 2>/dev/null + +cmdline="$(cat /proc/cmdline)" +target="" +net="" +for tok in $cmdline; do + case "$tok" in + crr_target=*) target="${tok#crr_target=}" ;; + crr_net=*) net="${tok#crr_net=}" ;; + esac +done + +if [ -z "$target" ] || [ -z "$net" ]; then + echo "ERROR: missing crr_target or crr_net on cmdline" + echo "cmdline: $cmdline" + poweroff -f +fi + +addr_mask="${net%,*}" +gw="${net#*,}" +host="${target%%:*}" +rest="${target#*:}" +port="${rest%%:*}" +n="${rest#*:}" + +busybox ifconfig lo up + +# Load virtio modules if shipped in the rootfs (distro-kernel case). +# Voidbox's slim kernel has them built-in so insmod fails harmlessly. +for mod in failover net_failover virtio_net; do + busybox find /lib/modules -name "${mod}.ko*" -exec busybox insmod {} \; 2>/dev/null +done + +i=0 +while [ $i -lt 30 ] && ! busybox ifconfig eth0 >/dev/null 2>&1; do + sleep 0.1 + i=$((i+1)) +done + +# Derive the netmask from the /N suffix instead of hard-coding /24: +# crr_net is documented as ADDR/MASK,GW and a future call site might +# reasonably use /16 or /29. Falls back to /24 if the suffix isn't +# parseable so existing setups keep working. +addr="${addr_mask%/*}" +prefix="${addr_mask#*/}" +case "$prefix" in + 8) mask=255.0.0.0 ;; + 16) mask=255.255.0.0 ;; + 24) mask=255.255.255.0 ;; + 29) mask=255.255.255.248 ;; + 30) mask=255.255.255.252 ;; + *) mask=255.255.255.0 ;; +esac +busybox ifconfig eth0 "$addr" netmask "$mask" up +busybox route add default gw "$gw" + +echo "===CRR-START===" +echo "addr=${addr_mask} gw=${gw} target=${host}:${port} n=${n}" +/tmp/crr-client "$host" "$port" "$n" +rc=$? +echo "===CRR-END (rc=$rc)===" + +poweroff -f