From f2ba71a2bb9967aba1f9c8587c1f4a2d33e8f6f5 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 13:02:28 -0300 Subject: [PATCH 01/19] tools: passt/pasta head-to-head comparison harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two scripts and a doc, deferred deliverable from docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md § "passt head-to-head methodology". scripts/bench-pasta.py Drives the same workload shape as voidbox-network-bench (g2h throughput, RR p50/p99, CRR p50) against pasta running in a network namespace. Outputs JSON in the same Report shape so bench-compare-pasta.py can diff the two side by side. pasta is launched with --config-net + --map-host-loopback (default: gateway IP) so connecting to the host gateway from inside the netns reaches the host's 127.0.0.1. Mirrors voidbox's SLIRP convention (10.0.2.2 → 127.0.0.1) closely enough for the apples-to-apples CRR metric. scripts/bench-compare-pasta.py Reads two JSONs and emits a markdown side-by-side. Auto-detects which file is which via the `backend` field. Reports the gap as 'voidbox N× faster/slower' so the direction is unambiguous. docs/passt-comparison.md Caveats + usage. Calls out that throughput numbers are NOT directly comparable (voidbox has VM/MMIO overhead pasta does not). CRR latency is the apples-to-apples metric: dominated by NAT-table operations on both sides. Tested locally: pasta CRR p50 ≈ 80 µs, voidbox CRR p50 ≈ 10.1 ms on the same host. The gap is dominated by voidbox's poll-thread cadence + virtio-mmio exits, not NAT-table cost — a useful actionable signal for follow-up perf work. --- docs/passt-comparison.md | 88 ++++++++ scripts/bench-compare-pasta.py | 130 ++++++++++++ scripts/bench-pasta.py | 366 +++++++++++++++++++++++++++++++++ 3 files changed, 584 insertions(+) create mode 100644 docs/passt-comparison.md create mode 100755 scripts/bench-compare-pasta.py create mode 100755 scripts/bench-pasta.py diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md new file mode 100644 index 00000000..4f052370 --- /dev/null +++ b/docs/passt-comparison.md @@ -0,0 +1,88 @@ +# passt head-to-head comparison harness + +Two scripts under `scripts/` produce a side-by-side comparison of voidbox +(real KVM VM + SLIRP) against passt's [`pasta`](https://passt.top/passt/about/) +running in a network namespace. + +This is the deferred deliverable from +[`docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md`](superpowers/plans/2026-04-27-smoltcp-passt-port.md) +§ "passt head-to-head methodology". + +## What the harness measures + +Both sides run the same workload shape — the same fields the +`voidbox-network-bench` `Report` already emits: + +| Field | Workload | +|---|---| +| `tcp_throughput_g2h_mbps` | `dd if=/dev/zero bs=1M count=N \| nc HOST PORT` from inside the guest / netns; host TCP server times the drain | +| `tcp_rr_latency_us_p50/p99` | Persistent connection, host-side echo loop bouncing one byte per round trip | +| `tcp_crr_latency_us_p50` | Independent `nc` invocations in a tight loop; host-side timing of the full accept→read→write→close cycle | + +The pasta side uses `pasta -- COMMAND` to run the client inside a fresh +network namespace. Pasta's `--map-host-loopback` (default: the host's +gateway IP) translates to the host's loopback, so the client connects +to `:PORT` and reaches the host server bound on `127.0.0.1:PORT`. + +## What it's good for + +**CRR latency is the most apples-to-apples metric** — it's dominated by +NAT-table operations and the round-trip path through the user-mode +networking stack, which is the same code on both sides. Per the spec: + +> Connect rate (CRR latency) is the most apples-to-apples metric — +> dominated by NAT-table operations, not MMIO. If passt does CRR in 135 µs +> and we do 600 µs, that's a meaningful "we have 4× more overhead per +> connect" signal that this refactor should narrow. + +## What it's not + +**Throughput numbers are not directly comparable.** + +- voidbox runs a real KVM VM; every packet incurs `virtio-mmio` + exits, vCPU IPI overhead, and per-packet copy across the device + boundary. +- pasta runs in a network namespace; the data path is just user-mode + socket forwarding, no VM, no MMIO. + +The throughput gap is therefore a *sum of the user-mode overhead the +two stacks share* plus *the VM transit cost only voidbox pays*. +Use the throughput numbers as a sanity bound, not a parity target. + +A proper VM-vs-VM comparison would run passt under +`qemu-system-x86_64` with a guest image carrying `nc` / `iperf3`. +That is documented as a separate follow-up; the harness here is the +quick, low-friction sibling that exercises the apples-to-apples +metric (CRR) without requiring an extra guest image. + +## Usage + +```bash +# Generate voidbox numbers (requires VOID_BOX_KERNEL/VOID_BOX_INITRAMFS). +cargo run --release --bin voidbox-network-bench -- \ + --iterations 3 --output /tmp/voidbox-bench.json + +# Generate pasta numbers (requires pasta on PATH or via $PASTA). +scripts/bench-pasta.py --output /tmp/pasta-bench.json + +# Side-by-side markdown. +scripts/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \ + --output /tmp/voidbox-vs-pasta.md +``` + +`scripts/bench-pasta.py --help` lists tunables (iterations, transfer +size, sample counts). + +## Reading the report + +| Δ column | Meaning | +|---|---| +| `voidbox N× faster` (throughput) | voidbox has the higher Mbps number | +| `voidbox N× slower` (throughput) | pasta has the higher Mbps number — expected, since pasta has no VM | +| `voidbox N× faster` (latency) | voidbox has the lower µs number | +| `voidbox N× slower` (latency) | pasta has the lower µs number — large multiples here mean voidbox spends much of its CRR time outside the NAT path (poll-thread cadence, vCPU exits, virtio handling) | + +A useful CRR signal: if `voidbox N× slower on CRR p50` is much larger +than `voidbox N× slower on RR p50`, the per-connection overhead is the +bottleneck, not the data path. RR p50 captures the data path; CRR +captures the connect path. diff --git a/scripts/bench-compare-pasta.py b/scripts/bench-compare-pasta.py new file mode 100755 index 00000000..430a442d --- /dev/null +++ b/scripts/bench-compare-pasta.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# bench-compare-pasta.py — produce a markdown side-by-side comparing +# voidbox-network-bench output against bench-pasta.py output. +# +# Both inputs are JSON files with the same field names (the shared +# voidbox-network-bench Report shape). Either argument can be the +# voidbox or pasta side; the script auto-detects via the `backend` +# field if present, otherwise positional. + +from __future__ import annotations + +import argparse +import json +import sys +from typing import Any + + +METRICS = [ + ("tcp_throughput_g2h_mbps", "TCP throughput g2h", "Mbps", False), + ("tcp_bulk_throughput_g2h_mbps", "TCP bulk g2h (constrained)", "Mbps", False), + ("tcp_rr_latency_us_p50", "TCP RR latency p50", "µs", True), + ("tcp_rr_latency_us_p99", "TCP RR latency p99", "µs", True), + ("tcp_crr_latency_us_p50", "TCP CRR latency p50", "µs", True), + ("udp_dns_qps", "UDP DNS qps", "qps", False), + ("icmp_rr_latency_us_p50", "ICMP RR p50", "µs", True), + ("tcp_rx_latency_us_p50", "TCP RX latency p50", "µs", True), +] + + +def fmt(value: Any, latency: bool) -> str: + if value is None: + return "n/a" + if isinstance(value, (int, float)): + if latency: + if value >= 1000: + return f"{value / 1000:.2f} ms" + return f"{value:.1f} µs" + if value >= 1000: + return f"{value:.0f}" + return f"{value:.2f}" + return str(value) + + +def fmt_delta(voidbox: Any, pasta: Any, latency: bool) -> str: + if voidbox is None or pasta is None: + return "—" + if pasta == 0: + return "—" + ratio = voidbox / pasta + # For latency: voidbox > pasta means voidbox is *slower* (positive ratio is bad). + # For throughput: voidbox > pasta means voidbox is *faster* (positive ratio is good). + sign = "slower" if (latency and ratio > 1) or (not latency and ratio < 1) else "faster" + if latency: + if ratio >= 1: + return f"voidbox {ratio:.1f}× slower" + return f"voidbox {1 / ratio:.2f}× faster" + if ratio >= 1: + return f"voidbox {ratio:.2f}× faster" + return f"voidbox {1 / ratio:.1f}× slower" + + +def load(path: str) -> dict[str, Any]: + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def detect_role(data: dict[str, Any], default: str) -> str: + backend = data.get("backend") + if backend in ("pasta", "voidbox"): + return backend + return default + + +def main() -> int: + p = argparse.ArgumentParser(description="voidbox vs pasta head-to-head comparison") + p.add_argument("voidbox_json", help="path to voidbox-network-bench JSON output") + p.add_argument("pasta_json", help="path to bench-pasta.py JSON output") + p.add_argument("--output", help="write markdown to file instead of stdout") + args = p.parse_args() + + voidbox = load(args.voidbox_json) + pasta = load(args.pasta_json) + + if detect_role(voidbox, "voidbox") == "pasta": + voidbox, pasta = pasta, voidbox + + lines: list[str] = [] + lines.append("# voidbox vs pasta head-to-head\n") + lines.append("Methodology per `docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md` §") + lines.append("\"passt head-to-head methodology\": same host, same workload (`nc`-based g2h /") + lines.append("RR / CRR), same metric names. **CRR latency is the most apples-to-apples**") + lines.append("metric — dominated by NAT-table operations on both sides. Throughput numbers") + lines.append("are not directly comparable: voidbox runs in a real KVM VM (virtio-mmio exit") + lines.append("overhead); pasta runs in a network namespace (no VM).\n") + lines.append("| Metric | voidbox (KVM + SLIRP) | pasta (netns) | Δ |") + lines.append("|---|---:|---:|---|") + + for key, label, _unit, latency in METRICS: + v = voidbox.get(key) + pa = pasta.get(key) + if v is None and pa is None: + continue + lines.append( + f"| {label} | {fmt(v, latency)} | {fmt(pa, latency)} | {fmt_delta(v, pa, latency)} |" + ) + + lines.append("") + pasta_version = pasta.get("pasta_version") + if pasta_version: + lines.append(f"_pasta version: `{pasta_version}`_") + lines.append("") + notes = pasta.get("notes") + if isinstance(notes, list) and notes: + lines.append("**Notes from pasta side:**") + for note in notes: + lines.append(f"- {note}") + lines.append("") + + md = "\n".join(lines) + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(md) + print(f"Report written to {args.output}", file=sys.stderr) + else: + print(md) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/bench-pasta.py b/scripts/bench-pasta.py new file mode 100755 index 00000000..a80fd4b4 --- /dev/null +++ b/scripts/bench-pasta.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +# bench-pasta.py — passt/pasta side of the head-to-head comparison. +# +# Drives the same workload shape as `voidbox-network-bench`: +# - tcp_throughput_g2h_mbps (sustained guest→host throughput) +# - tcp_rr_latency_us_p50/p99 (persistent-connection round-trip) +# - tcp_crr_latency_us_p50 (connect-request-response latency) +# +# The "guest" is a process running inside a pasta-managed network +# namespace. Pasta forwards the host's gateway address into the netns +# as a translation for the host's loopback (its --map-host-loopback +# default), so connecting to the host gateway IP from inside the netns +# reaches the host's 127.0.0.1. This mirrors voidbox's SLIRP +# convention (10.0.2.2 → 127.0.0.1) closely enough for the metric +# comparison to be apples-to-apples on the NAT path. +# +# Methodology aligns with docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md +# § "passt head-to-head methodology": same host, same workload, same +# metric names, focus on CRR latency (dominated by NAT-table ops, not +# MMIO exit overhead). + +from __future__ import annotations + +import argparse +import json +import os +import socket +import statistics +import subprocess +import sys +import threading +import time +from dataclasses import asdict, dataclass, field +from typing import Optional + + +@dataclass +class Report: + tcp_bulk_throughput_g2h_mbps: Optional[float] = None + tcp_throughput_g2h_mbps: Optional[float] = None + tcp_throughput_h2g_mbps: Optional[float] = None + tcp_rr_latency_us_p50: Optional[float] = None + tcp_rr_latency_us_p99: Optional[float] = None + tcp_crr_latency_us_p50: Optional[float] = None + udp_dns_qps: Optional[float] = None + icmp_rr_latency_us_p50: Optional[float] = None + tcp_rx_latency_us_p50: Optional[float] = None + backend: str = "pasta" + pasta_version: Optional[str] = None + notes: list[str] = field(default_factory=list) + + +def _resolve_pasta() -> str: + """Find a pasta binary in $PATH or fall back to /usr/bin/pasta.""" + import shutil + found = shutil.which("pasta") + if found: + return found + return "/usr/bin/pasta" + + +def detect_host_gateway() -> str: + out = subprocess.check_output(["ip", "-4", "route", "show", "default"], text=True) + for line in out.splitlines(): + parts = line.split() + if parts and parts[0] == "default": + return parts[2] + raise RuntimeError("no default gateway found") + + +def pasta_version(pasta: str) -> str: + out = subprocess.run([pasta, "--version"], capture_output=True, text=True, check=False) + first = out.stdout.splitlines() or [""] + return first[0].strip() + + +def free_port() -> int: + s = socket.socket() + s.bind(("127.0.0.1", 0)) + port = s.getsockname()[1] + s.close() + return port + + +def run_in_netns(pasta: str, cmd: str, *, timeout: float) -> subprocess.CompletedProcess[str]: + """Run `cmd` inside a fresh pasta-managed network namespace.""" + return subprocess.run( + [pasta, "-q", "--config-net", "--", "bash", "-c", cmd], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + + +def measure_g2h_throughput( + pasta: str, + gw: str, + iterations: int, + transfer_mb: int, +) -> Optional[float]: + samples_mbps: list[float] = [] + for i in range(iterations): + port = free_port() + result_box: dict[str, object] = {} + + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(1) + srv.settimeout(30.0) + + def host_drain() -> None: + try: + conn, _ = srv.accept() + except socket.timeout: + result_box["error"] = "accept timeout" + return + start = time.perf_counter() + total = 0 + with conn: + while True: + buf = conn.recv(1 << 16) + if not buf: + break + total += len(buf) + result_box["bytes"] = total + result_box["elapsed"] = time.perf_counter() - start + + worker = threading.Thread(target=host_drain, daemon=True) + worker.start() + time.sleep(0.2) + + cmd = f"dd if=/dev/zero bs=1M count={transfer_mb} 2>/dev/null | nc {gw} {port}" + try: + run_in_netns(pasta, cmd, timeout=60) + except subprocess.TimeoutExpired: + print(f"g2h[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=10) + srv.close() + + if "error" in result_box: + print(f"g2h[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr) + continue + bytes_received = int(result_box.get("bytes", 0)) + elapsed = float(result_box.get("elapsed", 0.0)) + if bytes_received <= 0 or elapsed < 1e-4: + print(f"g2h[{i:>2}]: bytes={bytes_received} elapsed={elapsed}s; skipping", file=sys.stderr) + continue + mbps = bytes_received * 8 / elapsed / 1_000_000 + print( + f"g2h[{i:>2}]: {bytes_received} B in {elapsed:.3f}s = {mbps:.1f} Mbps", + file=sys.stderr, + ) + samples_mbps.append(mbps) + + if not samples_mbps: + return None + return sum(samples_mbps) / len(samples_mbps) + + +def measure_rr_latency( + pasta: str, + gw: str, + iterations: int, + samples_per_iter: int, +) -> tuple[Optional[float], Optional[float]]: + all_samples_us: list[float] = [] + for i in range(iterations): + port = free_port() + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(1) + srv.settimeout(30.0) + + result_box: dict[str, object] = {} + + def host_echo() -> None: + try: + conn, _ = srv.accept() + except socket.timeout: + result_box["error"] = "accept timeout" + return + samples: list[float] = [] + with conn: + buf = bytearray(1) + for _ in range(samples_per_iter): + start = time.perf_counter_ns() + nrecv = conn.recv_into(buf, 1) + if nrecv == 0: + break + conn.sendall(bytes(buf[:1])) + samples.append((time.perf_counter_ns() - start) / 1000.0) + result_box["samples"] = samples + + worker = threading.Thread(target=host_echo, daemon=True) + worker.start() + time.sleep(0.2) + + # Send `samples_per_iter` zero bytes. The guest doesn't read + # the echoed bytes back; host-side timing is the ground truth. + cmd = f"dd if=/dev/zero bs=1 count={samples_per_iter} 2>/dev/null | nc {gw} {port} >/dev/null" + try: + run_in_netns(pasta, cmd, timeout=60) + except subprocess.TimeoutExpired: + print(f"rr[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=10) + srv.close() + + if "error" in result_box: + print(f"rr[{i:>2}]: {result_box['error']}; skipping", file=sys.stderr) + continue + iter_samples = list(result_box.get("samples", [])) + if len(iter_samples) > 1: + iter_samples.pop(0) + if not iter_samples: + print(f"rr[{i:>2}]: no samples; skipping", file=sys.stderr) + continue + p50 = statistics.median(iter_samples) + print(f"rr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.1f} µs", file=sys.stderr) + all_samples_us.extend(iter_samples) + + if not all_samples_us: + return None, None + sorted_s = sorted(all_samples_us) + n = len(sorted_s) + p50 = sorted_s[n // 2] + p99_idx = max(0, int(round(0.99 * (n - 1)))) + p99 = sorted_s[p99_idx] + return p50, p99 + + +def measure_crr_latency( + pasta: str, + gw: str, + iterations: int, + samples_per_iter: int, +) -> Optional[float]: + all_samples_us: list[float] = [] + for i in range(iterations): + port = free_port() + srv = socket.socket() + srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + srv.bind(("127.0.0.1", port)) + srv.listen(64) + srv.settimeout(30.0) + + result_box: dict[str, object] = {} + + def host_accept_loop() -> None: + samples: list[float] = [] + for _ in range(samples_per_iter): + try: + conn, _ = srv.accept() + except socket.timeout: + break + start = time.perf_counter_ns() + with conn: + # one read + one write keeps it a true CRR round-trip + try: + conn.recv(1) + conn.sendall(b"x") + except OSError: + pass + samples.append((time.perf_counter_ns() - start) / 1000.0) + result_box["samples"] = samples + + worker = threading.Thread(target=host_accept_loop, daemon=True) + worker.start() + time.sleep(0.2) + + # Guest: a tight loop of independent nc invocations + cmd = ( + f"for _ in $(seq 1 {samples_per_iter}); do " + f"echo y | nc {gw} {port} >/dev/null; done" + ) + try: + run_in_netns(pasta, cmd, timeout=120) + except subprocess.TimeoutExpired: + print(f"crr[{i:>2}]: client timeout; skipping", file=sys.stderr) + srv.close() + continue + + worker.join(timeout=15) + srv.close() + + iter_samples = list(result_box.get("samples", [])) + if not iter_samples: + print(f"crr[{i:>2}]: no samples; skipping", file=sys.stderr) + continue + p50 = statistics.median(iter_samples) + print(f"crr[{i:>2}]: {len(iter_samples)} samples, p50={p50:.0f} µs", file=sys.stderr) + all_samples_us.extend(iter_samples) + + if not all_samples_us: + return None + sorted_s = sorted(all_samples_us) + return sorted_s[len(sorted_s) // 2] + + +def main() -> int: + parser = argparse.ArgumentParser(description="passt/pasta head-to-head bench harness") + parser.add_argument( + "--pasta", + default=os.environ.get("PASTA") or _resolve_pasta(), + help="path to the pasta binary; default $PASTA, or `pasta` on PATH, or system /usr/bin/pasta", + ) + parser.add_argument("--iterations", type=int, default=3) + parser.add_argument("--transfer-mb", type=int, default=50) + parser.add_argument("--rr-samples", type=int, default=100) + parser.add_argument("--crr-samples", type=int, default=30) + parser.add_argument("--output", default=None, help="path to write JSON; default stdout") + args = parser.parse_args() + + if not os.access(args.pasta, os.X_OK): + print(f"pasta not executable: {args.pasta}", file=sys.stderr) + return 2 + + gw = detect_host_gateway() + version = pasta_version(args.pasta) + print(f"pasta: {version}", file=sys.stderr) + print(f"host gateway (acts as host-loopback inside netns): {gw}", file=sys.stderr) + + report = Report(backend="pasta", pasta_version=version) + report.notes.append( + "pasta runs in a network namespace (no VM); excludes the MMIO/virtio-mmio overhead " + "that voidbox-network-bench includes. CRR latency is the most apples-to-apples metric " + "because it is dominated by NAT-table operations on both sides." + ) + + print("\n--- TCP throughput g2h ---", file=sys.stderr) + report.tcp_throughput_g2h_mbps = measure_g2h_throughput( + args.pasta, gw, args.iterations, args.transfer_mb + ) + + print("\n--- TCP RR latency ---", file=sys.stderr) + p50, p99 = measure_rr_latency(args.pasta, gw, args.iterations, args.rr_samples) + report.tcp_rr_latency_us_p50 = p50 + report.tcp_rr_latency_us_p99 = p99 + + print("\n--- TCP CRR latency ---", file=sys.stderr) + report.tcp_crr_latency_us_p50 = measure_crr_latency( + args.pasta, gw, args.iterations, args.crr_samples + ) + + payload = json.dumps(asdict(report), indent=2) + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + f.write(payload) + f.write("\n") + print(f"\nReport written to {args.output}", file=sys.stderr) + else: + print() + print(payload) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From b633233eb5e14a90b96b572d32834777dfaf2496 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 15:04:06 -0300 Subject: [PATCH 02/19] tools: crr-client + voidbox-side single-process CRR diagnostic Pair of artefacts used to root-cause the apparent 122x voidbox-vs-pasta CRR p50 gap reported by scripts/bench-pasta.py. tools/crr-client.c Static-linked C binary that performs N TCP CRRs in one process, no fork or exec per iteration. Output is one line of nanoseconds: N P50 P99 MEAN. Compile with: gcc -O2 -static -o /tmp/crr-client tools/crr-client.c examples/crr_singleproc_bench.rs Voidbox-side driver. Boots a sandbox with /tmp host-mounted into the guest, runs the static binary inside the guest, parses the one-line output. Measures voidbox's NAT-path CRR cost without the outer bench's per-iteration nc fork+exec. Result: voidbox-in-VM at 421 us p50 vs pasta-in-netns at 107 us p50 is dominated (~300 us of the ~314 us gap) by VM transit (virtio-mmio exits, KVM IRQ injection, vsock RPC), not by SLIRP-engine cost. A genuinely apples-to-apples SLIRP-vs-SLIRP comparison (passt+qemu vs voidbox+voidbox-VM) is the natural follow-up; this commit captures the tooling so that follow-up can stand on a reproducible baseline. --- examples/crr_singleproc_bench.rs | 142 +++++++++++++++++++++++++++++++ tools/crr-client.c | 85 ++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 examples/crr_singleproc_bench.rs create mode 100644 tools/crr-client.c diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs new file mode 100644 index 00000000..93241d7c --- /dev/null +++ b/examples/crr_singleproc_bench.rs @@ -0,0 +1,142 @@ +//! crr_singleproc_bench — voidbox-side N-iteration TCP CRR loop in a +//! single guest process, isolating voidbox's NAT-path cost from the +//! existing bench's per-iteration `nc` fork+exec overhead. +//! +//! NOT meant for the production bench surface; this is a one-off +//! diagnostic that pairs with `tools/crr-client.c` + the pasta side +//! of the head-to-head. Compile and run directly: +//! +//! gcc -O2 -static -o /tmp/crr-client tools/crr-client.c +//! cargo run --release --example crr_singleproc_bench -- \ +//! --iterations 100 --bench-binary /tmp/crr-client +//! +//! Requires the same env vars as voidbox-network-bench: +//! VOID_BOX_KERNEL, VOID_BOX_INITRAMFS + +use std::net::TcpListener; +use std::sync::mpsc; +use std::thread; +use std::time::Duration; + +use clap::Parser; +use void_box::backend::MountConfig; +use void_box::sandbox::Sandbox; + +#[derive(Parser)] +#[command(version, about)] +struct Cli { + /// Number of CRR iterations. + #[arg(long, default_value_t = 100)] + iterations: u32, + /// Host path to the static crr-client binary. + #[arg(long, default_value = "/tmp/crr-client")] + bench_binary: String, + /// Memory size for the guest VM (MB). + #[arg(long, default_value_t = 1024)] + memory_mb: usize, +} + +const HOST_LOOPBACK_FROM_GUEST: &str = "10.0.2.2"; + +#[tokio::main(flavor = "multi_thread")] +async fn main() -> Result<(), Box> { + let cli = Cli::parse(); + let bench_binary = std::path::PathBuf::from(&cli.bench_binary); + if !bench_binary.exists() { + return Err(format!( + "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/crr-client.c`)", + cli.bench_binary + ) + .into()); + } + let bench_binary_dir = bench_binary + .parent() + .ok_or("bench-binary has no parent dir")? + .to_string_lossy() + .into_owned(); + let bench_binary_name = bench_binary + .file_name() + .ok_or("bench-binary has no file name")? + .to_string_lossy() + .into_owned(); + + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + + let iterations = cli.iterations; + let server_thread = thread::spawn(move || { + let mut accepted = 0u32; + listener.set_nonblocking(false).ok(); + let deadline = std::time::Instant::now() + Duration::from_secs(120); + let (done_tx, _done_rx) = mpsc::channel::<()>(); + while accepted < iterations && std::time::Instant::now() < deadline { + match listener.accept() { + Ok((mut conn, _)) => { + let mut buf = [0u8; 1]; + let _ = std::io::Read::read(&mut conn, &mut buf); + let _ = std::io::Write::write_all(&mut conn, b"x"); + accepted += 1; + } + Err(_) => break, + } + } + drop(done_tx); + accepted + }); + + let sandbox = Sandbox::local() + .from_env()? + .memory_mb(cli.memory_mb) + .network(true) + .mount(MountConfig { + host_path: bench_binary_dir.clone(), + guest_path: "/tmp/host".into(), + read_only: true, + }) + .build()?; + + eprintln!( + "VM booted; running {} CRRs in a single guest process...", + iterations + ); + let probe = sandbox.exec("sh", &["-c", ":"]).await?; + if !probe.success() { + return Err("VM probe exec failed".into()); + } + + let cmd = format!( + "/tmp/host/{name} {host} {port} {n}", + name = bench_binary_name, + host = HOST_LOOPBACK_FROM_GUEST, + port = host_port, + n = iterations, + ); + let output = sandbox.exec("sh", &["-c", &cmd]).await?; + let stdout = output.stdout_str().to_string(); + let stderr = output.stderr_str().to_string(); + if !output.success() { + eprintln!("guest stderr: {stderr}"); + return Err(format!("guest exec failed: {:?}", output.exit_code).into()); + } + + let server_thread_count = server_thread.join().unwrap_or(0); + eprintln!("host accepts: {server_thread_count}/{iterations}"); + + let line = stdout.lines().next().ok_or("empty guest stdout")?; + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() != 4 { + return Err(format!("unexpected guest stdout: {line:?}").into()); + } + let n: u32 = parts[0].parse()?; + let p50_ns: u64 = parts[1].parse()?; + let p99_ns: u64 = parts[2].parse()?; + let mean_ns: u64 = parts[3].parse()?; + + println!(); + println!("voidbox single-process CRR over {n} iterations:"); + println!(" p50: {} µs", p50_ns / 1000); + println!(" p99: {} µs", p99_ns / 1000); + println!(" mean: {} µs", mean_ns / 1000); + + Ok(()) +} diff --git a/tools/crr-client.c b/tools/crr-client.c new file mode 100644 index 00000000..df9ee70d --- /dev/null +++ b/tools/crr-client.c @@ -0,0 +1,85 @@ +// crr-client.c — N-iteration TCP CRR loop inside a single process. +// +// Usage: crr-client HOST PORT N +// Output: one line "n p50_ns p99_ns mean_ns" to stdout. +// +// Each iteration: socket → connect → write 1 byte → read 1 byte → close. +// Times the full cycle with CLOCK_MONOTONIC. No fork, no exec, no +// per-iteration interpreter overhead — isolates the user-mode TCP / +// NAT path from the bench's outer process-spawning loop. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int cmp_long(const void *a, const void *b) { + long la = *(const long *)a, lb = *(const long *)b; + return (la > lb) - (la < lb); +} + +int main(int argc, char **argv) { + if (argc != 4) { + fprintf(stderr, "usage: %s HOST PORT N\n", argv[0]); + return 1; + } + const char *host = argv[1]; + int port = atoi(argv[2]); + int n = atoi(argv[3]); + if (n <= 0 || n > 1000000) { + fprintf(stderr, "N out of range\n"); + return 1; + } + + struct sockaddr_in addr; + memset(&addr, 0, sizeof addr); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + if (inet_pton(AF_INET, host, &addr.sin_addr) != 1) { + fprintf(stderr, "bad host %s\n", host); + return 1; + } + + long *samples = calloc((size_t)n, sizeof(long)); + if (!samples) return 2; + + for (int i = 0; i < n; i++) { + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + + int fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { perror("socket"); return 3; } + if (connect(fd, (struct sockaddr *)&addr, sizeof addr) < 0) { + perror("connect"); + return 3; + } + ssize_t w = write(fd, "y", 1); + (void)w; + char buf; + ssize_t r = read(fd, &buf, 1); + (void)r; + close(fd); + + clock_gettime(CLOCK_MONOTONIC, &t1); + long ns = (t1.tv_sec - t0.tv_sec) * 1000000000L + + (t1.tv_nsec - t0.tv_nsec); + samples[i] = ns; + } + + qsort(samples, (size_t)n, sizeof(long), cmp_long); + long sum = 0; + for (int i = 0; i < n; i++) sum += samples[i]; + long p50 = samples[n / 2]; + long p99 = samples[(n * 99) / 100]; + long mean = sum / n; + printf("%d %ld %ld %ld\n", n, p50, p99, mean); + + free(samples); + return 0; +} From f073ab9d2aba2aa4a9e55dcc9cdd15711c57bcb4 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 15:27:08 -0300 Subject: [PATCH 03/19] =?UTF-8?q?tools:=20bench-qemu-slirp.sh=20=E2=80=94?= =?UTF-8?q?=20qemu+libslirp=20/=20qemu+passt=20CRR=20harness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boots a minimal qemu guest carrying tools/crr-client and runs N TCP CRRs against a host TCP server. Two backends: --backend libslirp qemu's built-in -netdev user (libslirp) --backend passt qemu -netdev stream + passt(1) over UNIX socket Same workload + iteration count as scripts/bench-pasta.py and examples/crr_singleproc_bench.rs, so the four datapoints (host-direct, pasta-in-netns, qemu+libslirp, qemu+passt, voidbox+voidbox-SLIRP) are directly comparable on the same machine. The script auto-builds the initramfs from tools/qemu-init.sh + busybox + tools/crr-client, including virtio_net + failover modules from the host kernel so a stock distro kernel can probe the qemu virtio-net-pci device. Voidbox's slim kernel has them built-in and the insmod calls fail harmlessly. Result on the dev machine: host-direct 63 us p50 pasta (netns, no VM) 107 us p50 qemu+libslirp (in VM) 181 us p50 qemu+passt (in VM) 163 us p50 voidbox+voidbox-SLIRP 421 us p50 Voidbox is ~2.2x slower than the mature C SLIRPs in the same VM-attached configuration -- the genuine engine gap, independent of fork artefact (10x) and VM transit (which both sides pay). --- scripts/bench-qemu-slirp.sh | 216 ++++++++++++++++++++++++++++++++++++ tools/qemu-init.sh | 63 +++++++++++ 2 files changed, 279 insertions(+) create mode 100755 scripts/bench-qemu-slirp.sh create mode 100755 tools/qemu-init.sh diff --git a/scripts/bench-qemu-slirp.sh b/scripts/bench-qemu-slirp.sh new file mode 100755 index 00000000..968b488a --- /dev/null +++ b/scripts/bench-qemu-slirp.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash +# bench-qemu-slirp.sh — qemu-side of the proper SLIRP-vs-SLIRP head-to-head. +# +# Boots a minimal qemu guest with the static crr-client baked in, runs N +# TCP CRRs against a host TCP server, and prints `n p50_ns p99_ns mean_ns`. +# +# Two backends: +# --backend libslirp qemu's built-in -netdev user (libslirp) +# --backend passt qemu -netdev stream + a passt(1) instance over UNIX socket +# +# Both produce a number directly comparable to scripts/bench-pasta.py's +# pasta-side number AND to examples/crr_singleproc_bench.rs's voidbox-side +# number — same workload, same C client, same iteration count. +# +# Why this exists: voidbox-vs-pasta comparisons mix two different +# architectures (a real VM vs a netns). The right SLIRP-vs-SLIRP comparison +# is voidbox+voidbox-SLIRP vs qemu+passt vs qemu+libslirp — all VM-attached. +# See docs/passt-comparison.md. + +set -euo pipefail + +BACKEND=libslirp +ITERATIONS=30 +KERNEL=${KERNEL:-/boot/vmlinuz-$(uname -r)} +# NB: must be the `passt` binary (VM/socket mode), NOT the `pasta` symlink +# (namespace mode). The two modes are the same code keyed on argv[0]. +PASST=${PASST:-/home/diego/github/passt/passt} +HOST_PORT=${HOST_PORT:-18877} +GUEST_ADDR=${GUEST_ADDR:-10.0.2.15} +GUEST_GATEWAY=${GUEST_GATEWAY:-10.0.2.2} +CRR_CLIENT_BIN=${CRR_CLIENT_BIN:-/tmp/crr-client} +ROOTFS_DIR=${ROOTFS_DIR:-} +KEEP_ROOTFS=${KEEP_ROOTFS:-0} + +usage() { + cat <&2; usage; exit 1 ;; + esac +done + +case "$BACKEND" in + libslirp|passt) : ;; + *) echo "unknown backend: $BACKEND" >&2; exit 1 ;; +esac + +[[ -x "$CRR_CLIENT_BIN" ]] || { + echo "ERROR: crr-client not found at $CRR_CLIENT_BIN" >&2 + echo " compile it with: gcc -O2 -static -o $CRR_CLIENT_BIN tools/crr-client.c" >&2 + exit 2 +} + +[[ -r "$KERNEL" ]] || { echo "ERROR: kernel not readable: $KERNEL" >&2; exit 2; } + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +INIT_TEMPLATE="$REPO_ROOT/tools/qemu-init.sh" +[[ -r "$INIT_TEMPLATE" ]] || { echo "ERROR: missing $INIT_TEMPLATE" >&2; exit 2; } + +# --------------------------------------------------------------------------- +# Build the initramfs. Keep it on tmpfs so it doesn't pollute the workspace. +# --------------------------------------------------------------------------- +if [[ -z "$ROOTFS_DIR" ]]; then + ROOTFS_DIR=$(mktemp -d -t voidbox-qemu-rootfs.XXXXXX) + cleanup_rootfs() { + if [[ "$KEEP_ROOTFS" -eq 0 ]]; then rm -rf "$ROOTFS_DIR"; fi + } + trap cleanup_rootfs EXIT +fi + +mkdir -p "$ROOTFS_DIR"/{bin,sbin,proc,sys,dev,tmp} + +# Static busybox: prefer host /usr/bin/busybox (Fedora ships static); fall back +# to extracting from voidbox's claude rootfs if needed. +if [[ -x /usr/bin/busybox ]] && file /usr/bin/busybox 2>/dev/null | grep -q "statically linked"; then + cp /usr/bin/busybox "$ROOTFS_DIR/bin/busybox" +elif [[ -r "$REPO_ROOT/target/void-box-claude.cpio.gz" ]]; then + (cd "$ROOTFS_DIR" && zcat "$REPO_ROOT/target/void-box-claude.cpio.gz" | cpio -idm bin/busybox 2>/dev/null) +else + echo "ERROR: no static busybox found; install busybox-static or build target/void-box-claude.cpio.gz" >&2 + exit 2 +fi + +cp "$INIT_TEMPLATE" "$ROOTFS_DIR/init" +chmod +x "$ROOTFS_DIR/init" +cp "$CRR_CLIENT_BIN" "$ROOTFS_DIR/tmp/crr-client" + +for cmd in sh ifconfig route poweroff cat sleep echo mount find ls insmod; do + ln -sf busybox "$ROOTFS_DIR/bin/$cmd" +done + +# Stage virtio_net + failover modules from the host kernel so the distro-kernel +# path can probe the qemu virtio-net-pci device. Voidbox's slim kernel has +# them built-in and ignores these. +KMOD_DIR="/lib/modules/$(uname -r)/kernel" +if [[ -d "$KMOD_DIR" ]]; then + KGUEST_DIR="$ROOTFS_DIR/lib/modules/$(uname -r)" + mkdir -p "$KGUEST_DIR" + for mod in net/core/failover.ko.xz net/core/failover.ko \ + drivers/net/net_failover.ko.xz drivers/net/net_failover.ko \ + drivers/net/virtio_net.ko.xz drivers/net/virtio_net.ko; do + [[ -r "$KMOD_DIR/$mod" ]] && cp "$KMOD_DIR/$mod" "$KGUEST_DIR/" + done +fi + +INITRD=$(mktemp -t voidbox-qemu-initrd.XXXXXX.cpio.gz) +trap "rm -f $INITRD; ${cleanup_rootfs:-true}" EXIT +(cd "$ROOTFS_DIR" && find . | cpio -H newc -o 2>/dev/null | gzip > "$INITRD") + +# --------------------------------------------------------------------------- +# Host-side echo server. Host port can be passed in via env; pick a free one +# if the default is in use. +# --------------------------------------------------------------------------- +SERVER_PIDFILE=$(mktemp) +python3 - < "$SERVER_PIDFILE" +trap "kill $SERVER_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE; ${cleanup_rootfs:-true}" EXIT +sleep 0.3 + +# --------------------------------------------------------------------------- +# Backend: spin up passt if requested. +# --------------------------------------------------------------------------- +PASST_PID="" +PASST_SOCK="" +NETDEV_ARGS="" +case "$BACKEND" in + libslirp) + NETDEV_ARGS="-netdev user,id=n0 -device virtio-net-pci,netdev=n0" + ;; + passt) + [[ -x "$PASST" ]] || { echo "ERROR: passt not executable: $PASST" >&2; exit 2; } + PASST_SOCK=$(mktemp -u -t voidbox-passt.XXXXXX.sock) + rm -f "$PASST_SOCK" + "$PASST" -f -s "$PASST_SOCK" \ + -a "$GUEST_ADDR" -n 24 -g "$GUEST_GATEWAY" \ + --map-host-loopback "$GUEST_GATEWAY" \ + -q >/tmp/passt.log 2>&1 & + PASST_PID=$! + sleep 0.4 + [[ -S "$PASST_SOCK" ]] || { echo "ERROR: passt socket not created" >&2; exit 3; } + NETDEV_ARGS="-netdev stream,id=n0,addr.type=unix,addr.path=$PASST_SOCK -device virtio-net-pci,netdev=n0" + trap "kill $SERVER_PID $PASST_PID 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $PASST_SOCK; ${cleanup_rootfs:-true}" EXIT + ;; +esac + +# --------------------------------------------------------------------------- +# Boot qemu, capture serial output. +# --------------------------------------------------------------------------- +QEMU_LOG=$(mktemp -t voidbox-qemu.XXXXXX.log) +trap "kill ${SERVER_PID} ${PASST_PID:-} 2>/dev/null; rm -f $INITRD $SERVER_PIDFILE $QEMU_LOG ${PASST_SOCK:-}; ${cleanup_rootfs:-true}" EXIT + +# shellcheck disable=SC2086 +HOST_PORT="$HOST_PORT" timeout 60 qemu-system-x86_64 \ + -enable-kvm -cpu host -m 512 -smp 1 \ + -kernel "$KERNEL" \ + -initrd "$INITRD" \ + -nographic -no-reboot \ + -append "console=ttyS0 reboot=t panic=1 quiet crr_target=${GUEST_GATEWAY}:${HOST_PORT}:${ITERATIONS} crr_net=${GUEST_ADDR}/24,${GUEST_GATEWAY}" \ + $NETDEV_ARGS \ + > "$QEMU_LOG" 2>&1 || true + +# Extract the one-line crr-client output between sentinels. +RESULT=$(sed -n '/===CRR-START===/,/===CRR-END/p' "$QEMU_LOG" | grep -E '^[0-9]+ [0-9]+ [0-9]+ [0-9]+$' | head -1 || true) + +if [[ -z "$RESULT" ]]; then + echo "ERROR: no result from guest (qemu log tail follows):" >&2 + tail -20 "$QEMU_LOG" >&2 + exit 4 +fi + +read -r N P50_NS P99_NS MEAN_NS <<<"$RESULT" +P50_US=$((P50_NS / 1000)) +P99_US=$((P99_NS / 1000)) +MEAN_US=$((MEAN_NS / 1000)) +echo "qemu+${BACKEND} CRR over $N iterations: p50=${P50_US} µs, p99=${P99_US} µs, mean=${MEAN_US} µs" >&2 +echo "$RESULT" diff --git a/tools/qemu-init.sh b/tools/qemu-init.sh new file mode 100755 index 00000000..1654868b --- /dev/null +++ b/tools/qemu-init.sh @@ -0,0 +1,63 @@ +#!/bin/sh +# tools/qemu-init.sh — /init for the SLIRP-vs-SLIRP comparison guest. +# +# Used by scripts/bench-qemu-slirp.sh. Read /proc/cmdline for: +# crr_target=HOST:PORT:N target server + iteration count +# crr_net=ADDR/MASK,GW static network config +# +# Bring up eth0 with the static IP, run /tmp/crr-client, and halt. +# The script is paranoid about busybox-vs-distro variations: virtio-net +# is loaded as a module if present (Fedora-style), or assumed built-in +# (voidbox's slim kernel). + +set +e +mount -t proc proc /proc 2>/dev/null +mount -t sysfs sysfs /sys 2>/dev/null + +cmdline="$(cat /proc/cmdline)" +target="" +net="" +for tok in $cmdline; do + case "$tok" in + crr_target=*) target="${tok#crr_target=}" ;; + crr_net=*) net="${tok#crr_net=}" ;; + esac +done + +if [ -z "$target" ] || [ -z "$net" ]; then + echo "ERROR: missing crr_target or crr_net on cmdline" + echo "cmdline: $cmdline" + poweroff -f +fi + +addr_mask="${net%,*}" +gw="${net#*,}" +host="${target%%:*}" +rest="${target#*:}" +port="${rest%%:*}" +n="${rest#*:}" + +busybox ifconfig lo up + +# Load virtio modules if shipped in the rootfs (distro-kernel case). +# Voidbox's slim kernel has them built-in so insmod fails harmlessly. +for mod in failover net_failover virtio_net; do + busybox find /lib/modules -name "${mod}.ko*" -exec busybox insmod {} \; 2>/dev/null +done + +i=0 +while [ $i -lt 30 ] && ! busybox ifconfig eth0 >/dev/null 2>&1; do + sleep 0.1 + i=$((i+1)) +done + +busybox ifconfig eth0 "${addr_mask%/*}" netmask 255.255.255.0 up +busybox route add default gw "$gw" + +echo "===CRR-START===" +echo "addr=${addr_mask} gw=${gw} target=${host}:${port} n=${n}" +/tmp/crr-client "$host" "$port" "$n" +rc=$? +echo "===CRR-END (rc=$rc)===" + +poweroff -f From 4ec59f9f5a5de1e9aeacfca0b767ffb973e000fa Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 15:48:01 -0300 Subject: [PATCH 04/19] perf(virtio-net): hot-path cleanups + suppress redundant IRQ pulses Four small wins on the per-packet path between the SlirpBackend's inject queue and the guest, identified by the SLIRP-vs-SLIRP comparison (voidbox 421 us p50 vs qemu+passt 163 us p50 on the single-process TCP CRR benchmark). src/devices/virtio_net.rs::try_inject_rx - Read avail.idx ONCE per call instead of per frame. The driver only bumps it when adding new buffers; per-frame re-reads are redundant guest-memory accesses. - Replace 'let used_elem = [...].concat()' with a stack [u8; 8]. The previous code allocated a Vec per injected frame in the hot path; the new code costs four byte copies and zero allocs. - Write used.idx ONCE at the end of the batch rather than after every frame. The virtio spec only requires a single update per publish; per-frame writes were redundant guest-memory accesses. - Return frames_injected (usize) so callers can pulse the IRQ line conditionally on actual new RX work. src/devices/virtio_net.rs::process_tx_queue - Replace per-frame Vec::concat with stack [u8; 8] (same fix as the RX path). - Read each TX descriptor segment directly into the packet buffer via packet.resize() + mem.read(&mut packet[off..]) instead of allocating an intermediate Vec and extend_from_slice'ing. Saves one allocation and one full memcpy per descriptor segment. - Reuse a single Vec packet buffer with capacity 1600 across all frames in the call instead of allocating fresh per frame. - Batch used.idx update at end of the batch (same as RX). src/vmm/mod.rs::net_poll_thread - Track previous-cycle pending state. Pulse KVM_IRQ_LINE only when (a) we actually injected new RX frames this cycle OR (b) interrupt_status went from clear -> pending across cycles. Previously the loop pulsed twice (assert level=1, then deassert level=0) on every cycle while interrupt_status was non-zero, even when the guest hadn't acked the previous pulse and no new work had arrived. Skipping the pulse pair when there's nothing new saves two ioctl(KVM_IRQ_LINE) calls per redundant cycle (~5-10 us each on the CRR hot path). Effect on the single-process CRR p50 (mean of 5 runs of 30 iterations each, voidbox+voidbox-SLIRP): before: 421 us p50 mean after: 380 us p50 mean (~10% improvement) The IRQ pulse change is the dominant contributor; the RX/TX heap allocation removals are correct cleanup but contribute below sample variance. Voidbox's gap to qemu+passt (163 us) shrinks from 2.6x to 2.3x; remaining gap candidates are MMIO exit cost, KVM_IRQ_LINE vs irqfd, and SlirpBackend lock contention. --- src/devices/virtio_net.rs | 108 ++++++++++++++++++++++++++------------ src/vmm/mod.rs | 29 +++++++--- 2 files changed, 97 insertions(+), 40 deletions(-) diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index 71214d47..c6cdca5c 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -451,6 +451,16 @@ impl VirtioNetDevice { .map_err(|e| crate::Error::Memory(e.to_string()))?; let avail_idx = u16::from_le_bytes(idx_buf); + let initial_tx_used_idx = self.tx_used_idx; + + // Reusable per-call packet buffer. Capacity carried across + // iterations within this call so chained-descriptor frames don't + // re-grow the buffer; cleared between frames so each + // process_tx_frame sees only this frame's bytes. Pre-size to + // a typical MTU + virtio-net header so the common single-segment + // path needs no realloc. + let mut packet: Vec = Vec::with_capacity(1600); + while self.tx_avail_idx != avail_idx { // Ring entry: 2 bytes, at avail_addr + 4 + (tx_avail_idx % queue_size)*2 let ring_offset = 4 + ((self.tx_avail_idx as usize) % queue_size) * 2; @@ -462,8 +472,7 @@ impl VirtioNetDevice { .map_err(|e| crate::Error::Memory(e.to_string()))?; let head_idx = u16::from_le_bytes(desc_id_buf) as usize; - // Walk descriptor chain and collect packet - let mut packet = Vec::new(); + packet.clear(); let mut next = head_idx; loop { if next >= queue_size { @@ -478,10 +487,14 @@ impl VirtioNetDevice { let flags = u16::from_le_bytes(desc[12..14].try_into().unwrap()); let next_desc = u16::from_le_bytes(desc[14..16].try_into().unwrap()) as usize; if len > 0 && addr != 0 { - let mut buf = vec![0u8; len]; - mem.read(&mut buf, GuestAddress(addr)) + // Read directly into the packet's tail instead of + // allocating an intermediate `Vec` and then + // `extend_from_slice`-ing it in. Saves one alloc + // and one full memcpy per descriptor segment. + let off = packet.len(); + packet.resize(off + len, 0); + mem.read(&mut packet[off..off + len], GuestAddress(addr)) .map_err(|e| crate::Error::Memory(e.to_string()))?; - packet.extend_from_slice(&buf); } if (flags & VIRTQ_DESC_F_NEXT) == 0 { break; @@ -493,20 +506,27 @@ impl VirtioNetDevice { self.process_tx_frame(&packet)?; } - // Write used ring: used->ring[tx_used_idx % queue_size] = { id: head_idx, len: 0 } + // Used-ring entry: 8 bytes (head_idx as u32, 0 as u32). + // Built on the stack to avoid heap-alloc-per-frame from + // `[...].concat()`. TX descriptors carry no return data + // so the length field is always 0. let used_ring_off = 4 + ((self.tx_used_idx as usize) % queue_size) * 8; - let used_elem = [ - (head_idx as u32).to_le_bytes(), - 0u32.to_le_bytes(), // len for TX typically 0 - ] - .concat(); + let mut used_elem = [0u8; 8]; + used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes()); + // bytes [4..8] stay zero (the length field). mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; self.tx_used_idx = self.tx_used_idx.wrapping_add(1); self.tx_avail_idx = self.tx_avail_idx.wrapping_add(1); + } - // Update used.idx so guest sees progress + // Publish used.idx ONCE per batch instead of after every frame. + // virtio spec: the device updates the used-ring entries first, + // then bumps used.idx; the guest reads used.idx with a memory + // barrier and iterates new entries. Per-frame writes are + // redundant for correctness and waste one mem.write per frame. + if self.tx_used_idx != initial_tx_used_idx { let used_idx_bytes = self.tx_used_idx.to_le_bytes(); mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; @@ -517,10 +537,16 @@ impl VirtioNetDevice { } /// Try to inject received frames from SLIRP into guest RX queue. Call from vCPU loop or after RX notify. - pub fn try_inject_rx(&mut self, mem: &M) -> Result<()> { + /// + /// Returns the number of frames the guest now has visible in its RX + /// ring after this call. Callers can use this to decide whether to + /// raise an IRQ — pulsing the line is only useful when the guest + /// has new work to do, not on every poll cycle while interrupt_status + /// is still set from an earlier (un-acked) injection. + pub fn try_inject_rx(&mut self, mem: &M) -> Result { let frames = self.get_rx_frames(); if frames.is_empty() { - return Ok(()); + return Ok(0); } let q = &self.rx_queue; @@ -533,30 +559,31 @@ impl VirtioNetDevice { frames.len() ); self.rx_buffer.extend(frames); - return Ok(()); + return Ok(0); } let desc_addr = GuestAddress(q.desc_addr); let avail_addr = GuestAddress(q.driver_addr); let used_addr = GuestAddress(q.device_addr); let queue_size = q.num as usize; + // avail_idx is monotonically increasing; the driver bumps it + // whenever it adds new buffers. Read it once per try_inject_rx + // call rather than per frame — saves one mem.read per frame in + // the hot path. If the device runs out of available buffers + // mid-batch the remaining frames are buffered for the next + // call, which is the same correctness contract as before. + let mut idx_buf = [0u8; 2]; + mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64)) + .map_err(|e| crate::Error::Memory(e.to_string()))?; + let avail_idx = u16::from_le_bytes(idx_buf); + + let mut frames_injected: u16 = 0; + for frame in frames { - // Read available ring: how many buffers has driver given us? - let mut idx_buf = [0u8; 2]; - mem.read(&mut idx_buf, avail_addr.unchecked_add(2u64)) - .map_err(|e| crate::Error::Memory(e.to_string()))?; - let avail_idx = u16::from_le_bytes(idx_buf); if self.rx_avail_idx == avail_idx { - debug!("virtio-net: RX no available buffers (avail_idx={}, our_idx={}), buffering frame ({} bytes)", - avail_idx, self.rx_avail_idx, frame.len()); self.rx_buffer.push(frame); continue; } - debug!( - "virtio-net: RX injecting frame ({} bytes), avail_idx={}", - frame.len(), - avail_idx - ); let ring_offset = 4 + ((self.rx_avail_idx as usize) % queue_size) * 2; let mut desc_id_buf = [0u8; 2]; @@ -599,25 +626,38 @@ impl VirtioNetDevice { next = next_desc; } + // Used-ring entry is exactly 8 bytes (2x u32, little-endian). + // Build it on the stack instead of allocating a Vec via + // `[...].concat()` — the previous code did a heap alloc per + // frame in the hot path. let used_ring_off = 4 + ((self.rx_used_idx as usize) % queue_size) * 8; - let used_elem = [ - (head_idx as u32).to_le_bytes(), - (written as u32).to_le_bytes(), - ] - .concat(); + let mut used_elem = [0u8; 8]; + used_elem[0..4].copy_from_slice(&(head_idx as u32).to_le_bytes()); + used_elem[4..8].copy_from_slice(&(written as u32).to_le_bytes()); mem.write(&used_elem, used_addr.unchecked_add(used_ring_off as u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; self.rx_used_idx = self.rx_used_idx.wrapping_add(1); self.rx_avail_idx = self.rx_avail_idx.wrapping_add(1); + frames_injected = frames_injected.wrapping_add(1); + } + // Publish the new used.idx ONCE at the end of the batch. The + // virtio spec only requires the device to update used.idx after + // it has written all corresponding used-ring entries; the guest + // reads used.idx with a memory barrier and then iterates new + // entries. Per-frame writes are redundant — saves one + // mem.write per frame on the hot path. + if frames_injected > 0 { let used_idx_bytes = self.rx_used_idx.to_le_bytes(); mem.write(&used_idx_bytes, used_addr.unchecked_add(2u64)) .map_err(|e| crate::Error::Memory(e.to_string()))?; } - self.interrupt_status |= 1; - Ok(()) + if frames_injected > 0 { + self.interrupt_status |= 1; + } + Ok(frames_injected as usize) } /// Reset device to initial state diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index 97fe2d0f..cea4a807 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1649,6 +1649,13 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A let mut epoll_events: Vec = Vec::new(); + // Tracks whether the device's interrupt_status was non-zero on the + // previous cycle. Used to decide whether to pulse the IRQ line: + // we pulse only on transitions clear→pending (or when new RX frames + // are injected this cycle), not on every cycle where pending is + // still set from an un-acked earlier pulse. + let mut prev_pending: bool = false; + while running.load(Ordering::Relaxed) { // Block outside the device lock: either on epoll readiness or a short // sleep. This lets the vCPU thread acquire the device lock without @@ -1692,18 +1699,28 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A } } - let has_interrupt = { + let (frames_injected, has_interrupt) = { let mut guard = match net_dev.lock() { Ok(g) => g, Err(_) => continue, }; - let _ = guard.try_inject_rx(guest_memory); - guard.has_pending_interrupt() + let injected = guard.try_inject_rx(guest_memory).unwrap_or(0); + (injected, guard.has_pending_interrupt()) }; - // Always pulse IRQ10 while pending; this prevents RX stalls if - // an earlier edge was missed by the guest. - if has_interrupt { + // Pulse IRQ10 only when there is *new* work for the guest: + // - frames just injected this cycle, OR + // - interrupt_status went from clear → pending (TX completion + // by the vCPU thread between cycles). + // Skipping pulses when the guest hasn't acknowledged a previous + // pulse saves two ioctl(KVM_IRQ_LINE) calls per cycle (~5–10 µs + // on the CRR hot path). If we pulse once and the guest's + // ISR services the queue, has_pending_interrupt will be false + // on the next cycle and `prev_pending` resets. + let now_pending = has_interrupt; + let pulse = frames_injected > 0 || (now_pending && !prev_pending); + prev_pending = now_pending; + if pulse { let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid From eef4aebfbfb98608629a24a612ffb2d7b53299a0 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 15:57:26 -0300 Subject: [PATCH 05/19] perf(vmm): IRQ delivery via KVM_IRQFD instead of KVM_IRQ_LINE pair The voidbox net-poll thread was raising IRQ 10 with two ioctl(KVM_IRQ_LINE) calls per pulse: assert level=1, then deassert level=0. Each ioctl is a syscall (~few us each on KVM); on the TCP CRR hot path with multiple IRQ deliveries per connection, the ioctl pair became a measurable share of per-iteration cost. Replace with KVM_IRQFD: one eventfd registered with the in-kernel irqchip via vm_fd().register_irqfd(&eventfd, 10) at thread startup. Pulsing the IRQ is now a single 8-byte write to the eventfd; the kernel asserts the IRQ line directly without a userspace round-trip through ioctl(). The legacy KVM_IRQ_LINE path is kept as a fallback when irqfd registration fails (kernel without irqfd support, irqchip routing not initialised). In normal operation the eventfd succeeds at startup and the legacy ioctls never run. Effect on the single-process CRR p50 (mean over 5 runs of 30 iterations, voidbox+voidbox-SLIRP): before this commit: ~380 us p50 after this commit: ~335 us p50 (~12% reduction) Cumulative with the previous virtio-net hot-path cleanups: baseline: 421 us p50 after all fixes: ~335 us p50 (~20% cumulative reduction) Voidbox's gap to qemu+passt (163 us) shrinks from 2.6x to 2.0x. --- src/vmm/mod.rs | 53 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index cea4a807..f1151a60 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1656,6 +1656,33 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A // still set from an un-acked earlier pulse. let mut prev_pending: bool = false; + // KVM_IRQFD: register an eventfd that asserts IRQ 10 when written. + // Writing 8 bytes to the eventfd is one syscall; the kernel signals + // the in-kernel irqchip directly. This replaces the pair of + // KVM_IRQ_LINE ioctls (assert level=1 / deassert level=0) with a + // single write. If setup fails (kernel without irqfd, broken irqchip + // routing) we fall back to the ioctl path below. + let irq_eventfd: Option = + match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { + Ok(fd) => match vm.vm_fd().register_irqfd(&fd, 10) { + Ok(()) => Some(fd), + Err(e) => { + debug!( + "net-poll: KVM_IRQFD register failed; falling back to KVM_IRQ_LINE: {}", + e + ); + None + } + }, + Err(e) => { + debug!( + "net-poll: eventfd create failed; falling back to KVM_IRQ_LINE: {}", + e + ); + None + } + }; + while running.load(Ordering::Relaxed) { // Block outside the device lock: either on epoll readiness or a short // sleep. This lets the vCPU thread acquire the device lock without @@ -1721,16 +1748,22 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A let pulse = frames_injected > 0 || (now_pending && !prev_pending); prev_pending = now_pending; if pulse { - let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; - // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into - // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid - // for the lifetime of `vm`. - unsafe { - libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq); - } - let deassert_irq = KvmIrqLevel { irq: 10, level: 0 }; - unsafe { - libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq); + if let Some(ref efd) = irq_eventfd { + // Fast path: KVM_IRQFD. One 8-byte write to the eventfd; + // the kernel asserts IRQ 10 directly. No ioctl pair. + let _ = efd.write(1); + } else { + let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; + // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into + // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid + // for the lifetime of `vm`. + unsafe { + libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq); + } + let deassert_irq = KvmIrqLevel { irq: 10, level: 0 }; + unsafe { + libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &deassert_irq); + } } } } From e08224d85a7f1eb7df943ecea26d72b4fb207f81 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 16:08:56 -0300 Subject: [PATCH 06/19] perf(vmm): KVM_IOEVENTFD for virtio-net TX queue notify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without ioeventfd, every guest TX (write to QUEUE_NOTIFY MMIO with value=1) forces a KVM_RUN exit: vCPU thread dispatches into virtio-net's write_mmio handler, calls process_tx_queue, then re-enters KVM_RUN. On the TCP CRR hot path with multiple TX per connection that's a few microseconds of pure VM-exit overhead per packet on top of the actual network work. Register the eventfd at MMIO addr 0xd000_0050 with datamatch=1 (TX queue notify only). Now KVM consumes the matching MMIO write in-kernel and signals the eventfd; vCPU continues running uninterrupted. Net-poll thread sees the eventfd alongside flow events on the existing EpollDispatch (under a token in a tag space that doesn't collide with PROTO_TAG_*), drains it, and calls process_tx_queue on its own schedule. Notifies for queue 0 (RX, value=0) still take the slow path through the MMIO write handler — they're rare (only when guest adds new RX buffers) so the optimisation isn't needed there. Falls back to the synchronous MMIO-exit path if eventfd creation or KVM_IOEVENTFD registration fails. Effect on the single-process CRR p50 (mean over 5 runs of 30 iterations, voidbox+voidbox-SLIRP): before this commit: ~335 us p50 after this commit: ~278 us p50 (~17% reduction) Cumulative across the recent perf series: baseline: 421 us p50 + virtio-net cleanups: ~380 us p50 + KVM_IRQFD: ~335 us p50 + KVM_IOEVENTFD: ~278 us p50 (~34% cumulative) Voidbox's gap to qemu+passt (163 us) shrinks from 2.6x to 1.7x. --- src/devices/virtio_net.rs | 11 ++++ src/vmm/mod.rs | 104 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 5 deletions(-) diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index c6cdca5c..25288530 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -434,6 +434,17 @@ impl VirtioNetDevice { } } + /// Process the TX queue from outside the vCPU thread. + /// + /// Called by `net_poll_thread` when the KVM_IOEVENTFD registered for + /// the virtio-net QUEUE_NOTIFY MMIO fires. Same body as the + /// synchronous TX-queue handler used from the MMIO write path, + /// just exposed under a different name so callers outside this + /// module can drive it. + pub fn process_tx_queue_external(&mut self, mem: &M) -> Result<()> { + self.process_tx_queue(mem) + } + /// Process TX queue: read descriptor chains from guest, send frames to SLIRP, update used ring. fn process_tx_queue(&mut self, mem: &M) -> Result<()> { let q = &self.tx_queue; diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index f1151a60..301f09fa 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1683,6 +1683,74 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A } }; + // KVM_IOEVENTFD for the virtio-net TX queue notify. + // + // Without this, every guest TX (write to QUEUE_NOTIFY MMIO with value=1) + // forces a KVM_RUN exit, the vCPU thread dispatches into virtio-net's + // MMIO write handler, then calls process_tx_queue and re-enters KVM_RUN. + // ~1–5 µs per packet of pure VM-exit overhead. + // + // With KVM_IOEVENTFD: the guest's MMIO write is consumed in-kernel, + // KVM signals the eventfd, and the vCPU thread continues running. + // The net-poll thread sees the eventfd as another epoll source, drains + // it, and calls process_tx_queue asynchronously. No vCPU exit. + // + // Address: virtio-net mmio_base (0xd000_0000) + QUEUE_NOTIFY offset + // (0x050) = 0xd000_0050. Datamatch=1 triggers only on TX queue + // notifies (value=1 → queue index 1 = transmit queue). Notifies for + // queue 0 (RX) still take the slow path through MMIO; they're rare + // (only when guest adds new RX buffers) so the optimisation isn't + // needed there. + const VIRTIO_NET_MMIO_BASE: u64 = 0xd000_0000; + const VIRTIO_NET_QUEUE_NOTIFY_OFFSET: u64 = 0x050; + const TX_NOTIFY_QUEUE_IDX: u32 = 1; + let tx_notify_eventfd: Option = + match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { + Ok(fd) => { + let mmio_addr = kvm_ioctls::IoEventAddress::Mmio( + VIRTIO_NET_MMIO_BASE + VIRTIO_NET_QUEUE_NOTIFY_OFFSET, + ); + match vm + .vm_fd() + .register_ioevent(&fd, &mmio_addr, TX_NOTIFY_QUEUE_IDX) + { + Ok(()) => Some(fd), + Err(e) => { + debug!( + "net-poll: KVM_IOEVENTFD register failed; TX notifies will continue to take MMIO exits: {}", + e + ); + None + } + } + } + Err(e) => { + debug!( + "net-poll: eventfd create for tx-notify failed; falling back to MMIO-exit TX path: {}", + e + ); + None + } + }; + // Token used to identify the TX-notify eventfd in epoll readiness + // events. Lives in a tag space that doesn't collide with the + // PROTO_TAG_* values SlirpBackend uses for flow tokens. + const TX_NOTIFY_TOKEN: u64 = 0x4000_0000_0000_0000; + if let Some(ref fd) = tx_notify_eventfd { + if let Some(ref ep_arc) = epoll_arc { + if let Err(e) = ep_arc.register( + fd.as_raw_fd(), + TX_NOTIFY_TOKEN, + crate::network::epoll_dispatch::RegisterMode::Read, + ) { + debug!( + "net-poll: failed to register tx-notify eventfd with epoll dispatch: {}", + e + ); + } + } + } + while running.load(Ordering::Relaxed) { // Block outside the device lock: either on epoll readiness or a short // sleep. This lets the vCPU thread acquire the device lock without @@ -1715,11 +1783,37 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A IDLE_TIMEOUT }; - // Push ready events into the backend's queue before acquiring the - // device lock for inject/IRQ work. drain_to_guest will consume them - // without re-locking EpollDispatch, eliminating mutex contention - // between the net-poll thread's 50 ms blocking wait and the vCPU - // thread's process_guest_frame → drain_to_guest path. + // Filter out the TX-notify eventfd event (if any) before pushing + // the rest to the SLIRP backend. When the guest writes to the + // virtio-net QUEUE_NOTIFY MMIO with value=1, KVM consumes it + // in-kernel and signals our eventfd; we drain it here and call + // process_tx_queue ourselves — the vCPU thread never exits for + // that MMIO write. + let mut tx_notify_fired = false; + if tx_notify_eventfd.is_some() { + epoll_events.retain(|e| { + if e.token == TX_NOTIFY_TOKEN { + tx_notify_fired = true; + false + } else { + true + } + }); + } + if tx_notify_fired { + if let Some(ref efd) = tx_notify_eventfd { + let _ = efd.read(); + } + if let Ok(mut guard) = net_dev.lock() { + let _ = guard.process_tx_queue_external(guest_memory); + } + } + + // Push remaining (flow) events into the backend's queue before + // acquiring the device lock for inject/IRQ work. drain_to_guest + // will consume them without re-locking EpollDispatch, eliminating + // mutex contention between the net-poll thread's blocking wait and + // the vCPU thread's process_guest_frame → drain_to_guest path. if !epoll_events.is_empty() { if let Ok(guard) = net_dev.lock() { guard.push_events_to_backend(&epoll_events); From 255eb74b86a816735cec151ac82169fdd16f2846 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 16:43:48 -0300 Subject: [PATCH 07/19] perf(virtio-net): lock-free RX hand-off via SegQueue (Option B) Restructures the host->guest RX path to eliminate the Arc> contention between the net-poll thread and the vCPU thread. Inspired by the user-suggested Option B: "net-poll -> rx_queue[vCPU] -> esa vCPU consume". Before: net-poll thread: let mut g = net_dev.lock(); // takes device mutex g.try_inject_rx(mem); // descriptor walk + writes drop(g); pulse_irq(); vCPU thread on MMIO exit: let g = net_dev.lock(); // waits for net-poll g.mmio_read(...); After: net-poll thread: drain backend frames into a Vec; // backend mutex only push each frame to pending_rx; // lock-free SegQueue pulse_irq(); // never touches device mutex vCPU thread on MMIO exit: let mut g = net_dev.lock(); // uncontended now g.flush_pending_rx(mem); // descriptor writes here g.mmio_read/mmio_write(...); Net-poll's hot path no longer holds the VirtioNetDevice mutex at all -- it only acquires the SLIRP backend Arc independently. vCPU's MMIO exits do the descriptor work in-context, paying for it once per exit but never waiting on a held lock. Implementation: src/devices/virtio_net.rs - new field pending_rx: Arc>> - pending_rx() accessor returns a clone of the Arc - slirp_arc() exposes the backend Arc for direct net-poll access - new method flush_pending_rx(&mut self, mem) drains the SegQueue and writes RX descriptors using the same loop as try_inject_rx - try_inject_rx is now a thin wrapper that calls a new shared helper write_frames_to_rx_ring; same behaviour, structured so flush_pending_rx can share the descriptor-writing logic. src/vmm/mod.rs::net_poll_thread - Cache pending_rx + slirp Arcs once at thread startup; never touch the VirtioNetDevice mutex on the per-cycle path. - Drain backend frames into a reusable Vec, wrap each with a virtio-net header, push to the SegQueue, then pulse the IRQ. src/vmm/cpu.rs (MMIO dispatch) - Call guard.flush_pending_rx(guest_memory) at the top of the virtio-net MMIO read AND write handlers. Materialises any frames the net-poll thread queued since the last MMIO exit. Adds: crossbeam-queue = "0.3". Effect on the single-process CRR p50 (mean over 5 runs of 30 iterations, voidbox+voidbox-SLIRP): before this commit: ~278 us p50 after this commit: ~265 us p50 (~5% reduction) Modest improvement on the single-vCPU benchmark we have available -- the win is mostly architectural (eliminates a contention point that will become more meaningful with multi-vCPU guests, higher pps, and parallel TX/RX paths). Cumulative across the whole perf series: baseline: 421 us p50 + virtio-net cleanups: ~380 us p50 + KVM_IRQFD: ~335 us p50 + KVM_IOEVENTFD: ~278 us p50 + Option B SegQueue: ~265 us p50 (~37% cumulative) Voidbox's gap to qemu+passt (163 us) is now ~1.6x. --- Cargo.lock | 10 ++++++ Cargo.toml | 5 +++ src/devices/virtio_net.rs | 67 +++++++++++++++++++++++++++++++++++++++ src/vmm/cpu.rs | 13 +++++++- src/vmm/mod.rs | 59 ++++++++++++++++++++++++++++++---- 5 files changed, 146 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 455b1e9a..868e1c21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -388,6 +388,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -2979,6 +2988,7 @@ dependencies = [ "byteorder", "bytes", "clap", + "crossbeam-queue", "dispatch2", "divan", "event-manager", diff --git a/Cargo.toml b/Cargo.toml index 50607e5f..af267aec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -113,6 +113,11 @@ socket2 = { version = "0.5", features = ["all"] } # path of a NAT keyed by guest-side ports the guest itself chooses. rustc-hash = "2" +# Lock-free MPMC queue used to hand virtio-net RX frames from the +# net-poll thread to the vCPU thread without taking the +# `Arc>` device lock on the hot path. +crossbeam-queue = "0.3" + # --- macOS-only dependencies --- [target.'cfg(target_os = "macos")'.dependencies] # Objective-C 2.0 bindings (auto-generated from Apple frameworks) diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index 25288530..555db678 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -10,6 +10,7 @@ use std::sync::{Arc, Mutex}; +use crossbeam_queue::SegQueue; use tracing::{debug, trace, warn}; use vm_memory::{Address, Bytes, GuestAddress, GuestMemory}; @@ -181,6 +182,16 @@ pub struct VirtioNetDevice { rx_avail_idx: u16, /// RX queue: next used index we'll write rx_used_idx: u16, + /// Lock-free queue of frames waiting to be written into the guest's + /// RX descriptors. The net-poll thread pushes frames here without + /// taking the device lock; the vCPU thread drains them on its next + /// MMIO exit (via [`Self::flush_pending_rx`]) and writes the + /// descriptors in its own context. + /// + /// Eliminates the `Arc>` contention that + /// previously serialised every net-poll-side `try_inject_rx` call + /// against vCPU MMIO exits. + pending_rx: Arc>>, } impl VirtioNetDevice { @@ -218,9 +229,30 @@ impl VirtioNetDevice { tx_used_idx: 0, rx_avail_idx: 0, rx_used_idx: 0, + pending_rx: Arc::new(SegQueue::new()), }) } + /// Returns a clone of the lock-free RX frame queue Arc. + /// + /// The net-poll thread holds this clone and pushes frames to it + /// without ever taking the [`VirtioNetDevice`] mutex. The vCPU + /// thread (which already holds the device mutex during MMIO + /// dispatch) drains it via [`Self::flush_pending_rx`]. + pub fn pending_rx(&self) -> Arc>> { + Arc::clone(&self.pending_rx) + } + + /// Returns a clone of the [`NetworkBackend`] arc. + /// + /// Lets the net-poll thread call `drain_to_guest` directly without + /// going through the device mutex. Combined with [`Self::pending_rx`], + /// this removes the `Arc>` contention point + /// from the per-packet RX hot path. + pub fn slirp_arc(&self) -> Arc> { + Arc::clone(&self.slirp) + } + /// Set the MMIO base address pub fn set_mmio_base(&mut self, base: u64) { self.mmio_base = base; @@ -547,6 +579,28 @@ impl VirtioNetDevice { Ok(()) } + /// Drain frames pushed into [`Self::pending_rx`] by the net-poll + /// thread and write them into the guest's RX descriptors. + /// + /// Same descriptor-walking shape as [`Self::try_inject_rx`], but + /// the input frames come from the lock-free SegQueue instead of + /// going through the (locked) network backend. The vCPU thread + /// calls this on every MMIO entry to virtio-net, materialising any + /// frames the net-poll thread queued since the last MMIO exit. + /// + /// Returns the number of frames written to the RX ring this call. + pub fn flush_pending_rx(&mut self, mem: &M) -> Result { + let mut frames: Vec> = Vec::new(); + while let Some(f) = self.pending_rx.pop() { + frames.push(f); + } + if !frames.is_empty() { + self.write_frames_to_rx_ring(frames, mem) + } else { + Ok(0) + } + } + /// Try to inject received frames from SLIRP into guest RX queue. Call from vCPU loop or after RX notify. /// /// Returns the number of frames the guest now has visible in its RX @@ -559,7 +613,20 @@ impl VirtioNetDevice { if frames.is_empty() { return Ok(0); } + self.write_frames_to_rx_ring(frames, mem) + } + /// Write a batch of fully-formed frames (already including the + /// virtio-net header) into the guest's RX descriptor ring. + /// + /// Shared between [`Self::try_inject_rx`] (frames pulled from the + /// network backend) and [`Self::flush_pending_rx`] (frames pushed + /// by the net-poll thread into the lock-free SegQueue). + fn write_frames_to_rx_ring( + &mut self, + frames: Vec>, + mem: &M, + ) -> Result { let q = &self.rx_queue; if !q.ready || q.num == 0 { // Queue not ready - buffer frames for later diff --git a/src/vmm/cpu.rs b/src/vmm/cpu.rs index 61008a0a..41f86920 100644 --- a/src/vmm/cpu.rs +++ b/src/vmm/cpu.rs @@ -249,8 +249,14 @@ fn vcpu_run_loop( } VcpuExit::MmioRead(addr, data) => { let handled = if let Some(ref dev) = mmio_devices.virtio_net { - let guard = dev.lock().unwrap(); + let mut guard = dev.lock().unwrap(); if guard.handles_mmio(addr) { + // Materialise any frames the net-poll thread + // pushed into pending_rx since our last MMIO + // entry — writes them into the guest's RX + // descriptors in our context, no cross-thread + // lock contention. + let _ = guard.flush_pending_rx(guest_memory); let offset = addr - guard.mmio_base(); guard.mmio_read(offset, data); true @@ -305,6 +311,11 @@ fn vcpu_run_loop( let handled = if let Some(ref dev) = mmio_devices.virtio_net { let mut guard = dev.lock().unwrap(); if guard.handles_mmio(addr) { + // Same pre-flush as the MMIO-read path: the + // guest may write INTERRUPT_ACK or another + // register before reading INTERRUPT_STATUS, + // so we materialise pending frames here too. + let _ = guard.flush_pending_rx(guest_memory); let offset = addr - guard.mmio_base(); guard.mmio_write(offset, data, Some(guest_memory)); true diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index 301f09fa..2e6716fa 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1751,6 +1751,22 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A } } + // Lock-free hand-off queue + direct backend Arc, pulled out of the + // device once at thread startup so the per-cycle hot path doesn't + // need to acquire the VirtioNetDevice mutex just to read backend + // frames. The vCPU thread drains `pending_rx` on each MMIO entry + // (see vmm/cpu.rs), so this thread only needs to push frames. + type PendingRxArc = std::sync::Arc>>; + type BackendArc = std::sync::Arc>; + let (pending_rx_arc, slirp_arc): (Option, Option) = + match net_dev.lock() { + Ok(g) => (Some(g.pending_rx()), Some(g.slirp_arc())), + Err(_) => (None, None), + }; + + // Reusable buffer for frames pulled from the backend each cycle. + let mut rx_scratch: Vec> = Vec::new(); + while running.load(Ordering::Relaxed) { // Block outside the device lock: either on epoll readiness or a short // sleep. This lets the vCPU thread acquire the device lock without @@ -1820,14 +1836,43 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A } } - let (frames_injected, has_interrupt) = { - let mut guard = match net_dev.lock() { - Ok(g) => g, - Err(_) => continue, - }; - let injected = guard.try_inject_rx(guest_memory).unwrap_or(0); - (injected, guard.has_pending_interrupt()) + // Drain backend frames into the pending_rx SegQueue WITHOUT + // touching the VirtioNetDevice mutex. The vCPU thread will + // materialise them into RX descriptors on its next MMIO entry + // via VirtioNetDevice::flush_pending_rx (see vmm/cpu.rs). + // + // This breaks the old contention pattern where the net-poll + // thread held the VirtioNetDevice lock for the duration of + // try_inject_rx (descriptor walk + memory writes), forcing the + // vCPU thread to wait on every MMIO exit that overlapped with + // a poll cycle. + let frames_pushed: usize = match (&pending_rx_arc, &slirp_arc) { + (Some(pending_rx), Some(slirp)) => { + rx_scratch.clear(); + if let Ok(mut backend) = slirp.lock() { + backend.drain_to_guest(&mut rx_scratch); + } + let n = rx_scratch.len(); + for frame in rx_scratch.drain(..) { + let mut packet = Vec::with_capacity( + crate::devices::virtio_net::VirtioNetHeader::SIZE + frame.len(), + ); + packet.extend_from_slice( + &crate::devices::virtio_net::VirtioNetHeader::new().to_bytes(), + ); + packet.extend_from_slice(&frame); + pending_rx.push(packet); + } + n + } + _ => 0, }; + let has_interrupt = frames_pushed > 0 + || match net_dev.lock() { + Ok(g) => g.has_pending_interrupt(), + Err(_) => false, + }; + let frames_injected = frames_pushed; // Pulse IRQ10 only when there is *new* work for the guest: // - frames just injected this cycle, OR From e26a6bc0d5e9a5d37c5b1558fc0271cb22a68c5f Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 16:58:42 -0300 Subject: [PATCH 08/19] perf(virtio-net): interrupt_status as Arc Wraps the device's interrupt_status register in Arc so the net-poll thread can read and update it without taking the device mutex. Three concrete benefits: 1. has_pending_interrupt() is now a single relaxed atomic load on &self -- safe to call from any thread, no lock, no contention. 2. The net-poll thread caches a clone of the Arc at startup and uses it directly for its idle-cycle 'do I need to pulse the IRQ?' check, removing one mutex acquisition per cycle. 3. interrupt_status |= 1 (set by RX inject) and interrupt_status &= !value (cleared by guest's INTERRUPT_ACK MMIO write) are now fetch_or / fetch_and atomic operations -- no read-modify-write race between the vCPU thread and the net-poll thread. The vCPU thread's MMIO read of INTERRUPT_STATUS still goes through the device mutex via the existing dispatcher, but the underlying operation is now a pure atomic load -- a follow-up that lets the dispatcher skip the lock for read-only MMIO accesses gets a cleaner path because the field no longer needs synchronisation through the mutex. Single-vCPU CRR is within sample noise of the previous measurement (~265 us p50 -> ~289 us across 5 runs of 30 iterations); the win is mostly architectural rather than measurable on this workload. Real benefit shows up with multi-vCPU guests, higher pps, or workloads where the net-poll and vCPU threads contend more aggressively. --- src/devices/virtio_net.rs | 47 ++++++++++++++++++++++++++++----------- src/vmm/mod.rs | 29 +++++++++++++++++------- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index 555db678..efe81516 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -8,6 +8,7 @@ //! - Integration with SLIRP stack for NAT //! - No root/TAP required +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use crossbeam_queue::SegQueue; @@ -158,8 +159,16 @@ pub struct VirtioNetDevice { queue_sel: u32, /// Device status status: u32, - /// Interrupt status - interrupt_status: u32, + /// Interrupt status, accessed concurrently from the vCPU thread + /// (MMIO read of `INTERRUPT_STATUS`, MMIO write of `INTERRUPT_ACK`) + /// and the net-poll thread (sets bit 0 when new RX frames are + /// queued, polls on idle cycles). + /// + /// Wrapped in [`Arc`] so the net-poll thread can hold + /// its own clone and read/update the value without taking the + /// device mutex. The vCPU thread accesses it via the device + /// guard during MMIO dispatch; both sides see the same atomic. + interrupt_status: Arc, /// Configuration generation counter config_generation: u32, /// Receive queue state @@ -211,7 +220,7 @@ impl VirtioNetDevice { features_sel: 0, queue_sel: 0, status: 0, - interrupt_status: 0, + interrupt_status: Arc::new(AtomicU32::new(0)), config_generation: 0, rx_queue: QueueState { num_max: 256, @@ -253,6 +262,13 @@ impl VirtioNetDevice { Arc::clone(&self.slirp) } + /// Returns a clone of the [`Arc`] backing + /// `interrupt_status`. The net-poll thread holds this clone and + /// reads/updates the ISR without ever taking the device mutex. + pub fn interrupt_status_arc(&self) -> Arc { + Arc::clone(&self.interrupt_status) + } + /// Set the MMIO base address pub fn set_mmio_base(&mut self, base: u64) { self.mmio_base = base; @@ -296,7 +312,7 @@ impl VirtioNetDevice { let queue = self.current_queue(); queue.ready as u32 } - mmio::INTERRUPT_STATUS => self.interrupt_status, + mmio::INTERRUPT_STATUS => self.interrupt_status.load(Ordering::Relaxed), mmio::STATUS => self.status, mmio::CONFIG_GENERATION => self.config_generation, // Device config (MAC address at offset 0x100) @@ -371,7 +387,7 @@ impl VirtioNetDevice { self.handle_queue_notify(value, guest_memory); } mmio::INTERRUPT_ACK => { - self.interrupt_status &= !value; + self.interrupt_status.fetch_and(!value, Ordering::Relaxed); } mmio::STATUS => { self.status = value; @@ -575,7 +591,7 @@ impl VirtioNetDevice { .map_err(|e| crate::Error::Memory(e.to_string()))?; } - self.interrupt_status |= 1; + self.interrupt_status.fetch_or(1, Ordering::Relaxed); Ok(()) } @@ -733,7 +749,7 @@ impl VirtioNetDevice { } if frames_injected > 0 { - self.interrupt_status |= 1; + self.interrupt_status.fetch_or(1, Ordering::Relaxed); } Ok(frames_injected as usize) } @@ -742,7 +758,7 @@ impl VirtioNetDevice { fn reset(&mut self) { debug!("virtio-net: device reset"); self.status = 0; - self.interrupt_status = 0; + self.interrupt_status.store(0, Ordering::Relaxed); self.driver_features = 0; self.tx_avail_idx = 0; self.tx_used_idx = 0; @@ -809,7 +825,7 @@ impl VirtioNetDevice { self.rx_buffer.push(packet); // Set interrupt - self.interrupt_status |= 1; + self.interrupt_status.fetch_or(1, Ordering::Relaxed); } /// Capture device state for snapshot. @@ -844,7 +860,7 @@ impl VirtioNetDevice { features_sel: self.features_sel, queue_sel: self.queue_sel, status: self.status, - interrupt_status: self.interrupt_status, + interrupt_status: self.interrupt_status.load(Ordering::Relaxed), config_generation: self.config_generation, mac: self.mac, queues, @@ -858,7 +874,8 @@ impl VirtioNetDevice { self.features_sel = state.features_sel; self.queue_sel = state.queue_sel; self.status = state.status; - self.interrupt_status = state.interrupt_status; + self.interrupt_status + .store(state.interrupt_status, Ordering::Relaxed); self.config_generation = state.config_generation; self.mac = state.mac; @@ -894,9 +911,13 @@ impl VirtioNetDevice { ); } - /// Check if there are pending interrupts + /// Check if there are pending interrupts. + /// + /// Atomic load — safe to call from any thread without holding the + /// device mutex. The net-poll thread uses this to decide whether + /// to pulse the IRQ line. pub fn has_pending_interrupt(&self) -> bool { - self.interrupt_status != 0 + self.interrupt_status.load(Ordering::Relaxed) != 0 } /// Get the MAC address diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index 2e6716fa..a61ff357 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1758,11 +1758,19 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A // (see vmm/cpu.rs), so this thread only needs to push frames. type PendingRxArc = std::sync::Arc>>; type BackendArc = std::sync::Arc>; - let (pending_rx_arc, slirp_arc): (Option, Option) = - match net_dev.lock() { - Ok(g) => (Some(g.pending_rx()), Some(g.slirp_arc())), - Err(_) => (None, None), - }; + type InterruptStatusArc = std::sync::Arc; + let (pending_rx_arc, slirp_arc, interrupt_status_arc): ( + Option, + Option, + Option, + ) = match net_dev.lock() { + Ok(g) => ( + Some(g.pending_rx()), + Some(g.slirp_arc()), + Some(g.interrupt_status_arc()), + ), + Err(_) => (None, None, None), + }; // Reusable buffer for frames pulled from the backend each cycle. let mut rx_scratch: Vec> = Vec::new(); @@ -1867,10 +1875,15 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A } _ => 0, }; + // Lock-free check: read interrupt_status via the AtomicU32 we + // cached at thread startup. Avoids one device-mutex acquisition + // per cycle on idle paths (the hot RX path skips this branch + // because frames_pushed > 0 already implies interrupt_status + // is about to be set when the vCPU drains pending_rx). let has_interrupt = frames_pushed > 0 - || match net_dev.lock() { - Ok(g) => g.has_pending_interrupt(), - Err(_) => false, + || match interrupt_status_arc { + Some(ref isr) => isr.load(std::sync::atomic::Ordering::Relaxed) != 0, + None => false, }; let frames_injected = frames_pushed; From c3b7f0ad2fda98fa2823eadfb475b4934d3dd6f0 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 18:05:36 -0300 Subject: [PATCH 09/19] tools: move perf-harness scripts under tools/perf-harness/ Collects the SLIRP-vs-SLIRP / vs-pasta diagnostic tooling under one directory. Five files relocate, no behaviour change: scripts/bench-pasta.py -> tools/perf-harness/bench-pasta.py scripts/bench-compare-pasta.py -> tools/perf-harness/bench-compare-pasta.py scripts/bench-qemu-slirp.sh -> tools/perf-harness/bench-qemu-slirp.sh tools/crr-client.c -> tools/perf-harness/crr-client.c tools/qemu-init.sh -> tools/perf-harness/qemu-init.sh Updates path references in: - bench-qemu-slirp.sh (uses $SCRIPT_DIR for qemu-init.sh location; updated busybox extraction to climb two dirs up to repo root) - examples/crr_singleproc_bench.rs (doc + error message paths) - docs/passt-comparison.md (usage examples + extended example block that now also covers bench-qemu-slirp.sh and crr_singleproc_bench) Smoke-tested after the move: - tools/perf-harness/bench-pasta.py --iterations 1 ... passes - tools/perf-harness/bench-qemu-slirp.sh --backend libslirp passes --- docs/passt-comparison.md | 16 +++++++++---- examples/crr_singleproc_bench.rs | 8 +++---- .../perf-harness}/bench-compare-pasta.py | 0 .../perf-harness}/bench-pasta.py | 0 .../perf-harness}/bench-qemu-slirp.sh | 24 ++++++++++++------- tools/{ => perf-harness}/crr-client.c | 0 tools/{ => perf-harness}/qemu-init.sh | 4 ++-- 7 files changed, 34 insertions(+), 18 deletions(-) rename {scripts => tools/perf-harness}/bench-compare-pasta.py (100%) rename {scripts => tools/perf-harness}/bench-pasta.py (100%) rename {scripts => tools/perf-harness}/bench-qemu-slirp.sh (91%) rename tools/{ => perf-harness}/crr-client.c (100%) rename tools/{ => perf-harness}/qemu-init.sh (90%) diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md index 4f052370..6c6faf66 100644 --- a/docs/passt-comparison.md +++ b/docs/passt-comparison.md @@ -63,15 +63,23 @@ cargo run --release --bin voidbox-network-bench -- \ --iterations 3 --output /tmp/voidbox-bench.json # Generate pasta numbers (requires pasta on PATH or via $PASTA). -scripts/bench-pasta.py --output /tmp/pasta-bench.json +tools/perf-harness/bench-pasta.py --output /tmp/pasta-bench.json # Side-by-side markdown. -scripts/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \ +tools/perf-harness/bench-compare-pasta.py /tmp/voidbox-bench.json /tmp/pasta-bench.json \ --output /tmp/voidbox-vs-pasta.md + +# qemu+libslirp / qemu+passt CRR (apples-to-apples SLIRP-vs-SLIRP). +gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c +tools/perf-harness/bench-qemu-slirp.sh --backend libslirp --iterations 30 +tools/perf-harness/bench-qemu-slirp.sh --backend passt --iterations 30 + +# Voidbox single-process CRR (no per-iteration nc fork). +cargo run --release --example crr_singleproc_bench -- --iterations 30 ``` -`scripts/bench-pasta.py --help` lists tunables (iterations, transfer -size, sample counts). +`tools/perf-harness/bench-pasta.py --help` lists tunables (iterations, +transfer size, sample counts). ## Reading the report diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs index 93241d7c..cb0505fe 100644 --- a/examples/crr_singleproc_bench.rs +++ b/examples/crr_singleproc_bench.rs @@ -3,10 +3,10 @@ //! existing bench's per-iteration `nc` fork+exec overhead. //! //! NOT meant for the production bench surface; this is a one-off -//! diagnostic that pairs with `tools/crr-client.c` + the pasta side -//! of the head-to-head. Compile and run directly: +//! diagnostic that pairs with `tools/perf-harness/crr-client.c` + the +//! pasta side of the head-to-head. Compile and run directly: //! -//! gcc -O2 -static -o /tmp/crr-client tools/crr-client.c +//! gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c //! cargo run --release --example crr_singleproc_bench -- \ //! --iterations 100 --bench-binary /tmp/crr-client //! @@ -44,7 +44,7 @@ async fn main() -> Result<(), Box> { let bench_binary = std::path::PathBuf::from(&cli.bench_binary); if !bench_binary.exists() { return Err(format!( - "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/crr-client.c`)", + "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/perf-harness/crr-client.c`)", cli.bench_binary ) .into()); diff --git a/scripts/bench-compare-pasta.py b/tools/perf-harness/bench-compare-pasta.py similarity index 100% rename from scripts/bench-compare-pasta.py rename to tools/perf-harness/bench-compare-pasta.py diff --git a/scripts/bench-pasta.py b/tools/perf-harness/bench-pasta.py similarity index 100% rename from scripts/bench-pasta.py rename to tools/perf-harness/bench-pasta.py diff --git a/scripts/bench-qemu-slirp.sh b/tools/perf-harness/bench-qemu-slirp.sh similarity index 91% rename from scripts/bench-qemu-slirp.sh rename to tools/perf-harness/bench-qemu-slirp.sh index 968b488a..d1291a84 100755 --- a/scripts/bench-qemu-slirp.sh +++ b/tools/perf-harness/bench-qemu-slirp.sh @@ -8,7 +8,7 @@ # --backend libslirp qemu's built-in -netdev user (libslirp) # --backend passt qemu -netdev stream + a passt(1) instance over UNIX socket # -# Both produce a number directly comparable to scripts/bench-pasta.py's +# Both produce a number directly comparable to tools/perf-harness/bench-pasta.py's # pasta-side number AND to examples/crr_singleproc_bench.rs's voidbox-side # number — same workload, same C client, same iteration count. # @@ -24,7 +24,15 @@ ITERATIONS=30 KERNEL=${KERNEL:-/boot/vmlinuz-$(uname -r)} # NB: must be the `passt` binary (VM/socket mode), NOT the `pasta` symlink # (namespace mode). The two modes are the same code keyed on argv[0]. -PASST=${PASST:-/home/diego/github/passt/passt} +# Default discovery order: $PASST env var → `passt` on $PATH → /usr/bin/passt. +default_passt() { + if command -v passt >/dev/null 2>&1; then + command -v passt + else + echo /usr/bin/passt + fi +} +PASST=${PASST:-$(default_passt)} HOST_PORT=${HOST_PORT:-18877} GUEST_ADDR=${GUEST_ADDR:-10.0.2.15} GUEST_GATEWAY=${GUEST_GATEWAY:-10.0.2.2} @@ -38,7 +46,7 @@ Usage: $0 [--backend libslirp|passt] [--iterations N] [--kernel PATH] [--port PO Env vars: KERNEL path to a Linux bzImage (default: host distro kernel) - PASST path to the passt binary (default: /home/diego/github/passt/pasta) + PASST path to the passt binary (default: \`passt\` on \$PATH, falling back to /usr/bin/passt) CRR_CLIENT_BIN path to the static crr-client binary (default: /tmp/crr-client) HOST_PORT TCP port for the host listener (default: 18877) GUEST_ADDR IPv4 to assign to the guest (default: 10.0.2.15) @@ -68,14 +76,14 @@ esac [[ -x "$CRR_CLIENT_BIN" ]] || { echo "ERROR: crr-client not found at $CRR_CLIENT_BIN" >&2 - echo " compile it with: gcc -O2 -static -o $CRR_CLIENT_BIN tools/crr-client.c" >&2 + echo " compile it with: gcc -O2 -static -o $CRR_CLIENT_BIN tools/perf-harness/crr-client.c" >&2 exit 2 } [[ -r "$KERNEL" ]] || { echo "ERROR: kernel not readable: $KERNEL" >&2; exit 2; } -REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" -INIT_TEMPLATE="$REPO_ROOT/tools/qemu-init.sh" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +INIT_TEMPLATE="$SCRIPT_DIR/qemu-init.sh" [[ -r "$INIT_TEMPLATE" ]] || { echo "ERROR: missing $INIT_TEMPLATE" >&2; exit 2; } # --------------------------------------------------------------------------- @@ -95,8 +103,8 @@ mkdir -p "$ROOTFS_DIR"/{bin,sbin,proc,sys,dev,tmp} # to extracting from voidbox's claude rootfs if needed. if [[ -x /usr/bin/busybox ]] && file /usr/bin/busybox 2>/dev/null | grep -q "statically linked"; then cp /usr/bin/busybox "$ROOTFS_DIR/bin/busybox" -elif [[ -r "$REPO_ROOT/target/void-box-claude.cpio.gz" ]]; then - (cd "$ROOTFS_DIR" && zcat "$REPO_ROOT/target/void-box-claude.cpio.gz" | cpio -idm bin/busybox 2>/dev/null) +elif [[ -r "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" ]]; then + (cd "$ROOTFS_DIR" && zcat "$SCRIPT_DIR/../../target/void-box-claude.cpio.gz" | cpio -idm bin/busybox 2>/dev/null) else echo "ERROR: no static busybox found; install busybox-static or build target/void-box-claude.cpio.gz" >&2 exit 2 diff --git a/tools/crr-client.c b/tools/perf-harness/crr-client.c similarity index 100% rename from tools/crr-client.c rename to tools/perf-harness/crr-client.c diff --git a/tools/qemu-init.sh b/tools/perf-harness/qemu-init.sh similarity index 90% rename from tools/qemu-init.sh rename to tools/perf-harness/qemu-init.sh index 1654868b..857413a0 100755 --- a/tools/qemu-init.sh +++ b/tools/perf-harness/qemu-init.sh @@ -1,7 +1,7 @@ #!/bin/sh -# tools/qemu-init.sh — /init for the SLIRP-vs-SLIRP comparison guest. +# tools/perf-harness/qemu-init.sh — /init for the SLIRP-vs-SLIRP comparison guest. # -# Used by scripts/bench-qemu-slirp.sh. Read /proc/cmdline for: +# Used by tools/perf-harness/bench-qemu-slirp.sh. Read /proc/cmdline for: # crr_target=HOST:PORT:N target server + iteration count # crr_net=ADDR/MASK,GW static network config # From 3c5da08183fa820a68802f054eafe8fd57749051 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 19:11:44 -0300 Subject: [PATCH 10/19] fix(perf-harness): address Copilot AI review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eight follow-up fixes from PR #81 review: src/vmm/mod.rs: Extract `setup_tx_notify_ioeventfd` helper and gate the entire IOEVENTFD path on `epoll_arc.is_some()`. Fixes the original safety concern: the previous code registered KVM_IOEVENTFD even when no epoll dispatcher was available, which would have left guest TX notifies trapped in-kernel with no userspace drain — a silent hang. The helper rolls back the epoll registration if KVM_IOEVENTFD registration fails, so the two halves succeed or fail together. examples/crr_singleproc_bench.rs: Switch the host-side accept thread to non-blocking accept with a deadline check so the example never hangs forever if the guest fails to connect. Initial Copilot suggestion of a 2 ms sleep inflated each guest CRR sample by ~1.8 ms (sleep latency directly added to per-iter accept-pickup time). Reduced to 50 µs to keep the sample noise below the metric resolution. tools/perf-harness/bench-pasta.py: - `detect_host_gateway` now parses the route line by `via` keyword instead of indexing parts[2], so non-standard route formats don't silently pick up the wrong field. - CRR timer started before `srv.accept()` to match the voidbox-network-bench `crr_echo_server` semantics. tools/perf-harness/bench-qemu-slirp.sh: - Replace `time.sleep(60)` with `threading.Event().wait()` so the host echo server stays alive for the entire qemu run instead of timing out at 60 s. - Add fail-fast bind error handling so port collisions surface immediately instead of producing a confusing "no result" later. tools/perf-harness/qemu-init.sh: Derive the netmask from the CIDR prefix instead of hardcoding 255.255.255.0, so non-/24 networks work. tools/perf-harness/bench-compare-pasta.py: Remove unused `sign` variable. docs/passt-comparison.md: Update path reference from `scripts/` to `tools/perf-harness/`. Verified: voidbox single-process CRR p50 stays at ~280-310 µs (within noise of pre-fix baseline) and `cargo test --test network_baseline` passes 24/24. --- docs/passt-comparison.md | 2 +- examples/crr_singleproc_bench.rs | 17 ++- src/vmm/mod.rs | 121 ++++++++++++++-------- tools/perf-harness/bench-compare-pasta.py | 3 - tools/perf-harness/bench-pasta.py | 31 +++++- tools/perf-harness/bench-qemu-slirp.sh | 21 +++- tools/perf-harness/qemu-init.sh | 16 ++- 7 files changed, 151 insertions(+), 60 deletions(-) diff --git a/docs/passt-comparison.md b/docs/passt-comparison.md index 6c6faf66..89f21661 100644 --- a/docs/passt-comparison.md +++ b/docs/passt-comparison.md @@ -1,6 +1,6 @@ # passt head-to-head comparison harness -Two scripts under `scripts/` produce a side-by-side comparison of voidbox +Tools under `tools/perf-harness/` produce a side-by-side comparison of voidbox (real KVM VM + SLIRP) against passt's [`pasta`](https://passt.top/passt/about/) running in a network namespace. diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs index cb0505fe..893cc86c 100644 --- a/examples/crr_singleproc_bench.rs +++ b/examples/crr_singleproc_bench.rs @@ -14,7 +14,6 @@ //! VOID_BOX_KERNEL, VOID_BOX_INITRAMFS use std::net::TcpListener; -use std::sync::mpsc; use std::thread; use std::time::Duration; @@ -62,25 +61,35 @@ async fn main() -> Result<(), Box> { let listener = TcpListener::bind("127.0.0.1:0")?; let host_port = listener.local_addr()?.port(); + listener.set_nonblocking(true)?; let iterations = cli.iterations; let server_thread = thread::spawn(move || { + // Non-blocking accept with a tight poll, deadline-checked. With + // a blocking accept the deadline never fires if the guest never + // connects (boot failure, SLIRP rate limit, etc.) and the + // example's later `server_thread.join()` would hang forever. + // The accept-pickup latency directly inflates each guest CRR + // sample, so the wait is kept short — `from_micros(50)` adds + // at most ~50 µs of jitter on top of a ~280 µs baseline, while + // still letting the deadline check fire every ~50 µs. let mut accepted = 0u32; - listener.set_nonblocking(false).ok(); let deadline = std::time::Instant::now() + Duration::from_secs(120); - let (done_tx, _done_rx) = mpsc::channel::<()>(); while accepted < iterations && std::time::Instant::now() < deadline { match listener.accept() { Ok((mut conn, _)) => { + conn.set_nonblocking(false).ok(); let mut buf = [0u8; 1]; let _ = std::io::Read::read(&mut conn, &mut buf); let _ = std::io::Write::write_all(&mut conn, b"x"); accepted += 1; } + Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_micros(50)); + } Err(_) => break, } } - drop(done_tx); accepted }); diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index a61ff357..e1a485e1 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -1607,6 +1607,77 @@ fn vsock_irq_thread( /// /// When the network backend does not provide an epoll instance /// (non-SlirpBackend), the thread falls back to a fixed 5 ms sleep. +/// Registers a host eventfd with KVM via `KVM_IOEVENTFD` for the +/// virtio-net TX-queue notify MMIO and adds it to the supplied +/// [`EpollDispatch`] under `token` so the net-poll thread can drain +/// it. Returns the eventfd on success, or `None` and logs a +/// `debug!` on any failure (eventfd creation, epoll registration, +/// `KVM_IOEVENTFD` registration); callers fall back to the +/// MMIO-exit TX path when this returns `None`. +/// +/// Both pieces (epoll registration and `KVM_IOEVENTFD` +/// registration) must succeed together: if KVM consumes the guest's +/// TX MMIO writes in-kernel but no userspace path drains the +/// eventfd, guest TX hangs silently. This helper rolls back the +/// epoll registration if the `KVM_IOEVENTFD` half fails. +/// +/// # Errors +/// +/// Returns `None` on any of: missing epoll dispatcher, eventfd +/// creation failure, epoll registration failure, or +/// `KVM_IOEVENTFD` registration failure. Each failure is logged at +/// `debug!` level with the underlying error. +fn setup_tx_notify_ioeventfd( + vm: &Vm, + epoll_arc: Option<&Arc>, + mmio_addr: u64, + queue_idx: u32, + token: u64, +) -> Option { + let Some(ep_arc) = epoll_arc else { + debug!( + "net-poll: no epoll dispatcher; falling back to MMIO-exit TX path (KVM_IOEVENTFD requires an async drain)" + ); + return None; + }; + let fd = match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { + Ok(fd) => fd, + Err(e) => { + debug!( + "net-poll: eventfd create for tx-notify failed; falling back to MMIO-exit TX path: {}", + e + ); + return None; + } + }; + if let Err(e) = ep_arc.register( + fd.as_raw_fd(), + token, + crate::network::epoll_dispatch::RegisterMode::Read, + ) { + debug!( + "net-poll: failed to register tx-notify eventfd with epoll dispatch ({e}); falling back to MMIO-exit TX path" + ); + return None; + } + let kvm_addr = kvm_ioctls::IoEventAddress::Mmio(mmio_addr); + if let Err(e) = vm.vm_fd().register_ioevent(&fd, &kvm_addr, queue_idx) { + // KVM didn't take the ioevent. Roll the epoll registration + // back so the eventfd doesn't stay armed without a service + // path on it. + let _ = ep_arc.unregister(fd.as_raw_fd()); + debug!( + "net-poll: KVM_IOEVENTFD register failed ({e}); TX notifies will continue to take MMIO exits" + ); + return None; + } + debug!( + "net-poll: KVM_IOEVENTFD active for TX notify @ MMIO {:#x} queue_idx={queue_idx}", + mmio_addr, + ); + Some(fd) +} + fn net_poll_thread(net_dev: Arc>, vm: Arc, running: Arc) { #[repr(C)] struct KvmIrqLevel { @@ -1704,52 +1775,18 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A const VIRTIO_NET_MMIO_BASE: u64 = 0xd000_0000; const VIRTIO_NET_QUEUE_NOTIFY_OFFSET: u64 = 0x050; const TX_NOTIFY_QUEUE_IDX: u32 = 1; - let tx_notify_eventfd: Option = - match vmm_sys_util::eventfd::EventFd::new(libc::EFD_NONBLOCK) { - Ok(fd) => { - let mmio_addr = kvm_ioctls::IoEventAddress::Mmio( - VIRTIO_NET_MMIO_BASE + VIRTIO_NET_QUEUE_NOTIFY_OFFSET, - ); - match vm - .vm_fd() - .register_ioevent(&fd, &mmio_addr, TX_NOTIFY_QUEUE_IDX) - { - Ok(()) => Some(fd), - Err(e) => { - debug!( - "net-poll: KVM_IOEVENTFD register failed; TX notifies will continue to take MMIO exits: {}", - e - ); - None - } - } - } - Err(e) => { - debug!( - "net-poll: eventfd create for tx-notify failed; falling back to MMIO-exit TX path: {}", - e - ); - None - } - }; // Token used to identify the TX-notify eventfd in epoll readiness // events. Lives in a tag space that doesn't collide with the // PROTO_TAG_* values SlirpBackend uses for flow tokens. const TX_NOTIFY_TOKEN: u64 = 0x4000_0000_0000_0000; - if let Some(ref fd) = tx_notify_eventfd { - if let Some(ref ep_arc) = epoll_arc { - if let Err(e) = ep_arc.register( - fd.as_raw_fd(), - TX_NOTIFY_TOKEN, - crate::network::epoll_dispatch::RegisterMode::Read, - ) { - debug!( - "net-poll: failed to register tx-notify eventfd with epoll dispatch: {}", - e - ); - } - } - } + + let tx_notify_eventfd = setup_tx_notify_ioeventfd( + vm.as_ref(), + epoll_arc.as_ref(), + VIRTIO_NET_MMIO_BASE + VIRTIO_NET_QUEUE_NOTIFY_OFFSET, + TX_NOTIFY_QUEUE_IDX, + TX_NOTIFY_TOKEN, + ); // Lock-free hand-off queue + direct backend Arc, pulled out of the // device once at thread startup so the per-cycle hot path doesn't diff --git a/tools/perf-harness/bench-compare-pasta.py b/tools/perf-harness/bench-compare-pasta.py index 430a442d..ac6af588 100755 --- a/tools/perf-harness/bench-compare-pasta.py +++ b/tools/perf-harness/bench-compare-pasta.py @@ -47,9 +47,6 @@ def fmt_delta(voidbox: Any, pasta: Any, latency: bool) -> str: if pasta == 0: return "—" ratio = voidbox / pasta - # For latency: voidbox > pasta means voidbox is *slower* (positive ratio is bad). - # For throughput: voidbox > pasta means voidbox is *faster* (positive ratio is good). - sign = "slower" if (latency and ratio > 1) or (not latency and ratio < 1) else "faster" if latency: if ratio >= 1: return f"voidbox {ratio:.1f}× slower" diff --git a/tools/perf-harness/bench-pasta.py b/tools/perf-harness/bench-pasta.py index a80fd4b4..264e808d 100755 --- a/tools/perf-harness/bench-pasta.py +++ b/tools/perf-harness/bench-pasta.py @@ -60,12 +60,27 @@ def _resolve_pasta() -> str: def detect_host_gateway() -> str: + """Return the host's IPv4 default-route gateway address. + + Parses ``ip -4 route show default`` for ``default via ...`` lines + and returns the address after ``via``. Routes of the form + ``default dev ...`` (no ``via``) are skipped — they don't + name a usable IP for pasta's ``--map-host-loopback`` translation. + """ out = subprocess.check_output(["ip", "-4", "route", "show", "default"], text=True) for line in out.splitlines(): parts = line.split() - if parts and parts[0] == "default": - return parts[2] - raise RuntimeError("no default gateway found") + if not parts or parts[0] != "default": + continue + try: + via_index = parts.index("via") + except ValueError: + continue + if via_index + 1 < len(parts): + return parts[via_index + 1] + raise RuntimeError( + "no IPv4 default gateway with a 'via' field found in `ip route show default` output" + ) def pasta_version(pasta: str) -> str: @@ -257,11 +272,19 @@ def measure_crr_latency( def host_accept_loop() -> None: samples: list[float] = [] for _ in range(samples_per_iter): + # Start the timer BEFORE accept() so each sample includes + # the TCP connect + accept latency, matching + # voidbox-network-bench's measure_crr_latency semantics + # (its crr_echo_server starts the timer before + # accept_with_deadline). Without this, the two + # harnesses report different metrics under the same + # name and the side-by-side comparison becomes + # meaningless. + start = time.perf_counter_ns() try: conn, _ = srv.accept() except socket.timeout: break - start = time.perf_counter_ns() with conn: # one read + one write keeps it a true CRR round-trip try: diff --git a/tools/perf-harness/bench-qemu-slirp.sh b/tools/perf-harness/bench-qemu-slirp.sh index d1291a84..eacb3f6a 100755 --- a/tools/perf-harness/bench-qemu-slirp.sh +++ b/tools/perf-harness/bench-qemu-slirp.sh @@ -137,16 +137,23 @@ trap "rm -f $INITRD; ${cleanup_rootfs:-true}" EXIT (cd "$ROOTFS_DIR" && find . | cpio -H newc -o 2>/dev/null | gzip > "$INITRD") # --------------------------------------------------------------------------- -# Host-side echo server. Host port can be passed in via env; pick a free one -# if the default is in use. +# Host-side echo server. The script's outer EXIT trap kills it, so the +# server stays alive for the entire qemu run rather than racing against a +# fixed-duration sleep. HOST_PORT must be free; the script fails fast if +# bind() refuses (no fallback to ephemeral — the guest's kernel cmdline +# carries the port and changing it after launch isn't useful). # --------------------------------------------------------------------------- SERVER_PIDFILE=$(mktemp) python3 - < "$SERVER_PIDFILE" diff --git a/tools/perf-harness/qemu-init.sh b/tools/perf-harness/qemu-init.sh index 857413a0..e32da047 100755 --- a/tools/perf-harness/qemu-init.sh +++ b/tools/perf-harness/qemu-init.sh @@ -51,7 +51,21 @@ while [ $i -lt 30 ] && ! busybox ifconfig eth0 >/dev/null 2>&1; do i=$((i+1)) done -busybox ifconfig eth0 "${addr_mask%/*}" netmask 255.255.255.0 up +# Derive the netmask from the /N suffix instead of hard-coding /24: +# crr_net is documented as ADDR/MASK,GW and a future call site might +# reasonably use /16 or /29. Falls back to /24 if the suffix isn't +# parseable so existing setups keep working. +addr="${addr_mask%/*}" +prefix="${addr_mask#*/}" +case "$prefix" in + 8) mask=255.0.0.0 ;; + 16) mask=255.255.0.0 ;; + 24) mask=255.255.255.0 ;; + 29) mask=255.255.255.248 ;; + 30) mask=255.255.255.252 ;; + *) mask=255.255.255.0 ;; +esac +busybox ifconfig eth0 "$addr" netmask "$mask" up busybox route add default gw "$gw" echo "===CRR-START===" From 8c0f49b47f9a23a9e3cf12aee83e0af50d8b78ef Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 19:39:21 -0300 Subject: [PATCH 11/19] perf(slirp): hoist ready-event scratch Vec out of drain_to_guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace `std::mem::take(&mut *queue)` with an in-place `extend_from_slice` + `clear()` against a scratch Vec owned by `SlirpBackend`. The previous pattern moved the queue's allocation out and left a fresh `Vec::new()` (cap=0) behind, forcing the next `push_ready_events` to grow `extend_from_slice` from cap=0 every cycle. Heaptrack on the single-process CRR bench (30 iters) measured this single callsite as ~half of all allocations during the run: before: push_ready_events 4843 allocs (49% of total) drain_to_guest 4776 allocs (48% of total) total 12618 allocs after: push_ready_events gone from top callers drain_to_guest 3957 allocs (still hot, downstream) total 6885 allocs (-45%) p50 CRR latency is unchanged (~270 µs); the wall-clock floor is elsewhere on this workload. The win is reduced allocator churn (GC pressure, jitter on bulk paths, fewer slow-path mallocs under sustained load) — visible in the throughput bench rather than CRR microbench. The `pending_events` Mutex is also pre-sized to `EVENTS_PRESIZE = 128` at construction so the very first push doesn't reallocate. --- src/network/slirp.rs | 69 ++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 1e452880..0fa50cde 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -59,6 +59,12 @@ struct PendingDnsQuery { /// while keeping the implementation simple. const DNS_CACHE_TTL_SECS: u64 = 60; +/// Initial capacity for the ready-event scratch buffers. Sized to +/// `EpollDispatch`'s typical per-wait batch so the buffers fit a +/// busy-loop wakeup without reallocating; oversized batches grow +/// once and stabilize. +const EVENTS_PRESIZE: usize = 128; + use ipnet::Ipv4Net; use smoltcp::iface::{Config, Interface, SocketSet}; @@ -689,6 +695,13 @@ pub struct SlirpBackend { /// keep the fallback so synthetic harnesses still observe /// readiness. has_external_poller: AtomicBool, + /// Per-call scratch buffer for the events `drain_to_guest` + /// processes. Owned by `SlirpBackend` so its capacity persists + /// across calls — `mem::take`-into-local would discard the + /// allocation and force the next round to grow from cap=0, + /// which heaptrack measured as ~half of all per-CRR + /// allocations. + ready_scratch: Vec, } impl SlirpBackend { @@ -793,9 +806,10 @@ impl SlirpBackend { accept_sender: accept_tx, epoll, epoll_waker, - pending_events: Mutex::new(Vec::new()), + pending_events: Mutex::new(Vec::with_capacity(EVENTS_PRESIZE)), pending_close: Vec::new(), has_external_poller: AtomicBool::new(false), + ready_scratch: Vec::with_capacity(EVENTS_PRESIZE), }) } @@ -1033,26 +1047,33 @@ impl SlirpBackend { // // Then, only if no net-poll thread has populated the queue // (unit tests / benches), fall back to a non-blocking poll on - // the epoll FD ourselves. `try_lock` keeps that fallback safe - // under contention. - let ready: Vec = { - let mut events: Vec = { - let mut queue = self.pending_events.lock().unwrap(); - std::mem::take(&mut *queue) - }; - // Fallback non-blocking poll only when no external poller - // (net_poll_thread) is feeding us events — otherwise we'd - // pay one mutex op + one epoll_wait syscall per call - // (~310 ns) for nothing. The flag is one-way: set by the - // first push_ready_events and stays set for the backend's - // lifetime. - if events.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) { - let _ = self - .epoll - .wait_with_timeout(&mut events, std::time::Duration::ZERO); - } - events - }; + // the epoll FD ourselves. + // + // The local `ready` Vec is taken from `self.ready_scratch`, + // populated by copying out of the locked queue (which is + // `clear()`-ed in place to keep its capacity), processed, + // then cleared and stashed back. The previous `mem::take` + // pattern dropped the queue's allocation every cycle — + // heaptrack measured that as ~half of all per-CRR + // allocations on this hot path. + let mut ready: Vec = std::mem::take(&mut self.ready_scratch); + ready.clear(); + { + let mut queue = self.pending_events.lock().unwrap(); + ready.extend_from_slice(&queue); + queue.clear(); + } + // Fallback non-blocking poll only when no external poller + // (net_poll_thread) is feeding us events — otherwise we'd + // pay one mutex op + one epoll_wait syscall per call + // (~310 ns) for nothing. The flag is one-way: set by the + // first push_ready_events and stays set for the backend's + // lifetime. + if ready.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) { + let _ = self + .epoll + .wait_with_timeout(&mut ready, std::time::Duration::ZERO); + } // 0a. Accept any newly-ready listener connections (may push into // accept_sender for the next step). @@ -1091,6 +1112,12 @@ impl SlirpBackend { out.append(&mut q.tx_queue); } out.append(&mut self.inject_to_guest); + + // Stash the local `ready` Vec back as scratch. `clear()` + // preserves capacity, so the next `drain_to_guest` reuses + // the buffer instead of allocating from cap=0. + ready.clear(); + self.ready_scratch = ready; } /// Poll the stack and return ethernet frames to send to the guest. From 08af859ef963ab7db043c447d845cabc69610d7b Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 19:47:06 -0300 Subject: [PATCH 12/19] fix(sandbox,bench): expose SLIRP rate-limit knobs for benches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SLIRP backend's per-second new-connection rate limit (`max_connections_per_second`, default 50/s) and concurrent- connection ceiling (`max_concurrent_connections`, default 64) are production anti-DoS defaults baked into `LocalSandbox`. They are hostile to microbenches that intentionally open hundreds of connections in a tight loop — at 51 connects/s the limiter starts returning RST to the guest, which crr-client sees as `ECONNREFUSED` on its very next connect and exits with rc=3. Reproduced as the "100-iter failure" in `crr_singleproc_bench`: 30 iters worked, 60 iters did not; the threshold was the 50/s limit, not anything in the network stack itself. Surface the two ceilings on `Sandbox::local()` as builder methods: .network_max_connections_per_second(u32::MAX) .network_max_concurrent_connections(usize::MAX) `None` keeps the production defaults, so this is purely additive. The bench now uses both. 500-iter run reproduces clean (p50 268 µs, p99 1.6 ms, host accepts 500/500). --- examples/crr_singleproc_bench.rs | 6 ++++++ src/sandbox/local.rs | 10 ++++++++-- src/sandbox/mod.rs | 27 +++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/examples/crr_singleproc_bench.rs b/examples/crr_singleproc_bench.rs index 893cc86c..0b109f8d 100644 --- a/examples/crr_singleproc_bench.rs +++ b/examples/crr_singleproc_bench.rs @@ -97,6 +97,12 @@ async fn main() -> Result<(), Box> { .from_env()? .memory_mb(cli.memory_mb) .network(true) + // Production SLIRP defaults (50/s rate, 64 concurrent) are + // sized to throttle a guest-side flood — far below what a + // CRR microbench wants. Lift both ceilings so the bench + // exercises the steady-state NAT path, not the rate limiter. + .network_max_connections_per_second(u32::MAX) + .network_max_concurrent_connections(usize::MAX) .mount(MountConfig { host_path: bench_binary_dir.clone(), guest_path: "/tmp/host".into(), diff --git a/src/sandbox/local.rs b/src/sandbox/local.rs index 69f9f240..a7b82bfe 100644 --- a/src/sandbox/local.rs +++ b/src/sandbox/local.rs @@ -91,8 +91,14 @@ impl LocalSandbox { session_secret: SessionSecret::new(session_secret_bytes), command_allowlist: Vec::new(), // Set via provisioning network_deny_list: default_network_deny_list(), - max_connections_per_second: DEFAULT_MAX_CONNECTIONS_PER_SECOND, - max_concurrent_connections: DEFAULT_MAX_CONCURRENT_CONNECTIONS, + max_connections_per_second: self + .config + .network_max_connections_per_second + .unwrap_or(DEFAULT_MAX_CONNECTIONS_PER_SECOND), + max_concurrent_connections: self + .config + .network_max_concurrent_connections + .unwrap_or(DEFAULT_MAX_CONCURRENT_CONNECTIONS), seccomp: true, }, snapshot: self.config.snapshot.clone(), diff --git a/src/sandbox/mod.rs b/src/sandbox/mod.rs index b2c820c0..de82de04 100644 --- a/src/sandbox/mod.rs +++ b/src/sandbox/mod.rs @@ -86,6 +86,14 @@ pub struct SandboxConfig { /// validate save/restore support at cold boot instead of deferring a /// cryptic failure to save time. pub enable_snapshots: bool, + /// Override `max_connections_per_second` on the network backend's + /// rate limiter. `None` keeps the production default (50/s); + /// raise it for benches that intentionally exceed the + /// anti-DoS limit. + pub network_max_connections_per_second: Option, + /// Override `max_concurrent_connections` on the network + /// backend. `None` keeps the production default (64). + pub network_max_concurrent_connections: Option, } impl Default for SandboxConfig { @@ -108,6 +116,8 @@ impl Default for SandboxConfig { env: Vec::new(), snapshot: None, enable_snapshots: false, + network_max_connections_per_second: None, + network_max_concurrent_connections: None, } } } @@ -815,6 +825,23 @@ impl SandboxBuilder { self } + /// Override the SLIRP backend's per-second new-connection rate + /// limit. The production default (50/s) protects the host from + /// guest-side connection floods; benches that intentionally + /// exceed it should call this to disable the limit. + pub fn network_max_connections_per_second(mut self, rate: u32) -> Self { + self.config.network_max_connections_per_second = Some(rate); + self + } + + /// Override the SLIRP backend's concurrent-connection ceiling. + /// Production default is 64; raise for sustained-throughput + /// benches. + pub fn network_max_concurrent_connections(mut self, count: usize) -> Self { + self.config.network_max_concurrent_connections = Some(count); + self + } + /// Set the kernel path pub fn kernel(mut self, path: impl Into) -> Self { self.config.kernel = Some(path.into()); From be3c37d802c4240550a6bf2ec1204ac793bd70ab Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:09:05 -0300 Subject: [PATCH 13/19] perf(virtio-net): reuse outer Vec across flush_pending_rx calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `flush_pending_rx` and `try_inject_rx` previously built a fresh `Vec>` on every MMIO exit and handed it to `write_frames_to_rx_ring`, which consumed it by value. The pattern dropped the outer-Vec allocation and forced the next call to grow it from cap=0 — heaptrack on the CRR microbench measured the flush_pending_rx site at 173 calls / 108 MB peak, the largest remaining alloc consumer after the SLIRP `ready_scratch` fix. `write_frames_to_rx_ring` now takes `&mut Vec>` and drains in place via `drain(..)` / `append`, so callers reuse a long-lived scratch buffer: - `flush_pending_rx` uses a new `flush_scratch` field on `VirtioNetDevice`, populated from `pending_rx` (SegQueue) and cleared at end. - `try_inject_rx` reuses the existing `rx_scratch` field that was already paired with `get_rx_frames`; the trailing `mem::take` in `get_rx_frames` is now followed by a `clear()` + restore at the end of `try_inject_rx`, so the capacity persists across the round-trip. Heaptrack on 100-iter CRR: before this commit: 6885 allocs / 30 iters = 229/iter after this commit: 18926 allocs / 100 iters = 189/iter Aggregate from the original baseline: baseline (before all fixes): ~421 allocs/iter this commit: ~189 allocs/iter (-55%) p50 latency unchanged at ~275 µs as expected — alloc reduction shows up in throughput and tail-latency stability, not the CRR floor. --- src/devices/virtio_net.rs | 40 ++++++++++++++++++++++++++++----------- src/sandbox/mod.rs | 37 +++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index efe81516..2c94e1c7 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -201,6 +201,11 @@ pub struct VirtioNetDevice { /// previously serialised every net-poll-side `try_inject_rx` call /// against vCPU MMIO exits. pending_rx: Arc>>, + /// Scratch buffer reused across `flush_pending_rx` calls so the + /// per-MMIO-exit `Vec>` doesn't grow from cap=0 every + /// time. Heaptrack measured the previous local-Vec allocation as + /// 173 calls / 108 MB peak on the CRR microbench. + flush_scratch: Vec>, } impl VirtioNetDevice { @@ -239,6 +244,7 @@ impl VirtioNetDevice { rx_avail_idx: 0, rx_used_idx: 0, pending_rx: Arc::new(SegQueue::new()), + flush_scratch: Vec::new(), }) } @@ -606,15 +612,22 @@ impl VirtioNetDevice { /// /// Returns the number of frames written to the RX ring this call. pub fn flush_pending_rx(&mut self, mem: &M) -> Result { - let mut frames: Vec> = Vec::new(); - while let Some(f) = self.pending_rx.pop() { - frames.push(f); + // Move the scratch out so we can mutate self while populating + // it. The post-write `clear()` keeps capacity, so subsequent + // calls reuse the buffer instead of growing from cap=0. + let mut frames = std::mem::take(&mut self.flush_scratch); + frames.clear(); + while let Some(frame) = self.pending_rx.pop() { + frames.push(frame); } - if !frames.is_empty() { - self.write_frames_to_rx_ring(frames, mem) + let result = if !frames.is_empty() { + self.write_frames_to_rx_ring(&mut frames, mem) } else { Ok(0) - } + }; + frames.clear(); + self.flush_scratch = frames; + result } /// Try to inject received frames from SLIRP into guest RX queue. Call from vCPU loop or after RX notify. @@ -625,11 +638,16 @@ impl VirtioNetDevice { /// has new work to do, not on every poll cycle while interrupt_status /// is still set from an earlier (un-acked) injection. pub fn try_inject_rx(&mut self, mem: &M) -> Result { - let frames = self.get_rx_frames(); + let mut frames = self.get_rx_frames(); if frames.is_empty() { return Ok(0); } - self.write_frames_to_rx_ring(frames, mem) + let result = self.write_frames_to_rx_ring(&mut frames, mem); + // Stash drained Vec back as scratch so the next call reuses + // its capacity instead of allocating from cap=0. + frames.clear(); + self.rx_scratch = frames; + result } /// Write a batch of fully-formed frames (already including the @@ -640,7 +658,7 @@ impl VirtioNetDevice { /// by the net-poll thread into the lock-free SegQueue). fn write_frames_to_rx_ring( &mut self, - frames: Vec>, + frames: &mut Vec>, mem: &M, ) -> Result { let q = &self.rx_queue; @@ -652,7 +670,7 @@ impl VirtioNetDevice { q.num, frames.len() ); - self.rx_buffer.extend(frames); + self.rx_buffer.append(frames); return Ok(0); } let desc_addr = GuestAddress(q.desc_addr); @@ -673,7 +691,7 @@ impl VirtioNetDevice { let mut frames_injected: u16 = 0; - for frame in frames { + for frame in frames.drain(..) { if self.rx_avail_idx == avail_idx { self.rx_buffer.push(frame); continue; diff --git a/src/sandbox/mod.rs b/src/sandbox/mod.rs index de82de04..9066e478 100644 --- a/src/sandbox/mod.rs +++ b/src/sandbox/mod.rs @@ -86,13 +86,14 @@ pub struct SandboxConfig { /// validate save/restore support at cold boot instead of deferring a /// cryptic failure to save time. pub enable_snapshots: bool, - /// Override `max_connections_per_second` on the network backend's - /// rate limiter. `None` keeps the production default (50/s); - /// raise it for benches that intentionally exceed the - /// anti-DoS limit. + /// Optional override for the network backend's + /// `max_connections_per_second` rate limit. `None` keeps the + /// production default (50/s); benches that intentionally exceed + /// the anti-DoS limit raise it explicitly. pub network_max_connections_per_second: Option, - /// Override `max_concurrent_connections` on the network - /// backend. `None` keeps the production default (64). + /// Optional override for the network backend's + /// `max_concurrent_connections` ceiling. `None` keeps the + /// production default (64). pub network_max_concurrent_connections: Option, } @@ -825,18 +826,36 @@ impl SandboxBuilder { self } - /// Override the SLIRP backend's per-second new-connection rate + /// Overrides the SLIRP backend's per-second new-connection rate /// limit. The production default (50/s) protects the host from /// guest-side connection floods; benches that intentionally - /// exceed it should call this to disable the limit. + /// exceed it call this to disable the limit. + /// + /// # Examples + /// + /// ```no_run + /// use void_box::sandbox::Sandbox; + /// let _ = Sandbox::local() + /// .network(true) + /// .network_max_connections_per_second(u32::MAX); + /// ``` pub fn network_max_connections_per_second(mut self, rate: u32) -> Self { self.config.network_max_connections_per_second = Some(rate); self } - /// Override the SLIRP backend's concurrent-connection ceiling. + /// Overrides the SLIRP backend's concurrent-connection ceiling. /// Production default is 64; raise for sustained-throughput /// benches. + /// + /// # Examples + /// + /// ```no_run + /// use void_box::sandbox::Sandbox; + /// let _ = Sandbox::local() + /// .network(true) + /// .network_max_concurrent_connections(1024); + /// ``` pub fn network_max_concurrent_connections(mut self, count: usize) -> Self { self.config.network_max_concurrent_connections = Some(count); self From 58abc710babc13e3eb3ab6e00b49b1936ac82048 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:14:29 -0300 Subject: [PATCH 14/19] perf(slirp): hoist relay_tcp_nat_data's frames_to_inject scratch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `relay_tcp_nat_data` builds a temporary `Vec>` per call because the relay can't push directly to `inject_to_guest` while iterating `flow_table` (both are `&mut self`). The previous pattern allocated a fresh `Vec::new()` every cycle, which heaptrack flagged as the biggest remaining contributor inside `drain_to_guest`'s call tree after the prior `ready_scratch` and `flush_scratch` fixes. Move the buffer onto `SlirpBackend` as `relay_frames_scratch` and use the standard `mem::take` → process → restore pattern so the buffer's capacity persists across `drain_to_guest` calls. The two trailing `inject_to_guest.append(&mut frames_to_inject)` sites already preserve capacity (Vec::append leaves the source empty but with its allocation intact); only the entry-point `Vec::new()` was discarding work. Cumulative impact on the 100-iter CRR microbench: baseline (before any of these fixes): ~421 allocs/iter after ready_scratch + flush_scratch: ~189 allocs/iter after relay_frames_scratch (this PR): ~93 allocs/iter (-78%) p50 latency continues at ~275 µs; the floor is dominated by KVM-exit / wakeup costs, not allocator churn. The win shows up under sustained load where reduced allocator pressure improves tail-latency stability and per-frame jitter. --- src/network/slirp.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 0fa50cde..c1bfa0eb 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -702,6 +702,11 @@ pub struct SlirpBackend { /// which heaptrack measured as ~half of all per-CRR /// allocations. ready_scratch: Vec, + /// Per-call scratch for `relay_tcp_nat_data`'s deferred frame + /// pushes. The relay can't push directly to `inject_to_guest` + /// while iterating `flow_table` (borrow conflict); reusing + /// this buffer keeps the per-cycle Vec from growing from cap=0. + relay_frames_scratch: Vec>, } impl SlirpBackend { @@ -810,6 +815,7 @@ impl SlirpBackend { pending_close: Vec::new(), has_external_poller: AtomicBool::new(false), ready_scratch: Vec::with_capacity(EVENTS_PRESIZE), + relay_frames_scratch: Vec::new(), }) } @@ -2348,8 +2354,13 @@ impl SlirpBackend { /// only the flow table entries directly, avoiding a separate Vec allocation. /// Data relay is restricted to flows with an EPOLLIN event in `ready`. fn relay_tcp_nat_data(&mut self, ready: &[EpollEvent]) { - // Collect frames to inject (built separately to avoid borrow issues) - let mut frames_to_inject: Vec> = Vec::new(); + // Collect frames to inject in the SlirpBackend-owned scratch + // so the buffer's capacity carries across calls. Pushes + // can't go straight to `inject_to_guest` because we're + // about to iterate `flow_table` and `inject_to_guest` is + // also `&mut self`. + let mut frames_to_inject = std::mem::take(&mut self.relay_frames_scratch); + frames_to_inject.clear(); // Seed removal set from flows already marked Closed by handle_tcp_frame // (FIN/RST path) via the pending_close queue. HashSet gives O(1) @@ -2634,6 +2645,10 @@ impl SlirpBackend { self.flow_table.remove(&flow_key); } self.inject_to_guest.append(&mut frames_to_inject); + // Both `append` calls drained `frames_to_inject` but + // preserved its capacity; restore the buffer to the + // backend so the next cycle reuses it. + self.relay_frames_scratch = frames_to_inject; } /// Drain replies from each active ICMP echo socket and emit echo-reply From aba8f85b91cc87ce9aaea9e294b97532f377cf97 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:24:01 -0300 Subject: [PATCH 15/19] perf(slirp): reuse flow-key scratch across TCP/ICMP/UDP relays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three of the relay functions called from `drain_to_guest` (`relay_tcp_nat_data`, `relay_icmp_echo`, `relay_udp_flows`) each built a per-call `Vec` to side-step the `&mut self` / `flow_table` borrow conflict. The Vecs were allocated, populated, drained, and dropped on every cycle. The UDP relay built two — one for the stale-sweep, one for the readiness loop. Add a single `flow_keys_scratch: Vec` field on `SlirpBackend` and rotate it through all four sites with the mem::take → process → restore pattern (the relays run sequentially inside `drain_to_guest`, so one buffer suffices). Each iteration uses `Vec::drain(..)` instead of for-by-value so capacity is preserved across the consume. Heaptrack on the 100-iter CRR microbench: before this commit: 9296 allocs (~93/iter) after this commit: 4103 allocs (~41/iter) temporary allocs: 5546 → 574 (-90%) Cumulative from the original baseline (start of this round): ~421 allocs/iter → ~41 allocs/iter (-90%) p50 latency unchanged at ~275 µs as predicted; the wall-clock floor is dominated by KVM exits / vCPU wakeups. The gain shows up as reduced allocator pressure on bulk paths and fewer slow-path mallocs under sustained load. Top remaining alloc callsites are now per-frame `Vec` from `build_tcp_packet_static` (one allocation per TCP frame) and TX queue frame parsing — both intrinsic to the protocol shape; further reduction needs a pool/arena, not a scratch hoist. --- src/network/slirp.rs | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index c1bfa0eb..7c7930c3 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -707,6 +707,14 @@ pub struct SlirpBackend { /// while iterating `flow_table` (borrow conflict); reusing /// this buffer keeps the per-cycle Vec from growing from cap=0. relay_frames_scratch: Vec>, + /// Shared scratch for the per-cycle `Vec` snapshots + /// that `relay_tcp_nat_data`, `relay_icmp_echo`, and + /// `relay_udp_flows` build to side-step `&mut self` / + /// `flow_table` borrow conflicts. All three relays run + /// sequentially inside `drain_to_guest`, so one buffer + /// suffices — each callsite takes it, fills it, drains it, + /// and stashes it back via `clear()` (capacity preserved). + flow_keys_scratch: Vec, } impl SlirpBackend { @@ -816,6 +824,7 @@ impl SlirpBackend { has_external_poller: AtomicBool::new(false), ready_scratch: Vec::with_capacity(EVENTS_PRESIZE), relay_frames_scratch: Vec::new(), + flow_keys_scratch: Vec::new(), }) } @@ -2420,7 +2429,8 @@ impl SlirpBackend { } } - let mut tcp_flow_keys: Vec = Vec::new(); + let mut tcp_flow_keys = std::mem::take(&mut self.flow_keys_scratch); + tcp_flow_keys.clear(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_TCP { continue; @@ -2434,7 +2444,7 @@ impl SlirpBackend { tcp_flow_keys.push(flow_key); } - for flow_key in tcp_flow_keys { + for flow_key in tcp_flow_keys.drain(..) { let FlowKey::Tcp(key) = flow_key else { continue; }; @@ -2647,8 +2657,10 @@ impl SlirpBackend { self.inject_to_guest.append(&mut frames_to_inject); // Both `append` calls drained `frames_to_inject` but // preserved its capacity; restore the buffer to the - // backend so the next cycle reuses it. + // backend so the next cycle reuses it. The flow-key + // buffer was already drained by the iteration above. self.relay_frames_scratch = frames_to_inject; + self.flow_keys_scratch = tcp_flow_keys; } /// Drain replies from each active ICMP echo socket and emit echo-reply @@ -2661,7 +2673,8 @@ impl SlirpBackend { const ICMP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); let now = Instant::now(); - let mut ready_flow_keys: Vec = Vec::new(); + let mut ready_flow_keys = std::mem::take(&mut self.flow_keys_scratch); + ready_flow_keys.clear(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_ICMP { continue; @@ -2723,6 +2736,8 @@ impl SlirpBackend { } self.flow_table.remove(&flow_key); } + ready_flow_keys.clear(); + self.flow_keys_scratch = ready_flow_keys; } /// Build an Ethernet/IPv4/ICMP echo-reply frame addressed to the guest. @@ -2793,18 +2808,20 @@ impl SlirpBackend { fn relay_udp_flows(&mut self, ready: &[EpollEvent]) { let now = Instant::now(); // Per-flow connected sockets are closed by Drop when the entry leaves - // flow_table. - let mut stale: Vec = Vec::new(); + // flow_table. The two flow-key Vecs here share `flow_keys_scratch`: + // the stale-sweep drains it, then the readiness loop refills it. + let mut flow_keys = std::mem::take(&mut self.flow_keys_scratch); + flow_keys.clear(); for (flow_key, entry) in &self.flow_table { let FlowKey::Udp(_) = flow_key else { continue }; let FlowEntry::Udp(udp_entry) = entry else { continue; }; if now.duration_since(udp_entry.last_activity) > UDP_IDLE_TIMEOUT { - stale.push(*flow_key); + flow_keys.push(*flow_key); } } - for flow_key in stale { + for flow_key in flow_keys.drain(..) { if let Some(FlowEntry::Udp(entry)) = self.flow_table.get(&flow_key) { self.token_to_key.remove(&entry.flow_token); self.epoll.unregister(entry.sock.as_raw_fd()).ok(); @@ -2812,7 +2829,6 @@ impl SlirpBackend { self.flow_table.remove(&flow_key); } - let mut flow_keys: Vec = Vec::new(); for event in ready { if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_UDP { continue; @@ -2822,7 +2838,7 @@ impl SlirpBackend { }; flow_keys.push(flow_key); } - for flow_key in flow_keys { + for flow_key in flow_keys.drain(..) { let FlowKey::Udp(key) = flow_key else { continue; }; @@ -2849,6 +2865,7 @@ impl SlirpBackend { self.inject_to_guest.push(frame_bytes); } } + self.flow_keys_scratch = flow_keys; } /// Build an Ethernet/IPv4/UDP frame addressed to the guest, carrying a From a7e6296c8e38bbb7ece4965f64612eaab9a4bcea Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:29:20 -0300 Subject: [PATCH 16/19] fix(voidbox-network-bench): lift SLIRP rate limit for the CRR phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same fix as `crr_singleproc_bench`: the bench's CRR phase opens 30 connections in <1s, which trips the production SLIRP rate limiter (50 conn/s) and surfaces as a 2 s "crr echo channel receive error" instead of a real number. Use the new `Sandbox::local()` rate-limit knobs to lift both ceilings (max_connections_per_second + max_concurrent_connections) explicitly. Production sandboxes are unaffected — the lift is opt-in. --- src/bin/voidbox-network-bench/main.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/bin/voidbox-network-bench/main.rs b/src/bin/voidbox-network-bench/main.rs index a18ac09e..9a2cc434 100644 --- a/src/bin/voidbox-network-bench/main.rs +++ b/src/bin/voidbox-network-bench/main.rs @@ -192,6 +192,14 @@ FAST SMOKE RUN\n\ .from_env()? .memory_mb(BENCH_MEMORY_MB) .network(true) + // Production SLIRP defaults (50 connect/s, 64 concurrent) + // are anti-DoS limits sized for real workloads. The CRR + // bench intentionally opens hundreds of connections per + // second; without this lift it gets RST'd at the 51st + // connect, which manifests as a 2 s `crr echo channel + // receive error` instead of a real number. + .network_max_connections_per_second(u32::MAX) + .network_max_concurrent_connections(usize::MAX) .build()?; // Prime the VM (triggers boot + vsock handshake) before any timed work. From 73059ccd19d9513164d11665c0195e4d9d912ca6 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:34:29 -0300 Subject: [PATCH 17/19] docs: scope architectural perf experiments stacked on #81 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan doc for the next perf round. After #81's user-space alloc reductions exhausted (-90% allocs/iter, p50 unchanged), the remaining floor is kernel↔userspace transitions, MMIO exits, and single-queue serialization. Three experiments in scope, ranked by risk × payoff: 1. io_uring for SLIRP host-socket I/O — start here 2. splice() / sendfile() zero-copy on bulk paths 3. MSI-X virtio + multi-queue for vCPU scaling Non-goal: TAP + passt-style host bypass. Routing through an external passt would close the latency gap to passt but moves the DNS interception, port-forwarding, deny-list, and rate-limiting feature surface out of voidbox — and loses the in-process observability we currently get from instrumenting SLIRP directly. Full SLIRP-path observability is a hard requirement. Each experiment lands as its own commit, gated behind a Cargo feature so the #81 baseline can A/B against it without a revert. Measurements use the harness shipped in #81. --- docs/perf-architectural-experiments.md | 115 +++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 docs/perf-architectural-experiments.md diff --git a/docs/perf-architectural-experiments.md b/docs/perf-architectural-experiments.md new file mode 100644 index 00000000..d4bd5fcb --- /dev/null +++ b/docs/perf-architectural-experiments.md @@ -0,0 +1,115 @@ +# SLIRP perf — architectural experiments + +Stacked on top of #81. After the heaptrack-driven user-space alloc +reductions exhausted (-90% allocs/iter, p50 unchanged at ~275 µs), the +remaining wall-clock floor is dominated by: + +1. **Kernel ↔ userspace transitions** — per-packet `read()`/`write()` on + host sockets, one syscall per packet, serial in `net_poll_thread`. +2. **Per-vCPU MMIO exits** for virtio doorbell writes (already partially + addressed by `KVM_IOEVENTFD` for TX-notify; RX-notify and other + queues still exit). +3. **Single-queue serialization** through `net_poll_thread`'s single + epoll loop, even with multi-vCPU guests. + +This document tracks the architectural experiments that target those +floors, ranked by risk × payoff. Each experiment lands as its own +commit with a measurement vs the #81 baseline attached. + +## Non-goal: TAP / passt-style host bypass + +Dropping SLIRP and routing through TAP + an external passt instance +would close the latency gap to passt itself, but it would move the +DNS interception, port-forwarding, deny-list, and rate-limiting +feature surface out of voidbox into a separate process — and we lose +the in-process observability we currently get from instrumenting +SLIRP directly. **Full SLIRP-path observability is a hard +requirement**, so passt-style bypass is out of scope. + +## Experiments + +### 1. `io_uring` for SLIRP host-socket I/O — start here + +**Current path:** per-flow `recv()` + `sendto()` on host sockets, +one syscall per packet, called from `net_poll_thread` in serial. +On CRR ~5 syscalls/iter; on bulk transfers it's the dominant cost. + +**Proposal:** add an `io_uring` instance to the SLIRP backend, +side-by-side with the existing `EpollDispatch`: + +- After each `epoll_wait`, submit a batched `IORING_OP_RECV` SQE + for every readable host socket — one SQE per flow with new + data, all submitted in a single syscall. +- Submit `IORING_OP_SEND` SQEs for the outbound frames the SLIRP + stack builds, again batched into a single submission. +- Drain CQEs in the relay loop instead of calling `recv` / + `sendto` directly. + +**Expected:** ~10–30 µs CRR p50 reduction (5 syscalls per CRR +× ~3–5 µs/syscall × batching savings). Measurable via +`examples/crr_singleproc_bench`. + +**Risk:** lowest — the change is localized to the relay layer's +read/write helpers. Falls back to the existing path behind a +build feature so we can A/B. + +### 2. `splice()` / `sendfile()` zero-copy on bulk paths + +**Current path:** guest virtio TX ring → vmm copies into Rust +`Vec` → SLIRP/smoltcp → kernel send buffer of host socket. +The middle copy is avoidable for direct-pipe flows where guest +payload is destined to a host TCP socket without header rewrites. + +**Proposal:** `splice()` between the host-socket fd and a pipe (then +to next stage) eliminates one userspace copy. Only works for +fd-to-fd, so SLIRP NAT rewriting defeats it for the header path; +applies to the **payload bytes only** if we route header building +through smoltcp metadata and pipe just the bulk payload. + +**Expected:** +10–20% throughput on `tcp_throughput_g2h_mbps`. +**Risk:** medium. Plumbing pipe fds through the relay state +machine is non-trivial; needs care around partial writes and +backpressure. + +### 3. MSI-X virtio + multi-queue for vCPU scaling + +**Current path:** virtio-net uses a single RX queue + single TX +queue, both serviced by `net_poll_thread`. With multi-vCPU +guests, the contention is on `net_poll_thread`'s single epoll +loop. + +**Proposal:** add MSI-X support to `src/vmm/arch/x86_64/` (currently +INTx only) and expose `VIRTIO_NET_F_MQ` so the guest can spin up +per-CPU queue pairs. Host side fans out queues to multiple poll +threads, each on its own epoll instance. + +**Expected:** +50–100% throughput on multi-vCPU sandboxes. No +impact on single-vCPU CRR microbenches. +**Risk:** highest of the three. Touches IRQ delivery, `KVM_IRQFD` +wiring, and the IRQ path is HW-feature-gated; CI workers without +MSI-X support need a fallback. + +## Tooling + +All experiments measured with the perf-harness from #81: + +| Tool | Signal | +|---|---| +| `examples/crr_singleproc_bench` | CRR p50/p99 (real NAT path) | +| `voidbox-network-bench` | g2h throughput, RR p50/p99 | +| `heaptrack` | allocation regression check | +| `tools/perf-harness/bench-pasta.py` | pasta reference number | +| `tools/perf-harness/bench-qemu-slirp.sh` | qemu+libslirp / qemu+passt cross-check | + +## Methodology + +1. Each experiment is a single commit gated behind a Cargo feature + (`io-uring`, `splice-zerocopy`, `multi-queue`) so the baseline + can A/B against it without a revert. +2. Commit message includes the before/after numbers from + `crr_singleproc_bench --iterations 100` and + `voidbox-network-bench --iterations 3`. +3. heaptrack run after each commit confirms no alloc regression + vs the round-2 number from #81 (~41 allocs/iter on CRR). +4. If a commit doesn't move the needle, it's reverted before the + next experiment lands so the diff stays minimal. From e4ff692ad7e2f6a8f5570357ffe23d6fa07cd85b Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 21:42:04 -0300 Subject: [PATCH 18/19] perf(slirp): scaffold io_uring batching primitive (feature-gated) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First commit on the architectural-experiments branch (#83). Adds a `UringBatch` wrapper around `io_uring::IoUring` with the submit / drain shape the SLIRP relay will use to batch host-socket recv / send into single `io_uring_enter` round-trips. Key shape: - One `UringBatch` is single-owner: the SLIRP `net_poll_thread` constructs and drives one. No locking, no cross-thread sharing. - SQEs are tagged with `(UringOp, correlation_id)` packed into `user_data` so the completion drain routes a CQE back to its originating flow without a side table. Low 32 bits = correlation id, top 32 bits = op tag. - `submit_recv` / `submit_send` are `unsafe` because the kernel references the user buffer asynchronously; the caller's safety contract requires `buf` to outlive the matching CQE. - The existing `EpollDispatch` keeps owning the readiness signal — io_uring replaces only the data-plane syscalls, not the wake-up. Two layers stay separable so the feature can be toggled off without touching the relay state machine. Behavior unchanged: nothing wires this in yet. Cargo feature `io-uring` (off by default) gates both the new module and the `io-uring = "0.7"` dependency. Module is `#![allow(dead_code)]` for now; the next commit on this branch wires the relay TCP recv / send paths through it and removes the allow. Tests: - 4 unit tests in `src/network/uring.rs` cover user-data round trip + a real `submit_send` -> `submit_recv` cycle across a `socketpair` (skipped on kernels without io_uring). - `cargo test --features io-uring --lib`: 381 passed. - `cargo test --test network_baseline` (default features): 24/24. - `cargo clippy --all-targets [-- -D warnings]` clean both with and without the feature. Methodology per `docs/perf-architectural-experiments.md`: each experiment lands as one feature-gated commit so the #81 baseline can A/B against it without a revert. This is the infrastructure commit; the next one wires + measures. --- Cargo.lock | 12 ++ Cargo.toml | 11 ++ src/network/mod.rs | 2 + src/network/uring.rs | 332 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 357 insertions(+) create mode 100644 src/network/uring.rs diff --git a/Cargo.lock b/Cargo.lock index 868e1c21..2f5bd65e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1196,6 +1196,17 @@ dependencies = [ "web-time", ] +[[package]] +name = "io-uring" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -3001,6 +3012,7 @@ dependencies = [ "hyper-util", "hyperlocal", "indicatif", + "io-uring", "ipnet", "kvm-bindings", "kvm-ioctls", diff --git a/Cargo.toml b/Cargo.toml index af267aec..c3e321aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,6 +118,12 @@ rustc-hash = "2" # `Arc>` device lock on the hot path. crossbeam-queue = "0.3" +# Linux io_uring bindings. Gated behind the `io-uring` Cargo +# feature so the baseline epoll+read/write path remains the +# default; the experiment branch toggles this on to A/B against +# the user-space alloc reductions from PR #81. +io-uring = { version = "0.7", optional = true } + # --- macOS-only dependencies --- [target.'cfg(target_os = "macos")'.dependencies] # Objective-C 2.0 bindings (auto-generated from Apple frameworks) @@ -149,6 +155,11 @@ opentelemetry = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetr # Expose internal SlirpBackend helpers (insert_synthetic_synsent_entry, etc.) # for use in benches/. Never enable in production builds. bench-helpers = [] +# Use io_uring for SLIRP host-socket recv/send batching instead of +# per-syscall read/write. Linux-only; falls back to the standard +# path on macOS or when the running kernel lacks io_uring support. +# Off by default while the experiment is being measured. +io-uring = ["dep:io-uring"] [[bin]] name = "voidbox" diff --git a/src/network/mod.rs b/src/network/mod.rs index fa498280..6d4b430a 100644 --- a/src/network/mod.rs +++ b/src/network/mod.rs @@ -9,6 +9,8 @@ pub(crate) mod epoll_dispatch; pub mod nat; pub mod slirp; +#[cfg(all(target_os = "linux", feature = "io-uring"))] +pub(crate) mod uring; use std::ffi::CString; use std::io; diff --git a/src/network/uring.rs b/src/network/uring.rs new file mode 100644 index 00000000..dbf0e07f --- /dev/null +++ b/src/network/uring.rs @@ -0,0 +1,332 @@ +// The `submit_recv`/`submit_send` API and surrounding types are +// scaffold for the SLIRP relay wiring that lands in the next +// commit on this experiment branch. Until that commit, the lib +// build sees them as unused — the unit tests below exercise every +// item, so coverage is not the issue. +#![allow(dead_code)] + +//! `io_uring` batching primitive for SLIRP host-socket I/O. +//! +//! Per-packet `recv` / `sendto` against host sockets is one syscall +//! per direction per packet. On a CRR workload that's ~5 syscalls +//! per iteration on the SLIRP relay path; on bulk-throughput it +//! dominates. This module wraps an [`IoUring`] instance the +//! relay submits batched `IORING_OP_RECV` / `IORING_OP_SEND` +//! SQEs into and drains CQEs from in a single syscall round-trip. +//! +//! # Threading +//! +//! Each [`UringBatch`] is single-owner: the SLIRP `net_poll_thread` +//! constructs and drives one. No locking, no cross-thread sharing. +//! The relay submits a batch after each `epoll_wait` and drains it +//! before the next. +//! +//! # Why epoll stays +//! +//! The existing [`crate::network::epoll_dispatch::EpollDispatch`] +//! still owns the readiness signal — io_uring replaces only the +//! data-plane syscalls, not the wake-up. Keeping the two layers +//! separate means io_uring can be feature-gated off without +//! touching the relay's flow-management state machine. +//! +//! [`IoUring`]: io_uring::IoUring + +use std::io; +use std::os::fd::RawFd; + +use io_uring::{opcode, types, IoUring}; + +/// Maximum SQE / CQE entries per [`UringBatch`]. Sized to comfortably +/// hold one submission per active SLIRP flow on a typical CRR cycle +/// without reallocation; oversized batches pay the kernel's submission +/// cost twice rather than failing. +const URING_QUEUE_DEPTH: u32 = 256; + +/// Per-submission token tagging the kind of operation a CQE +/// completes. Encoded into [`io_uring::squeue::Entry::user_data`] +/// so the completion drain can route a CQE back to the caller +/// without per-flow side tables. +/// +/// The low 32 bits carry the caller's correlation id (typically a +/// flow token); the top 32 bits encode this enum. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum UringOp { + /// `IORING_OP_RECV` against a host TCP / UDP socket. CQE + /// `result` carries the byte count or `-errno`. + Recv, + /// `IORING_OP_SEND` against a host TCP / UDP socket. CQE + /// `result` carries bytes written or `-errno`. + Send, +} + +impl UringOp { + const TAG_RECV: u64 = 1; + const TAG_SEND: u64 = 2; + + /// Encodes the op + correlation id as a single `u64` user-data + /// field for an SQE. The [`UringBatch`] inverse is + /// [`UringBatch::decode_user_data`]. + fn encode(self, correlation_id: u32) -> u64 { + let tag = match self { + UringOp::Recv => Self::TAG_RECV, + UringOp::Send => Self::TAG_SEND, + }; + (tag << 32) | u64::from(correlation_id) + } +} + +/// Result of draining a single completion from the ring. +/// +/// The caller matches on this to dispatch the bytes count (or +/// errno) back to the originating flow keyed by `correlation_id`. +#[derive(Debug, Clone, Copy)] +pub(crate) struct UringCompletion { + /// Operation kind the SQE was tagged with. + pub op: UringOp, + /// Caller-supplied correlation id (e.g. SLIRP flow token). + pub correlation_id: u32, + /// CQE result field: positive byte count, `0` for EOF, or + /// `-errno` on failure. Decoded from the kernel's signed + /// 32-bit return. + pub result: i32, +} + +/// Owns a single [`IoUring`] instance and serves as the submit / +/// complete entry point for the SLIRP relay. +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(all(target_os = "linux", feature = "io-uring"))] { +/// use void_box::network::uring::{UringBatch, UringOp}; +/// let mut batch = UringBatch::new().expect("kernel supports io_uring"); +/// let mut buf = vec![0u8; 1500]; +/// // SAFETY: caller guarantees `buf` lives until the matching CQE drains. +/// unsafe { +/// batch +/// .submit_recv(/*fd=*/ 3, &mut buf, /*correlation_id=*/ 42) +/// .expect("submission queue not full"); +/// } +/// batch.submit_and_wait(0).expect("kernel reachable"); +/// while let Some(_completion) = batch.drain_one() { /* … */ } +/// # } +/// ``` +pub(crate) struct UringBatch { + ring: IoUring, +} + +impl UringBatch { + /// Creates a new ring sized to [`URING_QUEUE_DEPTH`]. + /// + /// # Errors + /// + /// Returns the underlying [`io::Error`] if the kernel's + /// `io_uring_setup` syscall fails — typically because the + /// host kernel predates io_uring (Linux ≤ 5.0) or + /// `kernel.io_uring_disabled` is set. + pub(crate) fn new() -> io::Result { + let ring = IoUring::new(URING_QUEUE_DEPTH)?; + Ok(Self { ring }) + } + + /// Submits an `IORING_OP_RECV` against `fd` reading into `buf`. + /// + /// The SQE is tagged with `correlation_id` so the matching + /// CQE drained later can be routed back to its originating + /// flow. + /// + /// # Errors + /// + /// Returns [`io::ErrorKind::WouldBlock`] when the submission + /// queue is full — the caller submits the pending batch via + /// [`Self::submit_and_wait`] and retries. + /// + /// # Safety + /// + /// `buf` must remain valid until the matching CQE drains via + /// [`Self::drain_one`]. The kernel writes into the buffer + /// asynchronously; dropping or reusing it before completion + /// is undefined behavior. + pub(crate) unsafe fn submit_recv( + &mut self, + fd: RawFd, + buf: &mut [u8], + correlation_id: u32, + ) -> io::Result<()> { + let entry = opcode::Recv::new(types::Fd(fd), buf.as_mut_ptr(), buf.len() as u32) + .build() + .user_data(UringOp::Recv.encode(correlation_id)); + let mut sq = self.ring.submission(); + // SAFETY: the `Recv` SQE only references `buf`'s pointer; the + // caller's safety contract on this fn forwards the same + // lifetime requirement to the kernel. + unsafe { sq.push(&entry) }.map_err(|_| { + io::Error::new(io::ErrorKind::WouldBlock, "io_uring submission queue full") + })?; + Ok(()) + } + + /// Submits an `IORING_OP_SEND` writing `buf` to `fd`. + /// + /// # Errors + /// + /// Returns [`io::ErrorKind::WouldBlock`] when the submission + /// queue is full. + /// + /// # Safety + /// + /// `buf` must remain valid until the matching CQE drains. + pub(crate) unsafe fn submit_send( + &mut self, + fd: RawFd, + buf: &[u8], + correlation_id: u32, + ) -> io::Result<()> { + let entry = opcode::Send::new(types::Fd(fd), buf.as_ptr(), buf.len() as u32) + .build() + .user_data(UringOp::Send.encode(correlation_id)); + let mut sq = self.ring.submission(); + // SAFETY: `Send` SQE references `buf`'s pointer for the + // lifetime of the in-flight operation; the caller's safety + // contract forwards that requirement to the kernel. + unsafe { sq.push(&entry) }.map_err(|_| { + io::Error::new(io::ErrorKind::WouldBlock, "io_uring submission queue full") + })?; + Ok(()) + } + + /// Submits the queued SQEs and waits for `min_complete` CQEs + /// to land. `min_complete = 0` returns immediately after + /// submission. + /// + /// # Errors + /// + /// Returns the underlying [`io::Error`] from + /// `io_uring_enter`. + pub(crate) fn submit_and_wait(&mut self, min_complete: usize) -> io::Result { + self.ring.submit_and_wait(min_complete) + } + + /// Drains one completion if available. Returns `None` when + /// the completion queue is empty. + pub(crate) fn drain_one(&mut self) -> Option { + let cqe = self.ring.completion().next()?; + Some(Self::decode_user_data(cqe.user_data(), cqe.result())) + } + + fn decode_user_data(user_data: u64, result: i32) -> UringCompletion { + let tag = user_data >> 32; + let correlation_id = (user_data & 0xFFFF_FFFF) as u32; + let op = match tag { + UringOp::TAG_RECV => UringOp::Recv, + UringOp::TAG_SEND => UringOp::Send, + other => panic!("uring: unknown op tag {other} in user_data {user_data:#x}"), + }; + UringCompletion { + op, + correlation_id, + result, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn user_data_round_trip_recv() { + let encoded = UringOp::Recv.encode(0xDEAD_BEEF); + let decoded = UringBatch::decode_user_data(encoded, 1500); + let UringCompletion { + op, + correlation_id, + result, + } = decoded; + assert_eq!(op, UringOp::Recv); + assert_eq!(correlation_id, 0xDEAD_BEEF); + assert_eq!(result, 1500); + } + + #[test] + fn user_data_round_trip_send() { + let encoded = UringOp::Send.encode(0); + let decoded = UringBatch::decode_user_data(encoded, -11); + let UringCompletion { + op, + correlation_id, + result, + } = decoded; + assert_eq!(op, UringOp::Send); + assert_eq!(correlation_id, 0); + assert_eq!(result, -11); + } + + #[test] + fn ring_constructs_on_supported_kernel() { + let Ok(mut batch) = UringBatch::new() else { + return; + }; + let submitted = batch.submit_and_wait(0).expect("submit_and_wait succeeds"); + assert_eq!(submitted, 0); + assert!(batch.drain_one().is_none()); + } + + /// Exercises `submit_send` + `submit_recv` end-to-end across a + /// connected `socketpair`. Skipped on kernels without io_uring. + #[test] + fn submit_send_then_recv_round_trips_via_socketpair() { + use std::os::fd::AsRawFd; + + let Ok(mut batch) = UringBatch::new() else { + return; + }; + + let (a, b) = std::os::unix::net::UnixStream::pair().expect("socketpair"); + let send_payload: [u8; 5] = *b"hello"; + let mut recv_buf = [0u8; 16]; + + // SAFETY: `send_payload` and `recv_buf` outlive the + // submit_and_wait call below — the kernel only references + // them while the SQEs are in flight, and we hold those + // borrows until both CQEs land. + unsafe { + batch + .submit_send(a.as_raw_fd(), &send_payload, /*correlation_id=*/ 7) + .expect("submit_send not full"); + batch + .submit_recv(b.as_raw_fd(), &mut recv_buf, /*correlation_id=*/ 9) + .expect("submit_recv not full"); + } + + let completed = batch + .submit_and_wait(2) + .expect("kernel completes both SQEs"); + assert_eq!(completed, 2); + + let mut send_seen = false; + let mut recv_seen = false; + while let Some(cqe) = batch.drain_one() { + let UringCompletion { + op, + correlation_id, + result, + } = cqe; + match op { + UringOp::Send => { + assert_eq!(correlation_id, 7); + assert_eq!(result, send_payload.len() as i32); + send_seen = true; + } + UringOp::Recv => { + assert_eq!(correlation_id, 9); + assert_eq!(result, send_payload.len() as i32); + assert_eq!(&recv_buf[..result as usize], &send_payload); + recv_seen = true; + } + } + } + assert!(send_seen, "send CQE drained"); + assert!(recv_seen, "recv CQE drained"); + } +} From b65080d54ef51046037fb385cd47395961fcdf8c Mon Sep 17 00:00:00 2001 From: diego Date: Thu, 7 May 2026 13:47:42 -0300 Subject: [PATCH 19/19] bench: add multi-flow concurrent CRR microbench MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to `crr_singleproc_bench`: drives M concurrent crr-client processes in the same guest so the SLIRP relay sees N>1 ready flows per `net_poll_thread` cycle. The single-flow microbench can't see io_uring batching or multi-queue wins because there's nothing to batch / parallelize with one ready flow at a time; this bench is the workload the architectural experiments on this branch (#83) need. Per-flow `crr-client` writes its summary line to its own `/tmp/crr_results/$i.txt`; the trailing shell loop concatenates all M lines for the host to parse. Aggregation reports median-of-p50s, max p99, mean-of-means, and aggregate qps. Note: busybox-static lacks `seq`, so the flow-id list is materialized on the host and inlined into the shell command. ## Baseline (this branch's tip = #81 + io_uring scaffold) Single net_poll_thread, no architectural changes wired: | M | Median p50 | Max p99 | Aggregate qps | |---|-----------:|--------:|--------------:| | 1 | 275 µs | ~2 ms | ~3636 | | 2 | 473 µs | 12.9 ms | 2173 | | 4 | 732 µs | 13.2 ms | 2370 | | 8 | 2043 µs | 14.5 ms | 2242 | Reading: - Aggregate qps saturates at ~2200-2400 regardless of M — the single net_poll_thread is the bottleneck. - Per-flow p50 grows ~linearly with M (M=8 each flow takes 7.4× the M=1 p50). - p99 jumps to 12-14 ms at M=2 already; tail-latency is dominated by per-flow head-of-line blocking through the single epoll loop. This is exactly the workload io_uring batching, splice, and multi-queue should move. The io_uring wiring lands in the next commit on this branch with measurements against this table. --- examples/crr_concurrent_bench.rs | 273 +++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 examples/crr_concurrent_bench.rs diff --git a/examples/crr_concurrent_bench.rs b/examples/crr_concurrent_bench.rs new file mode 100644 index 00000000..a022178c --- /dev/null +++ b/examples/crr_concurrent_bench.rs @@ -0,0 +1,273 @@ +//! crr_concurrent_bench — voidbox-side multi-flow TCP CRR microbench. +//! +//! Companion to `crr_singleproc_bench`. That one isolates the +//! single-flow NAT-path floor; this one drives **M concurrent +//! crr-client processes** in the same guest so the SLIRP relay +//! sees N>1 ready flows per `net_poll_thread` cycle — the +//! workload io_uring batching, splice-zerocopy, and multi-queue +//! all need to actually win. +//! +//! Each guest-side flow runs its own `N`-iteration loop against +//! the same host listener port. The host accepts in a single +//! thread that spawns a tiny per-connection handler (recv 1 B, +//! send 1 B, close), so up to `M` concurrent connections make +//! progress at once. +//! +//! Per-flow p50/p99 are reported alongside an aggregate +//! throughput (`M*N` iterations divided by wall-clock). +//! +//! # Examples +//! +//! ```ignore +//! gcc -O2 -static -o /tmp/crr-client tools/perf-harness/crr-client.c +//! cargo run --release --example crr_concurrent_bench -- \ +//! --concurrency 4 --iterations 100 +//! ``` +//! +//! Requires the same env vars as `voidbox-network-bench`: +//! `VOID_BOX_KERNEL`, `VOID_BOX_INITRAMFS`. + +use std::net::TcpListener; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, Instant}; + +use clap::Parser; +use void_box::backend::MountConfig; +use void_box::sandbox::Sandbox; + +const HOST_LOOPBACK_FROM_GUEST: &str = "10.0.2.2"; +const HOST_ACCEPT_DEADLINE: Duration = Duration::from_secs(120); +const HOST_ACCEPT_POLL: Duration = Duration::from_micros(50); + +#[derive(Parser)] +#[command(version, about)] +struct Cli { + /// Number of concurrent guest-side crr-client processes. + #[arg(long, default_value_t = 4)] + concurrency: u32, + /// CRR iterations per concurrent flow (each client runs `iterations` rounds). + #[arg(long, default_value_t = 100)] + iterations: u32, + /// Host path to the static crr-client binary. + #[arg(long, default_value = "/tmp/crr-client")] + bench_binary: String, + /// Memory size for the guest VM (MB). + #[arg(long, default_value_t = 1024)] + memory_mb: usize, +} + +#[derive(Debug, Clone, Copy)] +struct FlowSummary { + flow_id: u32, + iterations: u32, + p50_ns: u64, + p99_ns: u64, + mean_ns: u64, +} + +#[tokio::main(flavor = "multi_thread")] +async fn main() -> Result<(), Box> { + let cli = Cli::parse(); + let bench_binary = std::path::PathBuf::from(&cli.bench_binary); + if !bench_binary.exists() { + return Err(format!( + "bench binary not found: {} (compile with `gcc -static -o /tmp/crr-client tools/perf-harness/crr-client.c`)", + cli.bench_binary + ) + .into()); + } + let bench_binary_dir = bench_binary + .parent() + .ok_or("bench-binary has no parent dir")? + .to_string_lossy() + .into_owned(); + let bench_binary_name = bench_binary + .file_name() + .ok_or("bench-binary has no file name")? + .to_string_lossy() + .into_owned(); + + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + listener.set_nonblocking(true)?; + + let total_expected: usize = (cli.concurrency as usize) * (cli.iterations as usize); + let accepts_done = Arc::new(AtomicUsize::new(0)); + + let server_thread = thread::spawn({ + let accepts_done = Arc::clone(&accepts_done); + move || { + // Host accept-and-handle loop. Spawns a per-connection + // worker thread for each accepted socket so up to + // `concurrency` flows can be in-flight at once — without + // this fan-out the host serializes M clients, which + // defeats the multi-flow signal we're trying to measure. + let deadline = Instant::now() + HOST_ACCEPT_DEADLINE; + while accepts_done.load(Ordering::Relaxed) < total_expected && Instant::now() < deadline + { + let (mut conn, _) = match listener.accept() { + Ok(pair) => pair, + Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(HOST_ACCEPT_POLL); + continue; + } + Err(_) => break, + }; + conn.set_nonblocking(false).ok(); + let accepts_done = Arc::clone(&accepts_done); + thread::spawn(move || { + let mut buf = [0u8; 1]; + let _ = std::io::Read::read(&mut conn, &mut buf); + let _ = std::io::Write::write_all(&mut conn, b"x"); + accepts_done.fetch_add(1, Ordering::Relaxed); + }); + } + } + }); + + let sandbox = Sandbox::local() + .from_env()? + .memory_mb(cli.memory_mb) + .network(true) + // Same SLIRP-default lift as `crr_singleproc_bench` — at M=4 + // concurrency × 100 iterations the bench would otherwise trip + // the 50 conn/s rate limiter and surface as a connect-refused + // failure mid-run. + .network_max_connections_per_second(u32::MAX) + .network_max_concurrent_connections(usize::MAX) + .mount(MountConfig { + host_path: bench_binary_dir.clone(), + guest_path: "/tmp/host".into(), + read_only: true, + }) + .build()?; + + eprintln!( + "VM booted; running {} concurrent flows × {} CRRs each ({} total)...", + cli.concurrency, cli.iterations, total_expected + ); + let probe = sandbox.exec("sh", &["-c", ":"]).await?; + if !probe.success() { + return Err("VM probe exec failed".into()); + } + + // Kick off `concurrency` crr-client processes in parallel from + // a single guest shell, each writing its own summary line into + // a per-flow file. `wait` blocks until every backgrounded + // process exits; the trailing loop concatenates the M lines + // for the host to parse. The flow-id list is materialized on + // the host because busybox-static (the guest shell) lacks + // `seq`. + let mut flow_ids = String::new(); + for flow_id in 1..=cli.concurrency { + if !flow_ids.is_empty() { + flow_ids.push(' '); + } + flow_ids.push_str(&flow_id.to_string()); + } + let cmd = format!( + "set -eu; rm -rf /tmp/crr_results; mkdir -p /tmp/crr_results; \ + for i in {flow_ids}; do \ + /tmp/host/{name} {host} {port} {iterations} > /tmp/crr_results/$i.txt & \ + done; \ + wait; \ + for i in {flow_ids}; do echo \"$i $(cat /tmp/crr_results/$i.txt)\"; done", + flow_ids = flow_ids, + name = bench_binary_name, + host = HOST_LOOPBACK_FROM_GUEST, + port = host_port, + iterations = cli.iterations, + ); + let wall_start = Instant::now(); + let output = sandbox.exec("sh", &["-c", &cmd]).await?; + let wall_elapsed = wall_start.elapsed(); + if !output.success() { + eprintln!("guest stderr: {}", output.stderr_str()); + return Err(format!("guest exec failed: {:?}", output.exit_code).into()); + } + + server_thread.join().unwrap_or(()); + let host_accepts = accepts_done.load(Ordering::Relaxed); + eprintln!("host accepts: {host_accepts}/{total_expected}"); + + let stdout = output.stdout_str().to_string(); + let mut flows: Vec = Vec::new(); + for line in stdout.lines() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() != 5 { + return Err(format!("unexpected guest stdout line: {line:?}").into()); + } + let flow_id: u32 = parts[0].parse()?; + let iterations: u32 = parts[1].parse()?; + let p50_ns: u64 = parts[2].parse()?; + let p99_ns: u64 = parts[3].parse()?; + let mean_ns: u64 = parts[4].parse()?; + flows.push(FlowSummary { + flow_id, + iterations, + p50_ns, + p99_ns, + mean_ns, + }); + } + + if flows.len() != cli.concurrency as usize { + return Err(format!( + "expected {} flow summaries, got {}", + cli.concurrency, + flows.len() + ) + .into()); + } + + let mut p50s_us: Vec = Vec::with_capacity(flows.len()); + let mut p99s_us: Vec = Vec::with_capacity(flows.len()); + let mut means_us: Vec = Vec::with_capacity(flows.len()); + for flow in &flows { + p50s_us.push(flow.p50_ns / 1000); + p99s_us.push(flow.p99_ns / 1000); + means_us.push(flow.mean_ns / 1000); + } + p50s_us.sort_unstable(); + p99s_us.sort_unstable(); + means_us.sort_unstable(); + let mid = p50s_us.len() / 2; + let median_of_p50s = p50s_us[mid]; + let max_p99 = *p99s_us.last().expect("non-empty"); + let mean_of_means: u64 = means_us.iter().sum::() / (means_us.len() as u64); + + let total_iters = total_expected as f64; + let aggregate_qps = total_iters / wall_elapsed.as_secs_f64(); + + println!(); + println!( + "voidbox concurrent CRR: {} flows × {} iterations ({:.3}s wall):", + cli.concurrency, + cli.iterations, + wall_elapsed.as_secs_f64() + ); + for flow in &flows { + let FlowSummary { + flow_id, + iterations, + p50_ns, + p99_ns, + mean_ns, + } = *flow; + println!( + " flow {flow_id} ({iterations} iters): p50={} µs p99={} µs mean={} µs", + p50_ns / 1000, + p99_ns / 1000, + mean_ns / 1000, + ); + } + println!(); + println!(" median-of-p50s: {median_of_p50s} µs"); + println!(" max p99: {max_p99} µs"); + println!(" mean-of-means: {mean_of_means} µs"); + println!(" aggregate qps: {aggregate_qps:.0} CRRs/s"); + + Ok(()) +}