diff --git a/.github/workflows/startup-bench.yml b/.github/workflows/startup-bench.yml index 2b8a5b20..d39926bb 100644 --- a/.github/workflows/startup-bench.yml +++ b/.github/workflows/startup-bench.yml @@ -1,13 +1,19 @@ name: Startup Bench -# Two layers, both run in this workflow: +# Three layers, all run in this workflow: # -# 1. **Divan micro-bench** — `cargo bench --bench startup`. Pure-compute -# hot paths (Message::serialize/deserialize, kernel_cmdline, -# getrandom). No KVM, no nested virt, no L2 boot — same wall-clock -# cost on every Linux runner. Cheap regression gate. +# 1. **Divan micro-bench (startup)** — `cargo bench --bench startup`. +# Pure-compute hot paths (Message::serialize/deserialize, +# kernel_cmdline, getrandom). No KVM, no nested virt, no L2 boot — +# same wall-clock cost on every Linux runner. Cheap regression gate. # -# 2. **Wall-clock harness** — `voidbox-startup-bench --iters 20 +# 2. **Divan micro-bench (network)** — `cargo bench --bench network`. +# SLIRP hot paths (process_syn, poll_idle, process_arp_request, +# poll_with_n_flows, dns_cache_hit, dns_cache_miss). Also pure +# compute, no nested virt — stable regression gate for the network +# stack without requiring KVM or a real VM boot. +# +# 3. **Wall-clock harness** — `voidbox-startup-bench --iters 20 # --breakdown`. Boots a real KVM VM through the slim kernel + test # initramfs and measures cold-boot + warm-restore p50/p95/p99 end # to end. Informational only on this runner: the GitHub-hosted @@ -161,14 +167,37 @@ jobs: echo '```' } >> "$GITHUB_STEP_SUMMARY" - - name: Run wall-clock harness (informational) - # No threshold gate — Azure nested-virt is slower than the - # bare-metal targets the verify-skill thresholds were tuned for. - # `continue-on-error` keeps the workflow green even if the - # harness fails outright (e.g. missing /dev/vhost-vsock on a - # future runner image change). The artifact preserves the log - # either way. - continue-on-error: true + - name: Run network divan micro-bench (regression gate) + # Same regression-detection role as the startup divan step, but + # for SLIRP hot paths: process_syn, poll_idle, process_arp_request, + # poll_with_n_flows, dns_cache_hit, dns_cache_miss. Pure compute, + # no nested virt — stable across CI hosts. Output captured for + # artifact + step summary. + run: | + cargo bench --bench network 2>&1 | tee target/tmp/divan-network.log + + { + echo + echo "## Divan network micro-bench (cargo bench --bench network)" + echo + echo '```' + grep -E 'fastest|median|slowest|^[a-z_]+\.' target/tmp/divan-network.log \ + || tail -40 target/tmp/divan-network.log + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + - name: Run wall-clock harness (strict) + # NO `continue-on-error` — was previously silently masking the + # vhost/userspace vsock backend mismatch on warm restore (root + # cause: `capture_snapshot` was building a Sandbox without + # `.enable_snapshots(true)` so vhost-vsock was selected, but + # `from_snapshot` always restores into userspace vsock; vring + # state lives in the kernel's vhost-vsock module and isn't part + # of our snapshot, so the restored userspace device couldn't + # accept connections and every host connect timed out). + # Threshold gate stays informal — Azure nested-virt is slower + # than the bare-metal Fedora 43 / KVM targets the verify-skill + # thresholds were tuned for, but the harness MUST exit 0. env: ITERS: ${{ inputs.iters || '20' }} VOID_BOX_KERNEL: ${{ github.workspace }}/target/vmlinux-slim-x86_64 @@ -194,10 +223,51 @@ jobs: echo '```' } >> "$GITHUB_STEP_SUMMARY" + - name: Build voidbox-network-bench (release) + # Network wall-clock harness: boots one VM with `network(true)`, + # measures TCP throughput, RR/CRR latency, UDP DNS qps, and ICMP + # RR latency. Mirror the startup harness build step. + run: cargo build --release --bin voidbox-network-bench + + - name: Run voidbox-network-bench (network wall-clock harness) + # NO `continue-on-error` here — unlike the startup-bench warm + # phase, this harness has well-defined failure modes that we + # want to surface in CI. A regression like the setuid-busybox + # bug fixed at 77dfc67 (Phase 1.6 → ECONNRESET on every + # connect for `network(true)` VMs) would otherwise hide behind + # `continue-on-error`. If this step is genuinely flaky on the + # runner image, fix the runner image — don't mask the signal. + env: + VOID_BOX_KERNEL: ${{ github.workspace }}/target/vmlinux-slim-x86_64 + VOID_BOX_INITRAMFS: /tmp/void-box-test-rootfs.cpio.gz + run: | + if [ ! -e /dev/vhost-vsock ]; then + echo "::warning::/dev/vhost-vsock not available; skipping voidbox-network-bench" + exit 0 + fi + ls -la "$VOID_BOX_KERNEL" "$VOID_BOX_INITRAMFS" + ./target/release/voidbox-network-bench --iterations 3 \ + --output target/tmp/network-bench.json 2>&1 \ + | tee target/tmp/network-bench.log + + { + echo + echo "## Network wall-clock harness (voidbox-network-bench --iterations 3)" + echo + echo "Metric names mirror passt's published table (passt.top/passt) so a" + echo "future side-by-side comparison run on the same host is plug-compatible." + echo + echo '```json' + cat target/tmp/network-bench.json + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + - name: Upload bench logs if: always() uses: actions/upload-artifact@v4 with: name: startup-bench-${{ github.run_id }} - path: target/tmp/*.log + path: | + target/tmp/*.log + target/tmp/*.json retention-days: 30 diff --git a/Cargo.toml b/Cargo.toml index f204f9a8..9443b736 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,6 +120,9 @@ divan = "0.1" default = [] # Enable full OpenTelemetry integration (OTLP export, trace context propagation) opentelemetry = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:opentelemetry-otlp"] +# Expose internal SlirpBackend helpers (insert_synthetic_synsent_entry, etc.) +# for use in benches/. Never enable in production builds. +bench-helpers = [] [[bin]] name = "voidbox" @@ -170,11 +173,20 @@ path = "tests/oci_integration.rs" name = "observe_codex" path = "tests/observe_codex.rs" +[[test]] +name = "network_baseline" +path = "tests/network_baseline.rs" + [[bench]] name = "startup" path = "benches/startup.rs" harness = false +[[bench]] +name = "network" +path = "benches/network.rs" +harness = false + [[bin]] name = "voidbox-startup-bench" path = "src/bin/voidbox-startup-bench/main.rs" @@ -183,6 +195,10 @@ path = "src/bin/voidbox-startup-bench/main.rs" name = "voidbox-rpc-bench" path = "src/bin/voidbox-rpc-bench/main.rs" +[[bin]] +name = "voidbox-network-bench" +path = "src/bin/voidbox-network-bench/main.rs" + [workspace] members = ["guest-agent", "void-box-protocol", "claudio", "voidbox-oci", "void-message", "void-mcp"] diff --git a/benches/network.rs b/benches/network.rs new file mode 100644 index 00000000..6fd8720a --- /dev/null +++ b/benches/network.rs @@ -0,0 +1,1035 @@ +//! Divan micro-benchmarks for SLIRP hot paths. +//! +//! Mirrors `benches/startup.rs` in shape. Job: regression detection +//! for the per-packet hot path on the vCPU and net-poll threads. +//! +//! Run with: `cargo bench --bench network` + +#[cfg(target_os = "linux")] +use divan::{counter::BytesCount, Bencher}; +#[cfg(target_os = "linux")] +use smoltcp::wire::{ + ArpOperation, ArpPacket, ArpRepr, EthernetAddress, EthernetFrame, EthernetProtocol, + EthernetRepr, Icmpv4Packet, Icmpv4Repr, IpAddress, IpProtocol, Ipv4Packet, Ipv4Repr, + TcpControl, TcpPacket, TcpRepr, UdpPacket, UdpRepr, +}; +#[cfg(target_os = "linux")] +use void_box::network::slirp::{ + SlirpBackend, GATEWAY_MAC, GUEST_MAC, SLIRP_DNS_IP, SLIRP_GATEWAY_IP, SLIRP_GUEST_IP, +}; + +fn main() { + // SLIRP-using benches are Linux-only (smoltcp dep is `cfg(target_os = + // "linux")` in Cargo.toml). On other platforms, `divan::main()` runs + // with zero registered benches and exits 0 — that's the right shape + // for cross-platform CI which runs `cargo bench --no-run` to compile- + // check the bench binary. + #[cfg(target_os = "linux")] + divan::main(); + #[cfg(not(target_os = "linux"))] + eprintln!("benches/network.rs: SLIRP benches are Linux-only; nothing to run here"); +} + +// All bench functions and helpers below are Linux-only (depend on smoltcp +// + the SLIRP backend, which are themselves `cfg(target_os = "linux")` +// in the workspace Cargo.toml). Wrapping in a module keeps the cfg gating +// in one place; on macOS the module compiles to nothing and `main()` above +// short-circuits before any of these are referenced. +#[cfg(target_os = "linux")] +mod linux_benches { + use super::*; + use std::net::TcpListener; + use std::thread; + use std::time::Duration; + + fn build_syn(src_port: u16, dst_port: u16) -> Vec { + let tcp = TcpRepr { + src_port, + dst_port, + control: TcpControl::Syn, + seq_number: smoltcp::wire::TcpSeqNumber(1000), + ack_number: None, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload: &[], + }; + let ip = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_GATEWAY_IP, + next_header: IpProtocol::Tcp, + payload_len: tcp.buffer_len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip.buffer_len() + tcp.buffer_len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ipp = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip.emit(&mut ipp, &Default::default()); + let mut tcpp = TcpPacket::new_unchecked(&mut buf[14 + ip.buffer_len()..]); + tcp.emit( + &mut tcpp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(SLIRP_GATEWAY_IP), + &Default::default(), + ); + buf + } + + #[divan::bench] + fn process_syn(bencher: Bencher) { + let frame = build_syn(49152, 1); + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); + } + + /// Time `SlirpBackend::process_guest_frame` for a single UDP datagram. + /// + /// Mirrors `process_syn` shape: build the frame once outside the timed + /// loop, fresh stack per iteration. Establishes UDP per-frame cost + /// for cross-phase regression detection. + #[divan::bench] + fn process_udp_frame(bencher: Bencher) { + let frame = build_udp_frame_for_bench(49152, 8080, b"x"); + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); + } + + /// Time `SlirpBackend::process_guest_frame` for a single ICMP echo + /// request. Note: a fresh stack means the unprivileged ICMP socket is + /// opened on every iteration, so this measures the full + /// `open_icmp_socket + insert + send_to` path. If the host's + /// `net.ipv4.ping_group_range` excludes the calling GID, the underlying + /// `socket()` call returns EACCES and `process_guest_frame` returns Ok + /// without touching `flow_table` — divan's measurement still completes + /// but `flow_table` stays empty. That's fine for regression detection. + #[divan::bench] + fn process_icmp_echo_request(bencher: Bencher) { + let frame = build_icmp_echo_for_bench(0xbeef, 1); + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); + } + + #[divan::bench] + fn poll_idle(bencher: Bencher) { + let mut stack = SlirpBackend::new().unwrap(); + let mut out: Vec> = Vec::with_capacity(8); + bencher.bench_local(|| { + out.clear(); + divan::black_box(&mut stack).drain_to_guest(&mut out); + }); + } + + #[divan::bench] + fn process_arp_request(bencher: Bencher) { + let arp_repr = ArpRepr::EthernetIpv4 { + operation: ArpOperation::Request, + source_hardware_addr: EthernetAddress(GUEST_MAC), + source_protocol_addr: SLIRP_GUEST_IP, + target_hardware_addr: EthernetAddress([0; 6]), + target_protocol_addr: SLIRP_GATEWAY_IP, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress([0xff; 6]), + ethertype: EthernetProtocol::Arp, + }; + let total = 14 + arp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut a = ArpPacket::new_unchecked(&mut buf[14..]); + arp_repr.emit(&mut a); + + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&buf)); + }); + } + + /// Open `n` distinct guest→gateway flows, then time `poll()`. + /// + /// Each iteration builds `n` SYN frames with unique source ports and feeds + /// them into a single [`SlirpBackend`], producing up to `n` NAT table entries. + /// `process_guest_frame` errors are ignored — the goal is "many NAT entries", + /// not "all connections succeed" (the default rate-limit may drop some). + /// + /// The timed section is a single `poll()` call on the pre-populated stack, + /// so the measurement reflects the NAT-walk cost at that table size. + /// Today the walk is `O(n)`; the unified flow table keeps the same + /// asymptotic complexity but with smaller per-entry constants. + #[divan::bench(args = [1, 100, 1000])] + fn poll_with_n_flows(bencher: Bencher, n: usize) { + let mut stack = SlirpBackend::new().unwrap(); + for i in 0..n { + let frame = build_syn(49152u16.wrapping_add(i as u16), 1); + let _ = stack.process_guest_frame(&frame); + } + let mut out: Vec> = Vec::with_capacity(8); + bencher.bench_local(|| { + out.clear(); + divan::black_box(&mut stack).drain_to_guest(&mut out); + }); + } + + /// Builds a minimal DNS A-query Ethernet frame from the guest to [`SLIRP_DNS_IP`]. + /// + /// `xid` is placed in the DNS transaction-ID field. The question section + /// queries `example.com` for an A record. The frame is a complete Ethernet → + /// IPv4 → UDP → DNS wire encoding suitable for passing to + /// [`SlirpBackend::process_guest_frame`]. + fn build_dns_query_for_bench(xid: u16) -> Vec { + let mut payload = Vec::new(); + payload.extend_from_slice(&xid.to_be_bytes()); + // flags: RD=1; QDCOUNT=1; ANCOUNT/NSCOUNT/ARCOUNT = 0 + payload.extend_from_slice(&[0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + // QNAME: \x07example\x03com\x00 + payload.extend_from_slice(b"\x07example\x03com\x00"); + // QTYPE=A (1), QCLASS=IN (1) + payload.extend_from_slice(&[0x00, 0x01, 0x00, 0x01]); + + let udp_repr = UdpRepr { + src_port: 49152, + dst_port: 53, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_DNS_IP, + next_header: IpProtocol::Udp, + payload_len: 8 + payload.len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + 8 + payload.len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(SLIRP_DNS_IP), + payload.len(), + |b| b.copy_from_slice(&payload), + &Default::default(), + ); + buf + } + + /// Times the stack's DNS processing path when the cache has no entry for the + /// queried name. + /// + /// Each iteration creates a fresh [`SlirpBackend`] (so the DNS cache is empty) + /// and processes one DNS query frame. The measurement captures stack + /// initialisation plus first-query cache-miss handling, giving a baseline for + /// the cold-cache cost. + #[divan::bench] + fn dns_cache_miss(bencher: Bencher) { + let frame = build_dns_query_for_bench(1); + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); + } + + /// Times the stack's DNS processing path when a cache entry already exists for + /// the queried name. + /// + /// Before the timed section, one query is injected and the stack is polled + /// for up to one second to allow the upstream DNS response to populate the + /// cache. The timed section then processes a second query (different XID, + /// same name) on the warm stack, isolating the cache-hit fast path. + #[divan::bench] + fn dns_cache_hit(bencher: Bencher) { + let mut stack = SlirpBackend::new().unwrap(); + let warm = build_dns_query_for_bench(1); + let _ = stack.process_guest_frame(&warm); + let mut out: Vec> = Vec::new(); + for _ in 0..20 { + out.clear(); + stack.drain_to_guest(&mut out); + std::thread::sleep(std::time::Duration::from_millis(50)); + } + let hit = build_dns_query_for_bench(2); + bencher.bench_local(|| { + let _ = divan::black_box(&mut stack).process_guest_frame(divan::black_box(&hit)); + }); + } + + /// Pure-compute bench for `nat::translate_outbound`. Baseline for future + /// hasher / data-structure changes (e.g. moving deny_cidrs from + /// `Vec` to a longest-prefix trie). Tens of nanoseconds + /// expected; microseconds would indicate an allocation in the hot path. + #[divan::bench] + fn nat_translate_outbound_hot_path(bencher: Bencher) { + use void_box::network::nat::{translate_outbound, Rules}; + + let rules = Rules { + gateway_loopback: true, + deny_cidrs: vec!["169.254.0.0/16".parse().unwrap()], + port_forwards: vec![], + }; + let dst = SLIRP_GATEWAY_IP; + let gateway = SLIRP_GATEWAY_IP; + + bencher.bench_local(|| { + divan::black_box(translate_outbound( + divan::black_box(&rules), + divan::black_box(dst), + divan::black_box(80), + divan::black_box(gateway), + )); + }); + } + + /// Measures TCP bulk throughput through the SLIRP relay under backpressure. + /// + /// Pushes 1 MiB through the relay in 1 KiB chunks with a constrained host + /// receiver (`SO_RCVBUF=4096`) so the backpressure path is exercised every + /// iteration. Divan reports throughput in MB/s alongside per-iteration + /// latency, giving a numerical regression signal for the passt-style + /// sequence-mirroring + don't-ACK-on-EAGAIN backpressure path. + /// + /// The 95% delivery threshold mirrors `tcp_writes_more_than_256kb_succeed` + /// — the binary contract test for TCP backpressure correctness. + #[divan::bench(sample_count = 10)] + fn tcp_bulk_throughput_1mb(bencher: Bencher) { + use smoltcp::wire::TcpControl; + use std::io::Read; + use std::os::unix::io::AsRawFd; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + const TOTAL_BYTES: usize = 1024 * 1024; + const CHUNK_BYTES: usize = 1024; + const WINDOW_MAX: u32 = 256 * 1024; + const DEADLINE_SECS: u64 = 5; + const GUEST_SRC_PORT: u16 = 49200; + const INITIAL_GUEST_SEQ: u32 = 1000; + + bencher + .counter(BytesCount::new(TOTAL_BYTES as u64)) + .bench_local(|| { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + unsafe { + let rcvbuf: libc::c_int = 4096; + libc::setsockopt( + listener.as_raw_fd(), + libc::SOL_SOCKET, + libc::SO_RCVBUF, + &rcvbuf as *const libc::c_int as *const libc::c_void, + std::mem::size_of::() as libc::socklen_t, + ); + } + + let bytes_received = Arc::new(AtomicUsize::new(0)); + let bytes_received_thr = Arc::clone(&bytes_received); + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + let mut buf = [0u8; 4096]; + loop { + match sock.read(&mut buf) { + Ok(0) => break, + Ok(bytes_read) => { + bytes_received_thr.fetch_add(bytes_read, Ordering::Relaxed); + } + Err(_) => break, + } + } + }); + + let mut stack = SlirpBackend::new().unwrap(); + + let syn = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + INITIAL_GUEST_SEQ, + 0, + TcpControl::Syn, + &[], + ); + stack.process_guest_frame(&syn).unwrap(); + + let synack_frames: Vec> = { + let mut frames = Vec::new(); + for _ in 0..4 { + stack.drain_to_guest(&mut frames); + } + frames + }; + let (gateway_seq, _, _, _) = synack_frames + .iter() + .find_map(|frame| parse_tcp_to_guest_frame(frame)) + .expect("synack"); + + let ack_frame = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + INITIAL_GUEST_SEQ + 1, + gateway_seq + 1, + TcpControl::None, + &[], + ); + stack.process_guest_frame(&ack_frame).unwrap(); + + let chunk = vec![b'x'; CHUNK_BYTES]; + let mut guest_seq = INITIAL_GUEST_SEQ + 1; + let mut acked_seq = INITIAL_GUEST_SEQ + 1; + let deadline = + std::time::Instant::now() + std::time::Duration::from_secs(DEADLINE_SECS); + + while bytes_received.load(Ordering::Relaxed) < TOTAL_BYTES * 95 / 100 + && std::time::Instant::now() < deadline + { + let data_frame = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + guest_seq, + gateway_seq + 1, + TcpControl::Psh, + &chunk, + ); + let _ = stack.process_guest_frame(&data_frame); + guest_seq = guest_seq.wrapping_add(CHUNK_BYTES as u32); + + let mut frames = Vec::new(); + for _ in 0..4 { + stack.drain_to_guest(&mut frames); + } + for frame in frames { + if let Some((_, ack, _, _)) = parse_tcp_to_guest_frame(&frame) { + if ack > acked_seq { + acked_seq = ack; + } + } + } + + if guest_seq.wrapping_sub(acked_seq) > WINDOW_MAX { + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } + + let fin_frame = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + guest_seq, + gateway_seq + 1, + TcpControl::Fin, + &[], + ); + let _ = stack.process_guest_frame(&fin_frame); + let mut fin_drain: Vec> = Vec::new(); + for _ in 0..40 { + fin_drain.clear(); + stack.drain_to_guest(&mut fin_drain); + if server.is_finished() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + let _ = server.join(); + + divan::black_box(bytes_received.load(Ordering::Relaxed)); + }); + } + + /// Builds a minimal IPv4-over-Ethernet TCP segment from guest to gateway. + /// + /// Returns the full Ethernet frame bytes. Mirrors the `build_tcp_frame` + /// helper from `tests/network_baseline.rs` inline so the bench compiles + /// as a standalone binary without a shared helper crate. + fn build_tcp_data_frame( + dst_ip: smoltcp::wire::Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], + ) -> Vec { + use smoltcp::wire::{IpAddress, TcpSeqNumber}; + + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: TcpSeqNumber(seq as i32), + ack_number: if ack == 0 { + None + } else { + Some(TcpSeqNumber(ack as i32)) + }, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let eth_hdr_len = 14usize; + let total = eth_hdr_len + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[eth_hdr_len..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked(&mut buf[eth_hdr_len + ip_repr.buffer_len()..]); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + &Default::default(), + ); + buf + } + + /// Parses one frame emitted by the stack as a TCP segment directed to the guest. + /// + /// Returns `(seq, ack, control, payload_len)` on success, `None` otherwise. + fn parse_tcp_to_guest_frame(frame: &[u8]) -> Option<(u32, u32, TcpControl, usize)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + return None; + } + let tcp = TcpPacket::new_checked(ip.payload()).ok()?; + let control = match (tcp.syn(), tcp.fin(), tcp.rst(), tcp.psh()) { + (false, false, false, false) => TcpControl::None, + (false, false, false, true) => TcpControl::Psh, + (true, false, false, _) => TcpControl::Syn, + (false, true, false, _) => TcpControl::Fin, + (false, false, true, _) => TcpControl::Rst, + _ => return None, + }; + Some(( + tcp.seq_number().0 as u32, + tcp.ack_number().0 as u32, + control, + tcp.payload().len(), + )) + } + fn build_udp_frame_for_bench(src_port: u16, dst_port: u16, payload: &[u8]) -> Vec { + let udp_repr = UdpRepr { src_port, dst_port }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_GATEWAY_IP, + next_header: IpProtocol::Udp, + payload_len: 8 + payload.len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + 8 + payload.len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(SLIRP_GATEWAY_IP), + payload.len(), + |b| b.copy_from_slice(payload), + &Default::default(), + ); + buf + } + + fn build_icmp_echo_for_bench(ident: u16, seq_no: u16) -> Vec { + let icmp_repr = Icmpv4Repr::EchoRequest { + ident, + seq_no, + data: b"bench", + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: smoltcp::wire::Ipv4Address::new(8, 8, 8, 8), + next_header: IpProtocol::Icmp, + payload_len: icmp_repr.buffer_len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + icmp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp = Icmpv4Packet::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + icmp_repr.emit(&mut icmp, &Default::default()); + buf + } + + /// Open `n/3` TCP + `n/3` UDP + `n/3` ICMP-echo flows, then time `poll()`. + /// + /// Mirrors `poll_with_n_flows` (TCP-only) but exercises the unified + /// `flow_table` with all three protocols populated. Catches enum-dispatch + /// and filter regressions at scale: each `relay_*_data` loop filters + /// by `FlowKey` variant over the unified table, so per-protocol scan cost + /// is `O(total_flows)` not `O(this_protocol's_flows)`. This bench is the + /// regression gate for that property. + #[divan::bench(args = [3, 99, 999])] + fn poll_with_n_mixed_flows(bencher: Bencher, n: usize) { + let mut stack = SlirpBackend::new().unwrap(); + let third = n / 3; + + // n/3 TCP SYNs. + for i in 0..third { + let frame = build_syn(49152u16.wrapping_add(i as u16), 1); + let _ = stack.process_guest_frame(&frame); + } + // n/3 UDP datagrams (any non-DNS port; one byte payload). + for i in 0..third { + let frame = build_udp_frame_for_bench(50152u16.wrapping_add(i as u16), 8080, b"x"); + let _ = stack.process_guest_frame(&frame); + } + // n/3 ICMP echoes (unique guest_id per flow). + for i in 0..third { + let frame = build_icmp_echo_for_bench(0x1000 + i as u16, 1); + let _ = stack.process_guest_frame(&frame); + } + + let mut out: Vec> = Vec::with_capacity(8); + bencher.bench_local(|| { + out.clear(); + divan::black_box(&mut stack).drain_to_guest(&mut out); + }); + } + + /// Insert + remove `n` flow-table entries using synthetic data. + /// + /// Pure-compute baseline for the unified `HashMap`. + /// Reference number for hasher experiments (foldhash, ahash, SipHash) + /// or container-shape changes (e.g. hashbrown raw API). Uses synthetic + /// `u32` values instead of real + /// `TcpNatEntry` (which requires TcpStream) to isolate HashMap + /// mechanics from socket cloning overhead — the real cost is + /// HashMap insert/remove, not socket ops. + /// + /// Pre-builds N unique keys with different `guest_src_port` values + /// (maintaining the same semantic as real flows), then times one + /// iteration of insert all + remove all. + #[divan::bench(args = [10, 100, 1000])] + fn flow_table_insert_remove(bencher: Bencher, n: usize) { + use std::collections::HashMap; + + // Build keys outside the timed loop. + // Each key has a unique guest_src_port to simulate distinct flows. + let keys: Vec<_> = (0..n) + .map(|i| { + smoltcp::wire::IpAddress::Ipv4(smoltcp::wire::Ipv4Address::new( + 10, + 0, + 2, + 2 + (i % 254) as u8, + )) + }) + .collect(); + + bencher.bench_local(|| { + let mut table: HashMap = HashMap::with_capacity(n); + // Insert phase + for (i, _key) in keys.iter().enumerate() { + table.insert(i, i as u32); + } + // Remove phase + for i in 0..n { + divan::black_box(table.remove(&i)); + } + }); + } + /// Build a SYN-ACK Ethernet frame from the guest toward the gateway. + /// + /// src = GUEST_IP:guest_port, dst = GATEWAY_IP:high_port + /// control = Syn, ack_number = Some(our_seq + 1) → produces SYN+ACK on wire. + #[cfg(feature = "bench-helpers")] + fn build_inbound_syn_ack_frame( + guest_port: u16, + high_port: u16, + our_seq: u32, + guest_seq: u32, + ) -> Vec { + use smoltcp::wire::TcpSeqNumber; + + let tcp_repr = TcpRepr { + src_port: guest_port, + dst_port: high_port, + control: TcpControl::Syn, + seq_number: TcpSeqNumber(guest_seq as i32), + ack_number: Some(TcpSeqNumber(our_seq.wrapping_add(1) as i32)), + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload: &[], + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_GATEWAY_IP, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(SLIRP_GATEWAY_IP), + &Default::default(), + ); + buf + } + + /// Seed a `SynSent` entry into `stack`'s flow table. + /// + /// Replicates `SlirpBackend::insert_synthetic_synsent_entry` inline. + /// Requires the `bench-helpers` feature (compile with + /// `cargo bench --features bench-helpers`). + #[cfg(feature = "bench-helpers")] + fn seed_synsent_entry(stack: &mut SlirpBackend, guest_port: u16, high_port: u16, our_seq: u32) { + use std::net::{TcpListener, TcpStream}; + let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback"); + let host_stream = + TcpStream::connect(listener.local_addr().unwrap()).expect("connect loopback"); + host_stream.set_nonblocking(true).ok(); + stack.insert_synthetic_synsent_entry(guest_port, high_port, our_seq, host_stream); + } + + /// Microbench for the inbound SYN-ACK state-machine transition added in + /// 5.5b.1 (`TcpNatState::SynSent` → `Established`). Each iteration + /// (re)builds a `SlirpBackend`, seeds one `SynSent` entry, feeds a + /// synthetic guest SYN-ACK frame to `process_guest_frame`, and lets + /// the bench timer capture the `process_guest_frame` cost. + /// + /// Expected magnitude: tens of µs (same order as `process_syn`, which + /// also rebuilds a fresh stack per iteration). + #[cfg(feature = "bench-helpers")] + #[divan::bench] + fn tcp_inbound_syn_ack_transition(bencher: Bencher) { + const GUEST_PORT: u16 = 8080; + const HIGH_PORT: u16 = 49152; + const OUR_SEQ: u32 = 1000; + const GUEST_SEQ: u32 = 42; + + let frame = build_inbound_syn_ack_frame(GUEST_PORT, HIGH_PORT, OUR_SEQ, GUEST_SEQ); + + bencher.bench_local(|| { + let mut stack = SlirpBackend::new().unwrap(); + seed_synsent_entry(&mut stack, GUEST_PORT, HIGH_PORT, OUR_SEQ); + let _ = divan::black_box(&mut stack).process_guest_frame(divan::black_box(&frame)); + }); + } + + /// Pure-compute cost of synthesizing an inbound SYN frame for + /// port-forwarding. No stack allocation or guest frame processing — + /// just the `build_tcp_packet_static` wire encoding. + /// + /// Expected magnitude: sub-microsecond (pure packet construction). + /// + /// Requires the `bench-helpers` feature (compile with + /// `cargo bench --features bench-helpers`). + #[cfg(feature = "bench-helpers")] + #[divan::bench] + fn synthesize_inbound_syn(bencher: Bencher) { + const HIGH_PORT: u16 = 49152; + const GUEST_PORT: u16 = 8080; + const OUR_SEQ: u32 = 1000; + + bencher.bench_local(|| { + divan::black_box(void_box::network::slirp::synthesize_inbound_syn( + divan::black_box(HIGH_PORT), + divan::black_box(GUEST_PORT), + divan::black_box(OUR_SEQ), + )); + }); + } + + /// Returns `true` if `frame` is an Ethernet/IPv4/TCP packet with the SYN + /// flag set, addressed to `dst_port`. + /// + /// The synthesized inbound SYN produced by `synthesize_inbound_syn` uses + /// `TcpControl::Syn` but smoltcp sets the ACK bit whenever `ack_number` + /// is `Some(...)`, even when the value is zero. Checking only `tcp.syn()` + /// + `dst_port` is therefore correct here. + fn is_tcp_syn_to_port(frame: &[u8], dst_port: u16) -> bool { + // Minimum: 14 (Eth) + 20 (IPv4) + 20 (TCP) = 54 bytes. + if frame.len() < 54 { + return false; + } + let eth = EthernetFrame::new_unchecked(frame); + if eth.ethertype() != EthernetProtocol::Ipv4 { + return false; + } + let ip = Ipv4Packet::new_unchecked(eth.payload()); + if ip.next_header() != IpProtocol::Tcp { + return false; + } + let ip_header_len = ip.header_len() as usize; + let tcp = TcpPacket::new_unchecked(ð.payload()[ip_header_len..]); + tcp.syn() && tcp.dst_port() == dst_port + } + + /// Wall-clock latency of the full inbound port-forward path: host + /// `TcpStream::connect` → epoll readiness event → `process_listener_readiness` + /// accept → mpsc channel push → `process_pending_inbound_accepts` → + /// `synthesize_inbound_syn` → first SYN frame visible in `drain_to_guest` + /// output. + /// + /// The listener FD is registered with `EpollDispatch`; accept latency is + /// bounded by the epoll_wait cadence (≤ 5 ms active), not a fixed poll + /// interval. Sub-millisecond medians are expected. Regressions in the + /// inbound state machine will surface numerically against this measurement. + #[divan::bench(sample_count = 20, sample_size = 1)] + fn port_forward_accept_latency(bencher: Bencher) { + const GUEST_PORT: u16 = 8080; + const CONNECT_TIMEOUT: Duration = Duration::from_secs(2); + const DRAIN_POLL: Duration = Duration::from_micros(100); + + // Probe-bind to grab an ephemeral host port, then release the listener + // so SlirpBackend can bind it. There is an inherent TOCTOU race + // between the drop and the SlirpBackend bind — acceptable for benches + // running on a loopback interface under controlled conditions. + let probe = TcpListener::bind("127.0.0.1:0").expect("probe bind for host port"); + let host_port = probe.local_addr().expect("probe local_addr").port(); + drop(probe); + + let mut stack = SlirpBackend::with_security( + 64, + 50, + &["169.254.0.0/16".to_string()], + &[(host_port, GUEST_PORT)], + ) + .expect("SlirpBackend::with_security"); + + let mut out: Vec> = Vec::new(); + + bencher.bench_local(|| { + // Spawn a worker thread that connects to the host listener port. + // EpollDispatch fires readiness; process_listener_readiness accepts + // and pushes the stream onto the mpsc channel. + let connect_addr = format!("127.0.0.1:{host_port}"); + let worker = thread::spawn(move || { + let addr: std::net::SocketAddr = connect_addr.parse().expect("parse connect addr"); + std::net::TcpStream::connect_timeout(&addr, CONNECT_TIMEOUT) + .expect("connect to listener"); + }); + + // Poll drain_to_guest until a SYN frame appears in the output. + loop { + out.clear(); + stack.drain_to_guest(&mut out); + if out + .iter() + .any(|frame| is_tcp_syn_to_port(frame, GUEST_PORT)) + { + break; + } + thread::sleep(DRAIN_POLL); + } + + worker.join().expect("worker thread panicked"); + }); + } + + /// Cost of one `drain_to_guest` call when one TCP flow is `Established` + /// and the host kernel has data ready to relay. + /// + /// Captures the per-packet SLIRP dispatch overhead via epoll: epoll_wait + /// (non-blocking, zero-timeout), readiness scan, peek, and Ethernet frame + /// construction. Only the flows with data ready are dispatched — flows + /// with nothing to relay are skipped. + /// + /// This bench cannot exercise the `net_poll_thread` 50 ms epoll cycle + /// (that thread does not run inside divan). The wall-clock latency floor + /// is captured separately by `voidbox-network-bench`'s `tcp_rx_latency_us_p50` + /// field; see that binary's `Report` struct for the measurement shape. + /// + /// Requires the `bench-helpers` feature (compile with + /// `cargo bench --features bench-helpers`). + #[cfg(feature = "bench-helpers")] + #[divan::bench(sample_count = 50, sample_size = 10)] + fn tcp_rx_latency_one_packet(bencher: Bencher) { + use smoltcp::wire::TcpControl; + use std::io::Write; + use std::net::TcpListener; + + const GUEST_SRC_PORT: u16 = 49155; + const INITIAL_GUEST_SEQ: u32 = 5000; + const PAYLOAD: &[u8] = &[0xAB; 64]; + + // Build a fresh stack with one Established TCP flow. Setup happens + // outside the timed loop so divan only measures the relay dispatch. + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + let server_thread = thread::spawn(move || listener.accept().unwrap()); + + let mut stack = SlirpBackend::new().unwrap(); + + // 3-way handshake: guest sends SYN → stack produces SYN-ACK → guest + // sends ACK. This mirrors `tcp_bulk_throughput_1mb` setup. + let syn = build_tcp_syn_for_latency_bench(GUEST_SRC_PORT, host_port, INITIAL_GUEST_SEQ); + stack.process_guest_frame(&syn).unwrap(); + + // Drain for up to 200 ms to collect the SYN-ACK. + let mut drain_frames: Vec> = Vec::new(); + let gateway_seq = { + let deadline = std::time::Instant::now() + Duration::from_millis(200); + loop { + drain_frames.clear(); + stack.drain_to_guest(&mut drain_frames); + if let Some((seq, _, _, _)) = drain_frames + .iter() + .find_map(|f| parse_tcp_to_guest_frame(f)) + { + break seq; + } + if std::time::Instant::now() > deadline { + panic!("no SYN-ACK within deadline"); + } + thread::sleep(Duration::from_millis(5)); + } + }; + + // Complete the handshake: guest sends ACK. + let ack = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + INITIAL_GUEST_SEQ + 1, + gateway_seq + 1, + TcpControl::None, + &[], + ); + stack.process_guest_frame(&ack).unwrap(); + + // The server thread accepted the connection; grab the socket. + let (mut server_sock, _) = server_thread.join().unwrap(); + server_sock + .set_nonblocking(true) + .expect("server non-blocking"); + + // Set up state for the timed loop. + let mut out: Vec> = Vec::with_capacity(8); + let guest_seq = INITIAL_GUEST_SEQ + 1; + + // Prime: put one payload in the kernel buffer before the first + // iteration begins so the first measured call sees a ready event. + let _ = server_sock.write(PAYLOAD); + + bencher.bench_local(|| { + out.clear(); + // Refill the kernel buffer from the previous iteration's drain. + // write() may return EAGAIN if the buffer is full; that is fine — + // the previous iteration's peek left data in place. + let _ = server_sock.write(divan::black_box(PAYLOAD)); + + // The cost we are measuring: one non-blocking epoll_wait + relay. + divan::black_box(&mut stack).drain_to_guest(&mut out); + + // Consume the relay output so inject_to_guest doesn't grow + // unboundedly across iterations. + divan::black_box(&out); + + // Keep the TCP stream happy: send an ACK for any data the relay + // fed into inject_to_guest (frame content doesn't matter for the + // bench; we just need the host stream not to stall). + for frame in &out { + if let Some((data_seq, _, _, plen)) = parse_tcp_to_guest_frame(frame) { + if plen > 0 { + let ack_back = build_tcp_data_frame( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + guest_seq, + data_seq.wrapping_add(plen as u32), + TcpControl::None, + &[], + ); + let _ = stack.process_guest_frame(&ack_back); + } + } + } + }); + } + + /// Build a SYN frame from the guest toward the host for the latency bench. + /// + /// Identical to `build_tcp_data_frame` with `TcpControl::Syn` and zero + /// `ack`. Kept as a separate function to document intent: this is the + /// opening segment of the 3-way handshake used by + /// `tcp_rx_latency_one_packet`. + #[cfg(feature = "bench-helpers")] + fn build_tcp_syn_for_latency_bench(src_port: u16, dst_port: u16, seq: u32) -> Vec { + build_tcp_data_frame( + SLIRP_GATEWAY_IP, + src_port, + dst_port, + seq, + 0, + smoltcp::wire::TcpControl::Syn, + &[], + ) + } +} // mod linux_benches diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase0.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase0.md new file mode 100644 index 00000000..a9106870 --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase0.md @@ -0,0 +1,2027 @@ +# Phase 0 Implementation Plan: Baseline + Trait Extraction + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task** (from the spec): +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Do not skip them. +> Use LSP (`goToDefinition`, `findReferences`, `documentSymbol`, +> `workspaceSymbol`) for Rust navigation; never grep/glob Rust source +> when LSP can answer. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) + +**Goal:** Land the test/bench baseline, the `NetworkBackend` trait +abstraction, and the `SlirpStack → SlirpBackend` rename, with zero +user-visible behavior change. + +**Naming rationale:** The new name is role-based, not +implementation-based. "Slirp" denotes the user-mode-NAT networking +role (same role libslirp / passt / pasta fill); "smoltcp" is just the +library we use to build it. Future siblings — `TapBackend`, +`VhostNetBackend` — follow the same role-based convention. Renaming +to `SmoltcpBackend` would leak the implementation library into the +public type name and lose this symmetry. + +**Architecture:** Three additive workstreams (correctness pins, divan +microbenches, wall-clock e2e harness) followed by a mechanical +trait-extraction refactor. Three "broken on purpose" assertions are +introduced in 0A and stay green — they flip in Phases 1, 2, 3 +respectively. + +**Tech Stack:** Rust 1.88, `smoltcp` 0.11 (wire types only), `divan` +0.1, `tokio` (existing), `std::net::TcpListener` for the e2e harness +host endpoint, `iperf3`/`netperf` invoked from inside the VM for +throughput numbers. + +--- + +## Task structure + +The phase has five workstreams (A–E) totaling **25 tasks**. A, B, C are +**independent and can be executed in parallel**. D depends on A +(baseline tests must exist before refactor). E is the final gate. + +``` +0A correctness baseline ──┐ +0B divan microbenches ────┼──→ 0D trait extraction ──→ 0E validation + PR +0C wall-clock harness ────┘ +``` + +--- + +## Workstream 0A — Correctness baseline (`tests/network_baseline.rs`) + +All Layer-1 unit-level pins. Linux-only because `SlirpStack` is +`#[cfg(target_os = "linux")]`. + +### Task 0A.1: Test file scaffolding + frame builder helpers + +**Files:** +- Create: `tests/network_baseline.rs` +- Modify: `Cargo.toml` (register `[[test]] name = "network_baseline"`) + +- [ ] **Step 1: Create the test file with helpers.** + +```rust +//! Layer-1 correctness pins for the smoltcp-based SLIRP stack. +//! +//! These tests drive `SlirpStack` directly with synthetic Ethernet +//! frames — no VM, no kernel, no host sockets to outside hosts. The +//! goal is to lock observable behavior (including deliberately broken +//! behavior) so the passt-pattern refactor's diff is legible to +//! reviewers. +//! +//! Three tests assert *broken* behavior on purpose. Each is marked +//! `BROKEN_ON_PURPOSE` and flips in the phase that fixes it: +//! +//! - `tcp_to_host_buffer_drops_at_256kb` — flips in Phase 3 +//! - `udp_non_dns_silently_dropped` — flips in Phase 2 +//! - `icmp_echo_silently_dropped` — flips in Phase 1 +//! +//! Run with: `cargo test --test network_baseline` + +#![cfg(target_os = "linux")] + +use smoltcp::wire::{ + ArpOperation, ArpPacket, ArpRepr, EthernetAddress, EthernetFrame, EthernetProtocol, + EthernetRepr, IpProtocol, Ipv4Address, Ipv4Packet, Ipv4Repr, TcpControl, TcpPacket, TcpRepr, + UdpPacket, UdpRepr, +}; +use std::net::{TcpListener, UdpSocket}; +use void_box::network::slirp::{ + SlirpStack, GATEWAY_MAC, GUEST_MAC, SLIRP_GATEWAY_IP, SLIRP_GUEST_IP, +}; + +const GUEST_EPHEMERAL_PORT: u16 = 49152; +const ETH_HDR_LEN: usize = 14; +const IPV4_MIN_HDR_LEN: usize = 20; +const TCP_MIN_HDR_LEN: usize = 20; +const UDP_HDR_LEN: usize = 8; + +/// Build a minimal IPv4-over-Ethernet TCP segment from guest to a +/// pretend external IP. Returns the full Ethernet frame bytes. +fn build_tcp_frame( + dst_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], +) -> Vec { + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: smoltcp::wire::TcpSeqNumber(seq as i32), + ack_number: if ack == 0 { + None + } else { + Some(smoltcp::wire::TcpSeqNumber(ack as i32)) + }, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + timestamp: None, + payload, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked( + &mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..], + ); + tcp_repr.emit( + &mut tcp, + &smoltcp::wire::IpAddress::Ipv4(SLIRP_GUEST_IP), + &smoltcp::wire::IpAddress::Ipv4(dst_ip), + &Default::default(), + ); + buf +} + +/// Build a UDP-over-Ethernet datagram from guest. +fn build_udp_frame(dst_ip: Ipv4Address, src_port: u16, dst_port: u16, payload: &[u8]) -> Vec { + let udp_repr = UdpRepr { src_port, dst_port }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Udp, + payload_len: UDP_HDR_LEN + payload.len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + UDP_HDR_LEN + payload.len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked( + &mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..], + ); + udp_repr.emit( + &mut udp, + &smoltcp::wire::IpAddress::Ipv4(SLIRP_GUEST_IP), + &smoltcp::wire::IpAddress::Ipv4(dst_ip), + UDP_HDR_LEN + payload.len(), + |b| b.copy_from_slice(payload), + &Default::default(), + ); + buf +} + +/// Parse one emitted frame as a TCP segment if it matches; return +/// `(seq, ack, control, payload_len)` for the matching direction. +fn parse_tcp_to_guest(frame: &[u8]) -> Option<(u32, u32, TcpControl, usize)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + return None; + } + let tcp = TcpPacket::new_checked(ip.payload()).ok()?; + Some(( + tcp.seq_number().0 as u32, + tcp.ack_number().0 as u32, + tcp.control(), + tcp.payload().len(), + )) +} + +/// Drain frames the stack wants to send to the guest, calling `poll` +/// up to `n` times. +fn drain_n(stack: &mut SlirpStack, n: usize) -> Vec> { + let mut out = Vec::new(); + for _ in 0..n { + out.extend(stack.poll()); + } + out +} +``` + +- [ ] **Step 2: Register the test in `Cargo.toml`.** + +```toml +[[test]] +name = "network_baseline" +path = "tests/network_baseline.rs" +``` + +- [ ] **Step 3: Verify it compiles with no tests yet.** + +```bash +cargo test --test network_baseline --no-run +``` + +Expected: builds clean, "0 tests" reported. + +- [ ] **Step 4: Commit.** + +```bash +git add tests/network_baseline.rs Cargo.toml +git commit -m "test(network): scaffold network_baseline pins with frame helpers" +``` + +--- + +### Task 0A.2: Pin TCP handshake (SYN → SYN-ACK) + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write the test using a host listener.** + +Append to `tests/network_baseline.rs`: + +```rust +#[test] +fn tcp_handshake_emits_synack() { + // Bind a host listener on 127.0.0.1 so the stack's connect() + // succeeds. SLIRP rewrites 10.0.2.2 → 127.0.0.1. + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let mut stack = SlirpStack::new().expect("stack"); + + // Guest sends SYN to gateway IP at the listener's port. + let syn = build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + ); + stack.process_guest_frame(&syn).expect("process syn"); + + // Drain — SYN-ACK should be queued. + let frames = drain_n(&mut stack, 4); + let synack = frames + .iter() + .find_map(|f| parse_tcp_to_guest(f)) + .expect("synack emitted"); + + let (_seq, ack, ctrl, _len) = synack; + assert_eq!(ctrl, TcpControl::Syn, "control flags include SYN+ACK"); + assert_eq!(ack, 1001, "ack = guest_seq + 1"); +} +``` + +- [ ] **Step 2: Run.** + +```bash +cargo test --test network_baseline tcp_handshake_emits_synack +``` + +Expected: PASS. (Note: `TcpControl::Syn` in smoltcp's repr also covers +SYN+ACK when ack number is set; assertion above is loose by +construction — sharpen if smoltcp distinguishes.) + +- [ ] **Step 3: If the assertion is wrong** (e.g. smoltcp reports + `TcpControl::None` with the ACK flag in a separate field), open + `src/network/slirp.rs` `build_tcp_packet_static` (around line 1102) + via LSP `goToDefinition` and read what it actually emits. Update the + assertion to match observed behavior. **Do not modify production + code** — this test pins what we have today. + +- [ ] **Step 4: Commit once green.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin TCP handshake SYN-ACK emission" +``` + +--- + +### Task 0A.3: Pin TCP data echo (guest send → host receive → host send → guest receive) + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write the round-trip test.** + +```rust +#[test] +fn tcp_data_round_trip() { + use std::io::{Read, Write}; + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Spawn a thread that accepts and echoes one chunk. + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + let mut buf = [0u8; 16]; + let n = sock.read(&mut buf).unwrap(); + sock.write_all(&buf[..n]).unwrap(); + }); + + let mut stack = SlirpStack::new().expect("stack"); + + // SYN + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + + // Drain SYN-ACK; capture our_seq. + let synack_frames = drain_n(&mut stack, 4); + let (our_seq, _ack, _ctrl, _len) = synack_frames + .iter() + .find_map(|f| parse_tcp_to_guest(f)) + .expect("synack"); + + // ACK the SYN-ACK (completes handshake). + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Send 5 bytes of data. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::Psh, + b"hello", + )) + .unwrap(); + + // Wait for server to echo and stack to relay back. + server.join().unwrap(); + let mut total_payload = 0; + for _ in 0..40 { + let frames = drain_n(&mut stack, 1); + for f in frames.iter() { + if let Some((_, _, _, len)) = parse_tcp_to_guest(f) { + total_payload += len; + } + } + if total_payload >= 5 { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + assert!( + total_payload >= 5, + "expected at least 5 bytes echoed back to guest, got {total_payload}" + ); +} +``` + +- [ ] **Step 2: Run.** `cargo test --test network_baseline tcp_data_round_trip` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin TCP guest↔host data round-trip" +``` + +--- + +### Task 0A.4: Pin "broken on purpose" — TCP `to_host` 256 KB cliff + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write the test that demonstrates the cliff.** + +```rust +/// BROKEN_ON_PURPOSE — flips in Phase 3. +/// +/// Today: when guest writes >256 KB to host before host reads, +/// `to_host` buffer overflows and the connection is closed +/// (`slirp.rs:903–910`). +/// +/// After Phase 3 (MSG_PEEK + sequence mirroring): the host kernel's +/// socket buffer absorbs the write; no userspace cap, no drop. +#[test] +fn tcp_to_host_buffer_drops_at_256kb() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Server that accepts but never reads — forces guest writes to + // accumulate in our `to_host` buffer. + let _server = std::thread::spawn(move || { + let (sock, _) = listener.accept().unwrap(); + std::thread::sleep(std::time::Duration::from_secs(2)); + drop(sock); + }); + + let mut stack = SlirpStack::new().expect("stack"); + + // Handshake. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let synack = drain_n(&mut stack, 4) + .into_iter() + .find_map(|f| parse_tcp_to_guest(&f)) + .expect("synack"); + let (our_seq, _, _, _) = synack; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Push ~300 KB in 1 KB segments. Today, somewhere past 256 KB the + // stack closes the connection (RST or FIN to guest). + let mut seq = 1001u32; + let chunk = vec![b'x'; 1024]; + let mut saw_close = false; + for _ in 0..300 { + let _ = stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + seq, + our_seq + 1, + TcpControl::Psh, + &chunk, + )); + seq = seq.wrapping_add(1024); + for f in drain_n(&mut stack, 1) { + if let Some((_, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Rst | TcpControl::Fin) { + saw_close = true; + } + } + } + if saw_close { + break; + } + } + assert!( + saw_close, + "BROKEN_ON_PURPOSE: today the 256 KB to_host cliff closes the \ + connection. If this assertion fails, Phase 3 may have already \ + landed — flip the assertion to `assert!(!saw_close)`." + ); +} +``` + +- [ ] **Step 2: Run.** `cargo test --test network_baseline tcp_to_host_buffer_drops_at_256kb` + +- [ ] **Step 3: If it doesn't capture the cliff** (e.g. test passes + 300 chunks without close), instrument with `tracing` at `WARN`, + re-run, and adjust chunk size / count. The cliff is real — the test + must capture it. + +- [ ] **Step 4: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): BROKEN_ON_PURPOSE pin — 256 KB to_host cliff" +``` + +--- + +### Task 0A.5: Pin TCP rate limit, max concurrent, deny list + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write three clustered tests.** + +```rust +#[test] +fn tcp_rate_limit_emits_rst() { + // 5 conn/s allowance; 10 attempts. + let mut stack = SlirpStack::with_security(64, 5, vec![]).unwrap(); + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let mut rsts = 0; + for i in 0..10 { + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT + i as u16, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + for f in drain_n(&mut stack, 2) { + if let Some((_, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if ctrl == TcpControl::Rst { + rsts += 1; + } + } + } + } + assert!( + rsts >= 4, + "expected ≥4 RSTs from rate limit, saw {rsts}" + ); + drop(listener); +} + +#[test] +fn tcp_max_concurrent_emits_rst() { + let mut stack = SlirpStack::with_security(2, 1000, vec![]).unwrap(); + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Open 4 distinct connections; cap is 2. + let mut rsts = 0; + for i in 0..4 { + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT + i, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + for f in drain_n(&mut stack, 2) { + if let Some((_, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if ctrl == TcpControl::Rst { + rsts += 1; + } + } + } + } + assert!(rsts >= 1, "expected RST after concurrent limit, saw {rsts}"); + drop(listener); +} + +#[test] +fn tcp_deny_list_emits_rst() { + use ipnet::Ipv4Net; + let deny: Vec = vec!["169.254.169.254/32".parse().unwrap()]; + let mut stack = SlirpStack::with_security(64, 1000, deny).unwrap(); + + stack + .process_guest_frame(&build_tcp_frame( + Ipv4Address::new(169, 254, 169, 254), + GUEST_EPHEMERAL_PORT, + 80, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let rst = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_tcp_to_guest(&f)) + .map(|(_, _, ctrl, _)| ctrl == TcpControl::Rst); + assert_eq!(rst, Some(true), "deny-list IP must get RST"); +} +``` + +- [ ] **Step 2: Run all three.** + +```bash +cargo test --test network_baseline tcp_rate_limit_emits_rst tcp_max_concurrent_emits_rst tcp_deny_list_emits_rst +``` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin TCP rate limit, concurrent cap, deny list" +``` + +--- + +### Task 0A.6: Pin ARP behavior + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Add ARP frame builder and three tests.** + +```rust +fn build_arp_request(target_ip: Ipv4Address) -> Vec { + let arp_repr = ArpRepr::EthernetIpv4 { + operation: ArpOperation::Request, + source_hardware_addr: EthernetAddress(GUEST_MAC), + source_protocol_addr: SLIRP_GUEST_IP, + target_hardware_addr: EthernetAddress([0; 6]), + target_protocol_addr: target_ip, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress([0xff; 6]), + ethertype: EthernetProtocol::Arp, + }; + let total = ETH_HDR_LEN + arp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut arp = ArpPacket::new_unchecked(&mut buf[ETH_HDR_LEN..]); + arp_repr.emit(&mut arp); + buf +} + +fn parse_arp_reply(frame: &[u8]) -> Option<(EthernetAddress, Ipv4Address)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Arp { + return None; + } + let arp = ArpPacket::new_checked(eth.payload()).ok()?; + let repr = ArpRepr::parse(&arp).ok()?; + if let ArpRepr::EthernetIpv4 { + operation: ArpOperation::Reply, + source_hardware_addr, + source_protocol_addr, + .. + } = repr + { + Some((source_hardware_addr, source_protocol_addr)) + } else { + None + } +} + +#[test] +fn arp_replies_for_gateway() { + let mut stack = SlirpStack::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(SLIRP_GATEWAY_IP)) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)) + .expect("arp reply for gateway"); + assert_eq!(reply.1, SLIRP_GATEWAY_IP); + assert_eq!(reply.0, EthernetAddress(GATEWAY_MAC)); +} + +#[test] +fn arp_replies_for_random_subnet_ip() { + let mut stack = SlirpStack::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(Ipv4Address::new(10, 0, 2, 99))) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)) + .expect("arp reply for in-subnet IP"); + assert_eq!(reply.0, EthernetAddress(GATEWAY_MAC)); +} + +#[test] +fn arp_does_not_reply_for_guest_ip() { + let mut stack = SlirpStack::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(SLIRP_GUEST_IP)) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)); + assert!(reply.is_none(), "stack must not claim guest's own IP"); +} +``` + +- [ ] **Step 2: Run.** `cargo test --test network_baseline arp_` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin ARP reply behavior for gateway and subnet" +``` + +--- + +### Task 0A.7: Pin DNS cache and forwarding + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Add four DNS tests.** A real recursive resolver is + required; tests skip cleanly if no nameserver is reachable. + +```rust +fn build_dns_query(xid: u16, qname: &[u8]) -> Vec { + use void_box::network::slirp::SLIRP_DNS_IP; + // Minimal DNS query: header + QNAME + QTYPE=A + QCLASS=IN + let mut payload = Vec::new(); + payload.extend_from_slice(&xid.to_be_bytes()); // ID + payload.extend_from_slice(&[0x01, 0x00]); // standard query, RD=1 + payload.extend_from_slice(&[0x00, 0x01]); // QDCOUNT=1 + payload.extend_from_slice(&[0x00, 0x00]); // ANCOUNT + payload.extend_from_slice(&[0x00, 0x00]); // NSCOUNT + payload.extend_from_slice(&[0x00, 0x00]); // ARCOUNT + payload.extend_from_slice(qname); + payload.extend_from_slice(&[0x00, 0x01]); // QTYPE=A + payload.extend_from_slice(&[0x00, 0x01]); // QCLASS=IN + build_udp_frame(SLIRP_DNS_IP, GUEST_EPHEMERAL_PORT, 53, &payload) +} + +fn parse_dns_reply_xid(frame: &[u8]) -> Option { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Udp { + return None; + } + let udp = UdpPacket::new_checked(ip.payload()).ok()?; + if udp.src_port() != 53 { + return None; + } + let p = udp.payload(); + if p.len() < 2 { + return None; + } + Some(u16::from_be_bytes([p[0], p[1]])) +} + +// `\x07example\x03com\x00` +const QNAME_EXAMPLE_COM: &[u8] = b"\x07example\x03com\x00"; + +#[test] +fn dns_query_resolves() { + let mut stack = match SlirpStack::new() { + Ok(s) => s, + Err(_) => return, // no /etc/resolv.conf; skip + }; + stack + .process_guest_frame(&build_dns_query(0x1234, QNAME_EXAMPLE_COM)) + .unwrap(); + // Resolution is async on net-poll thread. Drain up to 20× 100ms. + let mut got = None; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + if let Some(xid) = parse_dns_reply_xid(&f) { + got = Some(xid); + } + } + if got.is_some() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + } + if got.is_none() { + eprintln!("skip: no upstream DNS reachable"); + return; + } + assert_eq!(got, Some(0x1234)); +} + +#[test] +fn dns_cache_keys_by_question_not_xid() { + let mut stack = match SlirpStack::new() { + Ok(s) => s, + Err(_) => return, + }; + // Warm cache with xid=1. + stack + .process_guest_frame(&build_dns_query(0x0001, QNAME_EXAMPLE_COM)) + .unwrap(); + for _ in 0..20 { + let _ = drain_n(&mut stack, 1); + std::thread::sleep(std::time::Duration::from_millis(50)); + } + // Query with xid=2 — should hit cache and reply with xid=2. + stack + .process_guest_frame(&build_dns_query(0x0002, QNAME_EXAMPLE_COM)) + .unwrap(); + let frames = drain_n(&mut stack, 4); + let xid = frames.iter().find_map(|f| parse_dns_reply_xid(f)); + if xid.is_none() { + eprintln!("skip: cache warmup did not complete"); + return; + } + assert_eq!(xid, Some(0x0002), "cache must rewrite xid on hit"); +} +``` + +- [ ] **Step 2: Run.** + +```bash +cargo test --test network_baseline dns_ +``` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin DNS resolution and cache xid-rewrite" +``` + +--- + +### Task 0A.8: Pin "broken on purpose" — UDP non-DNS dropped + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write the dropped-on-purpose test.** + +```rust +/// BROKEN_ON_PURPOSE — flips in Phase 2. +/// +/// Today: UDP datagrams to any port other than 53 are silently +/// dropped (`slirp.rs:637` "drop silently"). A bound host UDP socket +/// receives nothing. +#[test] +fn udp_non_dns_silently_dropped() { + // Bind a host UDP socket; we'll prove nothing arrives. + let host_sock = UdpSocket::bind("127.0.0.1:0").unwrap(); + let host_port = host_sock.local_addr().unwrap().port(); + host_sock + .set_read_timeout(Some(std::time::Duration::from_millis(200))) + .unwrap(); + + let mut stack = SlirpStack::new().unwrap(); + stack + .process_guest_frame(&build_udp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + b"hello", + )) + .unwrap(); + let _ = drain_n(&mut stack, 4); + + let mut buf = [0u8; 32]; + let received = host_sock.recv(&mut buf).is_ok(); + assert!( + !received, + "BROKEN_ON_PURPOSE: today UDP-to-non-53 is dropped. \ + If this fires, Phase 2 likely landed — flip to assert!(received)." + ); +} +``` + +- [ ] **Step 2: Run.** `cargo test --test network_baseline udp_non_dns_silently_dropped` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): BROKEN_ON_PURPOSE pin — UDP non-DNS dropped" +``` + +--- + +### Task 0A.9: Pin "broken on purpose" — ICMP echo dropped + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Write the dropped-on-purpose test.** + +```rust +/// BROKEN_ON_PURPOSE — flips in Phase 1. +/// +/// Today: ICMP echo requests are silently dropped at +/// `slirp.rs:637`. Phase 1 adds `IPPROTO_ICMP SOCK_DGRAM` echo +/// translation. +#[test] +fn icmp_echo_silently_dropped() { + // Build a minimal ICMP echo request as an IPv4 packet inside an + // Ethernet frame. We don't have an `IcmpRepr` builder set up; do + // it by hand against smoltcp wire types. + use smoltcp::wire::{Icmpv4Packet, Icmpv4Repr}; + + let icmp_repr = Icmpv4Repr::EchoRequest { + ident: 0xbeef, + seq_no: 1, + data: b"ping", + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: Ipv4Address::new(8, 8, 8, 8), + next_header: IpProtocol::Icmp, + payload_len: icmp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + icmp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp = Icmpv4Packet::new_unchecked( + &mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..], + ); + icmp_repr.emit(&mut icmp, &Default::default()); + + let mut stack = SlirpStack::new().unwrap(); + stack.process_guest_frame(&buf).unwrap(); + let frames = drain_n(&mut stack, 4); + + let saw_icmp_reply = frames.iter().any(|f| { + EthernetFrame::new_checked(f.as_slice()) + .ok() + .and_then(|e| { + if e.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + Ipv4Packet::new_checked(e.payload()).ok().map(|ip| { + ip.next_header() == IpProtocol::Icmp + && ip.dst_addr() == SLIRP_GUEST_IP + }) + }) + .unwrap_or(false) + }); + assert!( + !saw_icmp_reply, + "BROKEN_ON_PURPOSE: today ICMP echo is dropped. \ + Phase 1 should flip this to assert!(saw_icmp_reply)." + ); +} +``` + +- [ ] **Step 2: Run.** `cargo test --test network_baseline icmp_echo_silently_dropped` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): BROKEN_ON_PURPOSE pin — ICMP echo dropped" +``` + +--- + +## Workstream 0B — divan microbenches (`benches/network.rs`) + +### Task 0B.1: Bench file scaffolding + first three benches + +**Files:** +- Create: `benches/network.rs` +- Modify: `Cargo.toml` (register `[[bench]] name = "network"`) + +- [ ] **Step 1: Create the bench file.** + +```rust +//! Divan micro-benchmarks for SLIRP hot paths. +//! +//! Mirrors `benches/startup.rs` in shape. Job: regression detection +//! for the per-packet hot path on the vCPU and net-poll threads. +//! +//! Run with: `cargo bench --bench network` + +#![cfg(target_os = "linux")] + +use divan::Bencher; +use smoltcp::wire::{ + EthernetAddress, EthernetFrame, EthernetProtocol, EthernetRepr, IpProtocol, Ipv4Address, + Ipv4Packet, Ipv4Repr, TcpControl, TcpPacket, TcpRepr, +}; +use void_box::network::slirp::{ + SlirpStack, GATEWAY_MAC, GUEST_MAC, SLIRP_GATEWAY_IP, SLIRP_GUEST_IP, +}; + +fn main() { + divan::main(); +} + +fn build_syn(src_port: u16, dst_port: u16) -> Vec { + let tcp = TcpRepr { + src_port, + dst_port, + control: TcpControl::Syn, + seq_number: smoltcp::wire::TcpSeqNumber(1000), + ack_number: None, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + timestamp: None, + payload: &[], + }; + let ip = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_GATEWAY_IP, + next_header: IpProtocol::Tcp, + payload_len: tcp.buffer_len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip.buffer_len() + tcp.buffer_len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ipp = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip.emit(&mut ipp, &Default::default()); + let mut tcpp = TcpPacket::new_unchecked(&mut buf[14 + ip.buffer_len()..]); + tcp.emit( + &mut tcpp, + &smoltcp::wire::IpAddress::Ipv4(SLIRP_GUEST_IP), + &smoltcp::wire::IpAddress::Ipv4(SLIRP_GATEWAY_IP), + &Default::default(), + ); + buf +} + +#[divan::bench] +fn process_syn(bencher: Bencher) { + let frame = build_syn(49152, 1); + bencher.bench_local(|| { + let mut stack = SlirpStack::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); +} + +#[divan::bench] +fn poll_idle(bencher: Bencher) { + let mut stack = SlirpStack::new().unwrap(); + bencher.bench_local(|| { + let _ = divan::black_box(&mut stack).poll(); + }); +} + +#[divan::bench] +fn process_arp_request(bencher: Bencher) { + use smoltcp::wire::{ArpOperation, ArpPacket, ArpRepr}; + let arp_repr = ArpRepr::EthernetIpv4 { + operation: ArpOperation::Request, + source_hardware_addr: EthernetAddress(GUEST_MAC), + source_protocol_addr: SLIRP_GUEST_IP, + target_hardware_addr: EthernetAddress([0; 6]), + target_protocol_addr: SLIRP_GATEWAY_IP, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress([0xff; 6]), + ethertype: EthernetProtocol::Arp, + }; + let total = 14 + arp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut a = ArpPacket::new_unchecked(&mut buf[14..]); + arp_repr.emit(&mut a); + + bencher.bench_local(|| { + let mut stack = SlirpStack::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&buf)); + }); +} +``` + +- [ ] **Step 2: Register in `Cargo.toml`.** + +```toml +[[bench]] +name = "network" +path = "benches/network.rs" +harness = false +``` + +- [ ] **Step 3: Build and run.** + +```bash +cargo bench --bench network --no-run +cargo bench --bench network process_syn +``` + +Expected: divan prints timing, e.g. `process_syn fastest=…us`. + +- [ ] **Step 4: Commit.** + +```bash +git add benches/network.rs Cargo.toml +git commit -m "bench(network): divan microbenches for SLIRP hot paths" +``` + +--- + +### Task 0B.2: Parametric NAT-walk scaling bench + +**Files:** +- Modify: `benches/network.rs` + +- [ ] **Step 1: Add the parametric bench.** Append: + +```rust +/// Open `n` distinct guest→gateway flows, then time `poll()`. +/// This walks the NAT table — `O(n)` today; the unified flow table +/// in Phase 4 should keep it `O(n)` but with smaller constants. +#[divan::bench(args = [1, 100, 1000])] +fn poll_with_n_flows(bencher: Bencher, n: usize) { + let mut stack = SlirpStack::new().unwrap(); + for i in 0..n { + let frame = build_syn(49152u16.wrapping_add(i as u16), 1); + let _ = stack.process_guest_frame(&frame); + } + bencher.bench_local(|| { + let _ = divan::black_box(&mut stack).poll(); + }); +} +``` + +- [ ] **Step 2: Run.** + +```bash +cargo bench --bench network poll_with_n_flows +``` + +- [ ] **Step 3: Commit.** + +```bash +git add benches/network.rs +git commit -m "bench(network): parametric NAT-walk scaling at 1/100/1000 flows" +``` + +--- + +### Task 0B.3: DNS cache hit/miss benches + +**Files:** +- Modify: `benches/network.rs` + +- [ ] **Step 1: Append DNS benches.** + +```rust +fn build_dns_query_for_bench(xid: u16) -> Vec { + use smoltcp::wire::{UdpPacket, UdpRepr}; + use void_box::network::slirp::SLIRP_DNS_IP; + let mut payload = Vec::new(); + payload.extend_from_slice(&xid.to_be_bytes()); + payload.extend_from_slice(&[0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]); + payload.extend_from_slice(b"\x07example\x03com\x00"); + payload.extend_from_slice(&[0x00, 0x01, 0x00, 0x01]); + + let udp_repr = UdpRepr { + src_port: 49152, + dst_port: 53, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_DNS_IP, + next_header: IpProtocol::Udp, + payload_len: 8 + payload.len(), + hop_limit: 64, + }; + let eth = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + 8 + payload.len(); + let mut buf = vec![0u8; total]; + let mut e = EthernetFrame::new_unchecked(&mut buf[..]); + eth.emit(&mut e); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &smoltcp::wire::IpAddress::Ipv4(SLIRP_GUEST_IP), + &smoltcp::wire::IpAddress::Ipv4(SLIRP_DNS_IP), + 8 + payload.len(), + |b| b.copy_from_slice(&payload), + &Default::default(), + ); + buf +} + +#[divan::bench] +fn dns_cache_miss(bencher: Bencher) { + let frame = build_dns_query_for_bench(1); + bencher.bench_local(|| { + let mut stack = SlirpStack::new().unwrap(); + let _ = stack.process_guest_frame(divan::black_box(&frame)); + }); +} + +#[divan::bench] +fn dns_cache_hit(bencher: Bencher) { + // Warm cache by injecting one query and polling resolution. + let mut stack = SlirpStack::new().unwrap(); + let warm = build_dns_query_for_bench(1); + let _ = stack.process_guest_frame(&warm); + for _ in 0..20 { + let _ = stack.poll(); + std::thread::sleep(std::time::Duration::from_millis(50)); + } + let hit = build_dns_query_for_bench(2); + bencher.bench_local(|| { + let _ = divan::black_box(&mut stack).process_guest_frame(divan::black_box(&hit)); + }); +} +``` + +- [ ] **Step 2: Run.** `cargo bench --bench network dns_` + +- [ ] **Step 3: Commit.** + +```bash +git add benches/network.rs +git commit -m "bench(network): DNS cache hit and miss paths" +``` + +--- + +### Task 0B.4: Wire CI extension + +**Files:** +- Modify: `.github/workflows/startup-bench.yml` (add a `network` step) + +- [ ] **Step 1: Read the existing workflow** to learn the regression + threshold mechanism. + +```bash +cat .github/workflows/startup-bench.yml +``` + +- [ ] **Step 2: Add a parallel job/step** that runs + `cargo bench --bench network` and compares against `main` baseline + using the same mechanism the startup bench uses. Concrete diff + depends on what's already there — match the pattern; do not + duplicate infrastructure. + +- [ ] **Step 3: Push to a feature branch and verify the workflow + runs.** If the divan output format the existing workflow expects + doesn't match, adjust the workflow rather than divan output (divan + has a single canonical JSON format; rely on it). + +- [ ] **Step 4: Commit.** + +```bash +git add .github/workflows/startup-bench.yml +git commit -m "ci(bench): include network microbenches in regression gate" +``` + +--- + +## Workstream 0C — Wall-clock e2e harness (`voidbox-network-bench`) + +### Task 0C.1: Binary scaffold + +**Files:** +- Create: `src/bin/voidbox-network-bench/main.rs` +- Modify: `Cargo.toml` (register `[[bin]] name = "voidbox-network-bench"`) + +- [ ] **Step 1: Create the binary scaffold.** + +```rust +//! Wall-clock end-to-end network benchmark harness. +//! +//! Boots a real VM and measures TCP throughput, RR/CRR latency, and +//! UDP DNS qps inside the guest. Output is JSON for diffing against +//! a baseline. +//! +//! Mirrors `voidbox-startup-bench` in CLI shape and lifecycle. +//! +//! Linux-only because the smoltcp-based SLIRP stack is Linux-only. + +#![cfg(target_os = "linux")] + +use clap::Parser; +use serde::Serialize; +use std::path::PathBuf; +use std::time::Duration; + +#[derive(Parser, Debug)] +#[command(version, about = "VoidBox network benchmark harness")] +struct Cli { + /// Number of iterations per metric. + #[arg(long, default_value_t = 5)] + iterations: u32, + + /// Output JSON file. If omitted, prints to stdout. + #[arg(long)] + output: Option, + + /// Skip throughput measurements (useful for fast smoke runs). + #[arg(long, default_value_t = false)] + no_throughput: bool, +} + +#[derive(Serialize, Debug, Default)] +struct Report { + tcp_throughput_g2h_mbps: Option, + tcp_throughput_h2g_mbps: Option, + tcp_rr_latency_us_p50: Option, + tcp_rr_latency_us_p99: Option, + tcp_crr_latency_us_p50: Option, + udp_dns_qps: Option, + icmp_rr_latency_us_p50: Option, // None today; populated post-Phase-1 +} + +fn main() -> Result<(), Box> { + let cli = Cli::parse(); + let mut report = Report::default(); + + eprintln!("voidbox-network-bench: scaffold (no measurements yet)"); + let _ = (cli.iterations, &cli.output, cli.no_throughput, &mut report); + + let json = serde_json::to_string_pretty(&report)?; + match cli.output { + Some(path) => std::fs::write(path, json)?, + None => println!("{json}"), + } + Ok(()) +} + +#[allow(dead_code)] +fn percentile(samples: &mut [Duration], p: f64) -> Duration { + samples.sort(); + let idx = ((samples.len() as f64) * p).clamp(0.0, samples.len() as f64 - 1.0) as usize; + samples[idx] +} +``` + +- [ ] **Step 2: Register in `Cargo.toml`.** + +```toml +[[bin]] +name = "voidbox-network-bench" +path = "src/bin/voidbox-network-bench/main.rs" +``` + +- [ ] **Step 3: Build.** + +```bash +cargo build --bin voidbox-network-bench +``` + +- [ ] **Step 4: Smoke run.** + +```bash +cargo run --bin voidbox-network-bench +``` + +Expected: prints JSON with all `null` fields. + +- [ ] **Step 5: Commit.** + +```bash +git add src/bin/voidbox-network-bench Cargo.toml +git commit -m "bench(network): voidbox-network-bench binary scaffold" +``` + +--- + +### Task 0C.2: TCP throughput measurement + +**Files:** +- Modify: `src/bin/voidbox-network-bench/main.rs` + +- [ ] **Step 1: Read the existing startup-bench harness** to learn + the VM lifecycle pattern. + +```bash +# Use LSP `documentSymbol` on src/bin/voidbox-startup-bench/main.rs +# to map its functions, then read the run loop. +``` + +- [ ] **Step 2: Implement `measure_tcp_throughput`** that: + 1. Starts a host-side iperf3 server (or a Rust echo loop on a + TCP socket). + 2. Boots a VM whose initramfs includes `iperf3`. + 3. Execs `iperf3 -c 10.0.2.2 -t 5 -p --json` inside the + guest via the existing `ControlChannel::exec`. + 4. Parses the JSON, extracts bits-per-second, returns Mbps. + 5. Stops the VM. +- [ ] **Step 3:** Wire the function into `main` for both directions + (g2h, h2g) and populate `report.tcp_throughput_*`. +- [ ] **Step 4: Smoke run.** + +```bash +cargo run --bin voidbox-network-bench -- --iterations 1 +``` + +- [ ] **Step 5: Commit.** + +```bash +git add src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): TCP throughput via iperf3 inside VM" +``` + +> **Note for the implementer:** the test image +> (`/tmp/void-box-test-rootfs.cpio.gz`) does not include `iperf3` by +> default. Either extend `scripts/build_test_image.sh` to include it, +> or write a hand-rolled echo loop in Rust that ships with the +> harness. The latter is simpler and recommended — see passt's +> `test/perf/` for the methodology to copy. + +--- + +### Task 0C.3: RR / CRR latency + +**Files:** +- Modify: `src/bin/voidbox-network-bench/main.rs` + +- [ ] **Step 1: Implement `measure_rr_latency`** — open a TCP echo + socket on the host, run a guest-side loop that does + `connect+send+recv+close` (CRR) or `send+recv` on a kept-open + connection (RR), record `iterations` samples, return p50/p99 in µs. +- [ ] **Step 2:** Wire into `main`. Populate + `report.tcp_rr_latency_us_*` and `report.tcp_crr_latency_us_p50`. +- [ ] **Step 3: Run.** + +```bash +cargo run --bin voidbox-network-bench -- --iterations 100 --no-throughput +``` + +- [ ] **Step 4: Commit.** + +```bash +git add src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): TCP RR/CRR latency p50/p99" +``` + +--- + +### Task 0C.4: UDP DNS qps + JSON baseline + +**Files:** +- Modify: `src/bin/voidbox-network-bench/main.rs` + +- [ ] **Step 1: Implement `measure_dns_qps`** — guest-side loop + resolving `example.com` against the SLIRP DNS at 10.0.2.3, count + successful replies in a fixed window, divide. +- [ ] **Step 2:** Wire into `main`, populate `report.udp_dns_qps`. +- [ ] **Step 3: Run** with `--output baseline.json` and inspect: + +```bash +cargo run --bin voidbox-network-bench -- --output baseline.json +cat baseline.json +``` + +- [ ] **Step 4: Commit and stash a `baseline.json`** as a build + artifact (do **not** commit it — it's machine-specific). Document + in the binary's `--help` output how to use it for diffing. + +```bash +git add src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): UDP DNS qps and JSON report output" +``` + +--- + +## Workstream 0D — Trait extraction + rename + +### Task 0D.1: Define `NetworkBackend` trait + +**Files:** +- Modify: `src/network/mod.rs` + +- [ ] **Step 1: Use LSP `documentSymbol`** on `src/network/mod.rs` to + confirm where to insert the trait (after `NetworkConfig`, before + `TapDevice`). +- [ ] **Step 2: Add the trait.** + +```rust +use std::io; + +/// A network backend processes raw Ethernet frames between guest and +/// host. +/// +/// Implementations must be `Send` so they can be held behind +/// `Arc>` and accessed from both the vCPU thread (TX path) +/// and the net-poll thread (RX path). +pub trait NetworkBackend: Send { + /// Process a raw Ethernet frame sent by the guest. + /// + /// Called from the vCPU thread on MMIO write to the TX virtqueue. + /// Implementations must not block. + fn process_guest_frame(&mut self, frame: &[u8]) -> io::Result<()>; + + /// Drain Ethernet frames destined for the guest into `out`. + /// + /// Called every ~5ms from the net-poll thread. Frames are + /// complete Ethernet payloads — no virtio-net header (the caller + /// prepends that). The buffer is reused across calls to avoid + /// per-poll allocation. + fn drain_to_guest(&mut self, out: &mut Vec>); + + /// Backend health. + /// + /// `false` means the backend has entered an unrecoverable state + /// and should be reconstructed by the caller. The default + /// implementation always returns `true`. + fn is_healthy(&self) -> bool { + true + } +} +``` + +> **Apply `rustdoc` skill:** confirm the doc comment style — summary +> sentence first, no leading "This trait …", `# Errors` / +> `# Panics` if applicable. The above complies. + +- [ ] **Step 3: Build.** `cargo check --target-dir target/check` +- [ ] **Step 4: Commit.** + +```bash +git add src/network/mod.rs +git commit -m "feat(network): introduce NetworkBackend trait" +``` + +--- + +### Task 0D.2: Tighten `SlirpStack::poll` to `drain_to_guest` signature + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Use LSP `findReferences`** on `SlirpStack::poll` to + list every call site — these all need to switch to + `drain_to_guest(&mut out)`. + +```bash +# Inside the IDE / via LSP: +# goToDefinition on `poll` → 392 +# findReferences on `poll` → list all callers +``` + +- [ ] **Step 2: Add the new method on `SlirpStack`** (do not yet + remove `poll` — keep both during the rename to keep the build + green). + +```rust +/// Drain frames destined to the guest into `out`. Reuses the buffer +/// across calls. See `NetworkBackend::drain_to_guest`. +pub fn drain_to_guest(&mut self, out: &mut Vec>) { + out.append(&mut self.poll()); +} +``` + +This is a thin wrapper for now — the real allocation drop happens in +**Task 0D.3** when the `poll` body moves into `drain_to_guest`. + +- [ ] **Step 3: Build.** `cargo check` +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): add drain_to_guest wrapper for trait fit" +``` + +--- + +### Task 0D.3: Move `poll` body into `drain_to_guest`, drop the per-call alloc + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Use LSP `goToDefinition`** on + `SlirpStack::poll` (around line 392) to land on its body. +- [ ] **Step 2: Refactor.** Move the body of `poll` into + `drain_to_guest`, replacing every `self.inject_to_guest.drain(..)` + / `Vec::new()` allocation with appends to `out`. + +Before: + +```rust +pub fn poll(&mut self) -> Vec> { + // ... existing body that builds and returns Vec> +} + +pub fn drain_to_guest(&mut self, out: &mut Vec>) { + out.append(&mut self.poll()); +} +``` + +After: + +```rust +pub fn drain_to_guest(&mut self, out: &mut Vec>) { + // ... body that pushes into `out` directly +} + +#[deprecated(note = "use drain_to_guest")] +pub fn poll(&mut self) -> Vec> { + let mut out = Vec::new(); + self.drain_to_guest(&mut out); + out +} +``` + +The deprecated `poll` keeps the existing tests/benches working while +0D.4 migrates callers. + +- [ ] **Step 3: Build and run baseline tests.** + +```bash +cargo check +cargo test --test network_baseline +``` + +Expected: all baseline pins still green. The deprecation warning +fires from the test file — that's intended; tests migrate in 0D.6. + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): move poll body into drain_to_guest, drop alloc" +``` + +--- + +### Task 0D.4: `impl NetworkBackend for SlirpStack` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add the impl.** Use the existing methods (return type + for `process_guest_frame` is `Result` — the trait wants + `io::Result`; bridge in the impl). + +```rust +use crate::network::NetworkBackend; +use std::io; + +impl NetworkBackend for SlirpStack { + fn process_guest_frame(&mut self, frame: &[u8]) -> io::Result<()> { + SlirpStack::process_guest_frame(self, frame) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string())) + } + + fn drain_to_guest(&mut self, out: &mut Vec>) { + SlirpStack::drain_to_guest(self, out) + } +} +``` + +> **Apply `rust-style` skill:** the closure can be a function-pointer +> reference if `e.to_string()` works without arguments — but +> `Error::to_string` takes `&self`, so the closure form is correct. +> The trait method names shadow the inherent names; explicit +> `SlirpStack::method(self, …)` disambiguates per project convention. + +- [ ] **Step 2: Build.** `cargo check` +- [ ] **Step 3: Sanity test.** + +```rust +// In tests/network_baseline.rs, behind the existing module, append: +#[test] +fn smoltcp_backend_implements_network_backend() { + fn assert_send() {} + fn assert_backend() {} + assert_send::(); + assert_backend::(); +} +``` + +```bash +cargo test --test network_baseline smoltcp_backend_implements_network_backend +``` + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs tests/network_baseline.rs +git commit -m "feat(slirp): impl NetworkBackend for SlirpStack" +``` + +--- + +### Task 0D.5: Switch `VirtioNetDevice` to hold `Arc>` + +**Files:** +- Modify: `src/devices/virtio_net.rs` + +- [ ] **Step 1: Use LSP `documentSymbol`** on + `src/devices/virtio_net.rs` to map its struct + methods. +- [ ] **Step 2: Use LSP `findReferences`** on the field that today + holds `Arc>` to know all the access sites. +- [ ] **Step 3: Apply `rust-analyzer-ssr`** to change + `Arc>` → `Arc>` + workspace-wide. SSR pattern (run from project root): + +```bash +# From the LSP shell or via the `rust-analyzer-ssr` skill: +# pattern: Arc> +# replace: Arc> +``` + +- [ ] **Step 4: Update method bodies that called `poll()`** to call + `drain_to_guest(&mut buf)` against a reused buffer field. + +Before: + +```rust +let frames = self.slirp.lock().unwrap().poll(); +for frame in frames { /* ... */ } +``` + +After: + +```rust +self.rx_scratch.clear(); +self.slirp.lock().unwrap().drain_to_guest(&mut self.rx_scratch); +for frame in self.rx_scratch.drain(..) { /* ... */ } +``` + +Add `rx_scratch: Vec>` to the struct, default-initialized. + +- [ ] **Step 5: Build + tests.** + +```bash +cargo check +cargo test --test network_baseline +``` + +- [ ] **Step 6: Commit.** + +```bash +git add src/devices/virtio_net.rs +git commit -m "refactor(virtio_net): hold dyn NetworkBackend, reuse rx buffer" +``` + +--- + +### Task 0D.6: Update VMM construction sites (cold-boot + snapshot-restore) + +**Files:** +- Modify: `src/vmm/mod.rs` + +- [ ] **Step 1: Use LSP `findReferences`** on `SlirpStack::new` and + `SlirpStack::with_security` to find every construction site. + Expect two: cold boot (around `Vm::new`) and snapshot restore + (around `restore`). Confirm via the file's `documentSymbol`. + +- [ ] **Step 2: Wrap each construction in `Arc>`** and bind + the variable type as `Arc>`: + +```rust +let backend: Arc> = Arc::new(Mutex::new( + SlirpStack::with_security(max_conn, max_rate, deny.clone())?, +)); +``` + +- [ ] **Step 3: Build + tests.** + +```bash +cargo check +cargo test --workspace --all-features +``` + +- [ ] **Step 4: Run the LSP `workspaceSymbol`** lookup for any + remaining `SlirpStack` references that should now be hidden behind + the trait. Anything outside `src/network/` and the construction + sites is suspect. + +- [ ] **Step 5: Commit.** + +```bash +git add src/vmm/mod.rs +git commit -m "refactor(vmm): construct network backend behind dyn trait" +``` + +--- + +### Task 0D.7: Rename `SlirpStack → SlirpBackend` + +**Files:** +- Modify: `src/network/slirp.rs`, `tests/network_baseline.rs`, + `benches/network.rs`, `src/devices/virtio_net.rs`, + `src/vmm/mod.rs`, any other references LSP turns up. + +The module file `src/network/slirp.rs` keeps its name — only the +type is renamed. (The current filename already aligns with the new +type name, and matches the convention used elsewhere in the repo: +`src/devices/virtio_net.rs` holds `VirtioNetDevice`, not a +`virtio_net_device.rs` file.) + +- [ ] **Step 1: Use LSP rename** (`rust-analyzer` rename refactor) on + `SlirpStack` → `SlirpBackend`. **Do not text-substitute** — the + rename also touches `tests/network_baseline.rs` imports, the + `benches/network.rs` imports, and any `pub use` re-exports. + +- [ ] **Step 2: Build + run all tests.** + +```bash +cargo check +cargo test --workspace --all-features +cargo test --test network_baseline +``` + +- [ ] **Step 3: Final build.** `cargo check` + +- [ ] **Step 4: Commit.** + +```bash +git add -A +git commit -m "refactor(network): rename SlirpStack to SlirpBackend" +``` + +--- + +## Workstream 0E — Validation + ship + +### Task 0E.1: Full validation gate + +**Files:** none + +- [ ] **Step 1: Format + clippy.** + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +``` + +- [ ] **Step 2: Workspace tests.** + +```bash +cargo test --workspace --all-features +cargo test --doc --workspace --all-features +``` + +- [ ] **Step 3: Network baseline.** + +```bash +cargo test --test network_baseline +``` + +Expected: all tests pass, including the three `BROKEN_ON_PURPOSE` +pins (they assert *broken* behavior — green is correct). + +- [ ] **Step 4: Microbenches no-regression.** + +```bash +cargo bench --bench network +``` + +Compare against `main` baseline (CI does this automatically; do it +locally first). + +- [ ] **Step 5: VM suites that touch networking.** + +```bash +export VOID_BOX_KERNEL=/boot/vmlinuz-$(uname -r) +scripts/build_test_image.sh +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test conformance -- --ignored --test-threads=1 +cargo test --test snapshot_integration -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +cargo test --test e2e_mount -- --ignored --test-threads=1 +``` + +- [ ] **Step 6: Repo `verify` skill.** Run the project's quality + gate (`/verify`) — format, clippy, tests, security audit, startup + bench regression, real-workload smoke. + +- [ ] **Step 7: aarch64 cross-check** (per `AGENTS.md`). + +```bash +CFLAGS_aarch64_unknown_linux_gnu="--sysroot=/usr/aarch64-redhat-linux/sys-root/fc43" \ + RUSTFLAGS="-D warnings" \ + cargo check --target aarch64-unknown-linux-gnu -p void-box --lib --tests +``` + +- [ ] **Step 8: macOS build smoke** (if a macOS box is available, or + via CI). The trait extraction must not break the macOS build — + `NetworkBackend` lives in `src/network/mod.rs` (cross-platform); + the `SmoltcpBackend` impl is gated `#[cfg(target_os = "linux")]`. + +- [ ] **Step 9:** If any gate fails, fix in place and re-run from + Step 1. Do not proceed to PR until all gates green. + +--- + +### Task 0E.2: Open the PR + +**Files:** none + +- [ ] **Step 1: Push the branch.** + +```bash +git push -u origin smoltcp-passt-port-phase0 +``` + +- [ ] **Step 2: Open the PR** with body: + +```markdown +## Phase 0: baseline + NetworkBackend trait + +Implements Phase 0 of `docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md`. + +**Zero user-visible behavior change.** This PR lands: + +- `tests/network_baseline.rs` — 13 unit-level pins for the smoltcp-based + SLIRP stack, including three deliberately-broken assertions that + flip in Phases 1, 2, 3. +- `benches/network.rs` — divan microbenches for SLIRP hot paths + (process_syn, poll_idle, NAT-walk scaling, DNS cache hit/miss). +- `voidbox-network-bench` — wall-clock e2e harness with metric names + matching passt's published table. +- `NetworkBackend` trait in `src/network/mod.rs`. +- `SlirpStack` renamed to `SlirpBackend` (role-based name, + symmetric with future `TapBackend`/`VhostNetBackend`); `poll` + replaced by `drain_to_guest(&mut Vec>)` to drop the + per-poll allocation. + +## Test plan + +- [x] cargo fmt / clippy clean +- [x] cargo test --workspace --all-features +- [x] cargo test --test network_baseline +- [x] cargo bench --bench network — no regression +- [x] conformance, snapshot_integration, e2e_skill_pipeline, + e2e_mount green +- [x] aarch64 cross-check green +- [x] macOS build smoke green +- [x] /verify clean + +## Broken on purpose + +These three baseline pins assert today's broken behavior. They flip +in subsequent phases — do not "fix" them in this PR: + +- `tcp_to_host_buffer_drops_at_256kb` (flips in Phase 3) +- `udp_non_dns_silently_dropped` (flips in Phase 2) +- `icmp_echo_silently_dropped` (flips in Phase 1) +``` + +- [ ] **Step 3: Tag for review.** Phase 0 is mechanical; the trait + shape is the only design decision worth a second pair of eyes. + +--- + +## Self-review checklist (run before handing off) + +- [ ] Every task has explicit file paths, exact commands, expected + output. +- [ ] No `TBD`, no "implement appropriately", no "similar to Task N" + without repeating the code. +- [ ] Three `BROKEN_ON_PURPOSE` pins are present (Tasks 0A.4, 0A.8, + 0A.9) and each names the phase that flips it. +- [ ] Trait surface in 0D.1 matches the spec doc exactly + (`drain_to_guest` out-param, `is_healthy` default-true). +- [ ] Rename in 0D.7 uses LSP rename (rust-analyzer-ssr), not text + substitution. Type renames to `SlirpBackend` (role-based, not + `SmoltcpBackend`). +- [ ] Validation gate in 0E.1 covers fmt, clippy, workspace tests, + baseline tests, microbenches, VM suites, aarch64 cross-check, + macOS smoke. +- [ ] All Rust-touching tasks reference `rust-style` / `rustdoc` / + `rust-analyzer-ssr` where they apply. diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase1.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase1.md new file mode 100644 index 00000000..668d06eb --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase1.md @@ -0,0 +1,663 @@ +# Phase 1 Implementation Plan: ICMP Echo via Unprivileged SOCK_DGRAM IPPROTO_ICMP + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development. +> Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task:** +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Use LSP for navigation. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) +**Continues from Phase 0:** [`2026-04-27-smoltcp-passt-port-phase0.md`](2026-04-27-smoltcp-passt-port-phase0.md) + +**Goal:** Make `ping` work inside guest VMs by relaying ICMP echo +through an unprivileged host kernel socket (`SOCK_DGRAM IPPROTO_ICMP`), +in the style of passt's `icmp.c`. Flip the `icmp_echo_silently_dropped` +BROKEN_ON_PURPOSE pin to assert the new behavior. + +**Architecture:** New `IcmpEchoEntry` per `(guest_id, dst_ip)` flow. +Each entry owns one `IPPROTO_ICMP` `SOCK_DGRAM` socket. `handle_icmp_frame` +sends echo requests through the socket; `relay_icmp_echo` polls socket +replies and emits ICMP echo reply frames to the guest. The host kernel +rewrites the ICMP id between guest_id and a kernel-assigned id; we +track the mapping per-flow and translate on the way back. + +**Tech Stack:** Rust 1.88, `libc` (existing dep) for `socket(2)` with +`IPPROTO_ICMP`, `smoltcp` 0.11 for `Icmpv4Packet`/`Icmpv4Repr` wire +types (already in use), `std::os::fd::FromRawFd` for the wrap. + +**Branch:** `smoltcp-passt-port-phase0` (same branch as Phase 0 — user +explicitly continues here, do not branch). + +--- + +## Cross-platform precondition + +Linux requires `net.ipv4.ping_group_range` to permit the calling GID +for unprivileged `IPPROTO_ICMP` sockets. The default on Fedora/Ubuntu +since ~2014 is `0 2147483647` (all gids), but it can be tightened by +admins. Approach: + +1. Try to open the socket once at `SlirpBackend::new` (or lazily on + first ICMP frame). If `socket()` returns `EACCES` or `EPERM`, log a + one-shot warning and **drop** ICMP frames as before. +2. macOS allows the same syscall unconditionally; no sysctl gate. + +This is the *exact* compatibility shape passt uses — see `icmp.c` +in `/home/diego/github/passt`. + +--- + +## Task structure + +7 tasks across two workstreams. + +| ID | Workstream | Scope | +|---|---|---| +| 1.1 | impl | Add `IcmpEchoEntry` + per-flow socket helper | +| 1.2 | impl | Wire `handle_icmp_frame` for guest→host echo path | +| 1.3 | impl | Wire `relay_icmp_echo` for host→guest reply path | +| 1.4 | impl | Sysctl-fallback to drop on `EACCES` / `EPERM` | +| 1.5 | test | Flip `icmp_echo_silently_dropped` to assert reply | +| 1.6 | bench | Populate `icmp_rr_latency_us_p50` in `voidbox-network-bench` | +| 1.7 | gate | Validation + commit summary | + +--- + +## Workstream 1A — Implementation (`src/network/slirp.rs`) + +### Task 1.1: `IcmpEchoEntry` + per-flow socket helper + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Define a NatKey-style key for ICMP echo.** + +```rust +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct IcmpEchoKey { + guest_id: u16, + dst_ip: Ipv4Address, +} +``` + +- [ ] **Step 2: Define `IcmpEchoEntry`.** + +```rust +struct IcmpEchoEntry { + /// Host-side socket, `socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP)`. + /// Set non-blocking; the kernel handles the ICMP framing. + sock: std::net::UdpSocket, + /// The guest's original ICMP id from the echo request. The kernel + /// assigns its own id when we send via the SOCK_DGRAM ICMP socket; + /// on reply we translate the kernel id back to `guest_id`. + guest_id: u16, + last_activity: std::time::Instant, +} +``` + +`std::net::UdpSocket` is the wrapper we use — see Step 3 for why. + +- [ ] **Step 3: Add a helper `open_icmp_socket() -> io::Result`** at module scope: + +```rust +fn open_icmp_socket() -> io::Result { + use std::os::fd::FromRawFd; + + // SAFETY: socket(2) returns -1 on error; we check before wrapping. + // IPPROTO_ICMP + SOCK_DGRAM is the unprivileged ICMP path: kernel + // handles ICMP framing, no CAP_NET_RAW required. + let raw = unsafe { + libc::socket( + libc::AF_INET, + libc::SOCK_DGRAM | libc::SOCK_NONBLOCK | libc::SOCK_CLOEXEC, + libc::IPPROTO_ICMP, + ) + }; + if raw < 0 { + return Err(io::Error::last_os_error()); + } + // SAFETY: `raw` is a valid fd from socket(2); UdpSocket adopts + // ownership and closes on drop. + Ok(unsafe { std::net::UdpSocket::from_raw_fd(raw) }) +} +``` + +Rationale: `std::net::UdpSocket` uses the SOCK_DGRAM I/O surface +(`recv_from`, `send_to`); it doesn't care that the underlying protocol +is ICMP rather than UDP. This is the same pattern passt uses (just +with raw fds). + +- [ ] **Step 4: Add `icmp_echo: HashMap` field to `SlirpBackend`.** + +Initialize in `SlirpBackend::with_security(...)` and `SlirpBackend::new()`. + +- [ ] **Step 5: `cargo check`** — should compile clean. No behavior wired yet. + +- [ ] **Step 6: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): add IcmpEchoEntry + IPPROTO_ICMP socket helper" +``` + +--- + +### Task 1.2: `handle_icmp_frame` (guest → host) + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Update `handle_ipv4_frame` to dispatch ICMP.** Around + line 654 (the "drop silently" branch), insert before it: + +```rust +if protocol == IpProtocol::Icmp { + return self.handle_icmp_frame(&ipv4); +} +``` + +- [ ] **Step 2: Add `handle_icmp_frame`** as a sibling of + `handle_dns_frame`. Body: + +```rust +fn handle_icmp_frame(&mut self, ipv4: &Ipv4Packet<&[u8]>) -> Result<()> { + let icmp = match smoltcp::wire::Icmpv4Packet::new_checked(ipv4.payload()) { + Ok(p) => p, + Err(_) => return Ok(()), + }; + let repr = match smoltcp::wire::Icmpv4Repr::parse(&icmp, &Default::default()) { + Ok(r) => r, + Err(_) => return Ok(()), + }; + let (ident, seq_no, data) = match repr { + smoltcp::wire::Icmpv4Repr::EchoRequest { ident, seq_no, data } => { + (ident, seq_no, data) + } + _ => return Ok(()), // only echo request handled today + }; + + let key = IcmpEchoKey { guest_id: ident, dst_ip: ipv4.dst_addr() }; + let entry = match self.icmp_echo.entry(key) { + std::collections::hash_map::Entry::Occupied(o) => o.into_mut(), + std::collections::hash_map::Entry::Vacant(v) => { + let sock = match open_icmp_socket() { + Ok(s) => s, + Err(e) => { + // Sysctl-driven fallback handled in Task 1.4. + trace!("SLIRP ICMP: open socket failed: {e}"); + return Ok(()); + } + }; + v.insert(IcmpEchoEntry { + sock, + guest_id: ident, + last_activity: Instant::now(), + }) + } + }; + entry.last_activity = Instant::now(); + + // Build a wire ICMP echo packet with seq + data; the kernel will + // rewrite the ident on send_to. + let req = smoltcp::wire::Icmpv4Repr::EchoRequest { + ident: 0, // kernel rewrites + seq_no, + data, + }; + let mut buf = vec![0u8; req.buffer_len()]; + let mut pkt = smoltcp::wire::Icmpv4Packet::new_unchecked(&mut buf); + req.emit(&mut pkt, &Default::default()); + + let dst = std::net::SocketAddr::from(( + std::net::Ipv4Addr::from(ipv4.dst_addr().0), + 0u16, // port ignored for ICMP + )); + if let Err(e) = entry.sock.send_to(&buf, dst) { + trace!("SLIRP ICMP: send_to failed: {e}"); + } + Ok(()) +} +``` + +- [ ] **Step 3: cargo check + cargo test --test network_baseline.** The + ICMP test still passes today (assertion is `assert!(!saw_icmp_reply)` — + no reply yet because reply path is in Task 1.3). + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): forward guest ICMP echo via SOCK_DGRAM IPPROTO_ICMP" +``` + +--- + +### Task 1.3: `relay_icmp_echo` (host → guest reply path) + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add a `relay_icmp_echo` method** alongside + `relay_tcp_nat_data`. Body: + +```rust +fn relay_icmp_echo(&mut self) { + // Drain replies from each active ICMP socket and emit echo-reply + // frames to the guest. + let now = Instant::now(); + const ICMP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + + let keys: Vec = self.icmp_echo.keys().copied().collect(); + for key in keys { + let frame = { + let Some(entry) = self.icmp_echo.get_mut(&key) else { continue; }; + if now.duration_since(entry.last_activity) > ICMP_IDLE_TIMEOUT { + None // mark for removal below + } else { + let mut buf = [0u8; 1500]; + match entry.sock.recv_from(&mut buf) { + Ok((n, _addr)) => { + entry.last_activity = now; + Self::build_icmp_echo_reply_to_guest( + key.dst_ip, + entry.guest_id, + &buf[..n], + ) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, + Err(_) => continue, + } + } + }; + match frame { + None => { + self.icmp_echo.remove(&key); + } + Some(Some(f)) => self.inject_to_guest.push(f), + Some(None) => {} // build failed; drop silently + } + } +} + +fn build_icmp_echo_reply_to_guest( + src_ip: Ipv4Address, + guest_id: u16, + raw_icmp: &[u8], +) -> Option> { + use smoltcp::wire::*; + let icmp = Icmpv4Packet::new_checked(raw_icmp).ok()?; + let parsed = Icmpv4Repr::parse(&icmp, &Default::default()).ok()?; + let (seq_no, data) = match parsed { + Icmpv4Repr::EchoReply { seq_no, data, .. } => (seq_no, data), + _ => return None, + }; + let reply = Icmpv4Repr::EchoReply { + ident: guest_id, + seq_no, + data, + }; + let ip_repr = Ipv4Repr { + src_addr: src_ip, + dst_addr: SLIRP_GUEST_IP, + next_header: IpProtocol::Icmp, + payload_len: reply.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GATEWAY_MAC), + dst_addr: EthernetAddress(GUEST_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + reply.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp_out = Icmpv4Packet::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + reply.emit(&mut icmp_out, &Default::default()); + Some(buf) +} +``` + +- [ ] **Step 2: Wire `relay_icmp_echo` into `drain_to_guest`.** Around + the existing `self.relay_tcp_nat_data();` call (find via LSP), add + `self.relay_icmp_echo();` immediately after. + +- [ ] **Step 3: cargo check + cargo test --test network_baseline.** All + 13 tests still pass; the broken-on-purpose assertion remains green + because Task 1.5 hasn't flipped it yet (Task 1.5 will demonstrate the + reply path actually works). + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): relay ICMP echo replies back to guest" +``` + +--- + +### Task 1.4: Sysctl fallback (graceful degrade) + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add a once-cell `static`** at module scope to track + whether ICMP support is available: + +```rust +use std::sync::atomic::{AtomicU8, Ordering}; + +/// Tristate: 0 = unknown, 1 = available, 2 = unavailable. +static ICMP_PROBE: AtomicU8 = AtomicU8::new(0); +``` + +- [ ] **Step 2: Probe in `open_icmp_socket`** — on the first call, try + the syscall; if it fails with `EACCES`/`EPERM`, set `ICMP_PROBE = 2`, + log a one-shot warning, and return `Err`. Subsequent calls short-circuit + on `2`. + +```rust +fn open_icmp_socket() -> io::Result { + if ICMP_PROBE.load(Ordering::Relaxed) == 2 { + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "ICMP unprivileged probe previously failed", + )); + } + use std::os::fd::FromRawFd; + let raw = unsafe { + libc::socket( + libc::AF_INET, + libc::SOCK_DGRAM | libc::SOCK_NONBLOCK | libc::SOCK_CLOEXEC, + libc::IPPROTO_ICMP, + ) + }; + if raw < 0 { + let err = io::Error::last_os_error(); + if matches!(err.raw_os_error(), Some(libc::EACCES) | Some(libc::EPERM)) { + if ICMP_PROBE.swap(2, Ordering::Relaxed) != 2 { + tracing::warn!( + "SLIRP: unprivileged ICMP unavailable on this host \ + (sysctl net.ipv4.ping_group_range likely restricts \ + it); ICMP echo from guests will be dropped." + ); + } + } + return Err(err); + } + ICMP_PROBE.store(1, Ordering::Relaxed); + Ok(unsafe { std::net::UdpSocket::from_raw_fd(raw) }) +} +``` + +- [ ] **Step 3: cargo check + tests.** Behavior on Linux/macOS where + the syscall is permitted is unchanged. On a host with restrictive + sysctl, the warning fires once and ICMP frames are silently dropped + (the same behavior as before Phase 1 — the BROKEN_ON_PURPOSE pin + becomes the steady state for that environment). + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): warn-once + fallback when unprivileged ICMP forbidden" +``` + +--- + +## Workstream 1B — Test + bench + +### Task 1.5: Flip `icmp_echo_silently_dropped` BROKEN_ON_PURPOSE pin + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Find the test** (introduced in Phase 0 task 0A.9). + Rename it to `icmp_echo_returns_reply` and rewrite the body to + assert a reply IS observed: + +```rust +/// Phase 1 flipped the BROKEN_ON_PURPOSE assertion: the guest now +/// receives an ICMP echo reply via the host's unprivileged +/// `IPPROTO_ICMP SOCK_DGRAM` socket. +#[test] +fn icmp_echo_returns_reply() { + use smoltcp::wire::{Icmpv4Packet, Icmpv4Repr}; + + let icmp_repr = Icmpv4Repr::EchoRequest { + ident: 0xbeef, + seq_no: 1, + data: b"ping", + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + // 127.0.0.1 — guaranteed to respond on most hosts via the host + // kernel's loopback; macOS and Linux both reply to ICMP echo. + dst_addr: Ipv4Address::new(127, 0, 0, 1), + next_header: IpProtocol::Icmp, + payload_len: icmp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + icmp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp = Icmpv4Packet::new_unchecked( + &mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..], + ); + icmp_repr.emit(&mut icmp, &Default::default()); + + let mut stack = match SlirpBackend::new() { + Ok(s) => s, + Err(_) => { + eprintln!("skip: SlirpBackend::new failed"); + return; + } + }; + if stack.process_guest_frame(&buf).is_err() { + eprintln!("skip: process_guest_frame failed (likely no ICMP support)"); + return; + } + + // Poll up to 20 × 50ms for the reply. + let mut saw_reply = false; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + let Some(eth) = EthernetFrame::new_checked(f.as_slice()).ok() else { continue; }; + if eth.ethertype() != EthernetProtocol::Ipv4 { continue; } + let Some(ip) = Ipv4Packet::new_checked(eth.payload()).ok() else { continue; }; + if ip.next_header() == IpProtocol::Icmp && ip.dst_addr() == SLIRP_GUEST_IP { + saw_reply = true; + break; + } + } + if saw_reply { break; } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + + if !saw_reply { + // Sysctl may forbid unprivileged ICMP on some hosts. Skip + // rather than fail — the warn-once log explains why. + eprintln!( + "skip: no ICMP reply received within 1s; \ + sysctl net.ipv4.ping_group_range may forbid unprivileged ICMP" + ); + } +} +``` + +- [ ] **Step 2: Run.** + +```bash +cargo test --test network_baseline icmp_echo_returns_reply +``` + +Expected: PASS (or SKIP with the sysctl message on a restrictive host). + +- [ ] **Step 3: Run the full suite** to confirm no regression: + +```bash +cargo test --test network_baseline +``` + +Expected: 14 tests pass (the renamed test is one of them). + +- [ ] **Step 4: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): flip ICMP pin — assert echo reply (was BROKEN_ON_PURPOSE)" +``` + +--- + +### Task 1.6: Populate `icmp_rr_latency_us_p50` in `voidbox-network-bench` + +**Files:** +- Modify: `src/bin/voidbox-network-bench/main.rs` + +- [ ] **Step 1: Add `measure_icmp_rr_latency`** alongside the existing + measurement functions. Use busybox `ping` (which is in the test + initramfs) inside the guest: + +```bash +ping -c -W 1 -i 0.05 8.8.8.8 \ + | awk '/time=/ { sub(/^.*time=/, ""); sub(/ ms.*/, ""); print }' +``` + +Each line of output is one RTT in milliseconds; multiply by 1000 for +microseconds, collect, percentile. + +The guest exec returns the joined output via the existing +`ControlChannel::exec` API. Parse the lines, build a `Vec`, +call `percentile(&mut samples, 0.5)`. + +If the guest's ICMP echo fails (sysctl, host kernel, etc.), `ping` +returns a non-zero exit. Treat that as "leave the metric `None`" with +a `WARN` log, same fallback shape as the other measurements. + +- [ ] **Step 2: Wire into `main`** — call after the existing TCP/UDP + measurements; populate `report.icmp_rr_latency_us_p50`. + +- [ ] **Step 3: Smoke run.** + +```bash +VOID_BOX_KERNEL=$PWD/target/vmlinux-slim-x86_64 \ +VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz \ + cargo run --release --bin voidbox-network-bench -- --iterations 1 \ + | python3 -m json.tool +``` + +`icmp_rr_latency_us_p50` should be a non-null number now. + +- [ ] **Step 4: Commit.** + +```bash +git add src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): populate ICMP RR latency p50" +``` + +--- + +## Workstream 1C — Validation + +### Task 1.7: Validation gate + summary commit + +**Files:** none (gate only) + +- [ ] **Step 1: Format + clippy.** + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +``` + +- [ ] **Step 2: Workspace tests.** + +```bash +cargo test --workspace --all-features +cargo test --doc --workspace --all-features +``` + +- [ ] **Step 3: Network baseline.** + +```bash +cargo test --test network_baseline +``` + +Expected: 14 tests pass (previously-broken `icmp_echo_silently_dropped` +is now `icmp_echo_returns_reply` and asserts a reply). + +- [ ] **Step 4: Microbenches no-regression.** + +```bash +cargo bench --bench network +``` + +Compared to the Phase 0 baseline. + +- [ ] **Step 5: VM suites that touch networking** (Linux/KVM): + +```bash +export VOID_BOX_KERNEL=$PWD/target/vmlinux-slim-x86_64 +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test conformance -- --ignored --test-threads=1 +cargo test --test snapshot_integration -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +cargo test --test e2e_mount -- --ignored --test-threads=1 +``` + +- [ ] **Step 6: New ICMP RR metric** captured: + +```bash +cargo run --release --bin voidbox-network-bench -- --iterations 3 \ + --output /tmp/baseline-network-phase1.json +cat /tmp/baseline-network-phase1.json +``` + +`icmp_rr_latency_us_p50` should be a non-null number; the other +metrics should be statistically equivalent to Phase 0's baseline. + +- [ ] **Step 7: aarch64 cross-check** if available. + +- [ ] **Step 8:** No commit needed for validation alone. PR opens + later when the user is ready (across multiple phases on the same + branch). + +--- + +## Risks + +- **Sysctl-restricted hosts.** If `net.ipv4.ping_group_range` is `1 0` + (default on some hardened environments), `socket()` returns `EACCES` + and we silently degrade. The warn-once log + the test's skip path + handle this. Document in the PR description. +- **macOS portability.** macOS's `IPPROTO_ICMP SOCK_DGRAM` works + unconditionally, but the rest of `slirp.rs` is already + `#[cfg(target_os = "linux")]`-gated, so this isn't a practical + concern in Phase 1 — macOS uses VZ NAT, not SLIRP. +- **ICMP id collision.** Two guest processes pinging different hosts + with the same id won't collide because the key is + `(guest_id, dst_ip)`. Two guest processes pinging the *same* host + with the same id will share an entry — which is correct: replies + belong to whichever guest sent the matching seq. + +## File impact + +| File | Change | Approximate LOC | +|---|---|---| +| `src/network/slirp.rs` | `IcmpEchoEntry`, `handle_icmp_frame`, `relay_icmp_echo`, sysctl fallback | +180 | +| `tests/network_baseline.rs` | flip `icmp_echo_silently_dropped` → `icmp_echo_returns_reply` | ~+15/-15 | +| `src/bin/voidbox-network-bench/main.rs` | `measure_icmp_rr_latency` | +50 | +| **Total** | | **~+230** (within the spec's ~150-LOC estimate plus test/bench wiring) | diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase2.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase2.md new file mode 100644 index 00000000..bb0512a3 --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase2.md @@ -0,0 +1,495 @@ +# Phase 2 Implementation Plan: Generalize UDP (per-flow connected sockets) + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development. +> Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task:** +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Use LSP for navigation. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) +**Continues from Phase 1:** [`2026-04-27-smoltcp-passt-port-phase1.md`](2026-04-27-smoltcp-passt-port-phase1.md) + +**Goal:** Replace the port-53-only `handle_dns_frame` fast-path with a +general per-flow UDP NAT, mirroring passt's `udp.c::udp_flow_from_tap` +design. Keep the existing DNS cache as a fast-path within the +generalized handler (the cache is actually better than what passt has, +per the spec). Flip the `udp_non_dns_silently_dropped` BROKEN_ON_PURPOSE +pin to verify arbitrary UDP works. + +**Architecture:** New `UdpFlowEntry` per `(guest_src_port, dst_ip, dst_port)`. +Each entry owns one connected `UdpSocket`. `handle_udp_frame` routes: +DNS (`SLIRP_DNS_IP:53`) keeps the existing cached/forward path; +everything else creates/reuses a flow and `send_to`s. `relay_udp_flows` +polls each socket for replies and emits UDP frames back to the guest. +Idle timeout reaps inactive flows. + +**Tech Stack:** Rust 1.88, `std::net::UdpSocket` (already used for DNS), +`smoltcp::wire::UdpRepr`/`UdpPacket` (already imported), no new deps. + +**Branch:** `smoltcp-passt-port-phase0` (continuing on the same branch +through Phase 0 + 1 + 2 — user instruction). + +--- + +## Task structure + +7 tasks across two workstreams. + +| ID | Workstream | Scope | +|---|---|---| +| 2.1 | impl | Add `UdpFlowEntry` + key + `icmp_echo`-style HashMap field | +| 2.2 | impl | Generalize dispatch: route non-53 UDP to `handle_udp_frame` | +| 2.3 | impl | Implement `relay_udp_flows` host→guest reply path | +| 2.4 | impl | Idle timeout + flow reaping (60s) | +| 2.5 | test | Flip `udp_non_dns_silently_dropped` BROKEN_ON_PURPOSE pin | +| 2.6 | bench | Replace `measure_dns_qps`'s `nc -w1`-bottlenecked impl with a real UDP socket | +| 2.7 | gate | Phase 2 validation gate | + +--- + +## Workstream 2A — Implementation (`src/network/slirp.rs`) + +### Task 2.1: `UdpFlowEntry` + per-flow socket helper + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Define key + entry types** (mirror `IcmpEchoKey`/`IcmpEchoEntry` from Phase 1): + +```rust +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct UdpFlowKey { + guest_src_port: u16, + dst_ip: Ipv4Address, + dst_port: u16, +} + +struct UdpFlowEntry { + /// Connected `UdpSocket`. The host kernel handles source-port + /// preservation and reply demux; we just `send_to` and + /// `recv_from`. Set non-blocking. + sock: std::net::UdpSocket, + last_activity: Instant, +} +``` + +- [ ] **Step 2: Add helper `open_udp_flow_socket(dst: SocketAddr) -> io::Result`** + +```rust +fn open_udp_flow_socket(dst: std::net::SocketAddr) -> io::Result { + let sock = std::net::UdpSocket::bind("0.0.0.0:0")?; + sock.set_nonblocking(true)?; + sock.connect(dst)?; + Ok(sock) +} +``` + +`connect()` on a `UdpSocket` doesn't open a TCP-style connection — it +sets the default destination and filters incoming datagrams to that +peer only. This is what passt's per-flow design relies on. + +- [ ] **Step 3: Add `udp_flows: HashMap` field on `SlirpBackend`.** + +Initialize in `with_security` (the canonical constructor) — `new()` and `Default::default()` delegate to it. + +- [ ] **Step 4: cargo check** — should compile clean. No behavior wired yet. + +- [ ] **Step 5: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): add UdpFlowEntry + per-flow connected socket helper" +``` + +--- + +### Task 2.2: Dispatch non-DNS UDP to `handle_udp_frame` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Update `handle_ipv4_frame` to route UDP.** Currently + (around line 642): + +```rust +if dst_ip == SLIRP_DNS_IP && protocol == IpProtocol::Udp { + return self.handle_dns_frame(&ipv4); +} +``` + +Change to: + +```rust +if protocol == IpProtocol::Udp { + if dst_ip == SLIRP_DNS_IP { + return self.handle_dns_frame(&ipv4); + } + return self.handle_udp_frame(&ipv4); +} +``` + +DNS keeps its dedicated handler (cache + upstream forward). Everything else flows through the new path. + +- [ ] **Step 2: Add `handle_udp_frame`** as a sibling of `handle_dns_frame`: + +```rust +fn handle_udp_frame(&mut self, ipv4: &Ipv4Packet<&[u8]>) -> Result<()> { + let udp = match UdpPacket::new_checked(ipv4.payload()) { + Ok(u) => u, + Err(_) => return Ok(()), + }; + let payload = udp.payload().to_vec(); // own; mutable borrow of self below + let key = UdpFlowKey { + guest_src_port: udp.src_port(), + dst_ip: ipv4.dst_addr(), + dst_port: udp.dst_port(), + }; + + // SLIRP gateway translation: 10.0.2.2 → 127.0.0.1 (same trick as TCP). + let dst_ip_for_socket = if key.dst_ip == SLIRP_GATEWAY_IP { + std::net::Ipv4Addr::LOCALHOST + } else { + std::net::Ipv4Addr::from(key.dst_ip.0) + }; + let dst = std::net::SocketAddr::from((dst_ip_for_socket, key.dst_port)); + + let entry = match self.udp_flows.entry(key) { + std::collections::hash_map::Entry::Occupied(o) => o.into_mut(), + std::collections::hash_map::Entry::Vacant(v) => { + let sock = match open_udp_flow_socket(dst) { + Ok(s) => s, + Err(e) => { + trace!("SLIRP UDP: open flow socket failed: {e}"); + return Ok(()); + } + }; + v.insert(UdpFlowEntry { sock, last_activity: Instant::now() }) + } + }; + entry.last_activity = Instant::now(); + + if let Err(e) = entry.sock.send(&payload) { + trace!("SLIRP UDP: send failed: {e}"); + } + Ok(()) +} +``` + +- [ ] **Step 3: cargo check + tests.** All 14 baseline tests still pass. + `udp_non_dns_silently_dropped` continues to pass (no reply path yet). + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): forward non-DNS UDP via per-flow connected sockets" +``` + +--- + +### Task 2.3: `relay_udp_flows` host→guest reply path + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add `relay_udp_flows`** alongside `relay_icmp_echo`: + +```rust +fn relay_udp_flows(&mut self) { + let now = Instant::now(); + let keys: Vec = self.udp_flows.keys().copied().collect(); + for key in keys { + let frame = { + let Some(entry) = self.udp_flows.get_mut(&key) else { continue; }; + let mut buf = [0u8; 1500]; + match entry.sock.recv(&mut buf) { + Ok(n) => { + entry.last_activity = now; + Self::build_udp_reply_to_guest( + key.dst_ip, key.dst_port, key.guest_src_port, &buf[..n], + ) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, + Err(_) => continue, + } + }; + if let Some(f) = frame { + self.inject_to_guest.push(f); + } + } +} + +fn build_udp_reply_to_guest( + src_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + payload: &[u8], +) -> Option> { + let udp_repr = UdpRepr { src_port, dst_port }; + let ip_repr = Ipv4Repr { + src_addr: src_ip, + dst_addr: SLIRP_GUEST_IP, + next_header: IpProtocol::Udp, + payload_len: 8 + payload.len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GATEWAY_MAC), + dst_addr: EthernetAddress(GUEST_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + 8 + payload.len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(src_ip), + &IpAddress::Ipv4(SLIRP_GUEST_IP), + payload.len(), + |b| b.copy_from_slice(payload), + &Default::default(), + ); + Some(buf) +} +``` + +Note `payload.len()` (NOT `8 + payload.len()`) for `udp_repr.emit`'s +4th arg — matches the bug we fixed in 0A.7. + +- [ ] **Step 2: Wire into `drain_to_guest`.** Find the existing chain: + `self.relay_tcp_nat_data();` → `self.relay_icmp_echo();` and append + `self.relay_udp_flows();` after the ICMP relay. + +- [ ] **Step 3: cargo check + tests.** Note: `udp_non_dns_silently_dropped` + is now expected to FAIL — UDP replies actually flow. Don't flip the + test in this task (Task 2.5 owns that). Run with `--no-fail-fast` to + confirm only that one test fails. + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): relay UDP flow replies back to guest" +``` + +--- + +### Task 2.4: UDP idle timeout + flow reaping + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add idle reap to `relay_udp_flows`.** At the start (or + end) of the function, walk entries and remove those past + `UDP_IDLE_TIMEOUT`: + +```rust +const UDP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + +// At top of relay_udp_flows: +let stale: Vec = self + .udp_flows + .iter() + .filter(|(_, e)| now.duration_since(e.last_activity) > UDP_IDLE_TIMEOUT) + .map(|(k, _)| *k) + .collect(); +for k in stale { + self.udp_flows.remove(&k); +} +``` + +passt uses `/proc/sys/net/netfilter/nf_conntrack_udp_timeout` for this; we hardcode 60s (the kernel default). Don't read from /proc. + +- [ ] **Step 2: cargo check + tests.** No new test for the timeout + (the test would need to wait 60s; integration cost not worth it). + +- [ ] **Step 3: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): UDP flow idle reap (60s)" +``` + +--- + +## Workstream 2B — Test + bench + +### Task 2.5: Flip `udp_non_dns_silently_dropped` BROKEN_ON_PURPOSE pin + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Find the test** (introduced in 0A.8). Rename to + `udp_non_dns_round_trips` and rewrite to assert the host receives + the datagram, then sends a reply that the guest receives. + +```rust +/// Phase 2 flipped the BROKEN_ON_PURPOSE assertion: arbitrary UDP +/// (any destination port, not just 53) now round-trips through the +/// per-flow connected-socket NAT. +#[test] +fn udp_non_dns_round_trips() { + let host_sock = UdpSocket::bind("127.0.0.1:0").unwrap(); + let host_port = host_sock.local_addr().unwrap().port(); + host_sock + .set_read_timeout(Some(std::time::Duration::from_millis(500))) + .unwrap(); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest sends "hello" to gateway:host_port (which SLIRP rewrites + // to 127.0.0.1:host_port). + stack + .process_guest_frame(&build_udp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + b"hello", + )) + .unwrap(); + let _ = drain_n(&mut stack, 4); + + // Host receives the datagram. + let mut buf = [0u8; 32]; + let (n, peer) = host_sock.recv_from(&mut buf).expect("host receives guest UDP"); + assert_eq!(&buf[..n], b"hello"); + + // Host echoes back. + host_sock.send_to(&buf[..n], peer).unwrap(); + + // Drain — guest should see the reply on its source port. + let mut saw_reply = false; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + let Some(eth) = EthernetFrame::new_checked(f.as_slice()).ok() else { continue; }; + if eth.ethertype() != EthernetProtocol::Ipv4 { continue; } + let Some(ip) = Ipv4Packet::new_checked(eth.payload()).ok() else { continue; }; + if ip.next_header() != IpProtocol::Udp { continue; } + let Some(udp_pkt) = UdpPacket::new_checked(ip.payload()).ok() else { continue; }; + if udp_pkt.dst_port() == GUEST_EPHEMERAL_PORT && udp_pkt.payload() == b"hello" { + saw_reply = true; + break; + } + } + if saw_reply { break; } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + assert!(saw_reply, "guest must receive UDP reply via per-flow NAT"); +} +``` + +- [ ] **Step 2: Run.** + +```bash +cargo test --test network_baseline udp_ +cargo test --test network_baseline # confirm 14 pass total +``` + +- [ ] **Step 3: Commit.** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): flip UDP pin — assert non-DNS round-trips (was BROKEN_ON_PURPOSE)" +``` + +--- + +### Task 2.6: Replace `measure_dns_qps` busybox-`nc`-bottlenecked impl + +**Files:** +- Modify: `src/bin/voidbox-network-bench/main.rs` + +- [ ] **Step 1: Read the current `measure_dns_qps`** to understand the + existing flow. It currently runs busybox `nc -u -w1` per query in the + guest, which caps qps at ~1/s (0.5 qps observed) regardless of SLIRP + speed. With Phase 2's general UDP, we can do something faster. + +- [ ] **Step 2: Replace the inner shell loop with a tighter pattern** + using busybox `dd`-style raw UDP via `/dev/udp/`. busybox `nc` opens + one connection per invocation and sleeps for the timeout. A loop in + shell using `awk` to bound iterations: + +```sh +end=$(($(date +%s) + 5)) +count=0 +while [ "$(date +%s)" -lt "$end" ]; do + printf '\x12\x34\x01\x00\x00\x01\x00\x00\x00\x00\x00\x00\x07example\x03com\x00\x00\x01\x00\x01' \ + | nc -u -w0 -q0 10.0.2.3 53 >/dev/null 2>&1 && count=$((count + 1)) +done +echo "qps=$((count / 5))" +``` + +`-w0` (no idle wait) and `-q0` (close immediately on EOF) prevent the +1s-per-query stall. busybox `nc` may not honor both; if so, accept +that DNS qps stays approximate and remove `measure_dns_qps` entirely +(replacing it with a host-driven measurement that sends UDP through +SLIRP from outside the guest — a smaller, cleaner change). + +If neither works reliably: leave the metric `null` with a `WARN`. +The Phase 2 win is correctness (DNS isn't blocked anymore), not +this specific number. + +- [ ] **Step 3: Smoke run** with `--iterations 1` and confirm the qps + metric is non-null and >> 0.5. + +- [ ] **Step 4: Commit.** + +```bash +git add src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): use tighter busybox-nc loop for DNS qps" +``` + +If Step 2 doesn't yield a reliable improvement, commit a smaller +change documenting the limit and move on. + +--- + +## Workstream 2C — Validation + +### Task 2.7: Validation gate + +**Files:** none (gate only) + +- [ ] fmt + clippy clean +- [ ] `cargo test --workspace` clean (modulo the pre-existing + guest-agent flake we tracked earlier) +- [ ] `cargo test --test network_baseline` 14 pass (the renamed test + is one of them) +- [ ] `cargo bench --bench network` no regression +- [ ] `cargo test --test snapshot_integration -- --ignored` 8/8 pass +- [ ] Wall-clock smoke run produces non-null `udp_dns_qps` >= Phase 0 + baseline (or stays `null` with documented WARN if Step 2.6 didn't + improve it) + +No PR opened — paused per user instruction. Branch will keep +accumulating phases. + +--- + +## File impact + +| File | Approximate LOC | +|---|---| +| `src/network/slirp.rs` | +200 | +| `tests/network_baseline.rs` | +30 / -25 (renamed test) | +| `src/bin/voidbox-network-bench/main.rs` | +30 / -10 | +| **Total** | **~+225** | + +## Risks + +- **Per-flow socket creation can leak fds** if the idle timeout is + too long under burst traffic. 60s is generous; consider tightening + to 30s if memory pressure becomes an issue. Out of scope for this + phase; default 60s matches kernel conntrack. +- **No port-forwarding configurability yet.** Phase 2 only handles + outbound UDP from guest. Inbound UDP forwarding (host → guest port + X) is part of Phase 5 (stateless NAT translation refactor). +- **DNS cache stays.** Some users may expect Phase 2 to invalidate + it; we don't. Cache only fires on `dst == 10.0.2.3:53`; everything + else takes the per-flow path. diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase3.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase3.md new file mode 100644 index 00000000..04c6a62e --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase3.md @@ -0,0 +1,544 @@ +# Phase 3 Implementation Plan: TCP Relay Rewrite (MSG_PEEK + sequence mirroring) + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development. +> Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task:** +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Use LSP for navigation. +> +> **THIS IS THE HIGH-RISK PHASE.** The TCP relay (~625 LOC at +> `src/network/slirp.rs:82–1048`) is the most fragile path in the +> project. The `tcp_to_host_buffer_drops_at_256kb` test pin is the +> headline assertion to flip. `snapshot_integration` and the +> conformance suite are the safety net — every task ends with both +> green or it doesn't land. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) +**Continues from Phase 2:** [`2026-04-27-smoltcp-passt-port-phase2.md`](2026-04-27-smoltcp-passt-port-phase2.md) + +**Goal:** Replace the hand-rolled TCP relay's `to_guest: Vec` and +`to_host: Vec` user-space buffers with passt-style sequence +mirroring (host kernel's TCP socket buffer IS the buffer). Eliminate +the 256 KB `to_host` cliff and drop 100s of LOC of fragile state. + +**Architecture:** For each direction: + +- **host → guest** (host writes, we relay to guest): instead of + `read()` into `to_guest: Vec` then drain, use + `recv(MSG_PEEK)` to inspect what's in the kernel socket without + consuming it. Send the un-acknowledged portion as TCP segments to + the guest. Track `bytes_in_flight = our_seq - last_acked_seq`. + When the guest ACKs, `recv()` (no MSG_PEEK) the ACK'd bytes to + advance the kernel's read pointer. The kernel's socket buffer + absorbs backpressure naturally. + +- **guest → host** (guest writes, we relay to host): on guest + segment, attempt non-blocking `send()` on the host socket. If it + succeeds: ACK the guest. If `WouldBlock` (kernel send buffer full): + **don't** ACK; let the guest retransmit (TCP's natural backpressure). + Drop the 256 KB `to_host: Vec` user-space buffer entirely. + +**Tech Stack:** Rust 1.88, `std::net::TcpStream` (already in use). +`libc::recv` with `MSG_PEEK` flag for the host→guest direction +(std doesn't expose MSG_PEEK on `TcpStream`). + +**Branch:** `smoltcp-passt-port-phase0` (continuing on the same branch +through all phases — user instruction). + +## Non-negotiable invariants + +These are MUSTs across every task in this phase. A task that violates +any of them is rejected at code review, regardless of test status. + +1. **Full observability is preserved.** The whole reason we lift + passt's *patterns* instead of running passt as a process is to + keep our debugging surface. Every task MUST: + - Keep all existing `tracing::trace!`/`debug!`/`warn!`/`error!` + calls in the TCP relay path. If a removed code path's trace + lines no longer fire because the path is gone, that's fine. + But a NEW path missing equivalent tracing is a bug. + - Add new `tracing` events for the new state — at minimum: + - `trace!` on each peek that yields N bytes, + - `trace!` on each ACK-driven consume, + - `debug!` on connection close with `bytes_in_flight` snapshot + (helps post-mortem the unusual-close case), + - `warn!` on unexpected protocol errors (RST during ESTABLISHED, + seq number going backwards, etc.). + - Stay all-Rust, no FFI boundary, no opaque process. `libc::recv` + for MSG_PEEK is fine — that's a syscall, not an opaque process; + it doesn't cross a debugger boundary. +2. **`cargo test`-driveable.** Every behavior change is exercised by + a test in `tests/network_baseline.rs` that drives `SlirpBackend` + directly (no VM). The pin tests are the contract. +3. **`tracing-subscriber` pipeline integrity.** Don't introduce + anything that bypasses the existing `tracing` filter chain + (`VOIDBOX_LOG_LEVEL` / `RUST_LOG` env vars, `LogConfig` + structured logger). If a new diagnostic needs a backchannel, + route it through `tracing` events with structured fields. +4. **Profiler keeps working.** No syscalls in tight loops without an + observable wrapper (e.g. don't call `libc::recv` from a hot path + without a `tracing::trace!` annotation that flame-graph-able + tools can attribute the time to). + +--- + +## Task structure + +8 tasks across three workstreams. + +| ID | Workstream | Scope | +|---|---|---| +| 3.1 | impl | Add sequence-mirroring fields to `TcpNatEntry`; default-init alongside existing buffers | +| 3.2 | impl | Add `recv_peek` helper using `libc::recv(MSG_PEEK)` | +| 3.3 | impl | Replace host→guest path: drain via peek, send `bytes_available - bytes_in_flight` | +| 3.4 | impl | Replace guest-ACK handling: consume ACK'd bytes from kernel, send next chunk | +| 3.5 | impl | Drop guest→host `to_host` buffer; rely on kernel send buffer + don't-ACK-on-EAGAIN backpressure | +| 3.6 | impl | Drop `to_guest`, `MAX_TO_HOST_BUFFER`, dead helpers; cleanup | +| 3.7 | test | Flip `tcp_to_host_buffer_drops_at_256kb` BROKEN_ON_PURPOSE pin | +| 3.8 | gate | Phase 3 validation gate (full conformance + snapshot suites + bench) | + +--- + +## Workstream 3A — Add scaffolding (no behavior change) + +### Task 3.1: Sequence-mirroring fields on `TcpNatEntry` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add fields** to `TcpNatEntry` (around line 107 — LSP `documentSymbol` will surface). Add at the end of the struct: + +```rust +/// passt-style sequence mirroring: bytes the kernel has buffered +/// past our last consumed point but not yet sent to guest. With +/// MSG_PEEK, we can inspect the kernel's recv queue without +/// consuming, then `recv` (no peek) the ACK'd portion later. +/// +/// `bytes_in_flight = our_seq - last_acked_seq` — bytes sent to +/// guest but not yet ACK'd. +#[allow(dead_code)] // consumed in 3.3 +bytes_in_flight: u32, +``` + +`our_seq` and `guest_ack` already exist on the struct. Reuse them; don't introduce new aliases. + +- [ ] **Step 2: Initialize** in every construction site of `TcpNatEntry` (LSP `findReferences` on the struct will list them — likely 1–2 sites in `handle_tcp_frame`'s SYN branch). Add `bytes_in_flight: 0,` to each. + +- [ ] **Step 3: Verify.** + +```bash +cargo check +cargo test --test network_baseline # 14 tests still pass +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): add bytes_in_flight to TcpNatEntry (no behavior change)" +``` + +--- + +### Task 3.2: `recv_peek` helper + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add a module-scope helper.** + +```rust +/// Non-blocking `recv(MSG_PEEK)` on a `TcpStream`, returning bytes +/// read without consuming them from the kernel socket buffer. +/// +/// `std::net::TcpStream` does not expose `MSG_PEEK`; we go through +/// `libc::recv` directly. +fn recv_peek(stream: &TcpStream, buf: &mut [u8]) -> io::Result { + use std::os::fd::AsRawFd; + // SAFETY: `stream` outlives the syscall; `buf` is uniquely + // borrowed and `len` matches. + let n = unsafe { + libc::recv( + stream.as_raw_fd(), + buf.as_mut_ptr() as *mut libc::c_void, + buf.len(), + libc::MSG_PEEK | libc::MSG_DONTWAIT, + ) + }; + if n < 0 { + return Err(io::Error::last_os_error()); + } + Ok(n as usize) +} +``` + +`std::os::fd::AsRawFd` is already in the module-scope use block (added in Phase 1.1). `MSG_DONTWAIT` ensures non-blocking even if the stream's `set_nonblocking` flag is dropped somehow. + +- [ ] **Step 2: Verify** the helper compiles. No callers yet: + +```bash +cargo check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): add recv_peek helper using libc::recv MSG_PEEK" +``` + +--- + +## Workstream 3B — The actual relay rewrite + +### Task 3.3: Replace host→guest path with peek-based send + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Locate** the host→guest section in `relay_tcp_nat_data` + via LSP `documentSymbol`. It's the `read` block around lines + 991–1025: read up to 16 KB into `entry.to_guest`, drain `to_guest` + in MTU-sized chunks, build TCP packets, increment `our_seq`. + +- [ ] **Step 2: Replace** that block with a peek-based version. The + new logic: + +```rust +// Host → guest, peek-based sequence-mirroring. +// We don't `read()` into a userspace buffer — the kernel's socket +// buffer holds outstanding data until the guest ACKs, at which point +// Task 3.4 consumes the ACK'd portion via plain `recv()`. +let mut peek_buf = [0u8; 65536]; +match recv_peek(&entry.host_stream, &mut peek_buf) { + Ok(0) => { + // EOF from host. Send FIN to guest if we haven't already. + // (FIN handling continues to use the existing block below.) + entry.state = TcpNatState::Closed; + } + Ok(n) => { + // Send only the un-ACK'd portion: skip what's already in flight. + let bytes_in_flight = entry.bytes_in_flight as usize; + if n > bytes_in_flight { + let new_payload = &peek_buf[bytes_in_flight..n]; + for chunk in new_payload.chunks(MTU - 54) { + let frame = build_tcp_packet_static( + /* ... existing args, payload=chunk, seq=entry.our_seq ... */ + ); + self.inject_to_guest.push(frame); + entry.our_seq = entry.our_seq.wrapping_add(chunk.len() as u32); + entry.bytes_in_flight = + entry.bytes_in_flight.wrapping_add(chunk.len() as u32); + } + } + // else: everything in the kernel buffer is already in flight; + // wait for guest to ACK before sending more. + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { + // Nothing in the kernel buffer yet; nothing to do. + } + Err(_) => { + entry.state = TcpNatState::Closed; + } +} +``` + +The exact builder call must match the existing `build_tcp_packet_static` signature — read the current call site and copy verbatim. + +- [ ] **Step 3: Run.** + +```bash +cargo check +cargo test --test network_baseline # tcp_data_round_trip MUST pass; the 256KB cliff test still passes (cliff still in place via to_host path which 3.5 will remove) +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +``` + +The `tcp_to_host_buffer_drops_at_256kb` BROKEN_ON_PURPOSE pin tests the **guest→host** direction — it should still pass after this task because we haven't touched that path yet (3.5 owns it). + +- [ ] **Step 4: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): peek-based host→guest TCP relay (drops to_guest buffer dependency)" +``` + +> Note: the `to_guest: Vec` field is now unused but still on the +> struct. Task 3.6 removes it; until then it stays so the diff per +> task is reviewable. + +--- + +### Task 3.4: ACK handling — consume ACK'd bytes from kernel + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Locate** guest-ACK handling. In `handle_tcp_frame`, + the ACK branch (around line 855–870) currently advances + `entry.guest_ack` and may transition state. With peek-based send, + on each ACK we must also `recv()` (no peek) the ACK'd bytes from + the kernel socket so the kernel can free them. + +- [ ] **Step 2: Compute ACK'd bytes** from the incoming TCP segment's + ACK number minus the entry's last-known `guest_ack`. Use wrapping + arithmetic — TCP sequence numbers wrap at 2³². + +```rust +let segment_ack = /* ... extract from TcpRepr ... */; +let acked_bytes = segment_ack.wrapping_sub(entry.guest_ack); +// Advance the recorded ack point. +if acked_bytes > 0 && acked_bytes <= entry.bytes_in_flight { + let mut sink = [0u8; 65536]; + let mut remaining = acked_bytes as usize; + while remaining > 0 { + let want = remaining.min(sink.len()); + match entry.host_stream.read(&mut sink[..want]) { + Ok(0) | Err(_) => break, // EOF or error; let next iteration handle it + Ok(n) => remaining -= n, + } + } + entry.bytes_in_flight = + entry.bytes_in_flight.wrapping_sub(acked_bytes - remaining as u32); + entry.guest_ack = segment_ack; +} +``` + +The `read()` call (not `recv` directly) consumes from the kernel buffer — equivalent on a non-blocking `TcpStream`. The `entry.host_stream` is already non-blocking, so this won't stall. + +- [ ] **Step 3: Test the round trip.** `tcp_data_round_trip` should + still pass — guest sends 5 bytes, host echoes, guest receives. The + echo path now uses peek + ACK-driven consume. + +```bash +cargo test --test network_baseline tcp_data_round_trip +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): consume ACK'd bytes from kernel on guest ACK" +``` + +--- + +### Task 3.5: Drop guest→host `to_host` buffer (kill the 256 KB cliff) + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Locate** the `to_host` write path. In `handle_tcp_frame` + (around lines 867–911) and `relay_tcp_nat_data` (around lines + 960–989), the current code: + - Writes guest payload to `entry.host_stream` directly when + `to_host` is empty. + - Buffers in `entry.to_host` on `WouldBlock`. + - Drops the connection when `to_host` exceeds `MAX_TO_HOST_BUFFER` + (256 KB). + - Sends ACK on successful write OR sets `to_host_pending_ack` when + the write was buffered. + +- [ ] **Step 2: Replace** with a strict don't-ACK-on-EAGAIN approach: + - Attempt non-blocking `write` on the host socket. + - On full success: ACK the guest immediately. + - On partial success (some bytes written): ACK only those bytes; + let the guest retransmit the rest. + - On `WouldBlock` with zero bytes written: **don't ACK**; let the + guest retransmit per TCP's natural backpressure. The kernel's + send buffer fills up; when it drains, the next guest retransmit + succeeds. + +```rust +// In handle_tcp_frame's data branch: +let payload = /* ... existing extract ... */; +let n_written = match entry.host_stream.write(payload) { + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => 0, + Err(_) => { + entry.state = TcpNatState::Closed; + return Ok(()); + } +}; +if n_written > 0 { + let ack_seq = segment_seq.wrapping_add(n_written as u32); + self.send_ack(entry, ack_seq); + entry.guest_seq = ack_seq; +} +// else: silently drop the segment; guest retransmits. +``` + +- [ ] **Step 3: Remove the `MAX_TO_HOST_BUFFER` constant** and the + 256 KB-cliff branch. The cliff is gone — TCP backpressure handles + it naturally. + +- [ ] **Step 4: Verify.** + +```bash +cargo check +cargo test --test network_baseline # tcp_data_round_trip still passes +# tcp_to_host_buffer_drops_at_256kb is EXPECTED TO FAIL now — +# Task 3.7 will flip it. For this task, run with --no-fail-fast and +# confirm only that test fails. +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): drop to_host buffer + 256KB cliff, use TCP backpressure" +``` + +--- + +### Task 3.6: Cleanup — drop unused fields + dead helpers + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Remove unused fields** from `TcpNatEntry`: + - `to_guest: Vec` — replaced by peek-based send. + - `to_host: Vec` — replaced by kernel send buffer + retransmit. + - `to_host_pending_ack: Option` — replaced by direct ACK on + successful write. + +- [ ] **Step 2: Remove dead helpers** that referenced them. Use LSP + `findReferences` on each removed field to find call sites; remove + the helpers if they're now orphaned. + +- [ ] **Step 3: Update doc comments** — the file-level doc and the + `TcpNatEntry` doc should reflect the new design. + +- [ ] **Step 4: Verify.** + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): drop to_guest/to_host/pending_ack fields and dead helpers" +``` + +--- + +## Workstream 3C — Test + validation + +### Task 3.7: Flip `tcp_to_host_buffer_drops_at_256kb` BROKEN_ON_PURPOSE pin + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Locate** the test. It currently asserts that pushing + ~300 KB closes the connection. + +- [ ] **Step 2: Rewrite** to assert the OPPOSITE — pushing >256 KB + succeeds with no connection close. Rename to + `tcp_writes_more_than_256kb_succeed`. The test: + - Bind a host TCP server that accepts and reads ~1 MB. + - Drive the handshake. + - Push 1 MB in chunks. + - Assert no `Rst` / `Fin` arrives at the guest mid-stream. + - Assert the host server receives all 1 MB. + +- [ ] **Step 3: Run.** + +```bash +cargo test --test network_baseline tcp_writes_more_than_256kb_succeed +cargo test --test network_baseline # 14 tests pass +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add tests/network_baseline.rs +git commit -m "test(network): flip 256KB cliff pin — assert >1MB succeeds" +``` + +--- + +### Task 3.8: Phase 3 validation gate + +**Files:** none (gate only) + +- [ ] **Static checks** + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +``` + +- [ ] **Unit + baseline tests** + +```bash +cargo test --workspace --all-features +cargo test --test network_baseline +``` + +- [ ] **Conformance + snapshot integration suites — the safety net** + +```bash +export VOID_BOX_KERNEL=$PWD/target/vmlinux-slim-x86_64 +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test conformance -- --ignored --test-threads=1 +cargo test --test snapshot_integration -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +cargo test --test e2e_mount -- --ignored --test-threads=1 +``` + +These exercise real TCP traffic through the SLIRP path. **Any +regression here is a Phase 3 blocker.** + +- [ ] **Microbench regression check** + +```bash +cargo bench --bench network +``` + +Compare `process_syn`, `poll_idle`, `poll_with_n_flows` against the +Phase 2 baseline. No regression > 10%. + +- [ ] **Wall-clock harness** + +```bash +./target/release/voidbox-network-bench --iterations 3 \ + --output /tmp/baseline-network-phase3.json +cat /tmp/baseline-network-phase3.json +``` + +Expected: +- `tcp_throughput_g2h_mbps`: comparable to Phase 2 (~1900 Mbps). +- `tcp_rr_latency_us_p50`: comparable (~2 µs). +- `tcp_crr_latency_us_p50`: **expected to drop** — the new TCP relay + has fewer per-segment ACK round-trips. From Phase 2's ~10,160 µs + toward something closer to passt's 135 µs. Anywhere meaningfully + below 5,000 µs is a clear win. + +- [ ] **Startup bench warm-restore** (the bench fixed in 0d0ab20) + must continue to pass: + +```bash +./target/release/voidbox-startup-bench --iters 3 --breakdown +# warm phase exits 0 +``` + +No PR opened — paused per user instruction. + +--- + +## Risks + +- **Highest-risk phase by far.** The TCP relay rewrite is ~400 LOC + replaced. Any subtle bug in the sequence math (off-by-one, + unsigned wrap, ACK-vs-segment-seq confusion) silently breaks + long-running connections. The conformance + snapshot suites are + the safety net. +- **Sequence wrap arithmetic.** TCP seq numbers are 32-bit and wrap + at 2³². Use `wrapping_add` / `wrapping_sub` everywhere. A naive + comparison at boundaries is silently wrong. +- **MSG_PEEK + non-blocking + multi-thread.** `recv_peek` is called + from the net-poll thread. The host socket is non-blocking. Confirm + no other code path closes the socket concurrently. +- **Window-scaling not implemented.** Today's `TCP_WINDOW = 65535` + hardcoded. We don't claim window scaling in SYN-ACK options. + Acceptable for Phase 3 — passt-grade window negotiation is deferred. +- **TCP_INFO not used.** passt queries `TCP_INFO` on the host socket + to mirror RTT/window. We don't. Connections work without it; window + semantics are slightly different. Out of scope here. + +## File impact + +| File | Approximate LOC | +|---|---| +| `src/network/slirp.rs` | **~+250 / −350** (net reduction) | +| `tests/network_baseline.rs` | ~+50 / −60 (rewrite the cliff test) | +| **Total** | **~+300 / −410** | + +Net reduction in `slirp.rs` is the headline win. Less code, fewer +fragile invariants, kernel does the buffering. diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase4.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase4.md new file mode 100644 index 00000000..fa3b29db --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase4.md @@ -0,0 +1,431 @@ +# Phase 4 Implementation Plan: Unified Flow Table + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development. +> Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task:** +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Use LSP for navigation. +> +> **Phase 4 is a NO-BEHAVIOR-CHANGE refactor.** Every task ends with +> all 14 baseline pins, all VM suites, and `voidbox-startup-bench` +> warm phase still green. The point is structural cleanup, not new +> capability — temptation to bolt on "while I'm here" features +> should be redirected to Phase 5. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) +**Continues from Phase 3:** [`2026-04-27-smoltcp-passt-port-phase3.md`](2026-04-27-smoltcp-passt-port-phase3.md) + +**Goal:** Replace the three per-protocol HashMaps on `SlirpBackend` +(`tcp_nat`, `udp_flows`, `icmp_echo`) with a single `flow_table` +keyed by a `FlowKey` enum, with values held in a `FlowEntry` enum. +Sets up Phase 5 (stateless NAT + port-forwarding) where shared +flow-table operations matter more. + +**Architecture:** + +```rust +// New types (unified): +enum FlowKey { + Tcp(TcpNatKey), + Udp(UdpFlowKey), + IcmpEcho(IcmpEchoKey), +} + +enum FlowEntry { + Tcp(TcpNatEntry), + Udp(UdpFlowEntry), + IcmpEcho(IcmpEchoEntry), +} + +// On SlirpBackend: +flow_table: HashMap, +``` + +The per-protocol code paths still match on the variant — this is +"three HashMaps in one wrapper" structurally, not a deep redesign. +The user-visible benefits land later: Phase 5 will reuse +`flow_table` for stateless NAT translation + port-forwarding without +caring which protocol owns each entry. + +**Tech Stack:** Rust 1.88, `std::collections::HashMap` (already in +use). No new deps. + +**Branch:** `smoltcp-passt-port-phase0` (continuing on the same +branch — user instruction). + +## Non-negotiable invariants (carried from Phase 3) + +1. **All-Rust** — no opaque process boundary. +2. **Full observability via `tracing`** — every relay continues + to emit `trace!`/`debug!`/`warn!` at the same observable points. + The unification must NOT silently drop log lines. +3. **`cargo test`-driveable** — all 14 baseline pins, plus + `tcp_writes_more_than_256kb_succeed`, must continue passing. +4. **Standard Rust tooling** — LSP, clippy, profiler keep working. + +## What this phase explicitly does NOT do + +- **No SipHash hasher.** The default `RandomState` already + randomizes per-process, which is sufficient DoS protection given + guests can't observe other VMs' hash seeds. SipHash is a Phase 5+ + consideration if and only if profiling shows hash contention, + which it currently doesn't. +- **No side-indexed entries.** passt's flow table tracks INISIDE + vs TGTSIDE for each entry; SLIRP is asymmetric (guest is always + the initiator) so this distinction is moot in our model. +- **No new behavior.** Same RFC compliance, same idle timeouts, + same packet handling. The pin tests are the contract. + +## Task structure + +10 tasks across three workstreams. The bench tasks (4.6a–4.6c) land +**after** the migration so they exercise the unified `flow_table`, +not the old per-protocol maps. The validation gate (4.7) compares +the new bench numbers against Phase 3 numbers to verify no +regression from enum dispatch. + +| ID | Workstream | Scope | +|---|---|---| +| 4.1 | impl | Define `FlowKey` + `FlowEntry` enums; no callers yet | +| 4.2 | impl | Add `flow_table` field to `SlirpBackend`; populate in parallel with existing maps (no migration yet) | +| 4.3 | impl | Migrate ICMP path to `flow_table`; drop `icmp_echo` HashMap | +| 4.4 | impl | Migrate UDP path to `flow_table`; drop `udp_flows` HashMap | +| 4.5 | impl | Migrate TCP path to `flow_table`; drop `tcp_nat` HashMap | +| 4.6 | impl | Cleanup: remove dead helpers, update doc comments | +| **4.6a** | **bench** | **`poll_with_n_mixed_flows` — n/3 TCP + n/3 UDP + n/3 ICMP entries, time `poll()`. Catches enum-dispatch regression at scale.** | +| **4.6b** | **bench** | **`process_udp_frame` + `process_icmp_echo_request` — per-protocol hot-path parity vs the existing `process_syn`.** | +| **4.6c** | **bench** | **`flow_table_insert_remove` — pure-compute HashMap op throughput on the unified table; Phase 4 reference for future Phase 5+ work.** | +| 4.7 | gate | Phase 4 validation gate (incl. new benches no-regression) | + +--- + +## Task 4.1: Define `FlowKey` + `FlowEntry` enums + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add the two enums** near the existing `NatKey`, + `TcpNatEntry`, `UdpFlowKey`, `UdpFlowEntry`, `IcmpEchoKey`, + `IcmpEchoEntry` definitions (LSP `documentSymbol` to confirm + placement): + +```rust +/// Unified flow-table key. Each variant wraps the protocol-specific +/// key already defined elsewhere in this module — no field changes, +/// just a single type that the unified `flow_table` HashMap can +/// store. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[allow(dead_code)] // consumed in 4.2 +enum FlowKey { + Tcp(NatKey), + Udp(UdpFlowKey), + IcmpEcho(IcmpEchoKey), +} + +/// Unified flow-table value. Each variant wraps the protocol's +/// existing entry struct. +#[allow(dead_code)] // consumed in 4.2 +enum FlowEntry { + Tcp(TcpNatEntry), + Udp(UdpFlowEntry), + IcmpEcho(IcmpEchoEntry), +} +``` + +`NatKey` already derives `Hash`+`Eq`+`Clone` (the existing TCP key). `UdpFlowKey` and `IcmpEchoKey` already derive the needed traits. The `Copy` constraint is enforced by the variant types — verify they're all `Copy` (they should be — all primitive fields). + +- [ ] **Step 2: Verify.** + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): define FlowKey + FlowEntry enums (no callers yet)" +``` + +--- + +## Task 4.2: Add `flow_table` field + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add the field on `SlirpBackend`.** Place it + alongside (not replacing) the existing per-protocol HashMaps: + +```rust +/// Unified flow table. During Phase 4, populated in parallel with +/// the per-protocol maps (`tcp_nat`, `udp_flows`, `icmp_echo`). +/// Phase 4.3–4.5 migrate each protocol; Phase 4.6 deletes the +/// per-protocol maps. +#[allow(dead_code)] // consumed in 4.3+ +flow_table: HashMap, +``` + +Initialize `flow_table: HashMap::new()` in every `SlirpBackend` +construction site (canonical: `with_security`, which `new()` and +`Default::default()` delegate to). + +- [ ] **Step 2: Verify.** + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): add flow_table field on SlirpBackend (parallel to existing maps)" +``` + +--- + +## Task 4.3: Migrate ICMP path to `flow_table` + +**Files:** +- Modify: `src/network/slirp.rs` + +ICMP first because it's the smallest path (added in Phase 1, ~150 +LOC) and the migration pattern is cleanest there. Once it's right, +4.4 and 4.5 follow the same shape. + +- [ ] **Step 1: Replace `self.icmp_echo` accesses with + `self.flow_table` accesses where the value is `FlowEntry::IcmpEcho`.** + +Two access sites: +- `handle_icmp_frame` (insert/lookup by `IcmpEchoKey`) +- `relay_icmp_echo` (iterate entries, drain socket, build reply) + +Pattern for insert: + +```rust +// OLD: +match self.icmp_echo.entry(key) { + std::collections::hash_map::Entry::Occupied(o) => o.into_mut(), + std::collections::hash_map::Entry::Vacant(v) => v.insert(IcmpEchoEntry { ... }), +} + +// NEW: +let flow_key = FlowKey::IcmpEcho(key); +match self.flow_table.entry(flow_key) { + std::collections::hash_map::Entry::Occupied(o) => match o.into_mut() { + FlowEntry::IcmpEcho(entry) => entry, + _ => unreachable!("FlowKey::IcmpEcho must map to FlowEntry::IcmpEcho"), + }, + std::collections::hash_map::Entry::Vacant(v) => match v.insert(FlowEntry::IcmpEcho(IcmpEchoEntry { ... })) { + FlowEntry::IcmpEcho(entry) => entry, + _ => unreachable!(), + }, +} +``` + +Pattern for iterate: + +```rust +// OLD: +let keys: Vec = self.icmp_echo.keys().copied().collect(); +for key in keys { + let entry = self.icmp_echo.get_mut(&key).unwrap(); + ... +} + +// NEW: +let flow_keys: Vec = self + .flow_table + .keys() + .copied() + .filter(|k| matches!(k, FlowKey::IcmpEcho(_))) + .collect(); +for flow_key in flow_keys { + let FlowKey::IcmpEcho(key) = flow_key else { continue; }; + let Some(FlowEntry::IcmpEcho(entry)) = self.flow_table.get_mut(&flow_key) else { continue; }; + ... +} +``` + +- [ ] **Step 2: Remove the `icmp_echo` field** from `SlirpBackend` + and its initializer. + +- [ ] **Step 3: Verify.** All 14 baseline tests pass, including + `icmp_echo_returns_reply`. + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): migrate ICMP to flow_table" +``` + +--- + +## Task 4.4: Migrate UDP path to `flow_table` + +**Files:** +- Modify: `src/network/slirp.rs` + +Same shape as 4.3. Access sites: +- `handle_udp_frame` (insert/lookup) +- `relay_udp_flows` (iterate + reap stale) + +The reap iteration (`stale: Vec`) needs the same +`filter(|k| matches!(k, FlowKey::Udp(_)))` pattern as 4.3 used for +ICMP iteration. + +- [ ] **Step 1: Migrate accesses to `FlowKey::Udp(...)` / + `FlowEntry::Udp(...)`.** +- [ ] **Step 2: Remove the `udp_flows` field.** +- [ ] **Step 3: Verify** — `udp_non_dns_round_trips` passes, all + 14 tests green. + +```bash +cargo check && cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): migrate UDP to flow_table" +``` + +--- + +## Task 4.5: Migrate TCP path to `flow_table` (the big one) + +**Files:** +- Modify: `src/network/slirp.rs` + +TCP is the largest path — `tcp_nat` is touched by `handle_tcp_frame` +(SYN/data/ACK/FIN/RST branches), `relay_tcp_nat_data` (peek + ACK +consume + idle reap + FIN-on-EOF), and a few helpers. + +- [ ] **Step 1: Catalog every `self.tcp_nat` access** via LSP + `findReferences`. Likely 8–12 sites. +- [ ] **Step 2: Migrate each site** to the + `FlowKey::Tcp(...)` / `FlowEntry::Tcp(...)` pattern from 4.3. The + ACK-consume and peek-send blocks have nested borrows; the + `let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&fk) else { continue; };` + pattern handles them cleanly. +- [ ] **Step 3: Remove the `tcp_nat` field.** +- [ ] **Step 4: Verify — full baseline + the headline pin + `tcp_writes_more_than_256kb_succeed`.** + +```bash +cargo check +cargo test --test network_baseline +cargo bench --bench network tcp_bulk_throughput_1mb +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): migrate TCP to flow_table" +``` + +--- + +## Task 4.6: Cleanup — drop `#[allow(dead_code)]`, update docs + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Remove all `#[allow(dead_code)]`** added in 4.1 + and 4.2 — the items are now consumed. +- [ ] **Step 2: Update file-level doc** at the top of `slirp.rs` + to reflect the unified flow table: + +``` +//! Architecture: +//! - ARP: custom handler for 10.0.2.x +//! - All TCP/UDP/ICMP flows live in a unified flow_table: +//! HashMap. Per-protocol relay logic dispatches +//! on the FlowEntry variant. +//! - DNS to 10.0.2.3:53 takes a cached fast-path +//! - Other: silently dropped +``` + +- [ ] **Step 3: Verify.** + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): drop allow(dead_code) + update Phase 4 docs" +``` + +--- + +## Task 4.7: Phase 4 validation gate + +**Files:** none. + +- [ ] **Static checks** + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +``` + +- [ ] **Unit + baseline + bench** + +```bash +cargo test --workspace --all-features +cargo test --test network_baseline # 14/14 +cargo bench --bench network # no regression +``` + +- [ ] **VM suites — the safety net** + +```bash +export VOID_BOX_KERNEL=$PWD/target/vmlinux-slim-x86_64 +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test snapshot_integration -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +cargo test --test e2e_mount -- --ignored --test-threads=1 +cargo test --test conformance -- --ignored --test-threads=1 +# (3 conformance tests pre-existing fail; same as before — verify same set fails) +``` + +- [ ] **Wall-clock — no regression** + +```bash +./target/release/voidbox-network-bench --iterations 3 --bulk-mb 10 +./target/release/voidbox-startup-bench --iters 3 --breakdown # warm phase exits 0 +``` + +Numbers should be statistically equivalent to Phase 3: +- `tcp_throughput_g2h_mbps` ≈ 1885 Mbps +- `tcp_bulk_throughput_g2h_mbps` ≈ 1565 Mbps +- `tcp_rr_latency_us_p50` = 2 µs +- `tcp_crr_latency_us_p50` ≈ 10 ms + +Any movement >10% on these is a regression. + +## Risks + +- **Borrow checker friction.** Nested `match` on enum variants + with `&mut self` borrows can be awkward — the `let Some(...) else + { continue; }` pattern keeps each access scoped. If you hit a + multi-variant borrow conflict, revisit by keeping the lookup and + the mutation in separate scopes (one to find the variant, one to + mutate). +- **Hashing.** `FlowKey` derives `Hash` from variant + inner key. + Collision probability is fine; the default `RandomState` is + per-process, so guests can't observe seeds. +- **No behavior change is the contract.** If any task changes a + `tracing` event's level or a fields shape, that violates the + observability invariant. Preserve message text and structured + fields. + +## File impact + +| File | Approximate LOC | +|---|---| +| `src/network/slirp.rs` | **~+50 / −30** (net positive — enum dispatch adds boilerplate) | +| **Total** | **~+20** | + +Net LOC goes UP slightly. The win is that Phase 5 can reuse +`flow_table` instead of cloning each per-protocol map's +boilerplate. diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase5.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase5.md new file mode 100644 index 00000000..a70eb780 --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port-phase5.md @@ -0,0 +1,493 @@ +# Phase 5 Implementation Plan: Stateless NAT + Port Forwarding + +> **For agentic workers:** REQUIRED SUB-SKILL: superpowers:subagent-driven-development. +> Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Mandatory skills for every Rust-touching task:** +> `rust-style`, `rustdoc`, `rust-analyzer-ssr`, +> `superpowers:test-driven-development`, +> `superpowers:verification-before-completion`. Use LSP for navigation. + +**Spec:** [`2026-04-27-smoltcp-passt-port.md`](2026-04-27-smoltcp-passt-port.md) +**Continues from Phase 4:** [`2026-04-27-smoltcp-passt-port-phase4.md`](2026-04-27-smoltcp-passt-port-phase4.md) + +**Goal:** Two related changes: + +1. **Refactor address translation** into a pure + `nat::translate_inbound(addr) -> SocketAddr` function. + Today the `SLIRP_GATEWAY_IP (10.0.2.2)` → `127.0.0.1` rewrite + is inlined in `handle_tcp_frame` and `handle_udp_frame`. Pulling + it out of the relay code makes the translation logic reviewable + on its own, sets the shape for IPv6 dual-stack later, and + prepares the hook point for #2. + +2. **Port forwarding** — first user-visible feature in this refactor + chain. Today the only translation is `10.0.2.2 → loopback`. After + Phase 5, an operator can say `host:8080 → guest:80` and a TCP/UDP + connection from a host process to `127.0.0.1:8080` reaches the + guest's port 80. Config flows: spec → `NetworkConfig::port_forwards` + → `nat::Rules` → consulted by `translate_inbound`. + +**Architecture:** + +```rust +// src/network/nat.rs (new file) +pub struct Rules { + /// Outbound: when guest connects to gateway, where on the host + /// kernel does that map to? (`SLIRP_GATEWAY_IP → 127.0.0.1`). + pub gateway_loopback: bool, + /// Outbound: drop / redirect rules that the deny-list / + /// metadata-IP filter currently inlines. + pub deny_cidrs: Vec, + /// Inbound: host-port → guest-port forwarding (the new feature). + pub port_forwards: Vec, +} + +pub struct PortForward { + pub proto: ForwardProto, // Tcp | Udp + pub host_port: u16, + pub guest_port: u16, +} + +/// Stateless: pure function of (incoming dst address, rules) → host +/// SocketAddr to connect/bind to. +pub fn translate_outbound(rules: &Rules, dst: Ipv4Address, dst_port: u16) + -> Option { ... } +``` + +`SlirpBackend` holds `nat: Rules` instead of inlining the gateway +rewrite. The relay code calls `translate_outbound` per packet +(it's pure, fast, no state). + +**Tech Stack:** Rust 1.88, `ipnet::Ipv4Net` (already in use). No new +deps. + +**Branch:** `smoltcp-passt-port-phase0` (continuing on the same +branch — user instruction). + +## Non-negotiable invariants (carried from prior phases) + +1. **All-Rust** — no opaque process boundary. +2. **Full observability via `tracing`** — every translation decision + that diverts a connection (loopback rewrite, deny, port-forward) + emits a `trace!` event with the (rule, src, dst) context. +3. **`cargo test`-driveable** — every behavior change exercised by + `tests/network_baseline.rs` (no VM needed). +4. **No regression** — all 14 baseline pins, snapshot suite, e2e + suites, microbenches, wall-clock baselines stay within 5% of the + Phase 4 numbers. + +## Task structure + +8 tasks across three workstreams. + +| ID | Workstream | Scope | +|---|---|---| +| 5.1 | impl | New module `src/network/nat.rs` with `Rules`, `PortForward`, `ForwardProto`, `translate_outbound` (no callers yet) | +| 5.2 | impl | `SlirpBackend` holds `nat: Rules`; existing `SLIRP_GATEWAY_IP → 127.0.0.1` rewrite + `deny_list` move into `Rules` | +| 5.3 | impl | TCP path consumes `nat::translate_outbound` (replaces the inline rewrite in `handle_tcp_frame`) | +| 5.4 | impl | UDP path consumes `nat::translate_outbound` | +| 5.5 | impl | Wire `port_forwards` from `NetworkConfig` → `Rules`. Inbound forwarding requires a host listener + per-rule accept loop spawned by `SlirpBackend::new` | +| 5.6 | test | New baseline pins: `nat_translate_outbound_loopback_rewrite`, `nat_translate_outbound_deny_list`, `nat_translate_outbound_unmodified`, `tcp_port_forward_inbound` | +| 5.7 | bench | New divan bench `nat_translate_outbound_hot_path` (pure-compute, ns-scale) | +| 5.8 | gate | Phase 5 validation gate | + +--- + +## Workstream 5A — Stateless translation module + +### Task 5.1: New `src/network/nat.rs` module + +**Files:** +- Create: `src/network/nat.rs` +- Modify: `src/network/mod.rs` (`pub mod nat;`) + +- [ ] **Step 1: Create `src/network/nat.rs`** + +```rust +//! Stateless address translation for SLIRP. +//! +//! Pure functions that map (guest-visible address, rules) → +//! (host-side SocketAddr to connect/bind to). No per-flow state +//! lives here — the flow table in `slirp.rs` owns that. Translation +//! itself is a function call. + +use std::net::{Ipv4Addr, SocketAddr}; + +use ipnet::Ipv4Net; +use smoltcp::wire::Ipv4Address; + +/// Inbound port-forwarding rule — host listener → guest port. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ForwardProto { + Tcp, + Udp, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct PortForward { + pub proto: ForwardProto, + pub host_port: u16, + pub guest_port: u16, +} + +/// Outbound translation rules, derived once at SlirpBackend construction. +#[derive(Clone, Debug, Default)] +pub struct Rules { + /// If `true`, guest connects to the SLIRP gateway IP map to + /// `127.0.0.1` on the host. Today this is always `true`; left + /// configurable so a future TAP backend can flip it off. + pub gateway_loopback: bool, + /// CIDRs the guest is not allowed to connect to. Outbound packets + /// targeting these get `None` from `translate_outbound`. + pub deny_cidrs: Vec, + /// Inbound port forwards. Consulted by `SlirpBackend::new` to spawn + /// listeners; not used by `translate_outbound`. + pub port_forwards: Vec, +} + +/// Translate an outbound packet's destination address. +/// +/// Returns `Some(host_addr)` if the packet should be forwarded — +/// loopback for the gateway IP, otherwise the original IP. +/// Returns `None` if the destination is in the deny list. +pub fn translate_outbound( + rules: &Rules, + dst: Ipv4Address, + dst_port: u16, + gateway_ip: Ipv4Address, +) -> Option { + let dst_ipv4 = Ipv4Addr::from(dst.0); + + // Deny-list check first — explicit block beats any other rule. + for cidr in &rules.deny_cidrs { + if cidr.contains(&dst_ipv4) { + return None; + } + } + + let host_ip = if rules.gateway_loopback && dst == gateway_ip { + Ipv4Addr::LOCALHOST + } else { + dst_ipv4 + }; + + Some(SocketAddr::from((host_ip, dst_port))) +} +``` + +- [ ] **Step 2: Register the module** in `src/network/mod.rs`: + +```rust +pub mod nat; +``` + +- [ ] **Step 3: Verify.** + +```bash +cargo check +cargo test --test network_baseline +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/nat.rs src/network/mod.rs +git commit -m "feat(network): add nat.rs with stateless translate_outbound (no callers yet)" +``` + +--- + +### Task 5.2: `SlirpBackend` holds `nat: Rules` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Add field** on `SlirpBackend`: + +```rust +nat: nat::Rules, +``` + +- [ ] **Step 2: Build it in `with_security`** from the existing + `deny_list` parameter. Today the deny list lives in two places + (a `Vec` field on `SlirpBackend` and a CLI arg). The + refactor: `Rules.deny_cidrs` is the new home. The existing + `deny_list` field becomes redundant once 5.3 + 5.4 land — remove + it then. + +```rust +let nat = nat::Rules { + gateway_loopback: true, + deny_cidrs: deny_list.clone(), + port_forwards: Vec::new(), // wired in 5.5 +}; +``` + +- [ ] **Step 3: Don't migrate any call sites yet.** The existing + inline rewrites in `handle_tcp_frame` / `handle_udp_frame` keep + working. 5.3 + 5.4 own the cutover. +- [ ] **Step 4: Verify** — all 14 baseline tests still pass. +- [ ] **Step 5: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): add nat::Rules field on SlirpBackend (parallel to existing deny_list)" +``` + +--- + +### Task 5.3: TCP path consumes `translate_outbound` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Find the existing translation in `handle_tcp_frame`** + (LSP `documentSymbol` — the SYN branch around the `TcpStream::connect` + call). It currently does: + +```rust +// Inline today: +let dst_ip_for_socket = if key.dst_ip == SLIRP_GATEWAY_IP { + Ipv4Addr::LOCALHOST +} else { + Ipv4Addr::from(key.dst_ip.0) +}; +let dst_addr = SocketAddr::from((dst_ip_for_socket, key.dst_port)); + +// Plus a separate deny-list check: +for cidr in &self.deny_list { + if cidr.contains(&dst_ip_for_socket) { + // send RST, return + } +} +``` + +- [ ] **Step 2: Replace with a single `translate_outbound` call:** + +```rust +let dst_addr = match nat::translate_outbound( + &self.nat, + key.dst_ip, + key.dst_port, + SLIRP_GATEWAY_IP, +) { + Some(addr) => addr, + None => { + // Denied. Send RST and return. + trace!( + "SLIRP TCP: deny-list reject dst={}:{} from guest_port={}", + key.dst_ip, key.dst_port, key.guest_src_port + ); + let rst = build_tcp_rst_to_guest(/* existing args */); + self.inject_to_guest.push(rst); + return Ok(()); + } +}; +let host_stream = match TcpStream::connect_timeout(&dst_addr, Duration::from_secs(3)) { + /* existing match */ +}; +``` + +- [ ] **Step 3: Preserve every existing tracing event.** +- [ ] **Step 4: Verify** — `tcp_data_round_trip`, + `tcp_writes_more_than_256kb_succeed`, `tcp_deny_list_emits_rst`, + `tcp_handshake_emits_synack` all pass. +- [ ] **Step 5: Commit.** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): TCP path uses nat::translate_outbound" +``` + +--- + +### Task 5.4: UDP path consumes `translate_outbound` + +**Files:** +- Modify: `src/network/slirp.rs` + +- [ ] **Step 1: Find** the inline UDP translation in `handle_udp_frame` + (Phase 2's `dst_ip_for_socket = if key.dst_ip == SLIRP_GATEWAY_IP { LOCALHOST } else { ... };`). +- [ ] **Step 2: Replace** with `nat::translate_outbound(&self.nat, key.dst_ip, key.dst_port, SLIRP_GATEWAY_IP)`. + On `None` (deny), drop the datagram silently with a `trace!`. +- [ ] **Step 3: Drop the now-unused `deny_list` field** on `SlirpBackend` — both TCP and UDP go through `Rules.deny_cidrs` now. LSP `findReferences` to confirm zero callers. +- [ ] **Step 4: Verify.** + +```bash +cargo check +cargo test --test network_baseline udp_non_dns_round_trips +cargo test --test network_baseline # 14/14 +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +git add src/network/slirp.rs +git commit -m "refactor(slirp): UDP path uses nat::translate_outbound, drop deny_list field" +``` + +--- + +## Workstream 5B — Port forwarding (the user-visible feature) + +### Task 5.5: Wire `port_forwards` from spec → host listeners + +**Files:** +- Modify: `src/network/mod.rs` (`NetworkConfig::port_forwards: Vec<(u16, u16)>` is already there from earlier work — confirm via LSP and use as the source) +- Modify: `src/network/slirp.rs` (`SlirpBackend::with_security` accepts `port_forwards`, populates `nat.port_forwards`, spawns listeners) + +This is the only task that ADDS user-visible behavior. The translation +refactor in 5.1–5.4 was no-behavior-change. + +- [ ] **Step 1: Define the listener thread shape.** For each + `PortForward { proto, host_port, guest_port }`: + - **TCP:** `TcpListener::bind(("127.0.0.1", host_port))` → + accept thread → on each accept, **inject a synthetic SYN frame** + into the guest from `SLIRP_GATEWAY_IP:host_port` → `SLIRP_GUEST_IP:guest_port`, + then proxy bytes between the host TcpStream and the guest's + response stream (mirrors the existing outbound path but reversed). + - **UDP:** `UdpSocket::bind(("127.0.0.1", host_port))` → + similar pattern with synthetic UDP datagrams. + + This is more involved than the outbound path because we have to + *initiate* a connection from the host side to the guest. The + guest's listener at `guest_port` must already be accepting; if + it's not, the host TCP connect will look like ECONNREFUSED to the + caller. + +- [ ] **Step 2: Smallest viable first commit — just plumb the config**: + - Pass `port_forwards: Vec` through `with_security`. + - Populate `nat.port_forwards`. + - Don't actually spawn listeners yet — just store the rules. A + next commit can add the listener implementation. + +- [ ] **Step 3: Smallest viable second commit — TCP forwarding only**: + - For each TCP `PortForward`, spawn a thread that binds the host + listener and on each accept, drives the synthetic SYN injection. + - Keep UDP forwarding as a TODO comment for a follow-up; the TCP + path is the high-value case. + +- [ ] **Step 4: Verify** — test plan in 5.6 covers this. + +This task is the single most user-visible piece of the entire SLIRP +refactor chain. Worth landing carefully; consider splitting into +sub-PRs if the diff balloons. + +--- + +## Workstream 5C — Test + bench + +### Task 5.6: Baseline pins for translation + port-forward + +**Files:** +- Modify: `tests/network_baseline.rs` + +- [ ] **Step 1: Pure-translation pins** — exercise `nat::translate_outbound` + directly without driving `SlirpBackend`: + +```rust +#[test] +fn nat_translate_outbound_loopback_rewrite() { /* ... */ } + +#[test] +fn nat_translate_outbound_deny_list() { /* ... */ } + +#[test] +fn nat_translate_outbound_unmodified_external_ip() { /* ... */ } +``` + +- [ ] **Step 2: Port-forward end-to-end pin**: + +```rust +#[test] +fn tcp_port_forward_inbound() { + // Bind a guest-side server (synthesized — drives SlirpBackend + // directly with a SYN/SYN-ACK/FIN sequence to simulate a guest + // accepting on guest_port). + // Build SlirpBackend with port_forwards = [{Tcp, host_port, guest_port}]. + // Connect from host to 127.0.0.1:host_port. + // Assert the connection succeeds and bytes flow through. +} +``` + +- [ ] **Step 3: Run.** + +```bash +cargo test --test network_baseline nat_ tcp_port_forward +cargo test --test network_baseline # full suite +git add tests/network_baseline.rs +git commit -m "test(network): pin nat::translate_outbound + tcp_port_forward_inbound" +``` + +--- + +### Task 5.7: divan bench for `translate_outbound` + +**Files:** +- Modify: `benches/network.rs` + +- [ ] **Step 1: Add** a pure-compute bench inside `linux_benches`: + +```rust +#[divan::bench] +fn nat_translate_outbound_hot_path(bencher: Bencher) { + use void_box::network::nat::{self, Rules}; + let rules = Rules { + gateway_loopback: true, + deny_cidrs: vec!["169.254.0.0/16".parse().unwrap()], + port_forwards: Vec::new(), + }; + let dst = SLIRP_GATEWAY_IP; + bencher.bench_local(|| { + divan::black_box(nat::translate_outbound(&rules, dst, 80, SLIRP_GATEWAY_IP)); + }); +} +``` + +Expected order of magnitude: tens of nanoseconds per call. If it's +microseconds, something's wrong (allocation in the hot path, etc.) — +investigate. + +- [ ] **Step 2: Commit.** + +```bash +cargo bench --bench network nat_translate_outbound_hot_path +git add benches/network.rs +git commit -m "bench(network): nat_translate_outbound_hot_path — Phase 5 baseline" +``` + +--- + +### Task 5.8: Phase 5 validation gate + +**Files:** none. + +- [ ] fmt + clippy clean. +- [ ] `cargo test --test network_baseline` — all baseline pins pass + (count grew by 4 in 5.6). +- [ ] `cargo bench --bench network` — no regression on existing benches; + new `nat_translate_outbound_hot_path` reports tens of ns. +- [ ] `cargo test --test snapshot_integration -- --ignored` — 8/8. +- [ ] `cargo test --test e2e_mount -- --ignored` — 11/11. +- [ ] `voidbox-network-bench --iterations 3 --bulk-mb 10` — within 5% of Phase 4 numbers. +- [ ] `voidbox-startup-bench --iters 3 --breakdown` — warm phase exits 0; numbers within noise of Phase 4. + +## Risks + +- **Port-forwarding is new behavior, not refactor.** 5.5 is the most + failure-prone task because it injects synthetic frames into the + flow_table from a different code path than the existing relay. If + the synthetic SYN doesn't match the existing TCP state-machine's + expectations, connections break in subtle ways. Strong test + coverage in 5.6 mitigates. +- **Visibility of `nat` types.** Test files and benches need access + to `Rules`, `PortForward`, `translate_outbound`. The plan above + uses `pub` everywhere in `nat.rs` — that's the right surface for + Phase 6+ users (port-forwarding via spec/CLI). Don't `pub(crate)` + it. + +## File impact + +| File | Approximate LOC | +|---|---| +| `src/network/nat.rs` | **+90** (new) | +| `src/network/mod.rs` | +1 (`pub mod nat;`) | +| `src/network/slirp.rs` | **−40 / +25** (deny-list field gone, inline rewrites replaced with `translate_outbound` calls; the +25 is for the port-forwarding spawn) | +| `tests/network_baseline.rs` | +120 (4 new tests) | +| `benches/network.rs` | +20 (one bench) | +| **Total** | **~+220** | diff --git a/docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md new file mode 100644 index 00000000..a12a10d7 --- /dev/null +++ b/docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md @@ -0,0 +1,430 @@ +# SLIRP Refactor: Lift passt Patterns Into Our Stack + +**Status:** Spec +**Date:** 2026-04-27 +**Supersedes:** [`2026-04-12-network-backend-abstraction.md`](2026-04-12-network-backend-abstraction.md) (design changes — see "Relationship to prior plan" below) + +## Required skills during execution + +> **Mandatory for every task in every phase.** Each phase plan and +> every individual task assumes the implementer has these loaded. +> Failures here are blocking review comments. + +| Skill | When it fires | Why mandatory here | +|---|---|---| +| **`rust-style`** | Any task that writes or modifies Rust code | Project-wide style: for-loops over iterators, `let-else` for early returns, variable shadowing, newtypes, explicit matching, minimal comments. The refactor is high-volume Rust; without this, style drift accumulates. | +| **`rustdoc`** | Any task that adds or changes doc comments on public items (`NetworkBackend` trait, new public methods, new public types) | Public surface gets documented per RFC 1574 — summary sentence, sections, type references. The trait is a long-lived public API; bad rustdoc ages badly. | +| **`rust-analyzer-ssr`** | Any task that does a structural rename or signature change across the workspace (e.g. `SlirpStack → SmoltcpBackend`, `poll → drain_to_guest`, swapping concrete types for trait objects) | LSP-aware rename understands type resolution and path equivalence. Grep-based renames break on shadowed paths and miss trait-method call sites. The plan's renames span `src/network/`, `src/devices/virtio_net.rs`, `src/vmm/mod.rs`, snapshot code, and tests — too wide for safe text-substitution. | +| **`superpowers:test-driven-development`** | Every test/bench task in Phase 0 and every behavior change in Phases 1–5 | The "broken on purpose" pins are TDD by construction: assertion locks current behavior, refactor flips assertion. Skipping the failing-test step destroys that property. | +| **`superpowers:verification-before-completion`** | Before claiming any task complete | The validation gate (`cargo fmt`, `cargo clippy -D warnings`, `cargo test`, `cargo bench`, VM suites where applicable) must produce real green output, not narration. | +| **`verify`** *(repo skill)* | At the end of every phase, before opening the PR | Runs the full project quality gate: format, clippy, tests, security audit, startup bench regression, real-workload smoke. Catches cross-cutting regressions that the network-only gate misses. | +| **`profile`** *(repo skill)* | When a divan or wall-clock bench regresses by >5% | Don't guess at perf regressions — capture eBPF profiles and read them. | + +In addition, the project-wide rules from `CLAUDE.md` and `AGENTS.md` +remain in force: + +- **Prefer LSP operations** (`goToDefinition`, `findReferences`, + `hover`, `documentSymbol`, `workspaceSymbol`) over Grep/Glob for + Rust code navigation. Grep/Glob only for comments, config files, + non-Rust files. +- **Platform parity:** every change validated on Linux (KVM) and, where + applicable, macOS (VZ). Phase 0's wall-clock harness is Linux-only + by design (smoltcp is `cfg(target_os = "linux")`); Phases 1–5 + surface-level changes must not break the macOS build. +- **Imports and constants at module scope.** Never inline `use` / + `const` inside function bodies. + +## Summary + +Refactor `src/network/slirp.rs` to fix correctness and coverage gaps (no +ICMP, UDP-only-on-port-53, fragile hand-rolled TCP relay) by lifting +proven design patterns from [passt](https://passt.top/passt) into our +own all-Rust smoltcp-based stack — instead of adopting passt as an +external backend. + +The work is gated behind a benchmark and correctness baseline: every +phase ships with assertions that pin existing behavior (including the +"broken on purpose" parts) so regressions and improvements are both +visible in the diff. + +## Motivation + +The prior plan (2026-04-12) proposed adding `passt` as an opt-in +Linux-only backend behind a new `NetworkBackend` trait. After deeper +analysis of both codebases, that approach has worse cost/benefit than +keeping the work in-tree: + +**Why not passt as a backend:** + +- **Observability regression.** passt is an opaque C process behind a + 4-byte-prefixed unix socket. Every bug becomes "did passt do the + right thing?" instead of "what did our stack do?" with full + structured logs, tracing spans, and a debugger that works. +- **Cross-platform divergence.** passt is Linux-only. Adding it makes + guest behavior diverge across host platforms (`ping` works on Linux, + fails silently on macOS). +- **Operational friction.** passt is not installed by default on + Fedora, Ubuntu, Arch, or Alpine. Every user wanting the upgrade + needs a separate install step. +- **Process-lifecycle complexity.** Crash policy, stderr routing, + `PR_SET_PDEATHSIG`, and snapshot/restore semantics all become real + problems we don't have today. +- **New attack surface in the data path.** C code in our sandbox + boundary, even battle-tested C code, is qualitatively new exposure. + +**Why lift the design patterns instead:** + +- The capability gaps (ICMP, full UDP, IPv6) are tractable in + Rust+smoltcp. ICMP via `SOCK_DGRAM IPPROTO_ICMP` is ~150 LOC. + Generalizing UDP off the port-53 fast-path is ~200 LOC. +- The fragile parts of our TCP relay (256 KB `to_host` buffer cliff, + hand-rolled FIN state machine, `EAGAIN` deferral) can be **deleted**, + not patched, by adopting passt's "no per-connection packet buffer, + mirror sequence numbers via `MSG_PEEK`" pattern. +- The all-Rust path keeps structured tracing, sanitizers, and + profiler-readable call stacks intact. +- The `NetworkBackend` trait abstraction still earns its keep: it + decouples virtio-net from the stack so a future TAP/vhost-net + backend (the path that actually moves throughput numbers, per the + prior plan's appendix) can land cleanly. + +## Hard invariant — observability + +**Full observability is a non-negotiable differentiator** of this +codebase vs. running passt as a process. Every phase MUST preserve: + +- All-Rust, no opaque process boundary in the data path. Syscalls + via `libc` are fine; spawning passt is not. +- The existing `tracing` integration end-to-end — every state + transition (connection accept/establish/RST/FIN, peek, ACK-driven + consume) emits a structured event. The `tracing-subscriber` + pipeline at `src/observe/logs.rs` continues to receive everything. +- `cargo test`-driveable behavior — every change exercised by tests + that drive `SlirpBackend` directly without a VM + (`tests/network_baseline.rs`). +- Standard Rust tooling — LSP, `cargo clippy`, sanitizers, profiler. + +Per-phase plans MUST encode this as task-level acceptance criteria +(see Phase 3's "Non-negotiable invariants" section for the +canonical wording). A task that lifts a passt pattern but +silently bypasses our observability stack — even one that "works" +end-to-end — is rejected. + +## Non-goals + +- **Adopting passt as a binary backend.** Explicitly rejected per the + motivation above. +- **Throughput improvements.** Per the 2026-04-12 plan's appendix, the + bottleneck is the MMIO exit path, not the network stack. This work + improves correctness and coverage; throughput wins require + ioeventfd/irqfd or vhost-net (separately scoped, separately reviewed). +- **IPv6 in the initial phases.** Real lift (~800–1000 LOC). Deferred + to a later phase with its own plan. +- **macOS feature parity in Phase 0.** The wall-clock e2e harness will + initially be Linux-only since `smoltcp` is already Linux-gated in + `Cargo.toml`. macOS (VZ NAT) continues unchanged. + +## Relationship to prior plan + +The 2026-04-12 plan proposed: + +1. Extract `NetworkBackend` trait. **Kept.** +2. Add `PasstBackend` (Linux-only, opt-in). **Replaced** with in-tree + improvements to the smoltcp-based backend. +3. Cleanup rename `SlirpStack → SlirpBackend`. **Kept**, moved into + Phase 0 alongside the trait extraction. Role-based name (matches + future `TapBackend`/`VhostNetBackend`); does not leak the smoltcp + library dependency. + +The trait surface from the prior plan is tightened (`poll` becomes an +out-param to drop the per-call `Vec>` allocation; explicit +error type; health/dead signal). + +## Design + +### Core insight + +passt's superpower is a single architectural decision: **don't buffer +per connection — mirror sequence numbers**. + +Our current TCP relay (`src/network/slirp.rs:82–1048`, ~625 LOC) does +the opposite: `read()`s from the host socket into a `to_guest: Vec`, +drains on the next poll, and **closes the connection if `to_host` +exceeds 256 KB** (`slirp.rs:903–910`). passt never has that problem +because it never copies — it `recv(MSG_PEEK)`s, and the host kernel's +socket buffer *is* the buffer. Sequence math +(`seq_to_tap = seq_ack_from_tap + bytes_peeked`) reproduces what we +hand-roll. + +That single trick eliminates roughly half of the fragility in our +current code: no `EAGAIN` buffer-overflow path, no manual +`to_host_pending_ack` deferral, no 256 KB cliff. + +### Five patterns ported, ranked by ROI + +| # | Pattern | passt source | Our target | Approx. LoC | Phase | +|---|---|---|---|---|---| +| 1 | `MSG_PEEK` + sequence mirroring (TCP) | `tcp.c` `tcp_data_from_sock`, `tcp_data_from_tap` | `slirp.rs::relay_tcp_nat_data`, `handle_tcp_frame` | ~400 replaced | 3 | +| 2 | Per-flow connected UDP socket | `udp.c` `udp_flow_from_tap`, `udp_listen_sock_handler` | `slirp.rs::handle_dns_frame` (generalize) | ~200 new | 2 | +| 3 | Unprivileged ICMP echo via `SOCK_DGRAM IPPROTO_ICMP` | `icmp.c` `icmp_ping_handler`, `icmp_sock_handler` | new `slirp.rs::handle_icmp_frame` | ~150 new | 1 | +| 4 | Unified flow table with side indexing | `flow.c`, `flow.h` `union flow` + SipHash table | new `slirp.rs::FlowTable` | ~200 refactor | 4 | +| 5 | Stateless address translation | `fwd.c::nat_inbound` | refactor existing 10.0.2.2→127.0.0.1 rewrite | ~150 refactor | 5 | + +### What we keep as-is + +- **DNS caching with question-section keying** (`slirp.rs:433–456`) is + better than passt — passt has no DNS cache. Keep it. +- **Net-poll thread on a 5ms timer** (`vmm/mod.rs:1594–1630`) is + simpler than passt's epoll/timerfd dance and fits our virtio-mmio + model. The 5ms floor matters less once we stop dropping connections + at 256 KB. +- **smoltcp for wire types + ARP via `Interface`** is the right + division of labor. passt has to hand-roll its packet abstraction + (`packet.h`); we get checksum and parsing for free. +- **Threading model** (`process_guest_frame` on vCPU, `poll` on + net-poll, `Arc>`) is sound. Don't touch it. + +### What we throw away from passt + +| passt feature | Why skip | +|---|---| +| `TCP_REPAIR` migration | Out of scope; VM snapshots already break TCP | +| `splice()` / vhost-user / pasta zero-copy | Throughput-focused, gated by MMIO exit cost | +| Full IPv6 (DHCPv6, NDP, RA) | Deferred to a later phase | +| AVX2 checksum | smoltcp's checksum is fine; premature optimization | +| Daemon harness, conf parsing, qrap | We're an embedded library, not a daemon | +| C weak-symbol dispatch | Use Rust enum dispatch / trait objects | + +### `NetworkBackend` trait + +```rust +// src/network/mod.rs + +use std::io; + +/// A network backend processes raw Ethernet frames between guest and host. +/// +/// Implementations must be `Send` so they can be held behind +/// `Arc>` and accessed from both the vCPU thread (TX path) and +/// the net-poll thread (RX path). +pub trait NetworkBackend: Send { + /// Process a raw Ethernet frame sent by the guest (TX path). + /// + /// Called from the vCPU thread on MMIO write to the TX virtqueue. + /// Implementations should not block. + fn process_guest_frame(&mut self, frame: &[u8]) -> io::Result<()>; + + /// Drain Ethernet frames destined for the guest into `out` (RX path). + /// + /// Called every ~5ms from the net-poll thread. Frames are + /// complete Ethernet payloads — no virtio-net header (the caller + /// prepends that). The buffer is reused across calls to avoid + /// per-poll allocation. + fn drain_to_guest(&mut self, out: &mut Vec>); + + /// Backend health. `false` means the backend has entered an + /// unrecoverable state and should be reconstructed. + fn is_healthy(&self) -> bool { + true + } +} +``` + +Differences from the prior plan: + +- `poll() -> Vec>` → `drain_to_guest(&mut self, out: &mut Vec>)`. + Drops the per-poll allocation that would otherwise fire every 5ms. +- Explicit `io::Result<()>` instead of project-wide `Result`. +- `is_healthy()` default-true hook for future backends that have a + process or socket lifecycle (TAP, vhost-net). Unused by + `SmoltcpBackend`. + +## Phase breakdown + +Each phase is **independent** and **landable on its own**. Each phase +will get its own bite-sized plan document under `docs/superpowers/plans/` +when execution starts. Phases 1–5 plan documents are deliberately not +written yet — what we learn from earlier phases will sharpen the +detailed task lists for later ones. + +| Phase | Scope | Risk | Plan doc | +|---|---|---|---| +| **0** | Baseline tests + benches + `NetworkBackend` trait extraction + `SlirpStack → SlirpBackend` rename. **Zero user-visible behavior change.** | Low | [`2026-04-27-smoltcp-passt-port-phase0.md`](2026-04-27-smoltcp-passt-port-phase0.md) | +| **1** | ICMP echo via unprivileged `SOCK_DGRAM IPPROTO_ICMP`, with sysctl-fallback to drop. | Low | [`2026-04-27-smoltcp-passt-port-phase1.md`](2026-04-27-smoltcp-passt-port-phase1.md) | +| **2** | Generalize UDP: per-flow connected sockets, drop port-53 limit, keep DNS fast-path/cache. | Low–medium | [`2026-04-27-smoltcp-passt-port-phase2.md`](2026-04-27-smoltcp-passt-port-phase2.md) | +| **3** | TCP relay rewrite using `MSG_PEEK` + sequence mirroring. Drop `to_guest: Vec` and 256 KB cap. | **High** — gnarliest of the lot. Snapshot integration tests are the gate. | [`2026-04-27-smoltcp-passt-port-phase3.md`](2026-04-27-smoltcp-passt-port-phase3.md) | +| **4** | Unified flow table refactor (no behavior change). Single `flow_table: HashMap` replacing the three per-protocol maps. | Medium | [`2026-04-27-smoltcp-passt-port-phase4.md`](2026-04-27-smoltcp-passt-port-phase4.md) | +| **5** | Stateless NAT translation refactor + port-forwarding configurability. | Low | [`2026-04-27-smoltcp-passt-port-phase5.md`](2026-04-27-smoltcp-passt-port-phase5.md) | +| **6** *(optional)* | IPv6 dual-stack (DHCPv6, NDP, RA, NAT). | High | TBD; may be split further | + +## Baseline strategy + +Every phase ships with assertions that pin observable behavior. Three +of these assertions deliberately encode **broken** behavior — they are +green lights that flip when the corresponding phase lands. + +### Two test layers + +**Layer 1 — unit-level (fast, deterministic, no VM):** drive +`SmoltcpBackend` directly. Feed synthetic Ethernet frames via +`process_guest_frame`, drive `drain_to_guest`, inspect emissions. +Sub-millisecond per test, runs on every `cargo test`. Lives in +`tests/network_baseline.rs`. + +**Layer 2 — wall-clock e2e (slow, real numbers, comparable to passt):** +boot a VM, run iperf3/netperf-style measurements inside, output JSON. +Mirrors the existing `voidbox-startup-bench` pattern. New binary +`voidbox-network-bench`. Linux-only initially. + +### Two benchmark layers + +**Layer 1 — divan microbenches:** `benches/network.rs` mirrors +`benches/startup.rs`. `divan::main()`, `#[divan::bench]`, parametric +`args` for NAT-walk scaling. Run with `cargo bench --bench network`. + +**Layer 2 — wall-clock harness above** outputs metrics named to match +passt's published table (`tcp_throughput_*`, `tcp_rr_latency`, +`tcp_crr_latency`, `udp_throughput_*`). + +### "Broken on purpose" pins + +These three tests assert broken behavior today. They are intended to +flip when the corresponding phase lands: + +| Test | Today's assertion | Flips in phase | +|---|---|---| +| `tcp_to_host_buffer_drops_at_256kb` | Connection closes when guest writes >256 KB before host reads | 3 | +| `udp_non_dns_silently_dropped` | UDP datagram to port 80 produces no host-side connection | 2 | +| `icmp_echo_silently_dropped` | ICMP echo request produces no echo reply | 1 | + +The PR that fixes each behavior is the PR that flips the assertion, +which makes the diff legible to reviewers. + +### passt head-to-head methodology + +Direct numerical comparison is structurally limited (passt runs in +qemu with its socket back-end; we run our own VMM with virtio-mmio). +The honest plan: + +1. **Same hardware, same workload, same metric names.** Run our + `voidbox-network-bench` and a passt+qemu reference on the same + host. Two columns in the report. +2. **Track the gap, don't claim parity.** Throughput will lag because + of MMIO exit overhead; that's known and out-of-scope. +3. **Connect rate (CRR latency) is the most apples-to-apples + metric** — dominated by NAT-table operations, not MMIO. If passt + does CRR in 135 µs and we do 600 µs, that's a meaningful "we have + 4× more overhead per connect" signal that this refactor should + narrow. + +Report shape (illustrative, real numbers come from the harness): + +``` + before after-phase-3 passt +tcp throughput g2h 1500B 4.1 G 5.2 G 5.2 G +tcp RR latency 72 µs 58 µs 58 µs +tcp CRR latency 640 µs 180 µs 135 µs +udp DNS qps 12k 12k n/a +icmp echo dropped ~110 µs ~50 µs +allocations per packet 3 0 0 +``` + +## File impact + +### Phase 0 (baseline + trait + rename) + +| File | Change | +|---|---| +| `src/network/mod.rs` | Add `NetworkBackend` trait | +| `src/network/slirp.rs` | `impl NetworkBackend for SlirpStack`, rename type to `SlirpBackend`, tighten `poll` to `drain_to_guest` | +| `src/devices/virtio_net.rs` | Hold `Arc>` instead of concrete `SlirpStack` | +| `src/vmm/mod.rs` | Update construction at cold-boot + snapshot-restore sites | +| `tests/network_baseline.rs` | **New file**: ~14 unit-level pins | +| `benches/network.rs` | **New file**: divan microbenches | +| `src/bin/voidbox-network-bench/main.rs` | **New file**: wall-clock harness | +| `Cargo.toml` | Register new bench, new binary, new test | +| `.github/workflows/startup-bench.yml` | Add `network` bench step (or add a new workflow file) | + +### Phases 1–5 + +Documented in their own plan files when scoped. + +## Risks + +- **TCP rewrite is the high-risk part.** Phase 3 replaces the most + battle-tested path in our networking code. The snapshot integration + suite is the safety gate; if any of `snapshot_integration`, + `e2e_telemetry`, `e2e_skill_pipeline`, `e2e_mount`, or `e2e_sidecar` + regress, Phase 3 stays in draft. +- **passt protocol/idiom drift.** We're lifting design patterns, not + code. The risk is that we hit edge cases passt has already solved + that we'll re-discover as bugs (e.g. PAWS, fast retransmit + thresholds). Mitigation: explicit test-case lift from passt's test + suite (`/home/diego/github/passt/test/`) where applicable. +- **Cross-platform parity for ICMP.** Linux requires the + `net.ipv4.ping_group_range` sysctl to permit the calling GID. + macOS allows unprivileged `SOCK_DGRAM IPPROTO_ICMP` unconditionally. + When sysctl forbids it on Linux, fall back to current behavior + (drop), with a warn-once log. +- **Engineering time vs. throughput wins.** This work does not move + throughput numbers. The ioeventfd/vhost-net path that *does* will + reuse the trait abstraction we land in Phase 0, but won't reuse the + TCP relay rewrite from Phase 3. If priorities shift toward + throughput, Phases 0, 1, and 2 still pay off; Phase 3 may be + deferred. + +## Validation gate (per phase) + +Every phase ends with: + +```bash +# Static +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings + +# Tests +cargo test --workspace --all-features +cargo test --doc --workspace --all-features + +# Network-specific +cargo test --test network_baseline +cargo bench --bench network # no >5% regression vs main + +# VM suites that exercise networking (Linux/KVM) +export VOID_BOX_KERNEL=/boot/vmlinuz-$(uname -r) +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test conformance -- --ignored --test-threads=1 +cargo test --test snapshot_integration -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +``` + +A phase is not "done" until all gates pass and the wall-clock +`voidbox-network-bench` shows no regression on previously-working +metrics. New metrics (ICMP latency, non-DNS UDP throughput) are +expected to flip from "n/a / dropped" to a number when their +corresponding phase lands. + +## References + +- **Prior plan** (this supersedes the design, keeps the trait): + `docs/superpowers/plans/2026-04-12-network-backend-abstraction.md` +- **passt source** (cloned locally): + `/home/diego/github/passt` + - `tcp.c` — TCP translation, sequence mirroring (Phase 3 reference) + - `udp.c` — per-flow UDP NAT (Phase 2 reference) + - `icmp.c` — `IPPROTO_ICMP SOCK_DGRAM` echo (Phase 1 reference) + - `flow.c` — unified flow table (Phase 4 reference) + - `fwd.c::nat_inbound` — stateless address translation (Phase 5 ref) +- **Our networking code:** + - `src/network/slirp.rs` (1275 LOC) — the file most of this work + lands in + - `src/network/mod.rs` (202 LOC) — where `NetworkBackend` trait goes + - `src/devices/virtio_net.rs` (831 LOC) — virtio-net wiring + - `src/vmm/mod.rs:1594–1630` — net-poll thread +- **Existing bench/test infrastructure to mirror:** + - `benches/startup.rs` — divan pattern + - `src/bin/voidbox-startup-bench/main.rs` — wall-clock harness + pattern + - `.github/workflows/startup-bench.yml` — CI regression gate +- **passt project page:** https://passt.top/passt — performance + table format, metric names diff --git a/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.4.md b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.4.md new file mode 100644 index 00000000..64050246 --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.4.md @@ -0,0 +1,1427 @@ +# Phase 6.4: Event-Driven RX Polling Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the 5 ms timer-driven `net_poll_thread` with `epoll_wait`-driven readiness dispatch, so host→guest RX latency is bounded by the actual data-arrival delay (sub-millisecond) rather than the 5 ms polling cycle. + +**Architecture:** A new `mod epoll_dispatch` inside `src/network/` owns a single `epoll_fd` plus a self-pipe. `SlirpBackend` registers/unregisters socket FDs on flow-table mutations. The `net_poll_thread` calls `epoll_wait` (50 ms timeout for housekeeping) and routes each ready FD to the correct relay handler via `epoll_data` carrying a `FlowKey`. The self-pipe lets the vCPU-thread side wake the poll thread when it adds a new flow without polling-cycle delay. + +**Tech stack:** smoltcp 0.11 wire types (unchanged), `libc::epoll_*` syscalls, `pipe2(O_NONBLOCK | O_CLOEXEC)`, no new crates. + +**Hard performance gate (the "more performant than master" requirement):** + +``` +scripts/bench-compare.sh --baseline origin/main --skip-vm +``` + +…must show, for every comparable bench, **HEAD ≤ baseline + 5 %** *and* at least the following must improve by ≥ 30 %: + +- `port_forward_accept_latency` (currently bounded by 50 ms listener poll; epoll should drop median by an order of magnitude once the listener also moves onto epoll — *or* document why it stays). +- a new `tcp_rx_latency_us_p50` wall-clock metric in `voidbox-network-bench` (Phase 6.4 must be sub-5 ms; pre-6.4 was bounded below by the 5 ms net-poll cycle). + +Phase 6.4 is **not allowed to merge** until both gates above pass. + +--- + +## Background + +Reviewer finding **A4** (Medium-Low) on PR #68: + +- `src/vmm/mod.rs:1599-1610`: `net_poll_thread` wakes every 5 ms (`std::thread::sleep(Duration::from_millis(5))`). +- `src/network/slirp.rs:1549`: `relay_tcp_nat_data` re-peeks 64 KiB on **every** connected TCP socket every tick, regardless of readiness. +- Listener threads spawned by `spawn_port_forward_listeners` (`src/network/slirp.rs:2097`) sleep 50 ms between accept attempts — this is the cap on `port_forward_accept_latency` (~50 ms median observed in `benches/network.rs::port_forward_accept_latency`). + +passt's reference: epoll-driven readiness ([passt/tcp.c:463](https://passt.top/passt/tree/tcp.c#n463)). Phase 6.4 ports the *idea* (event-driven), not the literal `SO_PEEK_OFF` mechanism (which is Linux-specific and would not survive a future cross-platform backend split — though SLIRP itself is already `cfg(target_os = "linux")`). + +## Invariants (carried from Phase 6 overview — non-negotiable) + +1. **Full observability via `tracing`.** Every epoll event emits a `trace!` line with the `FlowKey` and event type. No silent dispatch. +2. **All-Rust path.** `libc::epoll_*` is the syscall surface; no new crates. +3. **Cross-platform discipline.** Phase 6.4 stays inside the existing `#[cfg(target_os = "linux")]` gate. macOS VZ is unaffected. +4. **No regression in Phase 0–5 baselines.** `bench-compare.sh --baseline origin/main` enforced — see "Hard performance gate" above. +5. **Snapshot/restore correctness.** `snapshot_integration` continues to pass. The `epoll_fd` does not survive snapshot; restore rebuilds the epoll set from `flow_table` contents. Snapshot does not serialize the epoll FD itself. + +## File structure + +| Path | Responsibility | Action | +|---|---|---| +| `src/network/epoll_dispatch.rs` | Owns `epoll_fd`, self-pipe, register/unregister, `wait()` returning `Vec`. Linux-only. | **Create** | +| `src/network/mod.rs` | Add `pub(crate) mod epoll_dispatch;` | Modify | +| `src/network/slirp.rs` | Hold `epoll: EpollDispatch` field on `SlirpBackend`; register on every flow_table insert; unregister on remove; rewrite `relay_tcp_nat_data`/`relay_udp_flows`/`relay_icmp_echo` to dispatch only on ready flows. | Modify | +| `src/vmm/mod.rs` | `net_poll_thread` rewrite: `epoll_wait(timeout=50ms)` instead of `sleep(5ms)`. | Modify | +| `tests/network_baseline.rs` | New pin `tcp_rx_latency_sub_5ms`; fix-up `tcp_writes_more_than_256kb_succeed`'s comment-vs-code mismatch; rename/migrate `drain_n` from `.poll()` to `drain_to_guest`. | Modify | +| `benches/network.rs` | Add divan bench `tcp_rx_latency_one_packet`. | Modify | +| `src/bin/voidbox-network-bench/main.rs` | Add `tcp_rx_latency_us_p50` measurement (host writes to a flow, time until guest sees the bytes via the relay). | Modify | +| `docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.4.md` | This file. | Already created | + +`drain_n` migration in `tests/network_baseline.rs` is a quiet cleanup that lands in Task 1 — every test in the file uses it, so dropping `.poll()` here also drops the last in-tree `.poll()` caller and lets us delete the deprecated method entirely later. + +## Architecture notes + +### Why one `epoll_fd` (not one per protocol)? + +- Single point of dispatch — the poll thread does *one* `epoll_wait` syscall regardless of how many flows are open. +- `epoll_data.u64` is 8 bytes — we encode `FlowKey` as a 64-bit token there. UDP and ICMP keys are smaller; TCP keys (`(guest_port, dst_ip, dst_port)`) fit in 64 bits with a tag byte for the protocol discriminator. +- Self-pipe is registered alongside socket FDs; reading it drains a queue of "I just added flow X" wake events posted by `process_guest_frame` running on the vCPU thread. + +### Why a self-pipe? + +`process_guest_frame` runs on the **vCPU thread** under the device lock. When it inserts a new flow into `flow_table`, the new socket FD is registered with epoll on that thread (cheap — just `epoll_ctl(EPOLL_CTL_ADD, ...)`). But the **poll thread** is asleep inside `epoll_wait(timeout=50ms)`. Without a wakeup, the new flow has up to 50 ms of latency before the first poll cycle picks it up. + +The self-pipe (`pipe2(O_NONBLOCK | O_CLOEXEC)` registered with `EPOLLIN`) lets `process_guest_frame` write a single byte after `epoll_ctl`. The poll thread's `epoll_wait` returns immediately, drains the pipe (a no-op handler), and starts dispatching — including the new flow. + +### Snapshot interaction + +`epoll_fd` is a kernel handle on real FDs — not serializable. Snapshot path: + +- `snapshot_internal`: tear down epoll. Drop `EpollDispatch`. Serialize `flow_table` as today. +- `from_snapshot`: deserialize `flow_table` → for every entry, recreate the host socket (already happening today via `host_stream` round-trip) → register the new FD with a fresh `EpollDispatch`. + +No serde changes to `flow_table` itself. + +### Why 50 ms `epoll_wait` timeout? + +Housekeeping the poll thread does *outside* the dispatch loop: + +- Reap stale UDP flows (`UDP_IDLE_TIMEOUT = 60 s`) — coarse, 50 ms is fine. +- Reap stale ICMP flows (similar). +- Phase 6.1 will add `LAST_ACK_TIMEOUT` reaping here. + +If we set the timeout shorter we re-introduce the "wake every X ms regardless" cost we're trying to remove. If we set it longer, housekeeping latency grows. 50 ms balances both at a 10 % wakeup duty cycle versus the previous 100 % (one wakeup every 5 ms). + +--- + +## Tasks + +### Task 1: Pre-baseline + retransmit-test fix-up + +**Files:** +- Modify: `tests/network_baseline.rs:170-179` (the `drain_n` helper) +- Modify: `tests/network_baseline.rs:374-422` (retransmit comment-vs-code in `tcp_writes_more_than_256kb_succeed`) + +- [ ] **Step 1: Capture baseline numbers from `origin/main`** + +```bash +# from a clean repo checkout +scripts/bench-compare.sh --baseline origin/main --skip-vm > /tmp/baseline-vs-main.md +cat /tmp/baseline-vs-main.md +``` + +Expected: every comparable bench has a real number in both columns. Save `/tmp/baseline-vs-main.md` as the pre-Phase-6.4 reference. + +- [ ] **Step 2: Migrate `drain_n` from `.poll()` to `drain_to_guest`** + +Replace `tests/network_baseline.rs:170-179`: + +```rust +/// Drains frames the stack wants to send to the guest, calling +/// `drain_to_guest` up to `n` times. Returns all frames produced +/// across the calls (caller may not care about per-call boundaries). +fn drain_n(stack: &mut SlirpBackend, n: usize) -> Vec> { + let mut out: Vec> = Vec::new(); + for _ in 0..n { + stack.drain_to_guest(&mut out); + } + out +} +``` + +- [ ] **Step 3: Run the existing pins to confirm `drain_n` migration is non-breaking** + +```bash +cargo test --test network_baseline +``` + +Expected: PASS for every existing pin (no semantic change — `drain_to_guest` appends to the buffer, same as `.poll()` extension). + +- [ ] **Step 4: Fix the retransmit comment-vs-code mismatch in `tcp_writes_more_than_256kb_succeed`** + +The Copilot review's C1.1 finding is correct: the loop unconditionally advances `seq` after every send, never retransmits unACK'd chunks. The 95 % threshold tolerates the resulting loss but the test's intent ("we re-send those") doesn't match its implementation. + +Two valid fixes — pick the simpler one. Replace the loop body in `tests/network_baseline.rs:387-422`: + +```rust +while bytes_received.load(Ordering::Relaxed) < TOTAL && std::time::Instant::now() < deadline { + // Retransmit semantics: only advance the send cursor once the + // previous chunk has been ACK'd. If the stack stops ACKing + // (Phase 3 backpressure), we re-send the same seq/payload until + // it's acknowledged. This matches the comment above and the + // production guest-TCP behavior we're emulating. + let _ = stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + seq, + our_seq + 1, + TcpControl::Psh, + &chunk, + )); + + // Drain frames; track the highest ACK we've seen and watch + // for RST/FIN that would indicate a Phase-2 era close. + for f in drain_n(&mut stack, 4) { + if let Some((_, ack, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Rst | TcpControl::Fin) { + saw_close = true; + } + if ack > acked_seq { + acked_seq = ack; + } + } + } + + if saw_close { + break; + } + + // Advance our send cursor only past ACK'd data. If the stack + // didn't ACK this chunk, the next loop iteration re-sends the + // same seq/payload (true TCP retransmit semantics). + if acked_seq >= seq.wrapping_add(CHUNK as u32) { + seq = seq.wrapping_add(CHUNK as u32); + } else if seq.wrapping_sub(acked_seq) > 256 * 1024 { + // Out-paced kernel recv buffer; sleep briefly so the host + // server thread can drain. + std::thread::sleep(std::time::Duration::from_millis(10)); + } +} +``` + +The single substantive change: move `seq = seq.wrapping_add(...)` from line 398 (unconditional, immediately after send) to after the drain loop, gated on `acked_seq >= seq + CHUNK`. If the stack ACK'd, advance; otherwise the next iteration re-sends the same chunk. + +- [ ] **Step 5: Run the fixed test to confirm it still passes (now with real retransmit)** + +```bash +cargo test --test network_baseline tcp_writes_more_than_256kb_succeed +``` + +Expected: PASS. The 95 % threshold will likely be 100 % now since real retransmits don't drop bytes. + +- [ ] **Step 6: Commit** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): drain_n via drain_to_guest + real retransmit in 256kb test + +Two test-harness improvements landing together since both block the +Phase 6.4 RX-latency work: + +- drain_n migrated from deprecated SlirpBackend::poll() to + drain_to_guest. This was the last in-tree poll() caller. +- tcp_writes_more_than_256kb_succeed now matches its 'we re-send + those' comment: seq only advances when acked_seq catches up, + giving real TCP-retransmit semantics in the synthetic guest + rather than the previous 'lossy with 95% tolerance' shape. + Phase 6.4 must not regress this contract; making the test + faithful first means epoll regressions surface as failures + instead of borderline 95% misses." +``` + +--- + +### Task 2: ~~Failing pin — `tcp_rx_latency_sub_5ms`~~ **DROPPED** + +**Status:** Dropped during execution. Original intent was a unit-level BROKEN_ON_PURPOSE pin asserting host→guest delivery in < 5 ms. **The 5 ms floor lives in `net_poll_thread` (`src/vmm/mod.rs:1609`), not in `SlirpBackend::drain_to_guest`** — the relay is synchronous when called from a test harness, so a unit-level latency assertion can't measure what we actually care about. + +**Where the contract moved:** Task 13's wall-clock `tcp_rx_latency_us_p50` metric in `voidbox-network-bench`. That harness boots a real VM, drives the actual `net_poll_thread`, and observes the latency floor end-to-end. The hard-perf-gate requirement at the top of this plan (`tcp_rx_latency_us_p50 < 5 ms`) is the BROKEN_ON_PURPOSE replacement. + +**No code lands for Task 2.** Skip directly to Task 3. + +
+Original Task 2 body (kept for context) + +The original plan attempted a unit-level pin that called `drain_to_guest` synchronously and timed the host-write → guest-receive interval. Implementation revealed: + +- `drain_to_guest` is synchronous; the 5 ms `sleep` in `net_poll_thread` is what bounds VMM-level RX latency, not anything inside `SlirpBackend`. +- The test would have measured "spawn-thread + accept + write" minus "drain-loop find time", which underflowed in debug mode and was meaningless in release mode. + +The contract — Phase 6.4 must deliver host→guest data in < 5 ms when data is available — is preserved as a VM-level requirement in Task 13. + +
+ +- [ ] **Step 1: ~~Write the failing test~~ Skipped — see "DROPPED" note above. Original body kept below for context only.** + +```rust +/// Phase 6.4 pin: host→guest RX latency must be sub-5 ms when data +/// is available. Pre-Phase-6.4 the floor was 5 ms (the +/// `net_poll_thread` `sleep(5ms)` cycle); post-Phase-6.4 the +/// epoll dispatch should deliver in < 1 ms on a quiet system. +/// +/// Test harness: open a TCP flow guest→host, wait for ESTABLISHED, +/// have the host write 64 bytes, measure the time from `write()` +/// returning to the guest seeing the bytes in `drain_to_guest`'s +/// output. Pre-Phase-6.4 this measures ≈ 5 ms ± jitter; post- +/// Phase-6.4 it should be sub-millisecond on the same host. +#[test] +fn tcp_rx_latency_sub_5ms() { + use std::io::Write; + use std::net::{TcpListener, TcpStream}; + use std::time::Instant; + + // Bind a host listener; the SLIRP rewrite of 10.0.2.2 → 127.0.0.1 + // routes our SYN to it. + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let host_port = listener.local_addr().unwrap().port(); + let server = std::thread::spawn(move || -> Option { + let (mut sock, _) = listener.accept().ok()?; + // Wait for the guest to send something so we know the relay + // is established and bidirectional. + let mut probe = [0u8; 1]; + let _ = std::io::Read::read(&mut sock, &mut probe); + + // Stamp T0 just before write returns. + let t0 = Instant::now(); + sock.write_all(&[0x42; 64]).ok()?; + Some(t0.elapsed()) + }); + + let mut stack = SlirpBackend::new().unwrap(); + + // Drive the 3-way handshake. + let our_seq = 1000u32; + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, our_seq, 0, + TcpControl::Syn, &[], + )).unwrap(); + + let mut gateway_seq = 0u32; + for f in drain_n(&mut stack, 4) { + if let Some((s, _ack, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Syn) { + gateway_seq = s; + break; + } + } + } + + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, our_seq + 1, gateway_seq + 1, + TcpControl::None, &[], + )).unwrap(); + + // Send a probe byte so the host server thread proceeds to write. + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, our_seq + 1, gateway_seq + 1, + TcpControl::Psh, &[0xAA], + )).unwrap(); + + // Now the host writes and stamps T0. We measure from "host write + // completes" to "guest sees data in drain output." + let host_t0 = server.join().expect("server").expect("write succeeded"); + let drain_start = Instant::now(); + let mut saw_payload = false; + while drain_start.elapsed() < std::time::Duration::from_secs(1) { + let frames: Vec> = drain_n(&mut stack, 1); + for f in &frames { + if let Some((_, _, _, payload_len)) = parse_tcp_to_guest(f) { + if payload_len >= 64 { + saw_payload = true; + break; + } + } + } + if saw_payload { break; } + std::thread::sleep(std::time::Duration::from_micros(50)); + } + let host_to_guest_us = drain_start.elapsed().as_micros() as u64 + - host_t0.as_micros() as u64; + + assert!(saw_payload, "host payload never reached the guest"); + + // The contract: epoll dispatch delivers in < 5 ms. + assert!( + host_to_guest_us < 5_000, + "Phase 6.4 contract: host→guest RX latency must be sub-5 ms \ + (was bounded below by 5 ms net_poll_thread cycle); got {host_to_guest_us} µs" + ); +} +``` + +- [ ] **Step 2: Run the test, expect it to fail** + +```bash +cargo test --test network_baseline tcp_rx_latency_sub_5ms +``` + +Expected: **FAIL** with `host→guest RX latency must be sub-5 ms; got <5000-9999> µs` — the current `net_poll_thread` is ineligible to deliver in <5 ms because of its `sleep(5ms)`. + +This is the Phase 6.4 BROKEN_ON_PURPOSE pin. It will flip in Task 11. + +- [ ] **Step 3: Commit the failing pin** + +```bash +git add tests/network_baseline.rs +git commit -m "test(network): pin tcp_rx_latency_sub_5ms (BROKEN_ON_PURPOSE) + +Phase 6.4 contract: host→guest RX latency must be sub-5 ms when +data is available. Pre-6.4 the floor is the 5 ms net_poll_thread +sleep cycle; this assertion fails on master and on the current +PR #68 tip. Phase 6.4's epoll dispatch will flip it to passing. + +Mark with #[ignore] is deliberately NOT used: this is a positive +contract and CI must surface the failure on master so the gate +is unmissable." +``` + +--- + +### Task 3: `EpollDispatch` skeleton + unit test + +**Files:** +- Create: `src/network/epoll_dispatch.rs` +- Modify: `src/network/mod.rs` — add `pub(crate) mod epoll_dispatch;` + +- [ ] **Step 1: Write the failing test (in the new module)** + +In `src/network/epoll_dispatch.rs`: + +```rust +//! Linux epoll-driven readiness dispatch for SLIRP host sockets. +//! +//! Owns one `epoll_fd` plus a self-pipe. Callers register socket FDs +//! with a `FlowToken` (a 64-bit identifier the dispatcher returns on +//! readiness). The poll thread calls `wait_with_timeout` to block +//! until any registered FD is ready or the timeout fires, then drains +//! the events into a caller-owned buffer. +//! +//! Why no crate? The standard `mio`/`tokio` story would pull in a +//! reactor + a runtime — Phase 6.4 needs neither. `libc::epoll_*` +//! is two syscalls, fully observable, and the surface fits in ~150 +//! lines. See plan 2026-04-30-smoltcp-passt-port-phase6.4.md +//! "Architecture notes" for the rationale. + +use std::io; +use std::os::fd::{AsRawFd, OwnedFd, RawFd}; +use std::time::Duration; + +/// Opaque per-FD identifier the caller uses to look up which flow a +/// readiness event belongs to. Encoded into `epoll_data.u64`. +pub type FlowToken = u64; + +/// One readiness event, mapped from `libc::epoll_event`. +#[derive(Debug, Clone, Copy)] +pub struct EpollEvent { + pub token: FlowToken, + pub readable: bool, + pub writable: bool, +} + +#[derive(Debug)] +pub struct EpollDispatch { + // implementation in next step +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::fd::AsRawFd; + + #[test] + fn dispatch_new_creates_epoll_fd() { + let dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + assert!(dispatch.epoll_fd_for_test() >= 0); + } +} +``` + +- [ ] **Step 2: Run, expect compile error** + +```bash +cargo test --lib network::epoll_dispatch +``` + +Expected: COMPILE FAIL — `new` and `epoll_fd_for_test` not defined. + +- [ ] **Step 3: Implement minimal `EpollDispatch`** + +Replace the empty struct in `src/network/epoll_dispatch.rs`: + +```rust +#[derive(Debug)] +pub struct EpollDispatch { + epoll_fd: OwnedFd, +} + +impl EpollDispatch { + /// Create a new epoll instance with `EPOLL_CLOEXEC`. + pub fn new() -> io::Result { + // SAFETY: `epoll_create1` returns -1 on error and a valid fd + // otherwise. We wrap into OwnedFd so Drop closes it. + let raw = unsafe { libc::epoll_create1(libc::EPOLL_CLOEXEC) }; + if raw < 0 { + return Err(io::Error::last_os_error()); + } + let epoll_fd = unsafe { OwnedFd::from_raw_fd(raw) }; + Ok(Self { epoll_fd }) + } + + #[cfg(test)] + fn epoll_fd_for_test(&self) -> RawFd { + self.epoll_fd.as_raw_fd() + } +} +``` + +Add the missing `use std::os::fd::FromRawFd;` to the file's existing `use` block (module-scope per project convention). + +- [ ] **Step 4: Run, expect pass** + +```bash +cargo test --lib network::epoll_dispatch::tests::dispatch_new_creates_epoll_fd +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/network/epoll_dispatch.rs src/network/mod.rs +git commit -m "feat(network): EpollDispatch skeleton with epoll_create1 + +Phase 6.4 foundation. One epoll_fd owned via OwnedFd + EPOLL_CLOEXEC. +No registration logic yet — Task 4 will add register/unregister and +Task 6 will add the self-pipe + wait loop." +``` + +--- + +### Task 4: `register` / `unregister` + tests + +**Files:** +- Modify: `src/network/epoll_dispatch.rs` + +- [ ] **Step 1: Write the failing tests** + +In the `mod tests` block: + +```rust +#[test] +fn register_then_unregister_round_trip() { + use std::net::TcpListener; + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let mut dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + let token: FlowToken = 0xDEAD_BEEF; + dispatch + .register(listener.as_raw_fd(), token, true, false) + .expect("register"); + dispatch.unregister(listener.as_raw_fd()).expect("unregister"); +} + +#[test] +fn register_invalid_fd_returns_error() { + let mut dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + let result = dispatch.register(-1, 0, true, false); + assert!(result.is_err()); +} +``` + +- [ ] **Step 2: Run, expect compile fail** + +```bash +cargo test --lib network::epoll_dispatch +``` + +Expected: COMPILE FAIL — `register`/`unregister` not defined. + +- [ ] **Step 3: Implement** + +Add to `EpollDispatch`: + +```rust +impl EpollDispatch { + /// Register `fd` with the dispatcher. `readable`/`writable` + /// select EPOLLIN / EPOLLOUT. `token` is opaque to the + /// dispatcher — returned verbatim on readiness events. + pub fn register( + &mut self, + fd: RawFd, + token: FlowToken, + readable: bool, + writable: bool, + ) -> io::Result<()> { + let mut events: u32 = 0; + if readable { + events |= libc::EPOLLIN as u32; + } + if writable { + events |= libc::EPOLLOUT as u32; + } + let mut ev = libc::epoll_event { + events, + u64: token, + }; + // SAFETY: epoll_ctl reads `ev` for ADD; we own `fd` for the + // lifetime of the registration (caller's contract). + let rc = unsafe { + libc::epoll_ctl( + self.epoll_fd.as_raw_fd(), + libc::EPOLL_CTL_ADD, + fd, + &mut ev as *mut _, + ) + }; + if rc < 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } + + pub fn unregister(&mut self, fd: RawFd) -> io::Result<()> { + // SAFETY: epoll_ctl ignores the event pointer for DEL but + // still requires it to be non-null on older kernels. + let mut ev = libc::epoll_event { events: 0, u64: 0 }; + let rc = unsafe { + libc::epoll_ctl( + self.epoll_fd.as_raw_fd(), + libc::EPOLL_CTL_DEL, + fd, + &mut ev as *mut _, + ) + }; + if rc < 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } +} +``` + +- [ ] **Step 4: Run, expect pass** + +```bash +cargo test --lib network::epoll_dispatch +``` + +Expected: PASS for both new tests. + +- [ ] **Step 5: Commit** + +```bash +git add src/network/epoll_dispatch.rs +git commit -m "feat(network): EpollDispatch register/unregister" +``` + +--- + +### Task 5: `wait_with_timeout` + integration test + +**Files:** +- Modify: `src/network/epoll_dispatch.rs` + +- [ ] **Step 1: Write the failing test** + +```rust +#[test] +fn wait_returns_event_when_socket_becomes_readable() { + use std::io::Write; + use std::net::{TcpListener, TcpStream}; + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let addr = listener.local_addr().unwrap(); + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + sock.write_all(b"hi").unwrap(); + }); + let stream = TcpStream::connect(addr).expect("connect"); + server.join().unwrap(); + + let mut dispatch = EpollDispatch::new().expect("new"); + dispatch + .register(stream.as_raw_fd(), 0xCAFE, true, false) + .expect("register"); + + let mut events: Vec = Vec::new(); + let n = dispatch + .wait_with_timeout(&mut events, Duration::from_secs(1)) + .expect("wait"); + assert_eq!(n, 1); + assert_eq!(events[0].token, 0xCAFE); + assert!(events[0].readable); +} +``` + +- [ ] **Step 2: Run, expect compile fail** + +Expected: `wait_with_timeout` not found. + +- [ ] **Step 3: Implement** + +```rust +impl EpollDispatch { + /// Block up to `timeout` for any registered FD to become ready. + /// Drains ready events into `out` (cleared first). Returns the + /// number of events drained. + /// + /// `timeout = Duration::ZERO` is non-blocking poll; + /// `timeout = Duration::from_secs(...)` waits up to that long. + pub fn wait_with_timeout( + &self, + out: &mut Vec, + timeout: Duration, + ) -> io::Result { + out.clear(); + + // Pre-allocate a fixed-size event buffer. 64 ready FDs per + // wait is more than enough for our flow counts; events not + // returned this round will surface on the next wait. + let mut raw_events: [libc::epoll_event; 64] = + [libc::epoll_event { events: 0, u64: 0 }; 64]; + + let timeout_ms: i32 = timeout + .as_millis() + .min(i32::MAX as u128) as i32; + + // SAFETY: epoll_wait writes up to raw_events.len() entries; + // returns -1 on error, 0 on timeout, n>0 on events. + let n = unsafe { + libc::epoll_wait( + self.epoll_fd.as_raw_fd(), + raw_events.as_mut_ptr(), + raw_events.len() as i32, + timeout_ms, + ) + }; + if n < 0 { + // EINTR is non-fatal — caller can retry on next tick. + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + return Ok(0); + } + return Err(err); + } + for raw in &raw_events[..n as usize] { + out.push(EpollEvent { + token: raw.u64, + readable: (raw.events & libc::EPOLLIN as u32) != 0, + writable: (raw.events & libc::EPOLLOUT as u32) != 0, + }); + } + Ok(n as usize) + } +} +``` + +- [ ] **Step 4: Run, expect pass** + +```bash +cargo test --lib network::epoll_dispatch +``` + +- [ ] **Step 5: Commit** + +```bash +git add src/network/epoll_dispatch.rs +git commit -m "feat(network): EpollDispatch::wait_with_timeout" +``` + +--- + +### Task 6: Self-pipe + wakeup test + +**Files:** +- Modify: `src/network/epoll_dispatch.rs` + +- [ ] **Step 1: Write the failing test** + +```rust +#[test] +fn wakeup_unblocks_wait_immediately() { + use std::time::Instant; + let mut dispatch = EpollDispatch::new().expect("new"); + let waker = dispatch.waker(); + + // Start the wait in another thread with a long timeout. + let wait_thread = std::thread::spawn(move || -> std::time::Duration { + let mut events: Vec = Vec::new(); + let start = Instant::now(); + let _ = dispatch.wait_with_timeout(&mut events, Duration::from_secs(5)); + start.elapsed() + }); + + // Wake immediately. + std::thread::sleep(Duration::from_millis(10)); + waker.wake(); + + let elapsed = wait_thread.join().expect("wait thread"); + // Wait thread should return well under the 5 s timeout. + assert!( + elapsed < Duration::from_secs(1), + "wait did not return on wakeup: {elapsed:?}" + ); +} +``` + +- [ ] **Step 2: Run, expect compile fail** + +Expected: `waker()` and `Waker` not defined. + +- [ ] **Step 3: Implement** + +Add to `epoll_dispatch.rs`: + +```rust +/// Cloneable wakeup handle for `EpollDispatch`. Writing one byte to +/// the underlying pipe wakes a thread blocked in `wait_with_timeout`. +#[derive(Debug, Clone)] +pub struct Waker { + write_end: std::sync::Arc, +} + +impl Waker { + pub fn wake(&self) { + let buf = [0u8; 1]; + // SAFETY: write to a non-blocking pipe never blocks. We + // ignore EAGAIN — the pipe already has bytes pending, which + // means a wakeup is already queued. + let _ = unsafe { + libc::write(self.write_end.as_raw_fd(), buf.as_ptr() as *const _, 1) + }; + } +} + +const SELF_PIPE_TOKEN: FlowToken = u64::MAX; + +impl EpollDispatch { + /// Returns a `Waker` that, when called, unblocks any thread + /// currently inside `wait_with_timeout`. + pub fn waker(&mut self) -> Waker { + if self.waker_handle.is_none() { + let (read_fd, write_fd) = create_pipe2_nonblock_cloexec(); + self.register(read_fd.as_raw_fd(), SELF_PIPE_TOKEN, true, false) + .expect("register self-pipe"); + self.read_end = Some(read_fd); + self.waker_handle = Some(std::sync::Arc::new(write_fd)); + } + Waker { + write_end: self.waker_handle.as_ref().unwrap().clone(), + } + } +} + +fn create_pipe2_nonblock_cloexec() -> (OwnedFd, OwnedFd) { + let mut fds = [0 as RawFd; 2]; + // SAFETY: pipe2 with O_NONBLOCK | O_CLOEXEC writes two fds into fds. + let rc = unsafe { + libc::pipe2(fds.as_mut_ptr(), libc::O_NONBLOCK | libc::O_CLOEXEC) + }; + assert!(rc == 0, "pipe2 failed: {}", io::Error::last_os_error()); + let read_end = unsafe { OwnedFd::from_raw_fd(fds[0]) }; + let write_end = unsafe { OwnedFd::from_raw_fd(fds[1]) }; + (read_end, write_end) +} +``` + +Add fields to `EpollDispatch`: + +```rust +#[derive(Debug)] +pub struct EpollDispatch { + epoll_fd: OwnedFd, + read_end: Option, + waker_handle: Option>, +} +``` + +…and update `EpollDispatch::new` to initialize the new fields to `None`. + +In `wait_with_timeout`, after collecting events, drop the self-pipe wake-token from the returned set (the caller doesn't care about it) and drain any pending bytes from the read end: + +```rust +// Drain self-pipe events from the returned set + the pipe itself. +let mut filtered: Vec = Vec::with_capacity(out.len()); +for ev in out.drain(..) { + if ev.token == SELF_PIPE_TOKEN { + if let Some(read_end) = &self.read_end { + let mut scratch = [0u8; 64]; + // SAFETY: non-blocking read; ignored result. + unsafe { + libc::read( + read_end.as_raw_fd(), + scratch.as_mut_ptr() as *mut _, + scratch.len(), + ); + } + } + continue; + } + filtered.push(ev); +} +*out = filtered; +let observable_n = out.len(); +Ok(observable_n) +``` + +- [ ] **Step 4: Run all dispatch tests** + +```bash +cargo test --lib network::epoll_dispatch +``` + +Expected: PASS for all four tests. + +- [ ] **Step 5: Commit** + +```bash +git add src/network/epoll_dispatch.rs +git commit -m "feat(network): EpollDispatch self-pipe wakeup + +Cloneable Waker writes one byte to a non-blocking pipe registered +with EPOLLIN. wait_with_timeout filters self-pipe events out of +the returned set and drains the pipe so subsequent waits don't +spurious-wake." +``` + +--- + +### Task 7: Wire `EpollDispatch` into `SlirpBackend` + +**Files:** +- Modify: `src/network/slirp.rs` — `SlirpBackend` struct + `new` + `with_security`. + +- [ ] **Step 1: Add the field** + +In the `SlirpBackend` struct definition (~line 450): + +```rust +pub struct SlirpBackend { + // ... existing fields ... + epoll: crate::network::epoll_dispatch::EpollDispatch, + epoll_waker: crate::network::epoll_dispatch::Waker, +} +``` + +In `SlirpBackend::with_security` (~line 503), after `flow_table` is initialized but before any flow is inserted: + +```rust +let mut epoll = crate::network::epoll_dispatch::EpollDispatch::new() + .map_err(|e| anyhow::anyhow!("EpollDispatch::new: {e}"))?; +let epoll_waker = epoll.waker(); +``` + +…then include `epoll`, `epoll_waker` in the struct literal. + +- [ ] **Step 2: Run unit tests; expect them to still pass (no behavior change yet)** + +```bash +cargo test --lib network::slirp +cargo test --test network_baseline +``` + +Expected: ALL PASS — `SlirpBackend` now owns an unused epoll_fd. + +- [ ] **Step 3: Commit** + +```bash +git add src/network/slirp.rs +git commit -m "refactor(slirp): SlirpBackend holds EpollDispatch + Waker + +Plumbed but not yet consumed. Subsequent tasks wire flow_table +mutations into epoll register/unregister and rewrite the relay +loops to dispatch on readiness." +``` + +--- + +### Task 8: TCP register/unregister on flow_table mutation + smoke test + +**Files:** +- Modify: `src/network/slirp.rs` — `handle_tcp_frame` (after `flow_table.insert`) and `relay_tcp_nat_data` (where `to_remove` entries are reaped). + +- [ ] **Step 1: Add a `flow_token_for_tcp` helper at module scope** + +Encoding: 8 bits of protocol tag (0x01 = TCP), 8 bits unused (zero), 16 bits guest_port, 32 bits packed (dst_port << 16) | (truncated dst_ip). For 100 % uniqueness across tag/port collisions, see follow-up — for now this 64-bit token is unique within the flow table because `NatKey` itself is unique. + +```rust +const PROTO_TAG_TCP: u64 = 0x0100_0000_0000_0000; +const PROTO_TAG_UDP: u64 = 0x0200_0000_0000_0000; +const PROTO_TAG_ICMP: u64 = 0x0300_0000_0000_0000; + +fn flow_token_for_tcp(key: &NatKey) -> u64 { + let dst_ip_bytes = key.dst_ip.0; + let dst_ip_low: u64 = u64::from(u32::from_be_bytes(dst_ip_bytes)) & 0xFFFF_FFFF; + PROTO_TAG_TCP + | (u64::from(key.guest_src_port) << 32) + | (u64::from(key.dst_port) << 16) + | (dst_ip_low & 0xFFFF) +} +``` + +Symmetric helpers for UDP / ICMP land in Tasks 9 / 10. + +- [ ] **Step 2: After every `flow_table.insert(FlowKey::Tcp(...), FlowEntry::Tcp(entry))`, register the host_stream FD** + +For example in `handle_tcp_frame` (~line 1290 after insert): + +```rust +let token = flow_token_for_tcp(&key); +self.epoll + .register(entry.host_stream.as_raw_fd(), token, true, false) + .ok(); +self.epoll_waker.wake(); +``` + +…and in `process_pending_inbound_accepts` (line 648 area): + +```rust +self.flow_table.insert(FlowKey::Tcp(key), FlowEntry::Tcp(entry)); +let host_fd = match self.flow_table.get(&FlowKey::Tcp(key)) { + Some(FlowEntry::Tcp(e)) => e.host_stream.as_raw_fd(), + _ => unreachable!(), +}; +self.epoll.register(host_fd, flow_token_for_tcp(&key), true, false).ok(); +self.epoll_waker.wake(); +``` + +…and on every `flow_table.remove(&FlowKey::Tcp(...))` site, unregister first: + +```rust +if let Some(FlowEntry::Tcp(e)) = self.flow_table.get(&flow_key) { + self.epoll.unregister(e.host_stream.as_raw_fd()).ok(); +} +self.flow_table.remove(&flow_key); +``` + +(grep for every `flow_table.remove` and `flow_table.insert` site touching TCP — there are ~6.) + +- [ ] **Step 3: Run all baseline pins** + +```bash +cargo test --test network_baseline +``` + +Expected: PASS — no behavioral change yet (relay still re-peeks every flow on every tick). + +- [ ] **Step 4: Commit** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): register TCP flows with EpollDispatch + +flow_table mutations now keep the epoll set in sync. No relay-loop +change yet — Task 11 will switch the loop to dispatch by readiness +instead of iterating the full table." +``` + +--- + +### Task 9: UDP register/unregister + ICMP register/unregister + +Mirror Task 8 for `FlowKey::Udp` and `FlowKey::IcmpEcho` flow_table sites. Same shape: register on insert, unregister on remove. Use `PROTO_TAG_UDP` / `PROTO_TAG_ICMP` in the helpers. + +- [ ] **Step 1: Implement helpers and call sites** +- [ ] **Step 2: Run baseline pins (PASS)** +- [ ] **Step 3: Commit** with message `feat(slirp): register UDP + ICMP flows with EpollDispatch` + +--- + +### Task 10: Flip `relay_tcp_nat_data` to event-driven + +**Files:** +- Modify: `src/network/slirp.rs` — `relay_tcp_nat_data` body (~line 1512+). + +The current loop iterates *every* TCP entry in `flow_table` every tick. New shape: take the readiness set from a caller-passed `&[EpollEvent]`, look up the flow by `FlowKey`, only peek-relay readable flows. + +- [ ] **Step 1: Change signature** + +```rust +fn relay_tcp_nat_data(&mut self, ready: &[EpollEvent]) { + let mut to_remove: Vec = Vec::new(); + let mut frames_to_inject: Vec> = Vec::new(); + + for event in ready { + if event.token & PROTO_TAG_TCP_MASK != PROTO_TAG_TCP { + continue; + } + // Decode token back to NatKey by linear scan — flow_table is + // small and the token-to-key direction is rare (only on + // readiness). Future optimization: keep a side index. + let flow_key = match self.flow_table.iter().find_map(|(k, _)| { + if let FlowKey::Tcp(nat_key) = k { + if flow_token_for_tcp(nat_key) == event.token { + return Some(*k); + } + } + None + }) { + Some(k) => k, + None => continue, + }; + + let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&flow_key) else { + continue; + }; + if entry.state != TcpNatState::Established { + continue; + } + + // ... existing peek/relay body, unchanged from line 1549+ ... + } + + self.inject_to_guest.append(&mut frames_to_inject); + for flow_key in to_remove { + if let Some(FlowEntry::Tcp(e)) = self.flow_table.get(&flow_key) { + self.epoll.unregister(e.host_stream.as_raw_fd()).ok(); + } + self.flow_table.remove(&flow_key); + } +} +``` + +Define `PROTO_TAG_TCP_MASK` next to the other tag constants: + +```rust +const PROTO_TAG_MASK: u64 = 0xFF00_0000_0000_0000; +``` + +…and check `event.token & PROTO_TAG_MASK == PROTO_TAG_TCP`. + +- [ ] **Step 2: Update the caller in `drain_to_guest`** + +```rust +pub fn drain_to_guest(&mut self, out: &mut Vec>) { + self.process_pending_inbound_accepts(); + // ... ARP handling ... + + // Phase 6.4: gather readiness events once per tick. The poll + // thread will already have driven a recent epoll_wait; here we do + // a non-blocking poll to pick up anything that arrived between + // the last wait and now. + let mut ready: Vec = Vec::new(); + let _ = self.epoll.wait_with_timeout(&mut ready, Duration::ZERO); + + self.resolve_pending_dns(); + self.relay_tcp_nat_data(&ready); + self.relay_icmp_echo(&ready); + self.relay_udp_flows(&ready); + + // ... unchanged collection of frames ... +} +``` + +- [ ] **Step 3: Update `relay_icmp_echo` and `relay_udp_flows` signatures to `(&mut self, ready: &[EpollEvent])`** with parallel filtering by `PROTO_TAG_ICMP` / `PROTO_TAG_UDP`. + +- [ ] **Step 4: Run baseline pins** + +```bash +cargo test --test network_baseline +``` + +Expected: PASS — the `wait_with_timeout(Duration::ZERO)` non-blocking poll captures any ready FD between vCPU calls; the relay still works. + +- [ ] **Step 5: Commit** + +```bash +git add src/network/slirp.rs +git commit -m "feat(slirp): relay loops dispatch by epoll readiness + +drain_to_guest non-blocking-polls the epoll set once per tick and +passes the ready event list to relay_tcp_nat_data / +relay_udp_flows / relay_icmp_echo, which now skip non-ready flows +instead of re-peeking the whole table. Behavior unchanged on +hot-path; per-tick CPU should drop on idle systems with many +flows." +``` + +--- + +### Task 11: Rewrite `net_poll_thread` to use `epoll_wait` + +**Files:** +- Modify: `src/vmm/mod.rs:1599-1640`. + +- [ ] **Step 1: Replace the `sleep(5ms)` loop** + +The current loop: + +```rust +while running.load(Ordering::Relaxed) { + std::thread::sleep(std::time::Duration::from_millis(5)); + // ... try_inject_rx + irq ... +} +``` + +Becomes (pseudocode — exact integration with the device-lock pattern needs care): + +```rust +while running.load(Ordering::Relaxed) { + // Acquire the SlirpBackend's waker once at startup; use it as + // the shutdown signaling channel too. + let mut events: Vec = Vec::new(); + { + let guard = match net_dev.lock() { + Ok(g) => g, + Err(_) => continue, + }; + // Borrow epoll for the wait; see Step 2 for the API on + // VirtioNetDevice that exposes it without holding the + // device lock during epoll_wait. + let _ = guard.poll_epoll(&mut events, Duration::from_millis(50)); + } + // ... try_inject_rx + irq, unchanged ... +} +``` + +The challenge: `epoll_wait` blocks for up to 50 ms; we cannot hold the device lock that whole time (vCPU would stall on next TX). Solution: `VirtioNetDevice::poll_epoll` clones the `epoll` into an `Arc>` (or similar) and the wait happens *outside* the device lock. + +- [ ] **Step 2: Refactor the lock granularity** + +In `src/network/slirp.rs`, change: + +```rust +epoll: EpollDispatch, +``` + +to: + +```rust +epoll: std::sync::Arc>, +``` + +…and update all `self.epoll.register(...)` to `self.epoll.lock().unwrap().register(...)`. Provide a clone-of-Arc accessor: + +```rust +pub fn epoll_arc(&self) -> std::sync::Arc> { + Arc::clone(&self.epoll) +} +``` + +The poll thread holds an `Arc>`, calls `wait_with_timeout` while holding that lock, and *not* the device lock. + +- [ ] **Step 3: Run baseline + integration tests** + +```bash +cargo test --workspace --all-features +cargo test --test network_baseline +``` + +Expected: all PASS. + +- [ ] **Step 4: Run the BROKEN_ON_PURPOSE pin from Task 2 — it should now flip to PASS** + +```bash +cargo test --test network_baseline tcp_rx_latency_sub_5ms +``` + +Expected: **PASS** with measured latency < 5 ms (likely sub-millisecond). + +- [ ] **Step 5: Commit** + +```bash +git add src/network/slirp.rs src/vmm/mod.rs +git commit -m "feat(vmm): net_poll_thread driven by epoll_wait + +Replaces the 5 ms sleep cycle with epoll_wait(timeout=50ms). When +host data arrives, the poll thread wakes within microseconds and +drives drain_to_guest immediately. When idle, the thread wakes +once every 50 ms for housekeeping (UDP/ICMP idle reaping) — a +10x reduction in wakeup duty cycle vs the previous 5 ms timer. + +Phase 6.4 BROKEN_ON_PURPOSE pin tcp_rx_latency_sub_5ms flips to +passing here." +``` + +--- + +### Task 12: Snapshot rebuild test + implementation + +**Files:** +- Modify: `src/vmm/mod.rs` (snapshot/restore paths) and `src/network/slirp.rs` (`from_snapshot`-shaped constructor). + +- [ ] **Step 1: Run the existing snapshot integration suite to confirm baseline** + +```bash +export VOID_BOX_KERNEL=/boot/vmlinuz-$(uname -r) +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test snapshot_integration -- --ignored --test-threads=1 +``` + +Expected: PASS (Phase 0–5 baseline). If it doesn't pass on this branch's tip pre-6.4, fix before continuing — this gate is non-negotiable. + +- [ ] **Step 2: Write the new test pin** + +In `tests/network_baseline.rs`: + +```rust +/// Phase 6.4 contract: snapshot/restore must rebuild the epoll +/// dispatch from flow_table contents. After a round-trip, the +/// backend has zero registered flows in epoll if flow_table was +/// non-empty pre-snapshot — that's the bug we want to catch. +#[test] +fn epoll_set_rebuilt_on_restore_smoke() { + // Construct backend, open one TCP flow (handshake), serialize + // the flow_table, drop the backend, build a fresh backend and + // inject the serialized flow_table. Verify the new backend's + // epoll set has the flow's host_fd registered. + // ... (full test code) ... +} +``` + +The detailed body is omitted here — write it referencing the snapshot helpers in `src/vmm/snapshot.rs` and the existing `from_snapshot` shape. Verify by checking the count of registered FDs (add a `#[cfg(test)] pub fn registered_fd_count(&self) -> usize` to `EpollDispatch`). + +- [ ] **Step 3: Run, expect FAIL** + +The current snapshot path has no rebuild step; the count is 0. + +- [ ] **Step 4: Implement rebuild in the snapshot deserialization path** + +Wherever `from_snapshot` reconstructs the `SlirpBackend` (likely in `src/vmm/mod.rs` around line 690 area where snapshots are restored), after the flow_table is rebuilt from the snapshot bytes, iterate it and call `epoll.register` for each entry's host FD. + +- [ ] **Step 5: Run new test + integration suite** + +```bash +cargo test --test network_baseline epoll_set_rebuilt +cargo test --test snapshot_integration -- --ignored --test-threads=1 +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add tests/network_baseline.rs src/network/slirp.rs src/vmm/mod.rs +git commit -m "feat(slirp): rebuild epoll set on snapshot restore + +epoll_fd is a kernel handle and cannot serialize. After +flow_table is reconstructed from snapshot bytes, register every +host FD with a fresh EpollDispatch." +``` + +--- + +### Task 13: Bench the win + perf gate + +**Files:** +- Modify: `benches/network.rs` — add `tcp_rx_latency_one_packet`. +- Modify: `src/bin/voidbox-network-bench/main.rs` — add `tcp_rx_latency_us_p50` measurement. + +- [ ] **Step 1: Add divan microbench** + +In `benches/network.rs`, add: + +```rust +/// Phase 6.4 baseline: time from "host write returns" to "guest +/// sees data in drain_to_guest output". Pre-6.4 this was bounded +/// below by the 5 ms net_poll_thread cycle; post-6.4 epoll +/// dispatch should deliver in microseconds. +#[divan::bench] +fn tcp_rx_latency_one_packet(bencher: Bencher) { + // ... handshake setup outside the timed loop ... + bencher.bench_local(|| { + // Host writes; measure how fast the bytes appear in the + // SlirpBackend's drain output. + }); +} +``` + +Full implementation: harness similar to `tcp_inbound_syn_ack_transition` shape — use `bench-helpers` feature for synthetic flow seeding, drive the data path inside the timed closure. + +- [ ] **Step 2: Add wall-clock measurement to `voidbox-network-bench`** + +In `src/bin/voidbox-network-bench/main.rs`, add a `tcp_rx_latency_us_p50` field to `Report` and a `measure_rx_latency` function that boots a VM, opens a guest→host flow, has the host write small packets, and measures host-T0-to-guest-arrival via the SLIRP relay. + +- [ ] **Step 3: Run the perf gate against `origin/main`** + +```bash +scripts/bench-compare.sh --baseline origin/main --skip-vm > /tmp/phase6.4-vs-main.md +cat /tmp/phase6.4-vs-main.md +``` + +Validate per the hard performance gate at the top of this plan: + +- Every comparable bench: HEAD ≤ baseline + 5 %. +- `tcp_rx_latency_one_packet` (HEAD-only) shows a sub-millisecond median. +- `port_forward_accept_latency` improves by ≥ 30 %, *or* document why it stays (likely the listener accept thread is still on the 50 ms cycle — fixing it is a small follow-up step in Phase 6.4 itself or its own task; decide before committing). + +- [ ] **Step 4: If `port_forward_accept_latency` doesn't improve, add a fix-up sub-task** to also move the listener accept onto epoll. The plan permits this — see Architecture notes. + +- [ ] **Step 5: Commit benches + the perf-gate output** + +```bash +git add benches/network.rs src/bin/voidbox-network-bench/main.rs +git commit -m "bench(network): tcp_rx_latency_one_packet + voidbox-network-bench p50 + +Captures the Phase 6.4 win numerically. Pre-6.4 RX latency was +bounded below by the 5 ms net_poll_thread cycle; post-6.4 epoll +dispatch lands in microseconds. + +scripts/bench-compare.sh --baseline origin/main --skip-vm output +attached as /tmp/phase6.4-vs-main.md (not committed; consult the +PR description for the table)." +``` + +--- + +### Task 14: Phase 6.4 validation gate + +- [ ] **Step 1: Standard validation contract** (per `AGENTS.md`) + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +cargo test --workspace --all-features +cargo test --doc --workspace --all-features +``` + +All must pass. + +- [ ] **Step 2: VM suites** + +```bash +export VOID_BOX_KERNEL=/boot/vmlinuz-$(uname -r) +export VOID_BOX_INITRAMFS=/tmp/void-box-test-rootfs.cpio.gz +cargo test --test conformance -- --ignored --test-threads=1 +cargo test --test oci_integration -- --ignored --test-threads=1 +cargo test --test snapshot_integration -- --ignored --nocapture --test-threads=1 +cargo test --test e2e_telemetry -- --ignored --test-threads=1 +cargo test --test e2e_skill_pipeline -- --ignored --test-threads=1 +cargo test --test e2e_mount -- --ignored --test-threads=1 +cargo test --test e2e_service_mode -- --ignored --test-threads=1 +cargo test --test e2e_sidecar -- --ignored --test-threads=1 +``` + +All must pass. + +- [ ] **Step 3: aarch64 cross-check** + +```bash +CFLAGS_aarch64_unknown_linux_gnu="--sysroot=/usr/aarch64-redhat-linux/sys-root/fc43" \ + RUSTFLAGS="-D warnings" \ + cargo check --target aarch64-unknown-linux-gnu -p void-box --lib --tests +``` + +- [ ] **Step 4: Hard perf gate** + +```bash +scripts/bench-compare.sh --baseline origin/main --skip-vm +``` + +Validate against the contract at the top of this plan. **The PR is not allowed to merge** until this passes. + +- [ ] **Step 5: Commit gate evidence in the PR description (no commit needed)** + +Capture the bench-compare output in the PR body. Phase 6.4 PR is then ready for review. + +--- + +## Rollback plan + +Each task lands as one commit. If Task N introduces a regression caught at Task M (where M > N), `git revert` Task N's commit and redispatch its implementer with the failure context. No task irreversibly changes wire format or snapshot layout — every change is additive (new fields, new module) or behavior-preserving refactor. + +The only exception is the snapshot rebuild path (Task 12). If that's wrong on disk, restored backends will have a fresh-but-empty epoll set and connections will appear hung. Test the snapshot path *before* claiming Task 12 done. + +## Out of scope (deferred to Phase 6.1 / 6.2 / 6.3) + +- TCP half-close — Phase 6.1. +- Async outbound `connect` — Phase 6.2 (will *consume* the epoll dispatch primitive added here for `EPOLLOUT` writability detection). +- Window management — Phase 6.3. + +## Reviewer pointers + +- **Lock granularity:** verify `epoll_wait` does not happen under the device lock (Task 11 Step 2). +- **FD lifecycle:** every `flow_table.insert` has a matching `epoll.register`; every `flow_table.remove` has a matching `epoll.unregister`. grep for both pairs and pair-count. +- **Self-pipe correctness:** `Waker::wake` is no-block, no-allocate, signal-safe-adjacent. +- **Snapshot rebuild:** Task 12's test is the contract; verify the count helper is `#[cfg(test)]` only. +- **Token uniqueness:** `flow_token_for_tcp` is unique within the flow table because `NatKey` is unique. The 16-bit dst_ip truncation is intentional for v4-only addresses on a /16 SLIRP subnet — collisions with foreign IPs are not possible because all flows route through the gateway. + +## Document history + +- 2026-04-30: initial plan written, hard performance gate locked. diff --git a/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.md b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.md new file mode 100644 index 00000000..913e1e96 --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.md @@ -0,0 +1,286 @@ +# Phase 6: TCP Lifecycle + Async Connect + Window Mgmt + Event-Driven Polling + +> **Status:** Overview (scope + design). Per-subsystem TDD task lists are deferred to dedicated plans (`-phase6.1.md`, `-phase6.2.md`, `-phase6.3.md`, `-phase6.4.md`) written before each is implemented. This document scopes the work, locks invariants, and lists validation gates so each sub-plan can be reviewed against a stable target. + +> **For agentic workers:** This is an **overview**, not an executable plan. Do not run subagent-driven-development against this file. When picking up a sub-area, write its own plan first. + +**Goal:** Close the four architectural gaps surfaced in the `smoltcp-passt-port-phase0` PR review without regressing any Phase 0–5 baseline. + +**Architecture:** Each sub-area imports a specific passt design pattern adapted to our `cfg(target_os = "linux")` SLIRP backend; none requires a backend split. The relay loop in `SlirpBackend::drain_to_guest` stays the single net-poll dispatch point; the changes layer onto its existing flow_table / inject_to_guest pipeline. + +**Tech stack:** smoltcp 0.11 wire types, `std::net::TcpStream` (non-blocking), Linux `epoll` (Phase 6.4), no new crates. + +--- + +## Background + +Reviewer findings on the smoltcp-passt-port PR (April 2026) — three "Medium" or higher and one "Medium-Low" architectural gap. All four were verified VALID against current code. Quick-fix correctness items (Copilot review) are addressed on the same PR; this Phase 6 plan covers the architecture-shaped follow-ups. + +Reference: `docs/superpowers/plans/2026-04-27-smoltcp-passt-port.md` (top-level spec, observability invariant), Phase 0–5 plans (architectural decisions established by prior phases). + +## Invariants (carried from earlier phases — non-negotiable) + +These are locked from the top-level spec. Phase 6 changes must preserve all of them. + +1. **Full observability.** Every TCP/UDP/ICMP frame and every state transition remains traceable through tracing logs. No opaque C-process or kernel-side magic. If a new subsystem hides state inside the kernel (e.g. epoll), tracing must still expose what the host saw and when. +2. **All-Rust path.** No new C dependencies, no FFI beyond what `libc` already provides. `epoll`-via-`libc` is acceptable; a new crate that opaques it is not, unless the crate is already in the workspace. +3. **Cross-platform discipline.** SLIRP itself is Linux-only (`#[cfg(target_os = "linux")]` in `Cargo.toml`). Phase 6 stays inside that gate. macOS uses VZ's built-in NAT; Phase 6 does not affect it. +4. **No regression in Phase 0–5 baselines.** `bench-compare.sh --baseline ` must show every existing bench at ±5% or better. New benches added in Phase 6 may legitimately move the baseline, but the existing comparable set holds. +5. **Snapshot/restore correctness.** `snapshot_integration` must continue to pass. Any new state (e.g. half-close timers, async connect futures) added to `TcpNatEntry` must round-trip through serde or be rebuilt from `TcpStream` state on restore — not silently dropped. +6. **No bench-mode-only fixes.** Behavior changes go in production code paths, not behind `#[cfg(test)]` or feature flags. Tests/benches consume the same paths the guest does. + +## Sub-areas + +Four independent sub-areas, four sub-plans. Order is by reviewer-assigned severity, not by required ordering — they can land in any sequence as long as their individual validation gates hold. + +--- + +### 6.1 — TCP half-close (A1, High) + +**Severity:** High (correctness gap, not just performance). + +**Current state:** + +- `TcpNatState` at `src/network/slirp.rs:131-144` declares `FinWait1`, `FinWait2`, `CloseWait`, `LastAck` variants but they are unused. The enum carries `#[allow(dead_code)]` on line 130 to mute the resulting warnings. +- Guest FIN handler at `src/network/slirp.rs:1483-1500`: on receiving guest FIN, the stack immediately sends a FIN+ACK back to the guest and marks the entry `Closed` in the same call. There is no transition through `FinWait*` or `CloseWait`. The host-side `TcpStream` is dropped at the next `relay_tcp_nat_data` sweep when the entry is reaped. + +**The bug this enables:** + +When the guest's application closes the write side of a socket but expects to keep reading the host's response (the half-close pattern used by HTTP request bodies, SMTP DATA, anything with `shutdown(SHUT_WR)`), VoidBox slams the connection shut both directions. The host side never gets to flush its remaining response; the guest's read returns EOF prematurely. This is silent data loss for any protocol that uses orderly half-close. + +**Reference:** passt's `tcp.c` ([passt/tcp.c:238](https://passt.top/passt/tree/tcp.c#n238), [tcp.c:401](https://passt.top/passt/tree/tcp.c#n401)) tracks the four half-close states explicitly with timer-bounded transitions. + +**Target state:** + +- Guest FIN sets `state = FinWait1` (we still owe the host a half-close), shuts down the host socket's write side via `TcpStream::shutdown(Shutdown::Write)`, and ACKs the guest's FIN — but **does not** send our own FIN yet. +- When the host returns EOF (zero-byte read on the established connection) and the relay queue is drained, send our FIN to the guest, transition to `LastAck`. +- On guest's final ACK, transition to `Closed` and reap. +- The mirror pattern handles the host-initiated close: host EOF first → state goes to `CloseWait` (we owe the guest a FIN), continue forwarding any guest writes to the host, eventually send FIN to guest → `LastAck` → reap on ACK. +- Add a `LAST_ACK_TIMEOUT` (suggest 60 s, mirroring TCP MSL × 2) so a missing final ACK doesn't leak entries. + +**Test requirements:** + +- New `tests/network_baseline.rs` pin `tcp_half_close_guest_writes_first`: guest sends data, FIN; host reads data, replies with more data, then FIN. Assert: guest sees the host's post-FIN data **and** its FIN, in that order. Pre-Phase-6.1 this would fail (host data dropped). +- New pin `tcp_half_close_host_writes_first`: symmetric — host sends data, FIN; guest replies, FIN. Assert ordering. +- New pin `tcp_last_ack_timeout_reaps_stale_entry`: synthesize a `LastAck` entry with `last_activity` deep in the past; one `drain_to_guest` cycle later assert the entry is gone. +- `snapshot_integration`: round-trip a connection in `CloseWait` state. Assert post-restore the state is preserved (or, if we choose not to serde the half-close states, that the connection cleanly closes within `LAST_ACK_TIMEOUT`). + +**Validation gates (in addition to the global ones below):** + +- `cargo test --test network_baseline tcp_half_close_*` +- `cargo test --test snapshot_integration -- --ignored --test-threads=1` + +**File impact:** + +- `src/network/slirp.rs` — `handle_tcp_frame` FIN/RST arms (~lines 1483–1506), `relay_tcp_nat_data` (~line 1512+), `TcpNatEntry` (add half-close timer field if needed). +- `tests/network_baseline.rs` — three new pins. +- No changes to public API. + +--- + +### 6.2 — Async outbound connect (A2, Medium-High) + +**Severity:** Medium-High (correctness + UX gap). + +**Current state:** + +- `src/network/slirp.rs:1271`: on guest SYN, `handle_tcp_frame` calls `TcpStream::connect_timeout(&dst_addr, Duration::from_secs(3))` **synchronously**. +- `handle_tcp_frame` is called from `process_guest_frame` (~line 664), which is called from the virtio-net TX path (`src/devices/virtio_net.rs:~656`). +- The TX path runs on the vCPU thread under the device lock. A 3 s blocking connect to an unreachable destination stalls **all** guest networking — including unrelated connections — for the duration of the timeout. + +**The bug this enables:** + +A guest that opens connections to multiple destinations, one of which is slow or unreachable, sees the entire host networking pipeline freeze for 3 s every time it tries that destination. Long-running guests with sporadic dead destinations (DNS misconfigurations, transient NAT failures) suffer noticeable hitches. + +**Reference:** passt is fully event-driven — connect dispatches to a worker, completion arrives via epoll on the connecting socket's writability ([passt/tcp.c:2785](https://passt.top/passt/tree/tcp.c#n2785)). + +**Target state:** + +- On guest SYN: create a non-blocking socket (`TcpStream::connect` with `O_NONBLOCK`, or `socket2::Socket::new` + `connect_with_timeout` driven by us), insert a new state `Connecting` into `TcpNatState`, queue an entry in `flow_table` with the connecting socket. Return immediately to the vCPU thread. +- The net-poll thread polls the connecting socket on each tick (writability-check via `poll`/`select`/`epoll` — coordinate with 6.4). On readiness: + - Check `getsockopt(SOL_SOCKET, SO_ERROR)` — zero means connected, non-zero means failed. + - On success: transition `Connecting → SynReceived`, send SYN-ACK to the guest. + - On failure: send RST to the guest, reap the entry. + - On still-pending after `CONNECT_TIMEOUT` (3 s, matching today's behavior): treat as failure. +- vCPU thread is now never blocked on `connect`. + +**Test requirements:** + +- New pin `tcp_connect_to_unreachable_does_not_block_other_flows`: open one flow to a known-good destination, one to a deliberately-unreachable destination, both in quick succession. Measure time from guest SYN to host accepting the good-destination flow. Pre-6.2 this would be ~3 s (waiting for the bad one); post-6.2 it should be sub-millisecond. +- New pin `tcp_connect_async_eventual_rst_on_failure`: synthesize a connect to an unreachable address; drive `drain_to_guest` for >3 s; assert the guest receives RST. +- Bench: `bench/network.rs` add `process_syn_during_pending_connects` parametric on N pending connecting flows. Validates O(1) cost on guest TX path regardless of pending-connect backlog. + +**Validation gates:** + +- `cargo test --test network_baseline tcp_connect_*` +- `cargo bench --bench network process_syn_during_pending_connects` + +**File impact:** + +- `src/network/slirp.rs` — `TcpNatState` (add `Connecting`), `handle_tcp_frame` SYN arm (lines ~1267–1290), new `relay_pending_connects` method called from `drain_to_guest` (parallel to `relay_tcp_nat_data`). +- `tests/network_baseline.rs` — two new pins. +- `benches/network.rs` — one new bench. +- Snapshot interaction: `Connecting` state must serde correctly; restore should drop `Connecting` flows (reconnect from scratch is acceptable, deferred to Phase 6.1's MSL-bounded timer). + +--- + +### 6.3 — TCP window management (A3, Medium) + +**Severity:** Medium (perf gap, throughput left on the table). + +**Current state:** + +- `src/network/slirp.rs:1927`: `build_tcp_packet_static` always emits `window_len: TCP_WINDOW (65535)`, `window_scale: None`. +- No code reads `tcp.window_len()` from incoming guest frames. The guest's advertised window is ignored entirely. + +**Why this matters:** + +- The guest's TCP stack negotiates a window with us. We send "always 65535" regardless of what the guest can actually buffer. This is wrong both directions: + - Inbound (host→guest): we relay host data into our `inject_to_guest` queue without ever asking whether the guest still has receive buffer. If the guest is slow, our queue grows unbounded — Phase 3 partially mitigated this with peek-based reads, but window-aware backpressure would be cleaner. + - Outbound (guest→host): the guest sends respecting our advertised window (always 65535). On modern guests with `tcp_window_scaling=1` (the default), this caps effective throughput at 64 KB / RTT regardless of available bandwidth. +- The `window_scale: None` means we never negotiate scaling on SYN. Even if we tracked windows, we'd be capped at 64 KB. + +**Reference:** passt's `tcp_conn` ([passt/tcp_conn.h:21](https://passt.top/passt/tree/tcp_conn.h#n21)) tracks `wnd_from_tap`, `wnd_to_tap`, scale factors, and updates ACK/window per [tcp.c:1021](https://passt.top/passt/tree/tcp.c#n1021), [tcp.c:1426](https://passt.top/passt/tree/tcp.c#n1426). + +**Target state:** + +- On SYN/SYN-ACK exchange, negotiate `window_scale: Some(7)` (128× scale factor — passt's default). `TcpNatEntry` records the negotiated scale. +- On every guest packet, read `tcp.window_len()` and update `entry.guest_window` (after applying scale). Use this to bound the host→guest send rate: never push more bytes through `inject_to_guest` than the guest's effective receive window allows. +- On every host-side relay, set our outgoing `window_len` based on host kernel state — `getsockopt(TCP_INFO).tcpi_rcv_space` gives kernel-side receive buffer headroom; advertise that, scaled. +- Drop the hardcoded `TCP_WINDOW = 65535` constant. + +**Test requirements:** + +- New pin `tcp_advertised_window_tracks_guest_buffer`: synthesize a guest with a small advertised window (say 4096); push 64 KB of data from host; assert that `inject_to_guest` never holds more than ~`window` unacknowledged bytes. +- New pin `tcp_window_scale_negotiated_in_syn`: parse the SYN-ACK we send to the guest; assert it includes `window_scale: Some(7)`. +- Bench: extend `tcp_bulk_throughput_1mb` to also run with a constrained-window receiver (`SO_RCVBUF=16384`); pre-6.3 throughput will be 64 KB / RTT bound; post-6.3 should be substantially higher because we'll let the guest send larger bursts when host kernel space allows. + +**Validation gates:** + +- `cargo test --test network_baseline tcp_advertised_window_*` +- `cargo bench --bench network tcp_bulk_throughput_*` — assert no regression, and ideally improvement at small `SO_RCVBUF`. + +**File impact:** + +- `src/network/slirp.rs` — `TcpNatEntry` (add `guest_window`, `guest_window_scale`), `build_tcp_packet_static` signature (take advertised window from caller), `handle_tcp_frame` (read incoming window), `relay_tcp_nat_data` (gate sends on guest window). +- `tests/network_baseline.rs` — two new pins. +- `benches/network.rs` — one new bench arm. + +--- + +### 6.4 — Event-driven RX polling (A4, Medium-Low) + +**Severity:** Medium-Low (efficiency, not correctness). + +**Current state:** + +- `src/vmm/mod.rs:1599` — `net_poll_thread` wakes every 5 ms (`std::thread::sleep(Duration::from_millis(5))` at line 1609). +- `src/network/slirp.rs:1549` — `relay_tcp_nat_data` re-peeks a 64 KiB buffer on every connected TCP socket every tick, regardless of whether new data has arrived. + +**Why this matters:** + +- 200 polls/second on every connected flow, even when idle. With many flows this is wasted CPU. +- 5 ms granularity means tail latency for any RX event is bounded below by ~5 ms even if data arrived microseconds after the last poll. For latency-sensitive workloads this is the floor. + +**Reference:** passt uses epoll-driven socket readiness ([passt/tcp.c:463](https://passt.top/passt/tree/tcp.c#n463)) with optional `SO_PEEK_OFF` — the syscall returns the readable list, no polling needed. + +**Target state:** + +- Replace the 5 ms timer with `epoll_wait` on a Linux `epoll_fd` that owns all of: + - the connected `TcpStream`s in `flow_table` (registered with `EPOLLIN`) + - the connecting sockets from Phase 6.2 (registered with `EPOLLOUT`) + - the UDP flow sockets (Phase 2) + - the ICMP echo socket (Phase 1) + - a `pipe(2)` self-pipe for inter-thread wakeup (so `process_guest_frame` can request an out-of-band poll cycle when it adds a new flow). +- `epoll_wait` timeout: short (say 50 ms) just as a safety net for periodic housekeeping (LAST_ACK_TIMEOUT sweeps, idle UDP flow reaping). The hot path is event-driven. +- Each socket's `epoll_data` carries its `FlowKey` so the readiness handler can dispatch directly without iterating the full table. + +**Caveats:** + +- This sub-area is **Linux-specific** (`epoll`). The SLIRP backend itself is already Linux-only, so this fits, but the implementation should isolate epoll inside a `mod epoll_dispatch` so a future portable backend (e.g. BSD `kqueue`) can plug in a different reactor. +- Snapshot/restore: an `epoll_fd` does not survive snapshot (it's a kernel-side handle on real fds). Restore must rebuild the epoll set from scratch from `flow_table` contents — no serde required for the `epoll_fd` itself. + +**Test requirements:** + +- New pin `tcp_rx_latency_sub_5ms_when_data_available`: send data from host to a connected guest flow; measure host→guest delivery latency. Pre-6.4 this is bounded below by 5 ms (the timer cycle); post-6.4 it should be sub-millisecond on a quiet system. +- Bench: existing `port_forward_accept_latency` should *improve* — it's currently bounded by a 50 ms listener-poll cycle, but if 6.4 also moves the listener accept onto epoll, the median should drop substantially. +- `snapshot_integration`: verify rebuild-on-restore works (no FD leak, all flows still relay). + +**Validation gates:** + +- `cargo test --test network_baseline tcp_rx_latency_*` +- `cargo bench --bench network port_forward_accept_latency` — should regress *favorably* (faster). +- `cargo test --test snapshot_integration -- --ignored` + +**File impact:** + +- `src/vmm/mod.rs` — `net_poll_thread` rewrite to use `epoll_wait` (~lines 1599–1640). +- `src/network/slirp.rs` — new `mod epoll_dispatch`, `SlirpBackend` holds the `epoll_fd`, `flow_table` insertions/removals add/remove from epoll. +- New constants for the epoll wakeup pipe. + +--- + +## Cross-cutting concerns + +### Bench discipline + +Every sub-area must add at least one bench (microbench in `benches/network.rs` and/or wall-clock metric in `voidbox-network-bench`) that captures the win or proves no regression. `bench-compare.sh --baseline ` must run cleanly before each sub-area's PR is merged. Shared protocol: each sub-area's PR description includes the bench-compare table. + +### Observability + +Every state transition added (Connecting, FinWait*, CloseWait, LastAck, window updates, epoll readiness) emits a `tracing::trace!` or `tracing::debug!` line keyed on the relevant `FlowKey`. No silent state changes. This matches the observability invariant. + +### Test image + +No new test-image requirements expected. All new e2e pins should be expressible against the existing initramfs (BusyBox + claudio). + +### Phase ordering + +Logically sensible order is **6.4 → 6.2 → 6.1 → 6.3** (epoll first to give 6.2 its readiness primitive, async connect next to remove vCPU stalls, half-close once we have proper per-flow event handling, window mgmt last as the polish layer). However, the validation gates per sub-area are independent; any order that passes all gates is acceptable. + +## Validation gates (global, every sub-area) + +The standard validation contract from `AGENTS.md` applies. In addition: + +``` +# 1. Phase 0–5 baselines hold. +scripts/bench-compare.sh --baseline --skip-vm + +# 2. All Phase 6.X test pins pass. +cargo test --test network_baseline -- --ignored --test-threads=1 + +# 3. Snapshot integration intact. +cargo test --test snapshot_integration -- --ignored --test-threads=1 + +# 4. Cross-platform compile. +cargo check --workspace --exclude guest-agent --all-targets --all-features # macOS shape + +# 5. aarch64 cross-check (per AGENTS.md "aarch64 cross-check" section). +``` + +## Out of scope + +- IPv6 (deferred from earlier phases; would be its own Phase 7). +- TCP options beyond MSS and window-scale (SACK, timestamps, ECN). Possible future work but not Phase 6. +- vsock-over-SLIRP (orthogonal subsystem). +- A passt head-to-head benchmark suite (deferred separate task — needs passt+qemu reference env). + +## Reviewer pointers + +When a sub-area's plan and PR land, the review focus per area: + +- **6.1**: half-close transitions and `LAST_ACK_TIMEOUT` reaping. Verify no FD leaks under repeated open-close-open patterns. Verify snapshot interaction. +- **6.2**: vCPU thread is never blocked on connect under any input. Verify timing of the "unreachable destination doesn't stall good destination" pin. +- **6.3**: window scale negotiation in SYN/SYN-ACK frames. Verify advertised window tracks guest buffer state on tracing logs. +- **6.4**: epoll FD lifecycle (register/unregister on flow_table mutation), wakeup-pipe correctness, snapshot rebuild path. + +## Open questions + +- **6.3:** what window-scale factor to advertise? passt uses 7 (128×). We could be more conservative (say 5 = 32×) initially. Decide in 6.3's plan. +- **6.4:** should the epoll wakeup pipe also carry the new-flow `FlowKey` so the poll thread can `epoll_ctl(EPOLL_CTL_ADD, ...)` itself, vs. doing it under the SlirpBackend lock from the vCPU thread? Tradeoff is lock granularity vs. message-passing complexity. Decide in 6.4's plan. + +--- + +## Document history + +- 2026-04-30: initial overview written, scope locked from PR review on `smoltcp-passt-port-phase0` branch. diff --git a/guest-agent/src/main.rs b/guest-agent/src/main.rs index b42bd092..8fc36c59 100644 --- a/guest-agent/src/main.rs +++ b/guest-agent/src/main.rs @@ -411,6 +411,11 @@ fn main() { if std::process::id() == 1 { if network_enabled_from_cmdline() { setup_network(); + // Allow unprivileged ICMP sockets for all GIDs so non-root + // processes (uid=1000 sandbox user) can call ping without + // CAP_NET_RAW. Mirrors the default on most desktop Linux + // distributions (ping_group_range = 0 2147483647). + let _ = std::fs::write("/proc/sys/net/ipv4/ping_group_range", "0\t2147483647\n"); // Install the host-provided network deny list *once* at boot, // before any guest command can run. This closes the window // between network bring-up and the first exec call, and avoids diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh new file mode 100755 index 00000000..217480a0 --- /dev/null +++ b/scripts/bench-compare.sh @@ -0,0 +1,469 @@ +#!/usr/bin/env bash +# bench-compare.sh — compare HEAD bench results against an arbitrary baseline ref. +# +# Harnesses: +# 1. divan microbenches: cargo bench --bench network --features bench-helpers +# 2. VM wall-clock harness: cargo run --release --bin voidbox-network-bench +# +# Output: markdown report to stdout (or --output FILE). +# See AGENTS.md for harness descriptions and JSON field definitions. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +info() { printf '%s\n' "$*" >&2; } + +usage() { + cat >&2 <<'EOF' +Usage: scripts/bench-compare.sh [OPTIONS] + +Compare HEAD bench results against an arbitrary baseline git ref. + +Options: + --baseline Git ref (commit SHA, branch, tag) to compare against. + Default: merge-base with origin/main. + --output Write markdown report to FILE instead of stdout. + --skip-vm Skip the voidbox-network-bench VM harness. + --skip-divan Skip the cargo bench --bench network divan harness. + -h, --help Show this help and exit. +EOF +} + +die() { info "ERROR: $*"; exit 1; } + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- + +BASELINE_REF="" +OUTPUT_FILE="" +SKIP_VM=0 +SKIP_DIVAN=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --baseline) + [[ $# -ge 2 ]] || die "--baseline requires an argument" + BASELINE_REF="$2"; shift 2 ;; + --output) + [[ $# -ge 2 ]] || die "--output requires an argument" + OUTPUT_FILE="$2"; shift 2 ;; + --skip-vm) + SKIP_VM=1; shift ;; + --skip-divan) + SKIP_DIVAN=1; shift ;; + -h|--help) + usage; exit 0 ;; + *) + die "Unknown option: $1 (run with --help for usage)" ;; + esac +done + +# --------------------------------------------------------------------------- +# Resolve paths +# --------------------------------------------------------------------------- + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# --------------------------------------------------------------------------- +# Resolve SHAs +# --------------------------------------------------------------------------- + +HEAD_SHA="$(git -C "$REPO_ROOT" rev-parse HEAD)" +HEAD_SHORT="${HEAD_SHA:0:9}" +HEAD_BRANCH="$(git -C "$REPO_ROOT" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "detached")" + +if [[ -z "$BASELINE_REF" ]]; then + info "No --baseline given; resolving merge-base with origin/main ..." + # Fetch is not done automatically — the caller must ensure origin/main is current. + BASELINE_REF="$(git -C "$REPO_ROOT" merge-base HEAD origin/main)" \ + || die "Could not resolve merge-base with origin/main. Pass --baseline explicitly." +fi + +BASELINE_SHA="$(git -C "$REPO_ROOT" rev-parse "${BASELINE_REF}^{commit}")" \ + || die "Cannot resolve baseline ref '${BASELINE_REF}' to a commit SHA" +BASELINE_SHORT="${BASELINE_SHA:0:9}" + +info "HEAD: ${HEAD_SHORT} (${HEAD_BRANCH})" +info "Baseline: ${BASELINE_SHORT} (${BASELINE_REF})" + +# --------------------------------------------------------------------------- +# Worktree setup +# --------------------------------------------------------------------------- + +WORKTREE_DIR="$(mktemp -d)" +cleanup() { + git -C "$REPO_ROOT" worktree remove --force "$WORKTREE_DIR" 2>/dev/null || true + rm -rf "$WORKTREE_DIR" +} +trap cleanup EXIT + +info "Setting up worktree at ${WORKTREE_DIR} for ${BASELINE_SHORT} ..." +git -C "$REPO_ROOT" worktree add --detach "$WORKTREE_DIR" "$BASELINE_SHA" \ + || die "Failed to create git worktree at ${WORKTREE_DIR}" + +# --------------------------------------------------------------------------- +# Output buffer (built up as a string, flushed at the end) +# --------------------------------------------------------------------------- + +REPORT="" + +append() { REPORT="${REPORT}${*}"$'\n'; } + +append "# Bench comparison" +append "" +append "- HEAD: \`${HEAD_SHORT}\` (\`${HEAD_BRANCH}\`)" +append "- Baseline: \`${BASELINE_SHORT}\` (\`${BASELINE_REF}\`)" +append "" + +# --------------------------------------------------------------------------- +# Parse divan output into TSV: namemedian_ns +# +# divan table layout (columns separated by the │ U+2502 box-drawing char): +# top-level leaf: field1=" ", field2=slowest, +# field3=median, field4=mean, ... +# parametric parent: field1="", all other fields empty +# parametric child: field1="", field2=" ", +# field3=slowest, field4=median, ... +# MB/s secondary: field1="", field2=MB/s-fastest, ... (no name — skip) +# +# Strategy: split on │. The first non-empty field contains the name prefix +# plus the fastest time. The median is two fields after that. +# --------------------------------------------------------------------------- + +parse_divan() { + local file="$1" + LC_ALL=en_US.UTF-8 awk -F'│' ' + function unit_ns(val, unit) { + if (unit == "ns") return val + 0 + if (unit == "µs") return val * 1000 + if (unit == "us") return val * 1000 + if (unit == "ms") return val * 1000000 + if (unit == "s") return val * 1000000000 + # Unrecognised unit — treat as µs (safe fallback for future divan changes) + return val * 1000 + } + + function strip(s, r) { + r = s + gsub(/^[[:space:]╰─├│ ]+/, "", r) + gsub(/[[:space:]]+$/, "", r) + return r + } + + # Extract and from a string like "330.2 ns" or "50.12 ms". + # Sets out_val and out_unit. Returns 1 on success, 0 if no match. + function extract_time(s, out_val, out_unit, t, n) { + t = s + gsub(/^[[:space:]]+/, "", t) + # Check for a number followed by a unit + if (t !~ /^[0-9]/) return 0 + n = split(t, parts, /[[:space:]]+/) + if (n < 2) return 0 + out_val[1] = parts[1] + 0 + out_unit[1] = parts[2] + return 1 + } + + BEGIN { parent = "" } + + # Skip the header line and empty lines + /^network/ || /^$/ || /^Timer precision/ { next } + + # Skip the MB/s secondary throughput line (no bench name in field 1). + # Detect: field 1 is empty AND any field contains "MB/s". + /MB\/s/ && $1 !~ /[[:alpha:]]/ { next } + + { + # Find the first non-empty field (contains name + fastest time). + name_field_idx = 0 + name_raw = "" + for (i = 1; i <= NF; i++) { + f = $i + gsub(/^[[:space:]╰─├│ ]+/, "", f) + gsub(/[[:space:]]+$/, "", f) + if (f != "") { + name_field_idx = i + name_raw = f + break + } + } + if (name_field_idx == 0) next # completely empty line + + # The median column is two fields after the name+fastest field. + median_raw = "" + if (name_field_idx + 2 <= NF) { + median_raw = $(name_field_idx + 2) + gsub(/^[[:space:]│]+/, "", median_raw) + gsub(/[[:space:]]+$/, "", median_raw) + } + + # Extract the bench name from the name_raw field. + # name_raw looks like "dns_cache_hit 220.2 ns" (name + fastest time). + # Strip the trailing fastest-time portion: everything from the last + # contiguous digit sequence followed by a unit. + bench_label = name_raw + sub(/[[:space:]]+[0-9]+(\.[0-9]+)?[[:space:]]*(ns|us|ms|s|µs)[[:space:]]*$/, "", bench_label) + # Also strip any residual trailing box-drawing or tree chars + gsub(/[[:space:]]+$/, "", bench_label) + + # Check whether this row has a median measurement. + val_arr[1] = ""; unit_arr[1] = "" + has_median = extract_time(median_raw, val_arr, unit_arr) + + if (!has_median) { + # This is a parametric parent header row — record as parent. + parent = bench_label + next + } + + # This is a leaf measurement row. + if (parent != "" && name_field_idx > 1) { + # Child row: qualify with parent name. + full_name = parent "/" bench_label + } else { + full_name = bench_label + # Top-level leaf — clear parent so the next top-level bench starts fresh. + parent = "" + } + + median_ns = unit_ns(val_arr[1], unit_arr[1]) + print full_name "\t" median_ns + } + ' "$file" +} + +# --------------------------------------------------------------------------- +# Divan harness +# --------------------------------------------------------------------------- + +if [[ "$SKIP_DIVAN" -eq 0 ]]; then + info "--- divan harness ---" + + # Run divan bench in $1 (cwd), writing TSV-parseable stdout to $2. + # $3 is a human-readable label used in log lines. + # Tries --features bench-helpers first; falls back to no features if the + # feature isn't recognized at that ref. + run_divan_at() { + local cwd="$1" + local out="$2" + local label="$3" + local err + err="$(mktemp)" + if (cd "$cwd" && cargo bench --bench network --features bench-helpers >"$out" 2>"$err"); then + rm -f "$err" + return 0 + fi + if grep -qiE 'does not have feature|does not contain this feature|unknown feature' "$err"; then + info " ${label} lacks bench-helpers feature, retrying without" + rm -f "$err" + if (cd "$cwd" && cargo bench --bench network >"$out" 2>/dev/null); then + return 0 + fi + fi + rm -f "$err" + return 1 + } + + DIVAN_TMP_BASELINE="$(mktemp)" + DIVAN_TMP_HEAD="$(mktemp)" + + info "Running divan benches on baseline (${BASELINE_SHORT}) ..." + # cargo's build progress goes to stderr; bench table goes to stdout. + run_divan_at "$WORKTREE_DIR" "$DIVAN_TMP_BASELINE" "baseline" \ + || info "WARN: divan baseline bench failed; divan section will be incomplete" + + info "Running divan benches on HEAD (${HEAD_SHORT}) ..." + run_divan_at "$REPO_ROOT" "$DIVAN_TMP_HEAD" "HEAD" \ + || info "WARN: divan HEAD bench failed; divan section will be incomplete" + + DIVAN_BASELINE_TSV="$(parse_divan "$DIVAN_TMP_BASELINE")" + DIVAN_HEAD_TSV="$(parse_divan "$DIVAN_TMP_HEAD")" + rm -f "$DIVAN_TMP_BASELINE" "$DIVAN_TMP_HEAD" + + # Build the markdown table via awk: join on bench name, emit rows. + DIVAN_TABLE="$( + awk -F'\t' ' + # Load baseline + NR == FNR { + if ($1 != "") { + baseline_ns[$1] = $2 + if (!seen[$1]++) order[++n] = $1 + } + next + } + # Load head + { + if ($1 != "") { + head_ns[$1] = $2 + if (!seen[$1]++) order[++n] = $1 + } + } + END { + for (i = 1; i <= n; i++) { + name = order[i] + b = baseline_ns[name] + h = head_ns[name] + + # Format a nanosecond value into a human-readable string + # using the shortest unit whose display value is >= 1. + if (b == "") { + b_str = "—" + } else { + bv = b + 0 + if (bv >= 1000000000) { b_str = sprintf("%.3g s", bv/1000000000) } + else if (bv >= 1000000) { b_str = sprintf("%.3g ms", bv/1000000) } + else if (bv >= 1000) { b_str = sprintf("%.3g µs", bv/1000) } + else { b_str = sprintf("%.3g ns", bv) } + } + + if (h == "") { + h_str = "—" + } else { + hv = h + 0 + if (hv >= 1000000000) { h_str = sprintf("%.3g s", hv/1000000000) } + else if (hv >= 1000000) { h_str = sprintf("%.3g ms", hv/1000000) } + else if (hv >= 1000) { h_str = sprintf("%.3g µs", hv/1000) } + else { h_str = sprintf("%.3g ns", hv) } + } + + # Delta + if (b == "" || h == "") { + delta_str = "—" + pct_str = "—" + } else { + bv = b + 0; hv = h + 0 + diff = hv - bv + abs_diff = (diff < 0) ? -diff : diff + if (abs_diff >= 1000000000) { unit = "s"; factor = 1000000000 } + else if (abs_diff >= 1000000) { unit = "ms"; factor = 1000000 } + else if (abs_diff >= 1000) { unit = "µs"; factor = 1000 } + else { unit = "ns"; factor = 1 } + sign = (diff >= 0) ? "+" : "" + delta_str = sprintf("%s%.3g %s", sign, diff/factor, unit) + + if (bv != 0) { + pct = (hv - bv) / bv * 100 + psign = (pct >= 0) ? "+" : "" + pct_str = sprintf("%s%.1f%%", psign, pct) + } else { + pct_str = "—" + } + } + + print name "\t" b_str "\t" h_str "\t" delta_str "\t" pct_str + } + } + ' \ + <(printf '%s\n' "$DIVAN_BASELINE_TSV") \ + <(printf '%s\n' "$DIVAN_HEAD_TSV") + )" + + append "## divan microbenches (\`cargo bench --bench network\`)" + append "" + append "| Bench | Baseline | HEAD | Δ | Δ% |" + append "|-------|---------:|-----:|--:|---:|" + + if [[ -n "$DIVAN_TABLE" ]]; then + while IFS=$'\t' read -r name b_str h_str delta_str pct_str; do + append "| ${name} | ${b_str} | ${h_str} | ${delta_str} | ${pct_str} |" + done <<< "$DIVAN_TABLE" + else + append "| *(no data)* | | | | |" + fi + append "" +else + info "Skipping divan harness (--skip-divan)." +fi + +# --------------------------------------------------------------------------- +# VM harness +# --------------------------------------------------------------------------- + +if [[ "$SKIP_VM" -eq 1 ]]; then + info "Skipping VM harness (--skip-vm)." +elif [[ -z "${VOID_BOX_KERNEL:-}" ]]; then + info "Skipping VM harness because VOID_BOX_KERNEL is not set." +elif [[ -z "${VOID_BOX_INITRAMFS:-}" ]]; then + info "Skipping VM harness because VOID_BOX_INITRAMFS is not set." +else + info "--- VM harness ---" + + VM_TMP_BASELINE="$(mktemp --suffix=.json)" + VM_TMP_HEAD="$(mktemp --suffix=.json)" + + info "Running voidbox-network-bench on baseline (${BASELINE_SHORT}) ..." + (cd "$WORKTREE_DIR" && \ + cargo run --release --bin voidbox-network-bench -- --output "$VM_TMP_BASELINE") \ + || info "WARN: VM baseline bench failed; VM section will be incomplete" + + info "Running voidbox-network-bench on HEAD (${HEAD_SHORT}) ..." + (cd "$REPO_ROOT" && \ + cargo run --release --bin voidbox-network-bench -- --output "$VM_TMP_HEAD") \ + || info "WARN: VM HEAD bench failed; VM section will be incomplete" + + # JSON field names in display order. + # These match the Report struct fields in src/bin/voidbox-network-bench/main.rs. + VM_FIELDS=( + tcp_bulk_throughput_g2h_mbps + tcp_throughput_g2h_mbps + tcp_throughput_h2g_mbps + tcp_rr_latency_us_p50 + tcp_rr_latency_us_p99 + tcp_crr_latency_us_p50 + udp_dns_qps + icmp_rr_latency_us_p50 + ) + + append "## VM harness (\`voidbox-network-bench\`)" + append "" + append "| Metric | Baseline | HEAD | Δ | Δ% |" + append "|--------|---------:|-----:|--:|---:|" + + for field in "${VM_FIELDS[@]}"; do + b_val="$(jq -r --arg f "$field" 'if has($f) then .[$f] else null end | if . == null then "null" else tostring end' \ + "$VM_TMP_BASELINE" 2>/dev/null || echo "null")" + h_val="$(jq -r --arg f "$field" 'if has($f) then .[$f] else null end | if . == null then "null" else tostring end' \ + "$VM_TMP_HEAD" 2>/dev/null || echo "null")" + + if [[ "$b_val" == "null" ]]; then b_str="n/a"; else b_str="$b_val"; fi + if [[ "$h_val" == "null" ]]; then h_str="n/a"; else h_str="$h_val"; fi + + if [[ "$b_val" == "null" || "$h_val" == "null" ]]; then + delta_str="—" + pct_str="—" + else + delta_str="$(awk -v b="$b_val" -v h="$h_val" 'BEGIN { + diff = h - b + sign = (diff >= 0) ? "+" : "" + printf "%s%.4g\n", sign, diff + }')" + pct_str="$(awk -v b="$b_val" -v h="$h_val" 'BEGIN { + if (b == 0) { print "—"; exit } + pct = (h - b) / b * 100 + psign = (pct >= 0) ? "+" : "" + printf "%s%.1f%%\n", psign, pct + }')" + fi + + append "| ${field} | ${b_str} | ${h_str} | ${delta_str} | ${pct_str} |" + done + append "" + + rm -f "$VM_TMP_BASELINE" "$VM_TMP_HEAD" +fi + +# --------------------------------------------------------------------------- +# Emit report +# --------------------------------------------------------------------------- + +if [[ -n "$OUTPUT_FILE" ]]; then + printf '%s\n' "$REPORT" > "$OUTPUT_FILE" + info "Report written to ${OUTPUT_FILE}" +else + printf '%s\n' "$REPORT" +fi diff --git a/scripts/lib/guest_common.sh b/scripts/lib/guest_common.sh index 9e60d025..29d652d2 100755 --- a/scripts/lib/guest_common.sh +++ b/scripts/lib/guest_common.sh @@ -124,6 +124,21 @@ install_busybox() { readlink realpath sleep; do ln -sf busybox "$OUT_DIR/bin/$cmd" 2>/dev/null || true done + # NOTE: do NOT `chmod u+s busybox`. The cpio is packed as the build user + # (uid 1000), so a setuid bit makes the kernel drop euid to 1000 on + # every execve from PID 1 (uid=0) → setup_network()'s `ip link up`, + # `ip addr replace`, and `udhcpc` all silently fail with EPERM + # (no CAP_NET_ADMIN), the static-fallback loop wastes 10s of boot + # time, and the host's 30s control-channel handshake deadline + # expires before the vsock listener is bound. Symptom: ECONNRESET + # on every connect in `voidbox-network-bench` and any test that + # uses `network(true)`. See guest-agent::setup_network and + # control_channel::connect_with_handshake_sync. + # + # `ping` is intentionally omitted from the symlink list above — busybox + # `ping` uses SOCK_RAW which needs root, and busybox-static on Fedora + # is not built with CONFIG_FEATURE_PING_TYPE_DGRAM. Tools that want + # ICMP-from-guest should drive it through SLIRP from the host instead. else echo "[void-box] No BUSYBOX set; guest will have no /bin/sh (set BUSYBOX=/path/to/busybox for full shell support)." fi diff --git a/src/bin/voidbox-network-bench/main.rs b/src/bin/voidbox-network-bench/main.rs new file mode 100644 index 00000000..a18ac09e --- /dev/null +++ b/src/bin/voidbox-network-bench/main.rs @@ -0,0 +1,799 @@ +//! Wall-clock end-to-end network benchmark harness. +//! +//! Boots a real VM and measures TCP throughput, RR/CRR latency, and +//! UDP DNS qps inside the guest. Output is JSON for diffing against +//! a baseline. +//! +//! Mirrors `voidbox-startup-bench` in CLI shape and lifecycle. +//! +//! Linux-only because the smoltcp-based SLIRP stack is Linux-only. On +//! other platforms `main()` prints a skip notice and exits 0 so +//! cross-platform CI (`cargo build`, `cargo check`) compiles cleanly. + +#[cfg(not(target_os = "linux"))] +fn main() { + eprintln!( + "voidbox-network-bench: SLIRP-backed wall-clock harness is Linux-only \ + (smoltcp dep is `cfg(target_os = \"linux\")` in Cargo.toml). \ + Nothing to run on this platform." + ); +} + +#[cfg(target_os = "linux")] +use std::io::{Read, Write}; +#[cfg(target_os = "linux")] +use std::net::{TcpListener, TcpStream}; +#[cfg(target_os = "linux")] +use std::os::fd::AsRawFd; +#[cfg(target_os = "linux")] +use std::path::PathBuf; +#[cfg(target_os = "linux")] +use std::sync::mpsc; +#[cfg(target_os = "linux")] +use std::time::{Duration, Instant}; + +#[cfg(target_os = "linux")] +use clap::Parser; +#[cfg(target_os = "linux")] +use serde::Serialize; +#[cfg(target_os = "linux")] +use void_box::sandbox::Sandbox; + +// Linux-only block. Wrapped in a `mod linux_main` so cross-platform +// CI (macOS, etc.) compiles `voidbox-network-bench` cleanly — only +// `main()` (above, the non-Linux stub) is needed there. +#[cfg(target_os = "linux")] +mod linux_main { + use super::*; + + /// Transfer size per measurement run: 50 MiB. + const TRANSFER_MB: u32 = 50; + + /// Bytes per megabit. + const BYTES_PER_MEGABIT: f64 = 1_000_000.0 / 8.0; + + /// VM memory for the benchmark sandbox (MiB). + const BENCH_MEMORY_MB: usize = 1024; + + /// SLIRP host-gateway address reachable from inside the guest. + const SLIRP_HOST_ADDR: &str = "10.0.2.2"; + + /// Number of RR samples collected per iteration. + const RR_SAMPLES_PER_ITER: u32 = 100; + + /// Number of CRR samples collected per iteration. + const CRR_SAMPLES_PER_ITER: u32 = 30; + + /// Timeout for the host-side channel receive on RR/CRR measurements. + const LATENCY_RECV_TIMEOUT: Duration = Duration::from_secs(120); + + /// Accept-side deadline for spawned echo/drain threads. Set slightly longer + /// than `LATENCY_RECV_TIMEOUT` (the channel-side wait) so the channel times + /// out first when the iteration is genuinely stuck — the accept thread then + /// exits on its own deadline shortly after, releasing the listener FD before + /// the next iteration. + const ACCEPT_DEADLINE_SLACK: Duration = Duration::from_secs(5); + + #[derive(Parser, Debug)] + #[command( + version, + about = "VoidBox network benchmark harness", + long_about = "VoidBox network benchmark harness\n\ +\n\ +Boots one VM, exercises TCP throughput, TCP RR/CRR latency, and UDP DNS qps,\n\ +then emits a JSON report suitable for automated diffing.\n\ +\n\ +REQUIRED ENVIRONMENT VARIABLES\n\ + VOID_BOX_KERNEL Path to the guest kernel image (vmlinuz / vmlinux).\n\ + VOID_BOX_INITRAMFS Path to the guest initramfs (cpio.gz).\n\ +\n\ +RECOMMENDED WORKFLOW — CAPTURING AND DIFFING A BASELINE\n\ + # 1. Before a refactor or networking-stack change, capture a baseline:\n\ + cargo run --bin voidbox-network-bench -- --output baseline.json\n\ +\n\ + # 2. Make your change, then capture a post-change report:\n\ + cargo run --bin voidbox-network-bench -- --output after.json\n\ +\n\ + # 3. Compare with diff or a JSON-diff tool:\n\ + diff baseline.json after.json\n\ + # Or with jq for a side-by-side view of individual metrics:\n\ + jq -s '.[0] as $b | .[1] as $a | {metric: keys} | .metric[] |\n\ + {metric: ., before: $b[.], after: $a[.]}' baseline.json after.json\n\ +\n\ +METRIC NAMES\n\ + tcp_throughput_g2h_mbps Guest→host TCP throughput (Mbps)\n\ + tcp_rr_latency_us_p50 Persistent-connection round-trip latency p50 (µs)\n\ + tcp_rr_latency_us_p99 Persistent-connection round-trip latency p99 (µs)\n\ + tcp_crr_latency_us_p50 Connect-request-response latency p50 (µs)\n\ + udp_dns_qps UDP DNS queries per second against SLIRP resolver\n\ +\n\ +The metric names mirror the columns in passt's published performance table so\n\ +results can be compared directly.\n\ +\n\ +FAST SMOKE RUN\n\ + cargo run --bin voidbox-network-bench -- --iterations 1 --no-throughput" + )] + struct Cli { + /// Number of iterations per metric. + #[arg(long, default_value_t = 5)] + iterations: u32, + + /// Output JSON file. If omitted, prints to stdout. + #[arg(long)] + output: Option, + + /// Skip throughput measurements (useful for fast smoke runs). + #[arg(long, default_value_t = false)] + no_throughput: bool, + + /// Push N MB through the SLIRP relay against a slow-receiving host + /// (`SO_RCVBUF = 4096`). Forces the backpressure path to actually + /// engage — the small-payload throughput numbers don't exercise it + /// because the host drains too fast. + /// + /// 0 (default) skips the measurement. 10 MiB is a reasonable smoke + /// value; larger N produces more stable numbers but takes longer. + #[arg(long, default_value_t = 0)] + bulk_mb: u32, + } + + #[derive(Serialize, Debug, Default)] + struct Report { + /// Sustained guest→host throughput against a slow-receiving host + /// (`SO_RCVBUF = 4096`). Probes the TCP backpressure path — rather + /// than hitting a fixed userspace cliff and resetting the connection, + /// throughput is bounded by the kernel recv buffer's drain rate. + /// Populated only when `--bulk-mb > 0`. + tcp_bulk_throughput_g2h_mbps: Option, + tcp_throughput_g2h_mbps: Option, + // TODO(h2g): host→guest requires either a guest-side `nc -l` listener + // or an inverse data-push loop. The current harness only supports + // guest-initiated connections (the guest calls `nc HOST PORT`). A + // host-push direction would need the guest to accept connections, which + // means either (a) a guest-side daemon started before exec returns, or + // (b) an additional RPC for "open a listening socket and tell us the + // guest port" — out of scope for the minimal harness. + tcp_throughput_h2g_mbps: Option, + tcp_rr_latency_us_p50: Option, + tcp_rr_latency_us_p99: Option, + tcp_crr_latency_us_p50: Option, + udp_dns_qps: Option, + icmp_rr_latency_us_p50: Option, + /// p50 host→guest RX latency: "host write completes" → "SLIRP relay + /// delivers frame to drain_to_guest output". Measured at the VMM + /// layer against a live guest TCP flow via `nc -l`. + /// + /// Not yet populated: wiring a guest-side listener and synchronizing + /// on first-byte arrival requires either a guest daemon or an additional + /// RPC. The divan microbench `tcp_rx_latency_one_packet` captures the + /// SLIRP-layer dispatch cost directly (epoll_wait + peek + frame build); + /// this wall-clock field will complement it once the guest-listener + /// infrastructure is in place. + tcp_rx_latency_us_p50: Option, + } + + #[tokio::main(flavor = "multi_thread")] + pub(super) async fn main_impl() -> Result<(), Box> { + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("warn")), + ) + .with_writer(std::io::stderr) + .init(); + + let cli = Cli::parse(); + let mut report = Report::default(); + + // Boot one shared VM for all measurements that require a live guest. + // Throughput and latency measurements reuse this single sandbox to avoid + // paying the boot cost multiple times. + let sandbox = Sandbox::local() + .from_env()? + .memory_mb(BENCH_MEMORY_MB) + .network(true) + .build()?; + + // Prime the VM (triggers boot + vsock handshake) before any timed work. + let probe = sandbox.exec("sh", &["-c", ":"]).await?; + if !probe.success() { + return Err(format!( + "VM probe exec failed: exit={:?} stderr={}", + probe.exit_code, + probe.stderr_str() + ) + .into()); + } + + if !cli.no_throughput { + report.tcp_throughput_g2h_mbps = + measure_tcp_throughput_g2h(&sandbox, cli.iterations).await?; + } + + if cli.bulk_mb > 0 { + report.tcp_bulk_throughput_g2h_mbps = + measure_bulk_throughput_g2h(&sandbox, cli.iterations, cli.bulk_mb).await?; + } + + // Latency measurements always run (--no-throughput only skips throughput). + let (rr_p50, rr_p99) = measure_rr_latency(&sandbox, cli.iterations).await?; + report.tcp_rr_latency_us_p50 = rr_p50; + report.tcp_rr_latency_us_p99 = rr_p99; + report.tcp_crr_latency_us_p50 = measure_crr_latency(&sandbox, cli.iterations).await?; + report.udp_dns_qps = measure_dns_qps(&sandbox).await?; + report.icmp_rr_latency_us_p50 = measure_icmp_rr_latency(&sandbox, cli.iterations).await?; + + sandbox.stop().await?; + + let json = serde_json::to_string_pretty(&report)?; + match cli.output { + Some(path) => std::fs::write(path, json)?, + None => println!("{json}"), + } + Ok(()) + } + + /// Measure guest-to-host TCP throughput. + /// + /// Binds a host-side TCP listener on `127.0.0.1:0` and execs a BusyBox shell + /// snippet inside `sandbox` that pipes `dd` output to `nc`. The host drain + /// thread records bytes received and wall-clock elapsed time; Mbps is computed + /// from those two numbers. Runs `iterations` times and returns the mean. + /// + /// Returns `None` if every iteration fails to parse or times out. + async fn measure_tcp_throughput_g2h( + sandbox: &Sandbox, + iterations: u32, + ) -> Result, Box> { + let mut mbps_samples: Vec = Vec::new(); + + for iteration_index in 0..iterations { + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + + let (drain_tx, drain_rx) = mpsc::channel::<(u64, Duration)>(); + + let drain_deadline = Instant::now() + LATENCY_RECV_TIMEOUT + ACCEPT_DEADLINE_SLACK; + std::thread::spawn(move || { + let drain_result = drain_one_connection(&listener, drain_deadline); + let _ = drain_tx.send(drain_result); + }); + + let guest_cmd = format!( + "dd if=/dev/zero bs=1M count={TRANSFER_MB} 2>/dev/null | nc {SLIRP_HOST_ADDR} {host_port}", + ); + + let exec_result = sandbox.exec("sh", &["-c", &guest_cmd]).await; + + match exec_result { + Err(exec_err) => { + tracing::warn!( + iteration = iteration_index, + error = %exec_err, + "g2h iteration exec error; skipping" + ); + continue; + } + Ok(output) => { + if !output.success() { + tracing::warn!( + iteration = iteration_index, + exit_code = ?output.exit_code, + stderr = output.stderr_str(), + "g2h iteration non-zero exit; skipping" + ); + continue; + } + } + } + + match drain_rx.recv_timeout(Duration::from_secs(120)) { + Err(recv_err) => { + tracing::warn!( + iteration = iteration_index, + error = %recv_err, + "g2h drain channel receive error; skipping" + ); + } + Ok((bytes_received, elapsed)) => { + let elapsed_secs = elapsed.as_secs_f64(); + if elapsed_secs < 0.01 { + tracing::warn!( + iteration = iteration_index, + elapsed_secs, + "g2h elapsed too small to measure reliably; skipping" + ); + continue; + } + let mbps = (bytes_received as f64 * 8.0) / elapsed_secs / BYTES_PER_MEGABIT; + tracing::info!( + iteration = iteration_index, + bytes_received, + elapsed_secs, + mbps, + "g2h iteration complete" + ); + eprintln!( + "g2h[{iteration_index:>2}]: {bytes_received} B in {elapsed_secs:.3}s = {mbps:.1} Mbps" + ); + mbps_samples.push(mbps); + } + } + } + + if mbps_samples.is_empty() { + return Ok(None); + } + + let mut total_mbps = 0.0_f64; + for sample in &mbps_samples { + total_mbps += sample; + } + let mean_mbps = total_mbps / mbps_samples.len() as f64; + Ok(Some(mean_mbps)) + } + + /// Sustained guest→host throughput against a constrained receiver. + /// + /// Same shape as [`measure_tcp_throughput_g2h`] but with `SO_RCVBUF = 4096` + /// pinned on the listener socket. The small recv buffer forces TCP-level + /// backpressure: the kernel send buffer fills, our `host_stream.write` + /// returns `WouldBlock`, the SLIRP relay declines to ACK the guest's + /// segment, and the guest retransmits. The relay holds the line and the + /// bytes go through rather than resetting the connection at a fixed + /// userspace buffer limit. + /// + /// Returned value is the mean Mbps across `iterations` iterations of pushing + /// `bulk_mb` MiB. Effective throughput is much lower than + /// [`measure_tcp_throughput_g2h`]'s number because the constrained receiver + /// is the bottleneck — that's the point. + async fn measure_bulk_throughput_g2h( + sandbox: &Sandbox, + iterations: u32, + bulk_mb: u32, + ) -> Result, Box> { + let mut mbps_samples: Vec = Vec::new(); + + for iteration_index in 0..iterations { + let listener = TcpListener::bind("127.0.0.1:0")?; + // Constrain the receiver: 4 KiB request, kernel rounds up to the + // configured minimum (~8 KiB on Linux) — still small enough that + // the SLIRP send buffer fills quickly and backpressure engages. + let val: libc::c_int = 4096; + // SAFETY: listener.as_raw_fd() outlives the syscall; the int is + // stack-local and pointer-sized. + let rc = unsafe { + libc::setsockopt( + listener.as_raw_fd(), + libc::SOL_SOCKET, + libc::SO_RCVBUF, + &val as *const libc::c_int as *const libc::c_void, + std::mem::size_of::() as libc::socklen_t, + ) + }; + if rc != 0 { + tracing::warn!( + iteration = iteration_index, + "bulk-g2h: SO_RCVBUF setsockopt failed; skipping" + ); + continue; + } + let host_port = listener.local_addr()?.port(); + + let (drain_tx, drain_rx) = mpsc::channel::<(u64, Duration)>(); + let drain_deadline = Instant::now() + Duration::from_secs(300) + ACCEPT_DEADLINE_SLACK; + std::thread::spawn(move || { + let drain_result = drain_one_connection(&listener, drain_deadline); + let _ = drain_tx.send(drain_result); + }); + + let guest_cmd = format!( + "dd if=/dev/zero bs=1M count={bulk_mb} 2>/dev/null | nc {SLIRP_HOST_ADDR} {host_port}", + ); + let exec_result = sandbox.exec("sh", &["-c", &guest_cmd]).await; + match exec_result { + Err(exec_err) => { + tracing::warn!( + iteration = iteration_index, + error = %exec_err, + "bulk-g2h iteration exec error; skipping" + ); + continue; + } + Ok(output) => { + if !output.success() { + tracing::warn!( + iteration = iteration_index, + exit_code = ?output.exit_code, + stderr = output.stderr_str(), + "bulk-g2h iteration non-zero exit; the connection may have \ + been reset (backpressure cliff regression?). skipping" + ); + continue; + } + } + } + + match drain_rx.recv_timeout(Duration::from_secs(300)) { + Err(recv_err) => { + tracing::warn!( + iteration = iteration_index, + error = %recv_err, + "bulk-g2h drain channel receive error; skipping" + ); + } + Ok((bytes_received, elapsed)) => { + let elapsed_secs = elapsed.as_secs_f64(); + if elapsed_secs < 0.01 { + tracing::warn!( + iteration = iteration_index, + elapsed_secs, + "bulk-g2h elapsed too small to measure reliably; skipping" + ); + continue; + } + let mbps = (bytes_received as f64 * 8.0) / elapsed_secs / BYTES_PER_MEGABIT; + tracing::info!( + iteration = iteration_index, + bytes_received, + elapsed_secs, + mbps, + "bulk-g2h iteration complete" + ); + eprintln!( + "bulk-g2h[{iteration_index:>2}]: {bytes_received} B in {elapsed_secs:.3}s = {mbps:.1} Mbps (constrained receiver)" + ); + mbps_samples.push(mbps); + } + } + } + + if mbps_samples.is_empty() { + return Ok(None); + } + let mean_mbps: f64 = mbps_samples.iter().sum::() / mbps_samples.len() as f64; + Ok(Some(mean_mbps)) + } + + /// Accept one connection on `listener` with a deadline. Returns `None` if the + /// deadline lapses before any connection arrives (the spawning iteration has + /// likely failed and the thread should exit cleanly so the listener FD is + /// released for the next iteration). + fn accept_with_deadline( + listener: &TcpListener, + deadline: Instant, + ) -> Option<(TcpStream, std::net::SocketAddr)> { + listener.set_nonblocking(true).ok()?; + loop { + match listener.accept() { + Ok(pair) => { + let _ = pair.0.set_nonblocking(false); + return Some(pair); + } + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + if Instant::now() >= deadline { + return None; + } + std::thread::sleep(Duration::from_millis(10)); + } + Err(_) => return None, + } + } + } + + /// Accept exactly one TCP connection on `listener`, drain it to EOF, and + /// return `(bytes_received, elapsed)`. Intended to run in a background thread. + /// + /// Returns `(0, Duration::ZERO)` if no connection arrives before `deadline`. + fn drain_one_connection(listener: &TcpListener, deadline: Instant) -> (u64, Duration) { + let Some((mut stream, _peer_addr)) = accept_with_deadline(listener, deadline) else { + return (0, Duration::ZERO); + }; + + let start = Instant::now(); + let bytes_received = drain_stream(&mut stream); + let elapsed = start.elapsed(); + (bytes_received, elapsed) + } + + /// Read `stream` to EOF and return the total byte count. + fn drain_stream(stream: &mut TcpStream) -> u64 { + let mut buf = vec![0u8; 64 * 1024]; + let mut total_bytes: u64 = 0; + loop { + match stream.read(&mut buf) { + Ok(0) => break, + Ok(bytes_read) => total_bytes += bytes_read as u64, + Err(_) => break, + } + } + total_bytes + } + + fn percentile(samples: &mut [Duration], p: f64) -> Duration { + samples.sort(); + let idx = ((samples.len() as f64) * p).clamp(0.0, samples.len() as f64 - 1.0) as usize; + samples[idx] + } + + /// Measure TCP RR (Request-Response) latency on a kept-open connection. + /// + /// The guest pipes `RR_SAMPLES_PER_ITER` null bytes over a single `nc` + /// connection (`dd if=/dev/zero bs=1 count=N | nc host port`). The host + /// accepts one connection and services each byte as an independent echo + /// round-trip, timing each host-side `read + write` pair. + /// + /// Using dd+nc avoids BusyBox shell limitations around interactive TCP + /// sockets while still measuring per-message in-flight latency on a + /// persistent connection. The first sample from each iteration is discarded + /// because the first byte arrival absorbs TCP connect and Nagle jitter from + /// the guest side. Remaining samples are accumulated across all iterations; + /// p50 and p99 are computed over the union. + /// + /// Returns `(p50_us, p99_us)`, both `None` if no samples were collected. + async fn measure_rr_latency( + sandbox: &Sandbox, + iterations: u32, + ) -> Result<(Option, Option), Box> { + let mut all_samples: Vec = Vec::new(); + + for iteration_index in 0..iterations { + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + + let (echo_tx, echo_rx) = mpsc::channel::>(); + + let echo_deadline = Instant::now() + LATENCY_RECV_TIMEOUT + ACCEPT_DEADLINE_SLACK; + std::thread::spawn(move || { + let samples = rr_echo_server(&listener, RR_SAMPLES_PER_ITER, echo_deadline); + let _ = echo_tx.send(samples); + }); + + // Guest: pipe RR_SAMPLES_PER_ITER zero bytes over one nc connection. + // dd generates the bytes; nc forwards them to the host echo server. + // The guest does not need to read the echoed bytes — the host drives + // the timing loop and closes when done. BusyBox dd + nc suffice. + let guest_cmd = format!( + "dd if=/dev/zero bs=1 count={n} 2>/dev/null | nc {host} {port}", + n = RR_SAMPLES_PER_ITER, + host = SLIRP_HOST_ADDR, + port = host_port, + ); + + let exec_result = sandbox.exec("sh", &["-c", &guest_cmd]).await; + if let Err(exec_err) = exec_result { + tracing::warn!( + iteration = iteration_index, + error = %exec_err, + "rr iteration exec error; skipping" + ); + } + + match echo_rx.recv_timeout(LATENCY_RECV_TIMEOUT) { + Err(recv_err) => { + tracing::warn!( + iteration = iteration_index, + error = %recv_err, + "rr echo channel receive error; skipping" + ); + } + Ok(mut samples) => { + // Discard first sample (absorbs TCP connect jitter). + if samples.len() > 1 { + samples.remove(0); + } + let count = samples.len(); + let p50_us = if count > 0 { + percentile(&mut samples.clone(), 0.50).as_micros() + } else { + 0 + }; + eprintln!("rr[{iteration_index:>2}]: {count} samples, p50={p50_us} µs"); + all_samples.extend(samples); + } + } + } + + if all_samples.is_empty() { + return Ok((None, None)); + } + + let p50 = percentile(&mut all_samples, 0.50).as_micros() as f64; + let p99 = percentile(&mut all_samples, 0.99).as_micros() as f64; + Ok((Some(p50), Some(p99))) + } + + /// Host-side echo server for RR latency. + /// + /// Accepts one connection, then for each of the `count` iterations: reads + /// one byte, times that read, writes the byte back, and records the elapsed + /// duration. Returns the list of per-round-trip host-side durations. + /// + /// The timer starts just before the blocking `read` call and stops after the + /// `write` returns. This measures the host-observed round-trip time: the + /// interval from "host waiting for a byte" to "host has written the echo", + /// which is approximately the guest-side send→receive latency plus the + /// network stack overhead on both sides. + fn rr_echo_server(listener: &TcpListener, count: u32, deadline: Instant) -> Vec { + let Some((mut stream, _)) = accept_with_deadline(listener, deadline) else { + return Vec::new(); + }; + + let mut samples = Vec::with_capacity(count as usize); + let mut buf = [0u8; 1]; + + for _ in 0..count { + let start = Instant::now(); + match stream.read_exact(&mut buf) { + Ok(()) => {} + Err(_) => break, + } + match stream.write_all(&buf) { + Ok(()) => {} + Err(_) => break, + } + samples.push(start.elapsed()); + } + + samples + } + + /// Measure TCP CRR (Connect-Request-Response) latency. + /// + /// Each sample is one full `accept + read + write + close` cycle on the host, + /// timed from `accept` returning to the connection dropping. The guest runs + /// a shell loop that performs `CRR_SAMPLES_PER_ITER` independent `nc` invocations + /// per iteration (each is a full connect → send → recv → close). + /// + /// Host-side timing is the ground truth: the host observes when the + /// connection arrives and when it closes, so each sample faithfully captures + /// the TCP setup + data round-trip + teardown cost end-to-end. + /// + /// Returns `p50_us` across all collected samples, or `None` if none arrived. + async fn measure_crr_latency( + sandbox: &Sandbox, + iterations: u32, + ) -> Result, Box> { + let mut all_samples: Vec = Vec::new(); + + for iteration_index in 0..iterations { + let listener = TcpListener::bind("127.0.0.1:0")?; + let host_port = listener.local_addr()?.port(); + + // The host accepts CRR_SAMPLES_PER_ITER connections, times each cycle, + // and sends results back over a channel. + let (crr_tx, crr_rx) = mpsc::channel::>(); + let sample_count = CRR_SAMPLES_PER_ITER; + + let crr_deadline = Instant::now() + LATENCY_RECV_TIMEOUT + ACCEPT_DEADLINE_SLACK; + std::thread::spawn(move || { + let samples = crr_echo_server(&listener, sample_count, crr_deadline); + let _ = crr_tx.send(samples); + }); + + // Guest: loop CRR_SAMPLES_PER_ITER times; each iteration is a full + // nc invocation (connect → send one byte → read echo → disconnect). + let n = CRR_SAMPLES_PER_ITER; + let guest_cmd = format!( + "i=0; while [ $i -lt {n} ]; do printf 'A' | nc {host} {port}; i=$((i+1)); done", + host = SLIRP_HOST_ADDR, + port = host_port, + n = n, + ); + + let exec_result = sandbox.exec("sh", &["-c", &guest_cmd]).await; + if let Err(exec_err) = exec_result { + tracing::warn!( + iteration = iteration_index, + error = %exec_err, + "crr iteration exec error; skipping" + ); + } + + match crr_rx.recv_timeout(LATENCY_RECV_TIMEOUT) { + Err(recv_err) => { + tracing::warn!( + iteration = iteration_index, + error = %recv_err, + "crr echo channel receive error; skipping" + ); + } + Ok(samples) => { + let count = samples.len(); + let p50_us = if count > 0 { + percentile(&mut samples.clone(), 0.50).as_micros() + } else { + 0 + }; + eprintln!("crr[{iteration_index:>2}]: {count} samples, p50={p50_us} µs"); + all_samples.extend(samples); + } + } + } + + if all_samples.is_empty() { + return Ok(None); + } + + let p50 = percentile(&mut all_samples, 0.50).as_micros() as f64; + Ok(Some(p50)) + } + + /// Measure UDP DNS query throughput against the SLIRP resolver. + /// + /// Returns `None` — the busybox-`nc` tool available in the minimal test + /// initramfs cannot produce a meaningful number here. Each `nc -u -w1` + /// invocation blocks for the full 1-second `-w1` timeout after stdin EOF + /// even when the cached SLIRP reply arrives in microseconds, capping + /// throughput at roughly 1 qps regardless of stack latency. Tighter + /// alternatives tried: + /// + /// - `-q0`: nc exits before the UDP reply arrives, yielding 0 successes. + /// - `/dev/udp/HOST/PORT`: bash-specific; busybox ash does not support it. + /// - `timeout 0.1 nc ...`: `timeout` is not present in the test initramfs. + /// + /// A meaningful qps measurement requires a host-side UDP socket that sends + /// queries through SLIRP directly, bypassing the per-query nc process + /// spawn. Until that is implemented, `udp_dns_qps` is reported as `null` + /// in the JSON output. + async fn measure_dns_qps( + _sandbox: &Sandbox, + ) -> Result, Box> { + tracing::warn!( + "dns_qps: busybox-nc bottleneck (~1 qps due to -w1 per-query); \ + reporting null — replace with host-side UDP socket for real numbers" + ); + Ok(None) + } + + /// Measure ICMP echo round-trip latency. + /// + /// Currently a stub that returns `None`: the guest images intentionally + /// omit `/bin/ping` (busybox-static on Fedora lacks + /// `CONFIG_FEATURE_PING_TYPE_DGRAM`, and SOCK_RAW would require root in + /// the guest). A proper measurement path needs either a guest-agent RPC + /// or a custom static ICMP binary in the test image — tracked as a + /// follow-up. + async fn measure_icmp_rr_latency( + _sandbox: &Sandbox, + _iterations: u32, + ) -> Result, Box> { + tracing::warn!( + "icmp_rr_latency: guest-side ping unavailable (no /bin/ping symlink, \ + busybox-static lacks CONFIG_FEATURE_PING_TYPE_DGRAM); reporting null. \ + A host-driven ICMP measurement path is tracked as a follow-up." + ); + Ok(None) + } + + /// Host-side echo server for CRR latency. + /// + /// Accepts `count` independent connections in sequence. For each: starts the + /// timer on `accept`, reads one byte, writes it back, closes the connection, + /// and stops the timer. Returns all per-connection durations. + fn crr_echo_server(listener: &TcpListener, count: u32, deadline: Instant) -> Vec { + let mut samples = Vec::with_capacity(count as usize); + let mut buf = [0u8; 1]; + + for _ in 0..count { + let start = Instant::now(); + let Some((mut stream, _)) = accept_with_deadline(listener, deadline) else { + break; + }; + // Read the request byte and echo it back. + if stream.read_exact(&mut buf).is_ok() { + let _ = stream.write_all(&buf); + } + // Explicit drop closes the connection. + drop(stream); + samples.push(start.elapsed()); + } + + samples + } +} // mod linux_main + +#[cfg(target_os = "linux")] +fn main() -> Result<(), Box> { + linux_main::main_impl() +} diff --git a/src/bin/voidbox-startup-bench/main.rs b/src/bin/voidbox-startup-bench/main.rs index 72cd02e6..4380bf10 100644 --- a/src/bin/voidbox-startup-bench/main.rs +++ b/src/bin/voidbox-startup-bench/main.rs @@ -83,7 +83,7 @@ async fn main() -> Result<(), Box> { ); if !warm_only { - eprintln!("\n-- Phase 1: cold boot --"); + eprintln!("\n-- cold boot --"); let mut cold: Vec = Vec::with_capacity(iters); for i in 0..iters { // Route console to a file only on the very first iteration so we @@ -109,7 +109,7 @@ async fn main() -> Result<(), Box> { } if !cold_only { - eprintln!("\n-- Phase 2: warm (snapshot-restore) --"); + eprintln!("\n-- warm (snapshot-restore) --"); let tmp = tempfile::tempdir()?; let snap_path = capture_snapshot(memory_mb, tmp.path()).await?; eprintln!("captured snapshot at: {}", snap_path.display()); @@ -138,10 +138,19 @@ async fn capture_snapshot( memory_mb: usize, dir: &std::path::Path, ) -> Result> { + // `enable_snapshots(true)` flips the backend selector at + // `backend/kvm.rs:212` to `VsockBackendType::Userspace`. Without + // this, the cold boot uses vhost-vsock and the snapshot file + // captures vhost-shaped state — but `from_snapshot` always + // restores into the userspace backend, producing a mismatch that + // surfaces as `control_channel: deadline reached` on the warm + // phase (vhost's vring state lives in the host kernel's + // vhost-vsock module and isn't part of our snapshot at all). let sandbox = Sandbox::local() .from_env()? .memory_mb(memory_mb) .network(false) + .enable_snapshots(true) .build()?; // Trigger cold boot. let _ = sandbox.exec("sh", &["-c", ":"]).await?; diff --git a/src/daemon.rs b/src/daemon.rs index ffa42d5d..20f7a2be 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -1373,7 +1373,7 @@ async fn spawn_service_run( let mut published = false; let mut terminalized = false; - // Phase 1: Wait for output publication OR exit OR watchdog. + // Wait for output publication OR exit OR watchdog. tokio::select! { output_result = &mut output_rx => { if let Ok(publication) = output_result { diff --git a/src/devices/virtio_net.rs b/src/devices/virtio_net.rs index 8cd48d0b..71214d47 100644 --- a/src/devices/virtio_net.rs +++ b/src/devices/virtio_net.rs @@ -13,7 +13,8 @@ use std::sync::{Arc, Mutex}; use tracing::{debug, trace, warn}; use vm_memory::{Address, Bytes, GuestAddress, GuestMemory}; -use crate::network::slirp::{SlirpStack, GUEST_MAC}; +use crate::network::slirp::GUEST_MAC; +use crate::network::NetworkBackend; use crate::Result; /// Virtio descriptor flags @@ -142,8 +143,8 @@ struct QueueState { /// Virtio-net device state pub struct VirtioNetDevice { - /// SLIRP stack for networking - slirp: Arc>, + /// Network backend (SLIRP or any [`NetworkBackend`] impl) + slirp: Arc>, /// Guest MAC address mac: [u8; 6], /// Device features @@ -166,6 +167,8 @@ pub struct VirtioNetDevice { tx_queue: QueueState, /// Packets waiting to be received by guest rx_buffer: Vec>, + /// Scratch buffer reused across `drain_to_guest` calls to avoid per-poll allocation + rx_scratch: Vec>, /// MMIO base address mmio_base: u64, /// MMIO size @@ -181,8 +184,8 @@ pub struct VirtioNetDevice { } impl VirtioNetDevice { - /// Create a new virtio-net device with SLIRP backend - pub fn new(slirp: Arc>) -> Result { + /// Create a new virtio-net device with the given network backend + pub fn new(slirp: Arc>) -> Result { debug!("Creating virtio-net device with SLIRP backend"); let device_features = features::VIRTIO_NET_F_MAC @@ -208,6 +211,7 @@ impl VirtioNetDevice { ..Default::default() }, rx_buffer: Vec::new(), + rx_scratch: Vec::new(), mmio_base: 0, mmio_size: 0x200, tx_avail_idx: 0, @@ -656,11 +660,13 @@ impl VirtioNetDevice { /// Get frames waiting to be received by guest (RX path) pub fn get_rx_frames(&mut self) -> Vec> { - // Poll SLIRP for new packets - let frames = { - let mut slirp = self.slirp.lock().unwrap(); - slirp.poll() - }; + // Drain backend frames into the reused scratch buffer. + self.rx_scratch.clear(); + { + let mut backend = self.slirp.lock().unwrap(); + backend.drain_to_guest(&mut self.rx_scratch); + } + let frames = std::mem::take(&mut self.rx_scratch); // Prepend virtio-net header to each frame let mut result = Vec::new(); @@ -779,11 +785,35 @@ impl VirtioNetDevice { pub fn mac(&self) -> &[u8; 6] { &self.mac } + + /// Return the epoll dispatch instance from the underlying network backend, + /// if the backend is a `SlirpBackend` (Linux only). + /// + /// `net_poll_thread` uses this to block on `epoll_wait` instead of + /// sleeping, waking immediately when host sockets become readable. + #[cfg(target_os = "linux")] + pub fn epoll_arc( + &self, + ) -> Option> { + let backend = self.slirp.lock().unwrap(); + backend.epoll_arc() + } + + /// Forward ready epoll events into the network backend's per-tick queue. + /// + /// Called by net_poll_thread after each epoll_wait returns so that + /// drain_to_guest can process events without re-locking EpollDispatch. + #[cfg(target_os = "linux")] + pub fn push_events_to_backend(&self, events: &[crate::network::epoll_dispatch::EpollEvent]) { + let backend = self.slirp.lock().unwrap(); + backend.push_ready_events(events); + } } #[cfg(test)] mod tests { use super::*; + use crate::network::slirp::SlirpBackend; #[test] fn test_virtio_net_header() { @@ -798,7 +828,8 @@ mod tests { #[test] fn test_mmio_magic() { - let slirp = Arc::new(Mutex::new(SlirpStack::new().unwrap())); + let slirp: Arc> = + Arc::new(Mutex::new(SlirpBackend::new().unwrap())); let device = VirtioNetDevice::new(slirp).unwrap(); let mut data = [0u8; 4]; @@ -809,7 +840,8 @@ mod tests { #[test] fn test_mmio_version() { - let slirp = Arc::new(Mutex::new(SlirpStack::new().unwrap())); + let slirp: Arc> = + Arc::new(Mutex::new(SlirpBackend::new().unwrap())); let device = VirtioNetDevice::new(slirp).unwrap(); let mut data = [0u8; 4]; @@ -820,7 +852,8 @@ mod tests { #[test] fn test_device_type() { - let slirp = Arc::new(Mutex::new(SlirpStack::new().unwrap())); + let slirp: Arc> = + Arc::new(Mutex::new(SlirpBackend::new().unwrap())); let device = VirtioNetDevice::new(slirp).unwrap(); let mut data = [0u8; 4]; diff --git a/src/network/epoll_dispatch.rs b/src/network/epoll_dispatch.rs new file mode 100644 index 00000000..046f9510 --- /dev/null +++ b/src/network/epoll_dispatch.rs @@ -0,0 +1,387 @@ +//! Linux epoll-driven readiness dispatch for SLIRP host sockets. +//! +//! Owns one `epoll_fd` plus an eagerly-initialized self-pipe. Callers +//! register socket FDs with a `FlowToken` (a 64-bit identifier the +//! dispatcher returns on readiness). The poll thread calls +//! `wait_with_timeout` to block until any registered FD is ready or the +//! timeout fires, then drains the events into a caller-owned buffer. +//! +//! `EpollDispatch` is `Sync`: the Linux kernel serializes concurrent +//! `epoll_ctl` and `epoll_wait` calls on the same epoll fd internally. +//! Callers can therefore share one `Arc` across threads +//! and call `register`/`unregister` without an outer `Mutex`, eliminating +//! the lock-contention between `wait_with_timeout` (net-poll thread) and +//! `register` (vCPU thread handling new TCP SYNs). +//! +//! Why no crate? The standard `mio`/`tokio` story would pull in a +//! reactor + a runtime that the SLIRP poll loop does not need. +//! `libc::epoll_*` is two syscalls, fully observable, and the surface +//! fits in ~200 lines. + +use std::io; +use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +/// Opaque per-FD identifier the caller uses to look up which flow a +/// readiness event belongs to. Encoded into `epoll_data.u64`. +pub type FlowToken = u64; + +/// One readiness event, mapped from `libc::epoll_event`. +#[allow(dead_code)] +#[derive(Debug, Clone, Copy)] +pub struct EpollEvent { + pub token: FlowToken, + pub readable: bool, + pub writable: bool, +} + +/// Direction of interest for an `EpollDispatch::register` call. +/// +/// Closed enum lets the type system reject impossible combinations (e.g. +/// "neither read nor write") at compile time and gives a clear name to +/// each mode rather than two opaque booleans. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RegisterMode { + /// Wake on EPOLLIN only. + Read, + /// Wake on EPOLLOUT only. + Write, + /// Wake on either EPOLLIN or EPOLLOUT. + ReadWrite, +} + +/// Sentinel token reserved for the self-pipe wakeup mechanism. +/// Never returned to callers. +const SELF_PIPE_TOKEN: FlowToken = u64::MAX; + +/// `EpollDispatch` is `Sync`: concurrent `epoll_ctl` and `epoll_wait` +/// on the same epoll fd are kernel-serialized and safe from multiple +/// threads. The only shared state beyond the fd is `registered_count` +/// (an `AtomicUsize`) and the self-pipe (immutable after construction). +pub struct EpollDispatch { + epoll_fd: OwnedFd, + /// Read end of the self-pipe; registered with EPOLLIN at construction. + read_end: OwnedFd, + /// Cloneable waker backed by the write end of the self-pipe. + waker_handle: Arc, + /// Number of user-registered FDs (excludes the self-pipe). + registered_count: AtomicUsize, +} + +// SAFETY: All mutable state is either atomic or only accessed from one +// thread at a time (epoll_ctl/epoll_wait are kernel-serialized on the fd). +unsafe impl Sync for EpollDispatch {} + +impl EpollDispatch { + /// Create a new epoll instance with `EPOLL_CLOEXEC` and eagerly + /// initialize the self-pipe so `waker()` is lock-free. + pub fn new() -> io::Result { + // SAFETY: `epoll_create1` returns -1 on error and a valid fd + // otherwise. We wrap into OwnedFd so Drop closes it. + let raw = unsafe { libc::epoll_create1(libc::EPOLL_CLOEXEC) }; + if raw < 0 { + return Err(io::Error::last_os_error()); + } + let epoll_fd = unsafe { OwnedFd::from_raw_fd(raw) }; + + // Eagerly create the self-pipe and register its read end. + // This avoids the lazy-init branch in the hot path and lets + // `waker()` take `&self` instead of `&mut self`. + let (read_fd, write_fd) = create_pipe2_nonblock_cloexec(); + let mut ev = libc::epoll_event { + events: libc::EPOLLIN as u32, + u64: SELF_PIPE_TOKEN, + }; + // SAFETY: epoll_ctl ADD with a valid fd and event struct. + let epoll_ctl_result = unsafe { + libc::epoll_ctl( + epoll_fd.as_raw_fd(), + libc::EPOLL_CTL_ADD, + read_fd.as_raw_fd(), + &mut ev as *mut _, + ) + }; + if epoll_ctl_result < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(Self { + epoll_fd, + read_end: read_fd, + waker_handle: Arc::new(write_fd), + registered_count: AtomicUsize::new(0), + }) + } + + /// Register `fd` with the dispatcher under `token` for the requested + /// readiness `mode`. `token` is opaque to the dispatcher — returned + /// verbatim on readiness events. + /// + /// Thread-safe: concurrent calls with `unregister` and + /// `wait_with_timeout` are serialized by the kernel's per-epoll-fd lock. + pub fn register(&self, fd: RawFd, token: FlowToken, mode: RegisterMode) -> io::Result<()> { + let events: u32 = match mode { + RegisterMode::Read => libc::EPOLLIN as u32, + RegisterMode::Write => libc::EPOLLOUT as u32, + RegisterMode::ReadWrite => (libc::EPOLLIN | libc::EPOLLOUT) as u32, + }; + let mut ev = libc::epoll_event { events, u64: token }; + // SAFETY: epoll_ctl reads `ev` for ADD; we own `fd` for the + // lifetime of the registration (caller's contract). + let epoll_ctl_result = unsafe { + libc::epoll_ctl( + self.epoll_fd.as_raw_fd(), + libc::EPOLL_CTL_ADD, + fd, + &mut ev as *mut _, + ) + }; + if epoll_ctl_result < 0 { + return Err(io::Error::last_os_error()); + } + if token != SELF_PIPE_TOKEN { + self.registered_count.fetch_add(1, Ordering::Relaxed); + } + Ok(()) + } + + /// Thread-safe: concurrent calls with `register` and `wait_with_timeout` + /// are serialized by the kernel's per-epoll-fd lock. + pub fn unregister(&self, fd: RawFd) -> io::Result<()> { + // SAFETY: epoll_ctl ignores the event pointer for DEL but + // still requires it to be non-null on older kernels. + let mut ev = libc::epoll_event { events: 0, u64: 0 }; + let epoll_ctl_result = unsafe { + libc::epoll_ctl( + self.epoll_fd.as_raw_fd(), + libc::EPOLL_CTL_DEL, + fd, + &mut ev as *mut _, + ) + }; + if epoll_ctl_result < 0 { + return Err(io::Error::last_os_error()); + } + self.registered_count.fetch_sub(1, Ordering::Relaxed); + Ok(()) + } + + /// Returns the number of user-registered FDs (excludes the self-pipe). + #[cfg(any(test, feature = "bench-helpers"))] + pub(crate) fn registered_fd_count(&self) -> usize { + self.registered_count.load(Ordering::Relaxed) + } + + /// Block up to `timeout` for any registered FD to become ready. + /// Drains ready events into `out` (cleared first). Returns the + /// number of raw kernel events (including self-pipe wakes) so callers + /// can use it for adaptive-timeout decisions. + /// + /// `timeout = Duration::ZERO` is a non-blocking poll. + /// + /// Self-pipe events are drained to EAGAIN in-place: no extra allocation. + pub fn wait_with_timeout( + &self, + out: &mut Vec, + timeout: Duration, + ) -> io::Result { + out.clear(); + + // Pre-allocate a fixed-size event buffer. 64 ready FDs per + // wait is more than enough for our flow counts; events not + // returned this round will surface on the next wait. + let mut raw_events: [libc::epoll_event; 64] = [libc::epoll_event { events: 0, u64: 0 }; 64]; + + let timeout_ms: i32 = timeout.as_millis().min(i32::MAX as u128) as i32; + + // SAFETY: epoll_wait writes up to raw_events.len() entries; + // returns -1 on error, 0 on timeout, n>0 on events. + let n = unsafe { + libc::epoll_wait( + self.epoll_fd.as_raw_fd(), + raw_events.as_mut_ptr(), + raw_events.len() as i32, + timeout_ms, + ) + }; + if n < 0 { + // EINTR is non-fatal — caller can retry on next tick. + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + return Ok(0); + } + return Err(err); + } + + let raw_count = n as usize; + let mut drained_pipe = false; + + // Single pass: filter self-pipe events (draining the pipe to EAGAIN + // on first occurrence), push real events into `out`. + // No extra allocation: `out` was cleared at the top of this function. + for &raw in &raw_events[..raw_count] { + if raw.u64 == SELF_PIPE_TOKEN { + if !drained_pipe { + // Drain the self-pipe to EAGAIN so EPOLLIN is not + // re-asserted on the next wait. A single read is + // insufficient when wakes arrive faster than we drain + // (burst connection setup), so loop until read returns + // ≤ 0 or a partial fill (pipe empty). + let mut scratch = [0u8; 64]; + loop { + // SAFETY: read from O_NONBLOCK pipe; + // EAGAIN / EOF terminates the loop. + let r = unsafe { + libc::read( + self.read_end.as_raw_fd(), + scratch.as_mut_ptr() as *mut _, + scratch.len(), + ) + }; + if r <= 0 || (r as usize) < scratch.len() { + break; + } + } + drained_pipe = true; + } + continue; + } + out.push(EpollEvent { + token: raw.u64, + readable: (raw.events & libc::EPOLLIN as u32) != 0, + writable: (raw.events & libc::EPOLLOUT as u32) != 0, + }); + } + + Ok(raw_count) + } + + /// Returns a `Waker` that, when called, unblocks any thread + /// currently inside `wait_with_timeout`. The waker is cheap to + /// clone and may be stored across threads. + pub fn waker(&self) -> Waker { + Waker { + write_end: self.waker_handle.clone(), + } + } + + #[cfg(test)] + fn epoll_fd_for_test(&self) -> RawFd { + self.epoll_fd.as_raw_fd() + } +} + +/// Cloneable wakeup handle for `EpollDispatch`. Writing one byte to +/// the underlying pipe wakes a thread blocked in `wait_with_timeout`. +#[derive(Debug, Clone)] +pub struct Waker { + write_end: Arc, +} + +impl Waker { + pub fn wake(&self) { + let buf = [0u8; 1]; + // SAFETY: write to a non-blocking pipe never blocks. We + // ignore EAGAIN — the pipe already has bytes pending, which + // means a wakeup is already queued. + let _ = unsafe { libc::write(self.write_end.as_raw_fd(), buf.as_ptr() as *const _, 1) }; + } +} + +fn create_pipe2_nonblock_cloexec() -> (OwnedFd, OwnedFd) { + let mut fds = [0 as RawFd; 2]; + // SAFETY: pipe2 with O_NONBLOCK | O_CLOEXEC writes two fds into fds. + let rc = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_NONBLOCK | libc::O_CLOEXEC) }; + assert!(rc == 0, "pipe2 failed: {}", io::Error::last_os_error()); + let read_end = unsafe { OwnedFd::from_raw_fd(fds[0]) }; + let write_end = unsafe { OwnedFd::from_raw_fd(fds[1]) }; + (read_end, write_end) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::fd::AsRawFd; + + #[test] + fn dispatch_new_creates_epoll_fd() { + let dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + assert!(dispatch.epoll_fd_for_test() >= 0); + } + + #[test] + fn register_then_unregister_round_trip() { + use std::net::TcpListener; + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + let token: FlowToken = 0xDEAD_BEEF; + dispatch + .register(listener.as_raw_fd(), token, RegisterMode::Read) + .expect("register"); + dispatch + .unregister(listener.as_raw_fd()) + .expect("unregister"); + } + + #[test] + fn register_invalid_fd_returns_error() { + let dispatch = EpollDispatch::new().expect("EpollDispatch::new"); + let result = dispatch.register(-1, 0, RegisterMode::Read); + assert!(result.is_err()); + } + + #[test] + fn wait_returns_event_when_socket_becomes_readable() { + use std::io::Write; + use std::net::{TcpListener, TcpStream}; + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let addr = listener.local_addr().unwrap(); + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + sock.write_all(b"hi").unwrap(); + }); + let stream = TcpStream::connect(addr).expect("connect"); + server.join().unwrap(); + + let dispatch = EpollDispatch::new().expect("new"); + dispatch + .register(stream.as_raw_fd(), 0xCAFE, RegisterMode::Read) + .expect("register"); + + let mut events: Vec = Vec::new(); + let n = dispatch + .wait_with_timeout(&mut events, Duration::from_secs(1)) + .expect("wait"); + assert_eq!(n, 1); + assert_eq!(events[0].token, 0xCAFE); + assert!(events[0].readable); + } + + #[test] + fn wakeup_unblocks_wait_immediately() { + use std::time::Instant; + let dispatch = EpollDispatch::new().expect("new"); + let waker = dispatch.waker(); + + // Start the wait in another thread with a long timeout. + let wait_thread = std::thread::spawn(move || -> std::time::Duration { + let mut events: Vec = Vec::new(); + let start = Instant::now(); + let _ = dispatch.wait_with_timeout(&mut events, Duration::from_secs(5)); + start.elapsed() + }); + + // Wake immediately. + std::thread::sleep(Duration::from_millis(10)); + waker.wake(); + + let elapsed = wait_thread.join().expect("wait thread"); + // Wait thread should return well under the 5 s timeout. + assert!( + elapsed < Duration::from_secs(1), + "wait did not return on wakeup: {elapsed:?}" + ); + } +} diff --git a/src/network/mod.rs b/src/network/mod.rs index d884ec6b..fa498280 100644 --- a/src/network/mod.rs +++ b/src/network/mod.rs @@ -6,9 +6,12 @@ //! - virtio-net configuration //! - Network isolation and NAT +pub(crate) mod epoll_dispatch; +pub mod nat; pub mod slirp; use std::ffi::CString; +use std::io; use crate::{Error, Result}; @@ -63,6 +66,55 @@ impl NetworkConfig { } } +/// A network backend processes raw Ethernet frames between guest and host. +/// +/// Implementations must be `Send` so they can be held behind +/// `Arc>` and accessed from both the vCPU thread (TX path) and +/// the net-poll thread (RX path). +pub trait NetworkBackend: Send { + /// Process a raw Ethernet frame sent by the guest. + /// + /// Called from the vCPU thread on MMIO write to the TX virtqueue. + /// Implementations must not block. + fn process_guest_frame(&mut self, frame: &[u8]) -> io::Result<()>; + + /// Drain Ethernet frames destined for the guest into `out`. + /// + /// Called every ~5ms from the net-poll thread. Frames are + /// complete Ethernet payloads — no virtio-net header (the caller + /// prepends that). The buffer is reused across calls to avoid + /// per-poll allocation. + fn drain_to_guest(&mut self, out: &mut Vec>); + + /// Return the backend health status. + /// + /// `false` means the backend has entered an unrecoverable state + /// and should be reconstructed by the caller. The default + /// implementation always returns `true`. + fn is_healthy(&self) -> bool { + true + } + + /// Return the epoll dispatch instance shared by this backend, if any. + /// + /// Only `SlirpBackend` returns `Some`; other backends (mock, future + /// alternatives) return `None`. `net_poll_thread` uses this to block on + /// `epoll_wait` instead of sleeping, reducing host CPU burn between + /// network events. + #[cfg(target_os = "linux")] + fn epoll_arc(&self) -> Option> { + None + } + + /// Push ready epoll events into the backend's per-tick queue. + /// + /// Called by net_poll_thread after each epoll_wait returns, so + /// drain_to_guest can consume them without re-locking EpollDispatch. + /// The default is a no-op; `SlirpBackend` overrides this. + #[cfg(target_os = "linux")] + fn push_ready_events(&self, _events: &[epoll_dispatch::EpollEvent]) {} +} + /// TAP device handle pub struct TapDevice { name: String, diff --git a/src/network/nat.rs b/src/network/nat.rs new file mode 100644 index 00000000..23932d10 --- /dev/null +++ b/src/network/nat.rs @@ -0,0 +1,176 @@ +//! Stateless address translation for SLIRP. +//! +//! Pure functions that map (guest-visible address, rules) → (host-side +//! `SocketAddr` to connect/bind to). No per-flow state lives here — +//! the flow table in `slirp.rs` owns that. Translation itself is a +//! function call. +//! +//! Mirrors passt's `fwd.c::nat_inbound` design: address rewrites are +//! pure functions of (address, rules), not per-flow state. The same +//! pure-function shape extends cleanly to IPv6 dual-stack and +//! port-forwarding without introducing per-flow mutable state. + +use std::net::{Ipv4Addr, SocketAddr}; + +use ipnet::Ipv4Net; +use smoltcp::wire::Ipv4Address; + +/// Transport protocol discriminant for a port-forwarding rule. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ForwardProto { + /// Transmission Control Protocol. + Tcp, + /// User Datagram Protocol. + Udp, +} + +/// One inbound port-forwarding entry. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct PortForward { + /// Transport protocol; TCP or UDP. + pub proto: ForwardProto, + /// Host port to bind. Connections to `127.0.0.1:host_port` are + /// proxied into the guest at `guest_port`. + pub host_port: u16, + /// Guest port the forwarded connection terminates at. + pub guest_port: u16, +} + +/// Outbound translation rules, derived once at `SlirpBackend` +/// construction. +#[derive(Clone, Debug, Default)] +pub struct Rules { + /// If `true`, guest connections to the SLIRP gateway IP map to + /// `127.0.0.1` on the host. Today this is always `true`; left + /// configurable so a future TAP backend can flip it off. + pub gateway_loopback: bool, + /// CIDRs the guest is not allowed to connect to. Outbound packets + /// targeting these get `None` from [`translate_outbound`]. + pub deny_cidrs: Vec, + /// Inbound port forwards. Consulted by `SlirpBackend::new` to + /// spawn host listeners; not used by [`translate_outbound`]. + pub port_forwards: Vec, +} + +/// Translate an outbound packet's destination address. +/// +/// Returns `Some(host_addr)` if the packet should be forwarded — +/// loopback for the gateway IP, otherwise the original IP. Returns +/// `None` if the destination is in the deny list. +/// +/// # Examples +/// +/// ``` +/// use ipnet::Ipv4Net; +/// use smoltcp::wire::Ipv4Address; +/// use void_box::network::nat::{Rules, translate_outbound}; +/// +/// let rules = Rules { +/// gateway_loopback: true, +/// deny_cidrs: vec!["169.254.0.0/16".parse().unwrap()], +/// ..Default::default() +/// }; +/// let gateway = Ipv4Address::new(10, 0, 2, 2); +/// +/// // Gateway IP is rewritten to loopback. +/// let addr = translate_outbound(&rules, gateway, 80, gateway).unwrap(); +/// assert_eq!(addr.ip().to_string(), "127.0.0.1"); +/// +/// // External IPs pass through unchanged. +/// let ext = Ipv4Address::new(8, 8, 8, 8); +/// let addr = translate_outbound(&rules, ext, 53, gateway).unwrap(); +/// assert_eq!(addr.ip().to_string(), "8.8.8.8"); +/// +/// // Deny-listed IPs return None. +/// let metadata = Ipv4Address::new(169, 254, 169, 254); +/// assert!(translate_outbound(&rules, metadata, 80, gateway).is_none()); +/// ``` +pub fn translate_outbound( + rules: &Rules, + dst: Ipv4Address, + dst_port: u16, + gateway_ip: Ipv4Address, +) -> Option { + let dst_ipv4 = Ipv4Addr::from(dst.0); + + // Deny-list check first — explicit block beats any other rule. + for cidr in &rules.deny_cidrs { + if cidr.contains(&dst_ipv4) { + return None; + } + } + + let host_ip = if rules.gateway_loopback && dst == gateway_ip { + Ipv4Addr::LOCALHOST + } else { + dst_ipv4 + }; + + Some(SocketAddr::from((host_ip, dst_port))) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn gateway() -> Ipv4Address { + Ipv4Address::new(10, 0, 2, 2) + } + + fn rules_basic() -> Rules { + Rules { + gateway_loopback: true, + deny_cidrs: vec!["169.254.0.0/16".parse().unwrap()], + ..Default::default() + } + } + + #[test] + fn gateway_ip_maps_to_loopback() { + let gw = gateway(); + let addr = translate_outbound(&rules_basic(), gw, 80, gw).unwrap(); + assert_eq!(addr.ip().to_string(), "127.0.0.1"); + assert_eq!(addr.port(), 80); + } + + #[test] + fn external_ip_passes_through_unchanged() { + let gw = gateway(); + let ext = Ipv4Address::new(8, 8, 8, 8); + let addr = translate_outbound(&rules_basic(), ext, 53, gw).unwrap(); + assert_eq!(addr.ip().to_string(), "8.8.8.8"); + assert_eq!(addr.port(), 53); + } + + #[test] + fn deny_listed_ip_returns_none() { + let gw = gateway(); + let metadata = Ipv4Address::new(169, 254, 169, 254); + assert!(translate_outbound(&rules_basic(), metadata, 80, gw).is_none()); + } + + #[test] + fn gateway_loopback_false_passes_gateway_through() { + let gw = gateway(); + let rules = Rules { + gateway_loopback: false, + ..Default::default() + }; + let addr = translate_outbound(&rules, gw, 443, gw).unwrap(); + assert_eq!(addr.ip().to_string(), "10.0.2.2"); + assert_eq!(addr.port(), 443); + } + + #[test] + fn empty_deny_list_allows_all() { + let gw = gateway(); + let rules = Rules { + gateway_loopback: false, + deny_cidrs: vec![], + ..Default::default() + }; + let private = Ipv4Address::new(192, 168, 1, 1); + let addr = translate_outbound(&rules, private, 22, gw).unwrap(); + assert_eq!(addr.ip().to_string(), "192.168.1.1"); + } +} diff --git a/src/network/slirp.rs b/src/network/slirp.rs index c81974e2..4b0134e6 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -9,9 +9,18 @@ //! - DNS: 10.0.2.3 //! //! Architecture: +//! - Unified flow table: All TCP/UDP/ICMP echo flows live in a single +//! `flow_table: HashMap`. Per-protocol relay logic +//! dispatches on the FlowEntry variant. //! - ARP: custom handler responds as gateway for all 10.0.2.x IPs -//! - TCP: NAT proxy (raw packet parsing + host TCP sockets) -//! - UDP port 53 (DNS): forwarded to host resolver +//! - TCP: passt-style sequence-mirroring NAT (host→guest via +//! `recv(MSG_PEEK)` + ACK-driven consume; guest→host via direct +//! write + don't-ACK-on-WouldBlock TCP backpressure). No userspace +//! per-connection buffers — the host kernel's socket buffer holds +//! outstanding data. +//! - ICMP echo: relayed via unprivileged `SOCK_DGRAM IPPROTO_ICMP` +//! - UDP: per-flow connected sockets; DNS to 10.0.2.3:53 takes a +//! cached fast-path //! - Other: silently dropped //! //! The smoltcp library is used for its Ethernet/IPv4/TCP/UDP wire types @@ -19,11 +28,16 @@ use std::collections::HashMap; use std::collections::VecDeque; -use std::io::{Read, Write}; -use std::net::{SocketAddr, TcpStream, UdpSocket}; -use std::sync::{Arc, Mutex}; +use std::io::{self, Read, Write}; +use std::net::{Ipv4Addr, SocketAddr, TcpListener, TcpStream, UdpSocket}; +use std::os::fd::{AsRawFd, FromRawFd}; +use std::sync::atomic::{AtomicBool, AtomicU64, AtomicU8, Ordering}; +use std::sync::{mpsc, Arc, Mutex}; use std::time::{Duration, Instant}; +use crate::network::epoll_dispatch::{EpollDispatch, EpollEvent, RegisterMode, Waker}; +use crate::network::{nat, NetworkBackend}; + /// Cached DNS response with expiry. struct DnsCacheEntry { response: Vec, @@ -47,9 +61,9 @@ use smoltcp::iface::{Config, Interface, SocketSet}; use smoltcp::phy::{ChecksumCapabilities, Device, DeviceCapabilities, Medium, RxToken, TxToken}; use smoltcp::time::Instant as SmolInstant; use smoltcp::wire::{ - EthernetAddress, EthernetFrame, EthernetProtocol, EthernetRepr, HardwareAddress, IpAddress, - IpCidr, IpProtocol, Ipv4Address, Ipv4Packet, Ipv4Repr, TcpControl, TcpPacket, TcpRepr, - TcpSeqNumber, UdpPacket, + EthernetAddress, EthernetFrame, EthernetProtocol, EthernetRepr, HardwareAddress, Icmpv4Packet, + Icmpv4Repr, IpAddress, IpCidr, IpProtocol, Ipv4Address, Ipv4Packet, Ipv4Repr, TcpControl, + TcpPacket, TcpRepr, TcpSeqNumber, UdpPacket, UdpRepr, }; use tracing::{debug, trace, warn}; @@ -75,7 +89,73 @@ pub const GATEWAY_MAC: [u8; 6] = [0x52, 0x54, 0x00, 0x12, 0x34, 0x01]; const MTU: usize = 1500; const MAX_QUEUE_SIZE: usize = 64; const TCP_WINDOW: u16 = 65535; -const MAX_TO_HOST_BUFFER: usize = 256 * 1024; +const UDP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + +/// ICMP unprivileged probe state. +/// +/// `0` = unknown (not yet probed), `1` = available, `2` = unavailable +/// (kernel returned `EACCES` or `EPERM` — typically `net.ipv4.ping_group_range` +/// excludes the calling GID). Once set to `2`, `open_icmp_socket` short-circuits. +static ICMP_PROBE: AtomicU8 = AtomicU8::new(0); + +// ────────────────────────────────────────────────────────────────────── +// EpollDispatch flow tokens +// ────────────────────────────────────────────────────────────────────── + +/// High-byte protocol tag embedded in the upper 8 bits of a `FlowToken`. +/// The lower 56 bits are a monotonic per-flow counter (see `FLOW_TOKEN_COUNTER`). +/// The tag lets the relay loop distinguish protocol families with a bitmask +/// instead of a separate lookup; the counter guarantees global uniqueness +/// even when two flows share the same port tuple. +const PROTO_TAG_MASK: u64 = 0xFF00_0000_0000_0000; +const PROTO_TAG_TCP: u64 = 0x0100_0000_0000_0000; +const PROTO_TAG_UDP: u64 = 0x0200_0000_0000_0000; +const PROTO_TAG_ICMP: u64 = 0x0300_0000_0000_0000; +const PROTO_TAG_LISTEN: u64 = 0x0400_0000_0000_0000; + +/// Monotonic counter for flow token allocation. The lower 56 bits of each +/// `FlowToken` are drawn from here; the upper 8 bits carry `PROTO_TAG_*`. +/// 2^56 unique tokens are available before wrap — effectively infinite for +/// any realistic process lifetime. +static FLOW_TOKEN_COUNTER: AtomicU64 = AtomicU64::new(0); + +/// Allocate a fresh, globally unique `FlowToken` tagged for the given protocol. +/// +/// The lower 56 bits are drawn from a relaxed monotonic counter shared across +/// all `SlirpBackend` instances. The upper 8 bits carry `proto_tag` so relay +/// loops can demux by protocol without an additional map lookup. +fn next_flow_token(proto_tag: u64) -> u64 { + let counter = FLOW_TOKEN_COUNTER.fetch_add(1, Ordering::Relaxed) & 0x00FF_FFFF_FFFF_FFFF; + proto_tag | counter +} + +/// Build an epoll token for a port-forward listener FD. +/// +/// The high byte carries `PROTO_TAG_LISTEN`; the low 16 bits encode the +/// host port. Each port-forward rule has a distinct host port, so tokens +/// are unique across all registered listeners. +fn flow_token_for_listener(host_port: u16) -> u64 { + PROTO_TAG_LISTEN | u64::from(host_port) +} + +// ────────────────────────────────────────────────────────────────────── +// Inbound port-forward accept channel +// ────────────────────────────────────────────────────────────────────── + +/// One accepted host-side TCP connection waiting to be forwarded into the guest. +/// +/// Produced by [`SlirpBackend::process_listener_readiness`] (epoll-driven +/// accept) and consumed by [`SlirpBackend::process_pending_inbound_accepts`] +/// on the net-poll thread. +pub(crate) struct InboundAccept { + /// The accepted host-side TCP stream (non-blocking after accept). + host_stream: TcpStream, + /// Ephemeral port used as the synthesized SYN source port on the gateway side. + /// Derived from the peer's remote port so it is unique per connection. + high_port: u16, + /// Guest-side destination port (the service the guest is listening on). + guest_port: u16, +} // ────────────────────────────────────────────────────────────────────── // TCP NAT connection tracking @@ -83,8 +163,13 @@ const MAX_TO_HOST_BUFFER: usize = 256 * 1024; #[derive(Debug, Clone, Copy, PartialEq)] #[allow(dead_code)] -enum TcpNatState { +pub(crate) enum TcpNatState { + /// Guest sent SYN; we responded with SYN-ACK; waiting for guest's + /// final ACK to complete the outbound 3-way handshake. SynReceived, + /// We synthesized a SYN to the guest (port-forwarding); waiting + /// for the guest's SYN-ACK to advance to Established. + SynSent, Established, FinWait1, FinWait2, @@ -94,7 +179,7 @@ enum TcpNatState { } /// Key for NAT table: (guest_src_port, dst_ip, dst_port) -#[derive(Debug, Clone, Hash, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] struct NatKey { guest_src_port: u16, dst_ip: Ipv4Address, @@ -108,13 +193,186 @@ struct TcpNatEntry { our_seq: u32, /// Last acknowledged guest sequence number guest_ack: u32, - /// Data received from host, pending delivery to guest - to_guest: Vec, - /// Data received from guest, pending write to host (buffered on EAGAIN) - to_host: Vec, - /// Guest sequence number to ACK once `to_host` is flushed - to_host_pending_ack: Option, last_activity: Instant, + /// Bytes sent to the guest but not yet ACK'd by the guest. + /// Equivalent to `our_seq - last_acked_seq`, stored explicitly so + /// the relay can decide how much new payload to peek+send each poll. + /// The ACK-driven consume path decrements this as the guest ACKs data. + bytes_in_flight: u32, + /// Globally unique epoll token for this flow. Allocated once on insert + /// via `next_flow_token(PROTO_TAG_TCP)` and stored here so unregister + /// sites never need to recompute it. + flow_token: u64, +} + +/// Key for the ICMP echo NAT table: (guest ICMP id, destination IP). +/// +/// The host kernel rewrites the ICMP id when sending through a +/// `SOCK_DGRAM IPPROTO_ICMP` socket; we keep the guest's original id here so +/// the reply frame can be translated back before injection. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct IcmpEchoKey { + guest_id: u16, + dst_ip: Ipv4Address, +} + +/// State for one in-flight ICMP echo request from the guest. +struct IcmpEchoEntry { + /// Host-side socket: `socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP)`. + /// Set non-blocking; the kernel handles ICMP framing — no + /// `CAP_NET_RAW` needed. + sock: std::net::UdpSocket, + /// The guest's original ICMP id from the echo request. The host kernel + /// rewrites the id to a kernel-assigned value when the `SOCK_DGRAM` + /// ICMP socket sends; we translate back to `guest_id` when emitting the + /// reply frame. + // Read in `relay_icmp_echo` when translating the reply frame. + guest_id: u16, + last_activity: Instant, + /// Globally unique epoll token for this flow. Allocated once on insert + /// via `next_flow_token(PROTO_TAG_ICMP)` and stored here so unregister + /// sites never need to recompute it. + flow_token: u64, +} + +/// Key for the UDP flow NAT table: (guest source port, destination IP, destination port). +/// +/// Each unique 3-tuple maps to its own connected `UdpSocket` on the host, +/// mirroring passt's `udp_flow_from_tap` per-flow design. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct UdpFlowKey { + guest_src_port: u16, + dst_ip: Ipv4Address, + dst_port: u16, +} + +/// State for one active UDP flow from the guest. +struct UdpFlowEntry { + /// Connected `UdpSocket`. The host kernel handles source-port + /// preservation and reply demux; we just `send` and `recv`. + /// Set non-blocking. + sock: std::net::UdpSocket, + /// Last frame timestamp; read by Task 2.4 idle-timeout reaper. + last_activity: Instant, + /// Globally unique epoll token for this flow. Allocated once on insert + /// via `next_flow_token(PROTO_TAG_UDP)` and stored here so unregister + /// sites never need to recompute it. + flow_token: u64, +} + +/// Unified flow-table key. Each variant wraps the protocol-specific +/// key already defined elsewhere in this module — no field changes, +/// just one type the unified `flow_table` `HashMap` (added in Task 4.2) +/// can store. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +enum FlowKey { + Tcp(NatKey), + Udp(UdpFlowKey), + IcmpEcho(IcmpEchoKey), +} + +/// Unified flow-table value. Each variant wraps the protocol's existing +/// entry struct. +enum FlowEntry { + Tcp(TcpNatEntry), + Udp(UdpFlowEntry), + IcmpEcho(IcmpEchoEntry), +} + +/// Open an unprivileged ICMP socket (`SOCK_DGRAM IPPROTO_ICMP`). +/// +/// The kernel handles ICMP framing; `CAP_NET_RAW` is **not** required. +/// The socket is set `SOCK_NONBLOCK | SOCK_CLOEXEC` at creation time. +/// +/// Returns `Err` if the kernel rejects the call (e.g. the +/// `net.ipv4.ping_group_range` sysctl excludes the current GID). +/// After the first rejection, subsequent calls short-circuit and return +/// `PermissionDenied` without retrying the syscall. +fn open_icmp_socket() -> io::Result { + if ICMP_PROBE.load(Ordering::Relaxed) == 2 { + return Err(io::Error::new( + io::ErrorKind::PermissionDenied, + "ICMP unprivileged probe previously failed", + )); + } + // SAFETY: socket(2) returns -1 on error; we check before wrapping. + // IPPROTO_ICMP + SOCK_DGRAM is the unprivileged ICMP path: the kernel + // handles ICMP framing, no CAP_NET_RAW required. + let raw = unsafe { + libc::socket( + libc::AF_INET, + libc::SOCK_DGRAM | libc::SOCK_NONBLOCK | libc::SOCK_CLOEXEC, + libc::IPPROTO_ICMP, + ) + }; + if raw < 0 { + let err = io::Error::last_os_error(); + let errno = err.raw_os_error(); + let unprivileged_icmp_forbidden = errno == Some(libc::EACCES) || errno == Some(libc::EPERM); + if unprivileged_icmp_forbidden { + // First failure transitions 0 → 2 and emits the warn-once log. + // swap returns the previous value; only log if we were the first + // to set it. + if ICMP_PROBE.swap(2, Ordering::Relaxed) != 2 { + warn!( + "SLIRP: unprivileged ICMP unavailable on this host \ + (sysctl net.ipv4.ping_group_range likely restricts \ + it); ICMP echo from guests will be dropped." + ); + } + } + return Err(err); + } + ICMP_PROBE.store(1, Ordering::Relaxed); + // SAFETY: `raw` is a valid fd from socket(2); UdpSocket adopts + // ownership and closes on drop. + Ok(unsafe { std::net::UdpSocket::from_raw_fd(raw) }) +} + +/// Open a connected UDP socket for one guest→host flow. +/// +/// Binds to an ephemeral port on `0.0.0.0`, sets non-blocking mode, +/// then calls `connect(dst)` so that: +/// - `send` delivers datagrams to `dst` without specifying the address each time. +/// - Incoming datagrams are filtered to replies from `dst` only, enabling +/// per-flow demux without an additional dispatch table. +/// +/// No `CAP_NET_RAW` required — `SOCK_DGRAM` UDP is fully unprivileged. +fn open_udp_flow_socket(dst: std::net::SocketAddr) -> io::Result { + let sock = std::net::UdpSocket::bind("0.0.0.0:0")?; + sock.set_nonblocking(true)?; + sock.connect(dst)?; + Ok(sock) +} + +/// Non-blocking `recv(MSG_PEEK)` on a `TcpStream`, returning the +/// number of bytes available without consuming them from the +/// kernel's recv queue. +/// +/// `std::net::TcpStream` does not expose `MSG_PEEK`; we go through +/// `libc::recv` directly. `MSG_DONTWAIT` keeps the call non-blocking +/// even if the underlying stream's `set_nonblocking` flag was +/// dropped at some intermediate point. +/// +/// Used by the passt-style host→guest TCP relay (Task 3.3): peek +/// what's in the kernel buffer, send the un-ACK'd portion to the +/// guest. Bytes stay in the kernel until the guest ACKs and Task +/// 3.4's ACK-driven `read()` consumes them. +fn recv_peek(stream: &TcpStream, buf: &mut [u8]) -> io::Result { + // SAFETY: `stream` outlives the syscall; `buf` is uniquely + // borrowed and `len` matches the slice length. + let n = unsafe { + libc::recv( + stream.as_raw_fd(), + buf.as_mut_ptr() as *mut libc::c_void, + buf.len(), + libc::MSG_PEEK | libc::MSG_DONTWAIT, + ) + }; + if n < 0 { + return Err(io::Error::last_os_error()); + } + Ok(n as usize) } // ────────────────────────────────────────────────────────────────────── @@ -237,13 +495,11 @@ fn parse_resolv_conf() -> Vec { // SLIRP Stack // ────────────────────────────────────────────────────────────────────── -pub struct SlirpStack { +pub struct SlirpBackend { queue: Arc>, iface: Interface, sockets: SocketSet<'static>, _device: VirtualDevice, - /// TCP NAT table - tcp_nat: HashMap, /// Frames to inject into guest (built by our NAT, not by smoltcp) inject_to_guest: Vec>, /// Maximum concurrent TCP connections allowed @@ -252,26 +508,83 @@ pub struct SlirpStack { max_connections_per_second: u32, /// Sliding window of recent connection timestamps for rate limiting connection_timestamps: VecDeque, - /// Network deny list (CIDR ranges that the guest cannot reach) - deny_list: Vec, + /// Stateless outbound translation rules (deny-list, gateway loopback, port forwards). + nat: nat::Rules, /// Host DNS servers (parsed from /etc/resolv.conf, fallback to public) dns_servers: Vec, /// DNS response cache keyed by the raw query bytes (question section) dns_cache: HashMap, DnsCacheEntry>, /// DNS queries waiting to be resolved on the net-poll thread. pending_dns: Vec, + /// Unified flow table keyed by protocol + port tuple. + /// + /// All three protocols (TCP, UDP, ICMP echo) share this table so a single + /// dispatch loop handles all active flows. + flow_table: HashMap, + /// Reverse map from `FlowToken` → `FlowKey` for O(1) readiness-event + /// dispatch. Maintained in sync with `flow_table`: every insert adds an + /// entry; every remove clears it. + token_to_key: HashMap, + /// Live `TcpListener`s for each TCP port-forward rule, keyed by host port. + /// The tuple value is `(listener, guest_port)`. Each listener's FD is + /// registered with `EpollDispatch` under `PROTO_TAG_LISTEN`; readiness + /// events drive the accept loop on the net-poll thread. No dedicated + /// polling thread per rule. + port_forward_listeners: HashMap, + /// Receiver end of the accept channel fed by + /// [`bind_port_forward_listeners`] via [`SlirpBackend::process_listener_readiness`]. + /// Processed on the net-poll thread in + /// [`SlirpBackend::process_pending_inbound_accepts`]. + pending_inbound_accepts: mpsc::Receiver, + /// Sender end of `pending_inbound_accepts`. Kept alive so the channel + /// stays open when no listener threads are running (e.g. in tests) and + /// so test helpers can inject [`InboundAccept`] values directly. + #[allow(dead_code)] + accept_sender: mpsc::Sender, + /// Epoll dispatcher for host socket readiness. `EpollDispatch` is + /// `Sync`: `register`/`unregister` and `wait_with_timeout` are + /// kernel-serialized on the same epoll fd, so no `Mutex` wrapper is + /// needed. The `Arc` lets the net-poll thread share the dispatcher + /// without holding the device lock. + epoll: Arc, + /// Cloneable waker that interrupts `EpollDispatch::wait_with_timeout`. + /// Used after flow-table mutations to unblock the poll thread immediately. + epoll_waker: Waker, + /// Ready events fed by the net-poll thread after each blocking + /// epoll_wait. drain_to_guest drains this on every call without + /// any EpollDispatch lock contention. + pending_events: Mutex>, + /// Flow keys queued for removal because their state advanced to + /// Closed in a non-relay code path (e.g. guest FIN/RST in + /// handle_tcp_frame). Drained at the bottom of relay_tcp_nat_data + /// without scanning the full flow_table. + pending_close: Vec, + /// Set to `true` the first time `push_ready_events` is called — + /// signals "an external poller (net_poll_thread) is feeding us + /// readiness events." When true, `drain_to_guest` skips its + /// non-blocking-poll fallback (one mutex op + one epoll_wait + /// syscall per call, ~310 ns overhead) and only consumes + /// `pending_events`. Tests/benches without a net_poll_thread + /// keep the fallback so synthetic harnesses still observe + /// readiness. + has_external_poller: AtomicBool, } -impl SlirpStack { +impl SlirpBackend { pub fn new() -> Result { - Self::with_security(64, 50, &["169.254.0.0/16".to_string()]) + Self::with_security(64, 50, &["169.254.0.0/16".to_string()], &[]) } /// Create a SLIRP stack with security parameters. + /// + /// `port_forwards` maps host ports to guest ports as `(host_port, guest_port)` pairs. + /// Each entry is stored in [`nat::Rules`] as a TCP forward rule; host listeners are + /// spawned in sub-task B (5.5b) and not yet active. pub fn with_security( max_concurrent_connections: usize, max_connections_per_second: u32, deny_list_cidrs: &[String], + port_forwards: &[(u16, u16)], ) -> Result { debug!("Creating SLIRP stack"); let queue = Arc::new(Mutex::new(PacketQueue::new())); @@ -296,8 +609,7 @@ impl SlirpStack { let sockets = SocketSet::new(vec![]); - // Parse deny list CIDRs - let deny_list: Vec = deny_list_cidrs + let deny_cidrs: Vec = deny_list_cidrs .iter() .filter_map(|cidr| { cidr.parse::() @@ -309,35 +621,63 @@ impl SlirpStack { }) .collect(); + let nat_port_forwards: Vec = port_forwards + .iter() + .map(|&(host_port, guest_port)| nat::PortForward { + proto: nat::ForwardProto::Tcp, + host_port, + guest_port, + }) + .collect(); + + let nat = nat::Rules { + gateway_loopback: true, + deny_cidrs, + port_forwards: nat_port_forwards, + }; + let dns_servers = parse_resolv_conf(); debug!( - "SLIRP stack created - Gateway: {}, DNS: {}, max_conn: {}, rate: {}/s, deny_list: {} CIDRs, dns_servers: {:?}", - SLIRP_GATEWAY_IP, SLIRP_DNS_IP, max_concurrent_connections, max_connections_per_second, deny_list.len(), dns_servers + "SLIRP stack created - Gateway: {}, DNS: {}, max_conn: {}, rate: {}/s, deny_list: {} CIDRs, port_forwards: {}, dns_servers: {:?}", + SLIRP_GATEWAY_IP, SLIRP_DNS_IP, max_concurrent_connections, max_connections_per_second, + nat.deny_cidrs.len(), nat.port_forwards.len(), dns_servers ); + let (accept_tx, accept_rx) = mpsc::channel::(); + + let epoll_inner = EpollDispatch::new()?; + let epoll_waker = epoll_inner.waker(); + let epoll = Arc::new(epoll_inner); + + // Bind listeners for port-forwards and register their FDs with epoll. + let port_forward_listeners = bind_port_forward_listeners(&nat, &epoll); + Ok(Self { queue, iface, sockets, _device: device, - tcp_nat: HashMap::new(), inject_to_guest: Vec::new(), max_concurrent_connections, max_connections_per_second, connection_timestamps: VecDeque::new(), - deny_list, + nat, dns_servers, dns_cache: HashMap::new(), pending_dns: Vec::new(), + flow_table: HashMap::new(), + token_to_key: HashMap::new(), + port_forward_listeners, + pending_inbound_accepts: accept_rx, + accept_sender: accept_tx, + epoll, + epoll_waker, + pending_events: Mutex::new(Vec::new()), + pending_close: Vec::new(), + has_external_poller: AtomicBool::new(false), }) } - /// Check if a destination IP is blocked by the deny list. - fn is_denied(&self, ip: &Ipv4Address) -> bool { - let addr = std::net::Ipv4Addr::new(ip.0[0], ip.0[1], ip.0[2], ip.0[3]); - self.deny_list.iter().any(|net| net.contains(&addr)) - } - /// Check if a new connection is allowed by the rate limiter. /// Returns true if the connection is allowed. fn check_rate_limit(&mut self) -> bool { @@ -361,6 +701,133 @@ impl SlirpStack { true } + /// Drain the inbound-accept channel and seed a `SynSent` flow-table entry + /// plus a synthesized SYN frame for each accepted connection. + /// + /// Accept connections from any port-forward listeners whose FDs are ready + /// in `ready` and push them onto the inbound-accept channel for + /// [`process_pending_inbound_accepts`] to consume. + /// + /// Drains until `WouldBlock` so that a burst of connections arriving + /// between two epoll wakeups is not spread across multiple ticks. + fn process_listener_readiness(&mut self, ready: &[EpollEvent]) { + // Accepted connections are collected here first so that the borrow on + // `port_forward_listeners` ends before we call `accept_sender.send`. + let mut accepted_batch: Vec = Vec::new(); + let mut sender_failed = false; + + for event in ready { + if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_LISTEN { + continue; + } + let host_port = (event.token & 0xFFFF) as u16; + let Some((listener, guest_port)) = self.port_forward_listeners.get(&host_port) else { + continue; + }; + let guest_port = *guest_port; + // Drain the listener — multiple connections may have arrived in one + // EPOLLIN edge. + loop { + match listener.accept() { + Ok((stream, peer_addr)) => { + let high_port = peer_addr.port(); + let _ = stream.set_nonblocking(true); + trace!( + host_port, + guest_port, + high_port, + peer = %peer_addr, + "SLIRP port-forward: accepted connection" + ); + accepted_batch.push(InboundAccept { + host_stream: stream, + high_port, + guest_port, + }); + } + Err(ref would_block) if would_block.kind() == io::ErrorKind::WouldBlock => { + break; + } + Err(accept_error) => { + warn!( + host_port, + error = %accept_error, + "SLIRP port-forward: accept error" + ); + break; + } + } + } + } + + // Borrow of `port_forward_listeners` has ended; send the batch. + for accepted in accepted_batch { + if self.accept_sender.send(accepted).is_err() { + sender_failed = true; + break; + } + } + let _ = sender_failed; // receiver drop handled gracefully on next tick + } + + /// Called at the top of [`drain_to_guest`] so all `SlirpBackend` mutation + /// stays on the net-poll thread — same single-writer lock model as the rest + /// of the relay pipeline. `process_listener_readiness` enqueues accepted + /// connections via the mpsc channel; this method drains that channel and + /// seeds the flow table. + fn process_pending_inbound_accepts(&mut self) { + loop { + let accepted = match self.pending_inbound_accepts.try_recv() { + Ok(accepted) => accepted, + Err(mpsc::TryRecvError::Empty) => break, + Err(mpsc::TryRecvError::Disconnected) => break, + }; + let InboundAccept { + host_stream, + high_port, + guest_port, + } = accepted; + let our_isn = rand_seq(); + let key = NatKey { + guest_src_port: guest_port, + dst_ip: SLIRP_GATEWAY_IP, + dst_port: high_port, + }; + let token = next_flow_token(PROTO_TAG_TCP); + let entry = TcpNatEntry { + host_stream, + state: TcpNatState::SynSent, + our_seq: our_isn, + guest_ack: 0, + last_activity: Instant::now(), + bytes_in_flight: 0, + flow_token: token, + }; + let host_fd = entry.host_stream.as_raw_fd(); + let flow_key = FlowKey::Tcp(key); + self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); + self.token_to_key.insert(token, flow_key); + if let Err(e) = self.epoll.register(host_fd, token, RegisterMode::Read) { + warn!( + host_port = high_port, + guest_port, + fd = host_fd, + error = %e, + "SLIRP port-forward: epoll register failed; flow present but readiness-driven relay disabled" + ); + } + self.epoll_waker.wake(); + let syn_frame = synthesize_inbound_syn(high_port, guest_port, our_isn); + self.inject_to_guest.push(syn_frame); + trace!( + host_port = high_port, + guest_port, + our_isn, + "SLIRP port-forward: seeded SynSent entry" + ); + } + } + // ── Public API ────────────────────────────────────────────────── /// Process an ethernet frame from the guest @@ -374,6 +841,15 @@ impl SlirpStack { Err(_) => return Ok(()), }; + // Track inject_to_guest growth so we can wake the net-poll + // thread if this call queued any frames. The poll thread blocks + // in epoll_wait waiting on FD readiness; an ACK queued during + // guest TX has no FD-side signal (the guest is the writer, not + // the reader on the SLIRP-side socket). Without an explicit + // wake the ACK sits up to epoll_wait's timeout before being + // flushed — TCP send window stalls, throughput drops 10×. + let inject_len_before = self.inject_to_guest.len(); + match eth.ethertype() { EthernetProtocol::Arp => { self.handle_arp_frame(frame)?; @@ -385,30 +861,84 @@ impl SlirpStack { trace!("SLIRP: ignoring ethertype {:?}", eth.ethertype()); } } + + if self.inject_to_guest.len() > inject_len_before { + self.epoll_waker.wake(); + } Ok(()) } - /// Poll the stack. Returns ethernet frames to send to the guest. - pub fn poll(&mut self) -> Vec> { - // Check rx_queue size before polling + /// Drain frames destined to the guest into `out`, reusing the caller's + /// buffer across calls and avoiding a fresh allocation on every tick. + /// + /// See [`crate::network::NetworkBackend::drain_to_guest`]. + pub fn drain_to_guest(&mut self, out: &mut Vec>) { + // Check rx_queue size before polling. let rx_count = { let q = self.queue.lock().unwrap(); q.rx_queue.len() }; - // 1. Let smoltcp handle ARP + // 1. Let smoltcp handle ARP. let ts = smol_instant_now(); let mut dev = VirtualDevice::new(self.queue.clone()); let changed = self.iface.poll(ts, &mut dev, &mut self.sockets); - // 2. Resolve pending DNS queries (off vCPU thread) + // 2. Resolve pending DNS queries (off vCPU thread). self.resolve_pending_dns(); - // 3. Process TCP NAT data relay - self.relay_tcp_nat_data(); + // 3. Collect ready events. + // + // Always drain `pending_events` first — that's the queue + // `net_poll_thread` fills via `push_ready_events` after every + // successful `epoll_wait`. If we skipped this and only polled + // epoll directly, we would lose every event the net-poll thread + // already drained: level-triggered EPOLLIN doesn't re-fire for + // data the kernel already reported, so the next non-blocking + // poll returns 0 events even when there's work to do. CRR + // connections then wait one full 50 ms epoll cycle for the NEXT + // data event before their first data is relayed. + // + // Then, only if no net-poll thread has populated the queue + // (unit tests / benches), fall back to a non-blocking poll on + // the epoll FD ourselves. `try_lock` keeps that fallback safe + // under contention. + let ready: Vec = { + let mut events: Vec = { + let mut queue = self.pending_events.lock().unwrap(); + std::mem::take(&mut *queue) + }; + // Fallback non-blocking poll only when no external poller + // (net_poll_thread) is feeding us events — otherwise we'd + // pay one mutex op + one epoll_wait syscall per call + // (~310 ns) for nothing. The flag is one-way: set by the + // first push_ready_events and stays set for the backend's + // lifetime. + if events.is_empty() && !self.has_external_poller.load(Ordering::Relaxed) { + let _ = self + .epoll + .wait_with_timeout(&mut events, std::time::Duration::ZERO); + } + events + }; + + // 0a. Accept any newly-ready listener connections (may push into + // accept_sender for the next step). + self.process_listener_readiness(&ready); + + // 0b. Drain the accept channel (epoll-driven listeners + test helpers). + self.process_pending_inbound_accepts(); + + // 4. Process TCP NAT data relay. + self.relay_tcp_nat_data(&ready); + + // 5. Relay ICMP echo replies from host sockets back to the guest. + self.relay_icmp_echo(&ready); - // 4. Collect frames: smoltcp ARP responses + our NAT-built frames - let mut frames = Vec::new(); + // 6. Relay UDP flow replies from host sockets back to the guest. + self.relay_udp_flows(&ready); + + // 7. Collect frames: smoltcp ARP responses + our NAT-built frames. { let mut q = self.queue.lock().unwrap(); if !q.tx_queue.is_empty() || rx_count > 0 { @@ -420,11 +950,24 @@ impl SlirpStack { self.inject_to_guest.len() ); } - frames.append(&mut q.tx_queue); + out.append(&mut q.tx_queue); } - frames.append(&mut self.inject_to_guest); + out.append(&mut self.inject_to_guest); + } - frames + /// Poll the stack and return ethernet frames to send to the guest. + /// + /// # Deprecated + /// + /// Allocates a fresh [`Vec`] on every call. Prefer [`drain_to_guest`], + /// which writes into a caller-supplied buffer and avoids the allocation. + /// + /// [`drain_to_guest`]: SlirpBackend::drain_to_guest + #[deprecated(note = "use drain_to_guest")] + pub fn poll(&mut self) -> Vec> { + let mut out = Vec::new(); + self.drain_to_guest(&mut out); + out } /// Extract the DNS question section (bytes after the 12-byte header up to @@ -621,9 +1164,13 @@ impl SlirpStack { let dst_ip = ipv4.dst_addr(); let protocol = ipv4.next_header(); - // DNS (UDP to 10.0.2.3:53) – handle specially - if dst_ip == SLIRP_DNS_IP && protocol == IpProtocol::Udp { - return self.handle_dns_frame(&ipv4); + // UDP — DNS keeps its dedicated cache+forward handler; everything + // else goes through the per-flow connected-socket NAT. + if protocol == IpProtocol::Udp { + if dst_ip == SLIRP_DNS_IP { + return self.handle_dns_frame(&ipv4); + } + return self.handle_udp_frame(&ipv4); } // TCP to any external IP (not gateway) – NAT proxy @@ -634,7 +1181,12 @@ impl SlirpStack { } } - // Everything else (ICMP, etc.) – drop silently + // ICMP echo requests — forward via unprivileged SOCK_DGRAM IPPROTO_ICMP socket + if protocol == IpProtocol::Icmp { + return self.handle_icmp_frame(&ipv4); + } + + // Everything else – drop silently trace!("SLIRP: dropping {:?} packet to {}", protocol, dst_ip); Ok(()) } @@ -684,6 +1236,200 @@ impl SlirpStack { Ok(()) } + // ── Non-DNS UDP forwarding ──────────────────────────────────────── + + /// Forward a non-DNS guest UDP datagram to the host via a per-flow connected socket. + /// + /// Each unique (guest source port, destination IP, destination port) 3-tuple maps to + /// one connected `UdpSocket`. On the first frame for a flow the socket is created via + /// [`open_udp_flow_socket`] and stored in `flow_table` under `FlowKey::Udp`. Subsequent + /// frames reuse the existing socket, updating `last_activity` for idle-timeout reaping (Task 2.4). + /// + /// The SLIRP gateway address (`10.0.2.2`) is translated to `127.0.0.1` before + /// connecting, mirroring the same translation used on the TCP NAT path. + /// + /// Reply delivery back to the guest is handled by Task 2.3 (`relay_udp_flows`). + fn handle_udp_frame(&mut self, ipv4: &Ipv4Packet<&[u8]>) -> Result<()> { + let udp = match UdpPacket::new_checked(ipv4.payload()) { + Ok(u) => u, + Err(_) => return Ok(()), + }; + let payload = udp.payload().to_vec(); + let key = UdpFlowKey { + guest_src_port: udp.src_port(), + dst_ip: ipv4.dst_addr(), + dst_port: udp.dst_port(), + }; + + let dst = + match nat::translate_outbound(&self.nat, key.dst_ip, key.dst_port, SLIRP_GATEWAY_IP) { + Some(addr) => addr, + None => { + trace!( + "SLIRP UDP: deny-list reject dst={}:{} from guest_port={}", + key.dst_ip, + key.dst_port, + key.guest_src_port + ); + return Ok(()); + } + }; + + let flow_key = FlowKey::Udp(key); + // Track whether this is a new entry so we can register it with epoll. + let mut new_host_fd: Option = None; + let mut new_token: u64 = 0; + let entry: &mut UdpFlowEntry = match self.flow_table.entry(flow_key) { + std::collections::hash_map::Entry::Occupied(o) => match o.into_mut() { + FlowEntry::Udp(e) => e, + _ => unreachable!("FlowKey::Udp must map to FlowEntry::Udp"), + }, + std::collections::hash_map::Entry::Vacant(v) => { + let sock = match open_udp_flow_socket(dst) { + Ok(s) => s, + Err(e) => { + trace!("SLIRP UDP: open flow socket failed: {e}"); + return Ok(()); + } + }; + let token = next_flow_token(PROTO_TAG_UDP); + new_host_fd = Some(sock.as_raw_fd()); + new_token = token; + match v.insert(FlowEntry::Udp(UdpFlowEntry { + sock, + last_activity: Instant::now(), + flow_token: token, + })) { + FlowEntry::Udp(e) => e, + _ => unreachable!(), + } + } + }; + entry.last_activity = Instant::now(); + + if let Some(host_fd) = new_host_fd { + self.token_to_key.insert(new_token, flow_key); + if let Err(e) = self.epoll.register(host_fd, new_token, RegisterMode::Read) { + warn!( + guest_src_port = key.guest_src_port, + dst_ip = %key.dst_ip, + dst_port = key.dst_port, + fd = host_fd, + error = %e, + "SLIRP UDP: epoll register failed; flow present but readiness-driven relay disabled" + ); + } + self.epoll_waker.wake(); + } + + if let Err(e) = entry.sock.send(&payload) { + trace!("SLIRP UDP: send failed: {e}"); + } + Ok(()) + } + + // ── ICMP echo forwarding ───────────────────────────────────────── + + /// Forward a guest ICMP echo request to the host kernel via an unprivileged + /// `SOCK_DGRAM IPPROTO_ICMP` socket. + /// + /// The kernel rewrites the ICMP identifier on `send_to`; the entry stores + /// the guest's original `ident` so the reply path (Task 1.3) can translate + /// it back before injecting the frame into the guest. + fn handle_icmp_frame(&mut self, ipv4: &Ipv4Packet<&[u8]>) -> Result<()> { + let icmp = match Icmpv4Packet::new_checked(ipv4.payload()) { + Ok(p) => p, + Err(_) => return Ok(()), + }; + let repr = match Icmpv4Repr::parse(&icmp, &Default::default()) { + Ok(r) => r, + Err(_) => return Ok(()), + }; + let (ident, seq_no, data) = match repr { + Icmpv4Repr::EchoRequest { + ident, + seq_no, + data, + } => (ident, seq_no, data), + _ => return Ok(()), // only echo request handled today + }; + + // Copy data before the mutable borrow of self.flow_table below. + let data_owned: Vec = data.to_vec(); + + let key = IcmpEchoKey { + guest_id: ident, + dst_ip: ipv4.dst_addr(), + }; + let flow_key = FlowKey::IcmpEcho(key); + // Track whether this is a new entry so we can register it with epoll. + let mut new_icmp_fd: Option = None; + let mut new_token: u64 = 0; + let entry: &mut IcmpEchoEntry = match self.flow_table.entry(flow_key) { + std::collections::hash_map::Entry::Occupied(occupied) => match occupied.into_mut() { + FlowEntry::IcmpEcho(e) => e, + _ => unreachable!("FlowKey::IcmpEcho must map to FlowEntry::IcmpEcho"), + }, + std::collections::hash_map::Entry::Vacant(vacant) => { + let sock = match open_icmp_socket() { + Ok(s) => s, + Err(e) => { + // Sysctl-driven fallback handled in Task 1.4. + trace!("SLIRP ICMP: open socket failed: {e}"); + return Ok(()); + } + }; + let token = next_flow_token(PROTO_TAG_ICMP); + new_icmp_fd = Some(sock.as_raw_fd()); + new_token = token; + match vacant.insert(FlowEntry::IcmpEcho(IcmpEchoEntry { + sock, + guest_id: ident, + last_activity: Instant::now(), + flow_token: token, + })) { + FlowEntry::IcmpEcho(e) => e, + _ => unreachable!(), + } + } + }; + entry.last_activity = Instant::now(); + + if let Some(host_fd) = new_icmp_fd { + self.token_to_key.insert(new_token, flow_key); + if let Err(e) = self.epoll.register(host_fd, new_token, RegisterMode::Read) { + warn!( + guest_id = key.guest_id, + dst_ip = %key.dst_ip, + fd = host_fd, + error = %e, + "SLIRP ICMP: epoll register failed; flow present but readiness-driven relay disabled" + ); + } + self.epoll_waker.wake(); + } + + // Build a wire ICMP echo packet with seq + data; the kernel will + // rewrite the ident on send_to. + let req = Icmpv4Repr::EchoRequest { + ident: 0, // kernel rewrites + seq_no, + data: &data_owned, + }; + let mut buf = vec![0u8; req.buffer_len()]; + let mut pkt = Icmpv4Packet::new_unchecked(&mut buf); + req.emit(&mut pkt, &Default::default()); + + let dst = SocketAddr::from(( + Ipv4Addr::from(ipv4.dst_addr().0), + 0u16, // port ignored for ICMP + )); + if let Err(e) = entry.sock.send_to(&buf, dst) { + trace!("SLIRP ICMP: send_to failed: {e}"); + } + Ok(()) + } + // ── TCP NAT ───────────────────────────────────────────────────── fn handle_tcp_frame(&mut self, ipv4: &Ipv4Packet<&[u8]>) -> Result<()> { @@ -711,28 +1457,40 @@ impl SlirpStack { src_ip, src_port, dst_ip, dst_port ); - // Check deny list before connecting - if self.is_denied(&dst_ip) { - warn!( - "SLIRP TCP: connection to {}:{} denied by network deny list", - dst_ip, dst_port - ); - let rst = build_tcp_packet_static( - dst_ip, - SLIRP_GUEST_IP, - dst_port, - src_port, - 0, - seq + 1, - TcpControl::Rst, - &[], - ); - self.inject_to_guest.push(rst); - return Ok(()); - } + // Unified outbound translation: combines the gateway-loopback + // rewrite + deny-list check in one pure-function call. Returns None if + // the dst is denied; on Some, the SocketAddr already has the right + // host IP (loopback for the gateway, original for everything else). + let dst_addr = + match nat::translate_outbound(&self.nat, dst_ip, dst_port, SLIRP_GATEWAY_IP) { + Some(addr) => addr, + None => { + warn!( + "SLIRP TCP: connection to {}:{} denied by network deny list", + dst_ip, dst_port + ); + let rst = build_tcp_packet_static( + dst_ip, + SLIRP_GUEST_IP, + dst_port, + src_port, + 0, + seq + 1, + TcpControl::Rst, + &[], + ); + self.inject_to_guest.push(rst); + return Ok(()); + } + }; - // Check max concurrent connections - if self.tcp_nat.len() >= self.max_concurrent_connections { + let mut tcp_flow_count = 0; + for flow_key in self.flow_table.keys() { + if let FlowKey::Tcp(_) = flow_key { + tcp_flow_count += 1; + } + } + if tcp_flow_count >= self.max_concurrent_connections { warn!( "SLIRP TCP: max concurrent connections ({}) reached, rejecting SYN to {}:{}", self.max_concurrent_connections, dst_ip, dst_port @@ -771,34 +1529,44 @@ impl SlirpStack { return Ok(()); } - // Remove any stale entry with the same key - self.tcp_nat.remove(&key); - - // Create host TCP connection. - // Map the SLIRP gateway IP (10.0.2.2) to localhost so the guest - // can reach host services (e.g. Ollama at localhost:11434). - let host_ip = if dst_ip == SLIRP_GATEWAY_IP { - std::net::Ipv4Addr::new(127, 0, 0, 1) - } else { - std::net::Ipv4Addr::new(dst_ip.0[0], dst_ip.0[1], dst_ip.0[2], dst_ip.0[3]) - }; - let addr = SocketAddr::new(std::net::IpAddr::V4(host_ip), dst_port); + // Remove any stale entry with the same key, unregistering its FD + // from the epoll set to avoid a dangling registration. + if let Some(FlowEntry::Tcp(stale)) = self.flow_table.get(&FlowKey::Tcp(key)) { + self.token_to_key.remove(&stale.flow_token); + self.epoll.unregister(stale.host_stream.as_raw_fd()).ok(); + } + self.flow_table.remove(&FlowKey::Tcp(key)); - match TcpStream::connect_timeout(&addr, Duration::from_secs(3)) { + // Connect to the host address resolved by translate_outbound above. + match TcpStream::connect_timeout(&dst_addr, Duration::from_secs(3)) { Ok(stream) => { stream.set_nonblocking(true).ok(); + let host_fd = stream.as_raw_fd(); let our_seq: u32 = rand_seq(); + let token = next_flow_token(PROTO_TAG_TCP); + let flow_key = FlowKey::Tcp(key); let entry = TcpNatEntry { host_stream: stream, state: TcpNatState::SynReceived, our_seq, guest_ack: seq + 1, - to_guest: Vec::new(), - to_host: Vec::new(), - to_host_pending_ack: None, last_activity: Instant::now(), + bytes_in_flight: 0, + flow_token: token, }; - self.tcp_nat.insert(key.clone(), entry); + self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); + self.token_to_key.insert(token, flow_key); + if let Err(e) = self.epoll.register(host_fd, token, RegisterMode::Read) { + warn!( + guest_src_port = key.guest_src_port, + dst_ip = %key.dst_ip, + dst_port = key.dst_port, + fd = host_fd, + error = %e, + "SLIRP TCP: epoll register failed; flow present but readiness-driven relay disabled" + ); + } + self.epoll_waker.wake(); // Send SYN-ACK back to guest let syn_ack = build_tcp_packet_static( @@ -837,22 +1605,59 @@ impl SlirpStack { } // Look up existing connection - let entry = match self.tcp_nat.get_mut(&key) { - Some(e) => e, - None => { - trace!( - "SLIRP TCP: no NAT entry for {}:{} -> {}:{}", - src_ip, - src_port, - dst_ip, - dst_port - ); - return Ok(()); - } + let flow_key = FlowKey::Tcp(key); + let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&flow_key) else { + trace!( + "SLIRP TCP: no NAT entry for {}:{} -> {}:{}", + src_ip, + src_port, + dst_ip, + dst_port + ); + return Ok(()); }; + // Track whether this processing path sets state=Closed so we can + // enqueue the key in pending_close once the entry borrow ends. + // FIN/RST paths push to pending_close and return early; mid-function + // error paths (ACK-driven read failure, write failure) set this flag. + let mut closed_by_error = false; + entry.last_activity = Instant::now(); + // Inbound port-forward: guest's SYN-ACK completing the host-initiated + // 3-way handshake. We synthesized a SYN to the guest (5.5b.2/5.5b.3); + // the guest's kernel accepted it and replied with SYN+ACK. Send an ACK + // back so the guest's TCP stack transitions to Established on its side, + // then record our state as Established too. + // + // NatKey for the inbound flow: guest_src_port = guest service port, + // dst_ip = SLIRP_GATEWAY_IP, dst_port = the ephemeral high port we + // used as the SYN's source port. The ACK frame therefore flows + // src=SLIRP_GATEWAY_IP:dst_port → dst=SLIRP_GUEST_IP:guest_src_port. + if entry.state == TcpNatState::SynSent && tcp.syn() && tcp.ack() { + let ack_frame = build_tcp_packet_static( + SLIRP_GATEWAY_IP, // src_ip — the "host" side of the forward + SLIRP_GUEST_IP, // dst_ip — the guest + key.dst_port, // src_port — high ephemeral port we sent the SYN from + key.guest_src_port, // dst_port — the guest's service port + entry.our_seq.wrapping_add(1), // seq — our ISN + 1 (SYN consumed one) + tcp.seq_number().0.wrapping_add(1) as u32, // ack — guest ISN + 1 + TcpControl::None, + &[], + ); + self.inject_to_guest.push(ack_frame); + entry.our_seq = entry.our_seq.wrapping_add(1); + entry.guest_ack = tcp.seq_number().0.wrapping_add(1) as u32; + entry.state = TcpNatState::Established; + trace!( + "SLIRP TCP: inbound 3WH complete for guest_port={} high_port={}, → Established", + key.guest_src_port, + key.dst_port + ); + return Ok(()); + } + // ACK (completing handshake or acknowledging data) if tcp.ack() && entry.state == TcpNatState::SynReceived { entry.state = TcpNatState::Established; @@ -864,50 +1669,109 @@ impl SlirpStack { ); } + // ACK-driven consume: when the guest acknowledges data we sent via + // peek-based relay (Task 3.3), read those bytes from the kernel recv + // buffer to advance the kernel's read pointer. Without this step the + // kernel buffer fills up and recv_peek keeps returning the same bytes. + // + // Only runs in Established state — the SynReceived ACK above does not + // carry data acknowledgements from us yet (bytes_in_flight == 0 then). + if tcp.ack() && entry.state == TcpNatState::Established && entry.bytes_in_flight > 0 { + // segment_ack: what the guest is now confirming it has received + // from us (our send-side sequence space). + let segment_ack: u32 = tcp.ack_number().0 as u32; + + // last_sent_acked: the highest our-seq the guest had already + // confirmed before this segment. `our_seq` is the *next* byte we + // would send, so subtracting bytes_in_flight gives the start of the + // in-flight window. + // All arithmetic is wrapping — TCP sequence numbers wrap at 2^32. + let last_sent_acked: u32 = entry.our_seq.wrapping_sub(entry.bytes_in_flight); + + // acked_bytes: how many new bytes the guest acknowledged in this + // segment. Guards: + // > 0 — ACK actually advances (not a duplicate or stale ACK) + // <= bytes_in_flight — guest cannot ack more than we've sent + // (defends against malformed / spoofed ACKs from a guest) + let acked_bytes: u32 = segment_ack.wrapping_sub(last_sent_acked); + + if acked_bytes > 0 && acked_bytes <= entry.bytes_in_flight { + let mut sink = [0u8; 65536]; + let mut to_drain = acked_bytes as usize; + let mut drained: u32 = 0; + while to_drain > 0 { + let want = to_drain.min(sink.len()); + match entry.host_stream.read(&mut sink[..want]) { + Ok(0) => break, // EOF — nothing more to drain + Ok(n) => { + to_drain -= n; + drained = drained.wrapping_add(n as u32); + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => break, + Err(e) => { + warn!( + "SLIRP TCP: ACK-driven read failed on flow guest_port={}, marking Closed: {}", + key.guest_src_port, e + ); + entry.state = TcpNatState::Closed; + closed_by_error = true; + break; + } + } + } + entry.bytes_in_flight = entry.bytes_in_flight.wrapping_sub(drained); + trace!( + "SLIRP TCP: ACK consumed {} bytes from kernel (in_flight now={}, segment_ack={})", + drained, entry.bytes_in_flight, segment_ack + ); + } + } + let payload = tcp.payload(); if !payload.is_empty() && entry.state == TcpNatState::Established { - let new_ack = seq.wrapping_add(payload.len() as u32); - - if entry.to_host.is_empty() { - match entry.host_stream.write(payload) { - Ok(n) if n == payload.len() => { - entry.guest_ack = new_ack; - let ack_frame = build_tcp_packet_static( - dst_ip, - SLIRP_GUEST_IP, - dst_port, - src_port, - entry.our_seq, - entry.guest_ack, - TcpControl::None, - &[], - ); - self.inject_to_guest.push(ack_frame); - } - Ok(n) => { - entry.to_host.extend_from_slice(&payload[n..]); - entry.to_host_pending_ack = Some(new_ack); - entry.guest_ack = seq.wrapping_add(n as u32); - entry.last_activity = Instant::now(); - } - Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => { - entry.to_host.extend_from_slice(payload); - entry.to_host_pending_ack = Some(new_ack); - entry.last_activity = Instant::now(); - } - Err(e) => { - warn!("SLIRP TCP: write to host failed: {}", e); - entry.state = TcpNatState::Closed; - } + // Guest→host backpressure: rely on the kernel's send buffer + TCP + // retransmit. ACK only the bytes the kernel accepted right now; + // on WouldBlock, don't ACK at all and let the guest retransmit. + // No userspace buffering, no fixed byte-cap on in-flight data. + let payload_seq = seq; + let n_written = match entry.host_stream.write(payload) { + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => 0, + Err(e) => { + warn!( + "SLIRP TCP: write to host failed on flow guest_port={}, marking Closed: {}", + key.guest_src_port, e + ); + entry.state = TcpNatState::Closed; + // entry last used above; borrow ends here before pending_close push. + self.pending_close.push(flow_key); + return Ok(()); } - } else if entry.to_host.len() + payload.len() <= MAX_TO_HOST_BUFFER { - entry.to_host.extend_from_slice(payload); - entry.to_host_pending_ack = Some(new_ack); - entry.last_activity = Instant::now(); - } else { - warn!("SLIRP TCP: to_host buffer full, dropping connection"); - entry.state = TcpNatState::Closed; + }; + + if n_written > 0 { + let ack_seq = payload_seq.wrapping_add(n_written as u32); + entry.guest_ack = ack_seq; + let ack_frame = build_tcp_packet_static( + dst_ip, + SLIRP_GUEST_IP, + dst_port, + src_port, + entry.our_seq, + entry.guest_ack, + TcpControl::None, + &[], + ); + self.inject_to_guest.push(ack_frame); + trace!( + "SLIRP TCP guest→host: wrote {}/{} bytes, ACK={}", + n_written, + payload.len(), + ack_seq + ); } + // else: kernel send buffer full (WouldBlock) — don't ACK. + // Guest TCP will retransmit; kernel buffer drains over time. } // FIN from guest @@ -927,45 +1791,115 @@ impl SlirpStack { self.inject_to_guest.push(fin_ack_frame); entry.our_seq = entry.our_seq.wrapping_add(1); entry.state = TcpNatState::Closed; + // entry last used above; borrow ends before pending_close push. + self.pending_close.push(flow_key); + return Ok(()); } // RST from guest if tcp.rst() { debug!("SLIRP TCP: RST from guest for {}:{}", dst_ip, dst_port); entry.state = TcpNatState::Closed; + // entry last used above; borrow ends before pending_close push. + self.pending_close.push(flow_key); + return Ok(()); + } + + // ACK-driven read failure marked the entry Closed but execution + // continues here (no early return). Push to pending_close so + // relay_tcp_nat_data removes the flow without an O(n) sweep. + if closed_by_error { + self.pending_close.push(flow_key); } Ok(()) } - /// Relay data from host TCP connections to guest - fn relay_tcp_nat_data(&mut self) { - let mut to_remove = Vec::new(); + /// Relay data from host TCP connections to guest, driven by epoll readiness. + /// + /// Closed flows enqueued by handle_tcp_frame (FIN/RST) are drained from + /// `pending_close` and removed promptly. Idle-timeout detection iterates + /// only the flow table entries directly, avoiding a separate Vec allocation. + /// Data relay is restricted to flows with an EPOLLIN event in `ready`. + fn relay_tcp_nat_data(&mut self, ready: &[EpollEvent]) { // Collect frames to inject (built separately to avoid borrow issues) let mut frames_to_inject: Vec> = Vec::new(); - for (key, entry) in self.tcp_nat.iter_mut() { - if entry.state == TcpNatState::Closed { - to_remove.push(key.clone()); - continue; + // Seed removal set from flows already marked Closed by handle_tcp_frame + // (FIN/RST path) via the pending_close queue. HashSet gives O(1) + // membership checks in the idle-timeout sweep and readiness filter below, + // avoiding the O(n*k) cost of Vec::contains under connection churn. + let mut to_remove_set: std::collections::HashSet = + std::mem::take(&mut self.pending_close) + .into_iter() + .collect(); + + // Idle-timeout sweep: scan flow_table once without collecting a + // separate key Vec. 300-second inactivity applies regardless of epoll + // readiness; this is O(n) in the number of TCP flows. + const TCP_IDLE_TIMEOUT: Duration = Duration::from_secs(300); + for (flow_key, entry) in &self.flow_table { + if let FlowEntry::Tcp(tcp_entry) = entry { + if tcp_entry.last_activity.elapsed() > TCP_IDLE_TIMEOUT { + to_remove_set.insert(*flow_key); + } } - if entry.last_activity.elapsed() > Duration::from_secs(300) { - to_remove.push(key.clone()); + } + + let mut tcp_flow_keys: Vec = Vec::new(); + for event in ready { + if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_TCP { continue; } - if entry.state != TcpNatState::Established { + let Some(flow_key) = self.token_to_key.get(&event.token).copied() else { + continue; + }; + if to_remove_set.contains(&flow_key) { continue; } + tcp_flow_keys.push(flow_key); + } - if !entry.to_host.is_empty() { - match entry.host_stream.write(&entry.to_host) { - Ok(n) => { - entry.to_host.drain(..n); - entry.last_activity = Instant::now(); - if entry.to_host.is_empty() { - if let Some(ack) = entry.to_host_pending_ack.take() { - entry.guest_ack = ack; - let ack_frame = build_tcp_packet_static( + for flow_key in tcp_flow_keys { + let FlowKey::Tcp(key) = flow_key else { + continue; + }; + + let mut became_closed = false; + let mut fin_frame: Option> = None; + + { + let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&flow_key) else { + continue; + }; + + if entry.state != TcpNatState::Established { + continue; + } + + // Host→guest path: peek what's in the kernel recv buffer + // without consuming. Send only the un-ACK'd portion (bytes past + // what we've already sent). The kernel's socket buffer holds the + // outstanding data; ACK-driven `read()` consumes it once the + // guest ACKs. + let mut peek_buf = [0u8; 65536]; + match recv_peek(&entry.host_stream, &mut peek_buf) { + Ok(0) => { + // Host closed the connection. Send FIN to guest below. + debug!( + "SLIRP TCP: host EOF on flow guest_port={}, marking Closed", + key.guest_src_port + ); + entry.state = TcpNatState::Closed; + became_closed = true; + } + Ok(peek_n) => { + let in_flight = entry.bytes_in_flight as usize; + if peek_n > in_flight { + let new_bytes = &peek_buf[in_flight..peek_n]; + let mut sent_total: usize = 0; + for chunk in new_bytes.chunks(MTU - 54) { + let frame = build_tcp_packet_static( key.dst_ip, SLIRP_GUEST_IP, key.dst_port, @@ -973,78 +1907,318 @@ impl SlirpStack { entry.our_seq, entry.guest_ack, TcpControl::None, - &[], + chunk, ); - frames_to_inject.push(ack_frame); + frames_to_inject.push(frame); + entry.our_seq = entry.our_seq.wrapping_add(chunk.len() as u32); + entry.bytes_in_flight = + entry.bytes_in_flight.wrapping_add(chunk.len() as u32); + sent_total += chunk.len(); } + entry.last_activity = Instant::now(); + trace!( + "SLIRP TCP relay: peeked {} bytes (in_flight before={}, sent now={})", + peek_n, + in_flight, + sent_total + ); } + // else: kernel buffer holds only already-in-flight bytes. + // Wait for guest ACK before sending more (Task 3.4). + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { + // Kernel recv buffer empty; nothing to do this poll. } - Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {} Err(e) => { - warn!("SLIRP TCP: buffered write to host failed: {}", e); + warn!( + "SLIRP TCP: recv_peek failed on flow guest_port={}, marking Closed: {}", + key.guest_src_port, e + ); entry.state = TcpNatState::Closed; - continue; + became_closed = true; } } - } - // Read from host - let mut buf = [0u8; 16384]; - match entry.host_stream.read(&mut buf) { - Ok(0) => { - debug!("SLIRP TCP: host closed for {}:{}", key.dst_ip, key.dst_port); - entry.state = TcpNatState::Closed; - } - Ok(n) => { - entry.to_guest.extend_from_slice(&buf[..n]); - entry.last_activity = Instant::now(); - } - Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {} - Err(e) => { - trace!("SLIRP TCP: host read error: {}", e); - entry.state = TcpNatState::Closed; + // FIN if host closed + if entry.state == TcpNatState::Closed { + fin_frame = Some(build_tcp_packet_static( + key.dst_ip, + SLIRP_GUEST_IP, + key.dst_port, + key.guest_src_port, + entry.our_seq, + entry.guest_ack, + TcpControl::Fin, + &[], + )); } + } // entry borrow ends here + + if let Some(fin) = fin_frame { + frames_to_inject.push(fin); + } + // Queue for removal so the cleanup loop below can unregister + drop. + if became_closed { + to_remove_set.insert(flow_key); } + } - // Build data frames for guest - while !entry.to_guest.is_empty() && entry.state == TcpNatState::Established { - let chunk_size = entry.to_guest.len().min(MTU - 54); - let chunk: Vec = entry.to_guest.drain(..chunk_size).collect(); - let frame = build_tcp_packet_static( - key.dst_ip, - SLIRP_GUEST_IP, - key.dst_port, - key.guest_src_port, - entry.our_seq, - entry.guest_ack, - TcpControl::None, - &chunk, - ); - entry.our_seq = entry.our_seq.wrapping_add(chunk.len() as u32); - frames_to_inject.push(frame); + self.inject_to_guest.append(&mut frames_to_inject); + + for flow_key in to_remove_set { + if let Some(FlowEntry::Tcp(entry)) = self.flow_table.get(&flow_key) { + self.token_to_key.remove(&entry.flow_token); + self.epoll.unregister(entry.host_stream.as_raw_fd()).ok(); } + self.flow_table.remove(&flow_key); + } + } - // FIN if host closed - if entry.state == TcpNatState::Closed { - let fin = build_tcp_packet_static( - key.dst_ip, - SLIRP_GUEST_IP, - key.dst_port, - key.guest_src_port, - entry.our_seq, - entry.guest_ack, - TcpControl::Fin, - &[], - ); - frames_to_inject.push(fin); + /// Drain replies from each active ICMP echo socket and emit echo-reply + /// frames to the guest, driven by epoll readiness. + /// + /// Only flows whose token appears in `ready` with EPOLLIN set are visited. + /// Entries idle longer than `ICMP_IDLE_TIMEOUT` are still evicted on any + /// readiness event for that flow. + fn relay_icmp_echo(&mut self, ready: &[EpollEvent]) { + const ICMP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + let now = Instant::now(); + + let mut ready_flow_keys: Vec = Vec::new(); + for event in ready { + if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_ICMP { + continue; } + let Some(flow_key) = self.token_to_key.get(&event.token).copied() else { + continue; + }; + ready_flow_keys.push(flow_key); } - self.inject_to_guest.append(&mut frames_to_inject); + // Mirrors the TCP idle-timeout sweep so ICMP sockets do not accumulate + // indefinitely when the ping target goes silent. + let mut icmp_to_remove: std::collections::HashSet = + std::collections::HashSet::new(); + for (flow_key, entry) in &self.flow_table { + let FlowKey::IcmpEcho(_) = flow_key else { + continue; + }; + let FlowEntry::IcmpEcho(icmp_entry) = entry else { + continue; + }; + if now.duration_since(icmp_entry.last_activity) > ICMP_IDLE_TIMEOUT { + icmp_to_remove.insert(*flow_key); + } + } + + for flow_key in &ready_flow_keys { + // Skip if already in remove set (idle-timeout caught it first). + // O(1) via HashSet, not O(k) Vec::contains. + if icmp_to_remove.contains(flow_key) { + continue; + } + let FlowKey::IcmpEcho(key) = *flow_key else { + continue; + }; + let frame = { + let Some(FlowEntry::IcmpEcho(entry)) = self.flow_table.get_mut(flow_key) else { + continue; + }; + let mut buf = [0u8; 1500]; + match entry.sock.recv_from(&mut buf) { + Ok((n, _addr)) => { + entry.last_activity = now; + Self::build_icmp_echo_reply_to_guest(key.dst_ip, entry.guest_id, &buf[..n]) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, + Err(_) => continue, + } + }; + if let Some(frame_bytes) = frame { + self.inject_to_guest.push(frame_bytes); + } + } + + for flow_key in icmp_to_remove { + if let Some(FlowEntry::IcmpEcho(e)) = self.flow_table.get(&flow_key) { + self.token_to_key.remove(&e.flow_token); + self.epoll.unregister(e.sock.as_raw_fd()).ok(); + } + self.flow_table.remove(&flow_key); + } + } + + /// Build an Ethernet/IPv4/ICMP echo-reply frame addressed to the guest. + /// + /// `src_ip` is the original ping destination (becomes the reply source). + /// `guest_id` is the ICMP identifier to write into the reply so the guest + /// can match it against its outstanding echo request. + /// `raw_icmp` is the raw ICMP packet received from the host kernel via + /// the `SOCK_DGRAM IPPROTO_ICMP` socket (no IP header; ICMP type + code + + /// checksum + payload). + /// + /// Returns `Some(frame)` on success, `None` if the packet cannot be parsed + /// or is not an `EchoReply`. + fn build_icmp_echo_reply_to_guest( + src_ip: Ipv4Address, + guest_id: u16, + raw_icmp: &[u8], + ) -> Option> { + let icmp = Icmpv4Packet::new_checked(raw_icmp).ok()?; + let parsed = Icmpv4Repr::parse(&icmp, &Default::default()).ok()?; + // Copy the payload before `icmp` / `parsed` go out of scope so we can + // build the outgoing `EchoReply` with a fresh borrow. Mirrors the + // same pattern used in `handle_icmp_frame` (Task 1.2). + let (seq_no, data_owned) = match parsed { + Icmpv4Repr::EchoReply { seq_no, data, .. } => (seq_no, data.to_vec()), + _ => return None, + }; + let reply = Icmpv4Repr::EchoReply { + ident: guest_id, + seq_no, + data: &data_owned, + }; + let ip_repr = Ipv4Repr { + src_addr: src_ip, + dst_addr: SLIRP_GUEST_IP, + next_header: IpProtocol::Icmp, + payload_len: reply.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GATEWAY_MAC), + dst_addr: EthernetAddress(GUEST_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + reply.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp_out = Icmpv4Packet::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + reply.emit(&mut icmp_out, &Default::default()); + Some(buf) + } + + /// Drain replies from each active UDP flow socket and emit UDP frames to + /// the guest, driven by epoll readiness. + /// + /// Only flows whose token appears in `ready` with EPOLLIN set are visited. + /// Idle-timeout reaping still runs every call: the reap scan is cheap + /// (skips flows not in `ready`) and ensures stale entries are eventually + /// evicted even when no new data arrives. + /// + /// Reply addressing mirrors the original guest datagram in reverse: the + /// frame's IP source is the original destination (`key.dst_ip`) and UDP + /// source port is `key.dst_port`; the destination is the guest IP and + /// `key.guest_src_port`. + fn relay_udp_flows(&mut self, ready: &[EpollEvent]) { + let now = Instant::now(); + // Per-flow connected sockets are closed by Drop when the entry leaves + // flow_table. + let mut stale: Vec = Vec::new(); + for (flow_key, entry) in &self.flow_table { + let FlowKey::Udp(_) = flow_key else { continue }; + let FlowEntry::Udp(udp_entry) = entry else { + continue; + }; + if now.duration_since(udp_entry.last_activity) > UDP_IDLE_TIMEOUT { + stale.push(*flow_key); + } + } + for flow_key in stale { + if let Some(FlowEntry::Udp(entry)) = self.flow_table.get(&flow_key) { + self.token_to_key.remove(&entry.flow_token); + self.epoll.unregister(entry.sock.as_raw_fd()).ok(); + } + self.flow_table.remove(&flow_key); + } - for key in to_remove { - self.tcp_nat.remove(&key); + let mut flow_keys: Vec = Vec::new(); + for event in ready { + if !event.readable || event.token & PROTO_TAG_MASK != PROTO_TAG_UDP { + continue; + } + let Some(flow_key) = self.token_to_key.get(&event.token).copied() else { + continue; + }; + flow_keys.push(flow_key); } + for flow_key in flow_keys { + let FlowKey::Udp(key) = flow_key else { + continue; + }; + let frame = { + let Some(FlowEntry::Udp(entry)) = self.flow_table.get_mut(&flow_key) else { + continue; + }; + let mut buf = [0u8; 1500]; + match entry.sock.recv(&mut buf) { + Ok(n) => { + entry.last_activity = now; + Self::build_udp_reply_to_guest( + key.dst_ip, + key.dst_port, + key.guest_src_port, + &buf[..n], + ) + } + Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => continue, + Err(_) => continue, + } + }; + if let Some(frame_bytes) = frame { + self.inject_to_guest.push(frame_bytes); + } + } + } + + /// Build an Ethernet/IPv4/UDP frame addressed to the guest, carrying a + /// reply from a host-side UDP flow socket. + /// + /// - `src_ip` — original destination IP (becomes the reply source address). + /// - `src_port` — original destination port (becomes the reply source port). + /// - `dst_port` — guest's ephemeral source port (becomes the reply destination). + /// - `payload` — raw UDP payload received from the host socket. + /// + /// Returns `Some(frame)` on success. Currently infallible, but wrapped in + /// `Option` for symmetry with [`build_icmp_echo_reply_to_guest`]. + fn build_udp_reply_to_guest( + src_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + payload: &[u8], + ) -> Option> { + let udp_repr = UdpRepr { src_port, dst_port }; + let ip_repr = Ipv4Repr { + src_addr: src_ip, + dst_addr: SLIRP_GUEST_IP, + next_header: IpProtocol::Udp, + payload_len: 8 + payload.len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GATEWAY_MAC), + dst_addr: EthernetAddress(GUEST_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = 14 + ip_repr.buffer_len() + 8 + payload.len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[14..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[14 + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(src_ip), + &IpAddress::Ipv4(SLIRP_GUEST_IP), + payload.len(), + |b| b.copy_from_slice(payload), + &Default::default(), + ); + Some(buf) } // ── Packet building helpers ────────────────────────────────────── @@ -1097,6 +2271,45 @@ impl SlirpStack { buf } + + /// Push events from the net-poll thread into this backend's per-tick + /// event queue. Called from net_poll_thread after each successful + /// epoll_wait, while holding no other lock. + /// + /// drain_to_guest drains this queue with a brief uncontended lock + /// instead of re-entering EpollDispatch (which the net-poll thread + /// holds for the full 50 ms of the blocking wait). + pub fn push_ready_events(&self, events: &[EpollEvent]) { + // First push from net_poll_thread flips the flag so drain_to_guest + // skips its non-blocking-poll fallback. Stays set for the + // backend's lifetime — net_poll_thread doesn't disappear mid-run. + self.has_external_poller.store(true, Ordering::Relaxed); + if events.is_empty() { + return; + } + let mut queue = self.pending_events.lock().unwrap(); + queue.extend_from_slice(events); + } +} + +impl NetworkBackend for SlirpBackend { + fn process_guest_frame(&mut self, frame: &[u8]) -> io::Result<()> { + SlirpBackend::process_guest_frame(self, frame).map_err(|e| io::Error::other(e.to_string())) + } + + fn drain_to_guest(&mut self, out: &mut Vec>) { + SlirpBackend::drain_to_guest(self, out) + } + + #[cfg(target_os = "linux")] + fn epoll_arc(&self) -> Option> { + Some(std::sync::Arc::clone(&self.epoll)) + } + + #[cfg(target_os = "linux")] + fn push_ready_events(&self, events: &[crate::network::epoll_dispatch::EpollEvent]) { + SlirpBackend::push_ready_events(self, events) + } } /// Build a TCP packet (free function to avoid borrow issues with &self methods) @@ -1163,6 +2376,49 @@ fn build_tcp_packet_static( buf } +/// Build a synthetic TCP SYN frame from the SLIRP gateway to the guest, +/// used for inbound port-forwarding. +/// +/// The frame mirrors what the guest would see from a real TCP client: +/// - src: `SLIRP_GATEWAY_IP:high_port` +/// - dst: `SLIRP_GUEST_IP:guest_port` +/// - control: `TcpControl::Syn` +/// - seq: caller-supplied `our_seq` (the host's chosen ISN for this flow) +/// - ack: 0 (no piggybacked ACK on the initial SYN) +/// +/// Caller pushes the returned bytes into `inject_to_guest`. The guest's +/// kernel sees an inbound TCP SYN, routes it to whatever's bound at +/// `guest_port`, and emits a SYN-ACK that `handle_tcp_frame` matches +/// to the seeded `SynSent` flow_table entry (5.5b.1). +#[cfg(any(test, feature = "bench-helpers"))] +pub fn synthesize_inbound_syn(high_port: u16, guest_port: u16, our_seq: u32) -> Vec { + build_tcp_packet_static( + SLIRP_GATEWAY_IP, + SLIRP_GUEST_IP, + high_port, + guest_port, + our_seq, + 0, + TcpControl::Syn, + &[], + ) +} + +#[cfg(not(any(test, feature = "bench-helpers")))] +#[allow(dead_code)] // consumed in 5.5b.3 +fn synthesize_inbound_syn(high_port: u16, guest_port: u16, our_seq: u32) -> Vec { + build_tcp_packet_static( + SLIRP_GATEWAY_IP, + SLIRP_GUEST_IP, + high_port, + guest_port, + our_seq, + 0, + TcpControl::Syn, + &[], + ) +} + // ── Utility functions ──────────────────────────────────────────────── fn rand_seq() -> u32 { @@ -1195,9 +2451,247 @@ fn ipv4_checksum(header: &[u8]) -> u16 { !sum as u16 } -impl Default for SlirpStack { +/// Bind one `TcpListener` per TCP port-forward rule, register each with +/// `epoll`, and return a map from host port to `(listener, guest_port)`. +/// +/// Rules whose bind or `set_nonblocking` calls fail are skipped with a +/// `WARN` log; the returned map contains only the rules that succeeded. +/// When `nat.port_forwards` contains no TCP rules the map is empty. +pub(crate) fn bind_port_forward_listeners( + nat: &nat::Rules, + epoll: &Arc, +) -> HashMap { + let mut listeners = HashMap::new(); + for port_forward in &nat.port_forwards { + if port_forward.proto != nat::ForwardProto::Tcp { + continue; + } + let host_port = port_forward.host_port; + let guest_port = port_forward.guest_port; + let listener = match TcpListener::bind(("127.0.0.1", host_port)) { + Ok(l) => l, + Err(bind_error) => { + warn!( + host_port, + error = %bind_error, + "SLIRP port-forward: bind failed, rule disabled" + ); + continue; + } + }; + if let Err(nb_error) = listener.set_nonblocking(true) { + warn!( + host_port, + error = %nb_error, + "SLIRP port-forward: set_nonblocking failed, rule disabled" + ); + continue; + } + let token = flow_token_for_listener(host_port); + if let Err(reg_error) = epoll.register(listener.as_raw_fd(), token, RegisterMode::Read) { + warn!( + host_port, + error = %reg_error, + "SLIRP port-forward: epoll register failed, rule disabled" + ); + continue; + } + debug!( + host_port, + guest_port, "SLIRP port-forward: listening on 127.0.0.1 (epoll-driven)" + ); + listeners.insert(host_port, (listener, guest_port)); + } + listeners +} + +impl Default for SlirpBackend { fn default() -> Self { - Self::new().expect("Failed to create default SlirpStack") + Self::new().expect("Failed to create default SlirpBackend") + } +} + +impl SlirpBackend { + /// Re-register every live host FD in `flow_table` with the current epoll + /// dispatcher. Called from snapshot restore: `epoll_fd` is a kernel + /// handle that does not survive snapshot, so a fresh dispatcher starts + /// empty even though `flow_table` deserialized correctly with new FDs. + /// + /// The current snapshot path does not reconstruct `flow_table` — the + /// backend always starts empty after restore and new flows form naturally. + /// This method is therefore a no-op today but is wired in advance so + /// future work that persists restored flows across snapshot/restore has a + /// ready call site. + /// Re-register every live host FD in `flow_table` with the current epoll + /// dispatcher and rebuild `token_to_key`. Called from snapshot restore: + /// the `epoll_fd` is a kernel handle that does not survive snapshot, so a + /// fresh dispatcher starts empty even though `flow_table` deserialized + /// correctly with new FDs. + /// + /// Each existing flow keeps its stored `flow_token` so that any + /// already-queued readiness events (unlikely post-restore, but safe) still + /// resolve correctly. The `token_to_key` map is rebuilt from scratch + /// because it is in-memory-only state; it does not need to be persisted. + pub fn rebuild_epoll_from_flow_table(&mut self) { + use std::os::fd::AsRawFd; + self.token_to_key.clear(); + for (flow_key, entry) in &self.flow_table { + match (flow_key, entry) { + (FlowKey::Tcp(_), FlowEntry::Tcp(e)) => { + self.token_to_key.insert(e.flow_token, *flow_key); + let _ = self.epoll.register( + e.host_stream.as_raw_fd(), + e.flow_token, + RegisterMode::Read, + ); + } + (FlowKey::Udp(_), FlowEntry::Udp(e)) => { + self.token_to_key.insert(e.flow_token, *flow_key); + let _ = + self.epoll + .register(e.sock.as_raw_fd(), e.flow_token, RegisterMode::Read); + } + (FlowKey::IcmpEcho(_), FlowEntry::IcmpEcho(e)) => { + self.token_to_key.insert(e.flow_token, *flow_key); + let _ = + self.epoll + .register(e.sock.as_raw_fd(), e.flow_token, RegisterMode::Read); + } + _ => {} + } + } + } +} + +/// Test-only helpers — not compiled into production builds. +/// +/// These are `#[cfg(test)]`/`#[cfg(feature = "bench-helpers")]` methods on +/// `SlirpBackend` that allow unit tests and divan benches to insert synthetic +/// flow entries without widening the visibility of private types. +/// The full behavioral contract for the SynSent → Established transition is +/// pinned in the E2E test `tcp_inbound_syn_ack_completes_handshake` below and +/// will be further exercised end-to-end in task 5.5b.5 +/// (`tcp_port_forward_inbound` in `tests/network_baseline.rs`). +#[cfg(any(test, feature = "bench-helpers"))] +impl SlirpBackend { + /// Insert a synthetic `SynSent` entry into the flow table. + /// + /// Used by `tcp_inbound_syn_ack_completes_handshake` to pre-seed the state + /// that would normally be created by `synthesize_inbound_syn` (5.5b.2). + /// + /// `guest_port`: the guest's listening service port (e.g. 8080). + /// `high_port`: the ephemeral source port we used for the synthesized SYN. + /// `our_isn`: the ISN we put in the synthesized SYN. + /// `host_stream`: a `TcpStream` representing the accepted host-side connection. + pub fn insert_synthetic_synsent_entry( + &mut self, + guest_port: u16, + high_port: u16, + our_isn: u32, + host_stream: TcpStream, + ) { + let key = NatKey { + guest_src_port: guest_port, + dst_ip: SLIRP_GATEWAY_IP, + dst_port: high_port, + }; + let host_fd = host_stream.as_raw_fd(); + let token = next_flow_token(PROTO_TAG_TCP); + let flow_key = FlowKey::Tcp(key); + let entry = TcpNatEntry { + host_stream, + state: TcpNatState::SynSent, + our_seq: our_isn, + guest_ack: 0, + last_activity: Instant::now(), + bytes_in_flight: 0, + flow_token: token, + }; + self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); + self.token_to_key.insert(token, flow_key); + // Skip epoll registration in test/bench contexts: the synthetic + // stream is already non-blocking but test harnesses check specific + // state transitions, not readiness events. + #[cfg(not(any(test, feature = "bench-helpers")))] + { + if let Err(e) = self.epoll.register(host_fd, token, RegisterMode::Read) { + warn!( + guest_port, + high_port, + fd = host_fd, + error = %e, + "SLIRP: epoll register for synthetic SynSent failed" + ); + } + self.epoll_waker.wake(); + } + #[cfg(any(test, feature = "bench-helpers"))] + let _ = host_fd; + } + + /// Return the `TcpNatState` for the flow identified by `(guest_port, GATEWAY_IP, high_port)`, + /// or `None` if no such entry exists in the flow table. + #[allow(dead_code)] + pub(crate) fn tcp_flow_state(&self, guest_port: u16, high_port: u16) -> Option { + let key = NatKey { + guest_src_port: guest_port, + dst_ip: SLIRP_GATEWAY_IP, + dst_port: high_port, + }; + match self.flow_table.get(&FlowKey::Tcp(key))? { + FlowEntry::Tcp(entry) => Some(entry.state), + _ => None, + } + } + + /// Count how many frames queued for injection carry the given TCP flags. + /// + /// Checks `inject_to_guest` for Ethernet/IPv4/TCP frames where the TCP + /// `ack` flag is set and the `syn` flag is clear (i.e. a plain ACK). + #[allow(dead_code)] + pub(crate) fn injected_plain_ack_count(&self) -> usize { + let mut count = 0; + for frame in &self.inject_to_guest { + if frame.len() < 54 { + continue; + } + let tcp_offset = 14 + 20; + let flags_byte = frame[tcp_offset + 13]; + let ack = flags_byte & 0x10 != 0; + let syn = flags_byte & 0x02 != 0; + if ack && !syn { + count += 1; + } + } + count + } + + /// Inject an [`InboundAccept`] directly into the accept channel, bypassing + /// the listener thread. Used by unit tests to drive + /// `process_pending_inbound_accepts` without a real listener. + #[allow(dead_code)] + pub(crate) fn push_inbound_accept(&self, accepted: InboundAccept) { + self.accept_sender + .send(accepted) + .expect("accept channel must be open"); + } + + /// Returns the number of user-registered FDs in the epoll set + /// (excludes the self-pipe). + pub fn registered_fd_count(&self) -> usize { + self.epoll.registered_fd_count() + } + + /// Replace the epoll dispatcher with a fresh empty one, discarding all + /// existing registrations. Simulates the post-snapshot state where the + /// kernel-side `epoll_fd` handle does not survive and a new one is + /// created. Used by `epoll_set_rebuilt_from_flow_table_smoke` to set up + /// the precondition that `rebuild_epoll_from_flow_table` must fix. + pub fn reset_epoll_for_snapshot_test(&mut self) { + let new_epoll_inner = EpollDispatch::new().expect("EpollDispatch::new"); + let new_waker = new_epoll_inner.waker(); + self.epoll = Arc::new(new_epoll_inner); + self.epoll_waker = new_waker; } } @@ -1220,7 +2714,7 @@ mod tests { #[test] fn test_slirp_stack_creation() { - let stack = SlirpStack::new(); + let stack = SlirpBackend::new(); assert!(stack.is_ok()); } @@ -1232,44 +2726,217 @@ mod tests { assert_ne!(cksum, 0); } - #[test] - fn test_to_host_buffer_limit() { - assert_eq!(MAX_TO_HOST_BUFFER, 256 * 1024); + /// Build a TCP frame from the guest (SLIRP_GUEST_IP) to a given destination. + /// + /// Used by `tcp_inbound_syn_ack_completes_handshake` to synthesize the + /// guest's SYN-ACK reply to our port-forward SYN. + fn build_guest_tcp_frame( + dst_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack_number: u32, + control: TcpControl, + set_ack_flag: bool, + ) -> Vec { + use smoltcp::wire::{ + EthernetAddress, EthernetFrame, EthernetRepr, IpAddress, Ipv4Packet, Ipv4Repr, + TcpPacket, TcpRepr, TcpSeqNumber, + }; + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: TcpSeqNumber(seq as i32), + ack_number: if set_ack_flag { + Some(TcpSeqNumber(ack_number as i32)) + } else { + None + }, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None; 3], + payload: &[], + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: smoltcp::wire::IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: smoltcp::wire::EthernetProtocol::Ipv4, + }; + let checksums = smoltcp::phy::ChecksumCapabilities::default(); + let total = eth_repr.buffer_len() + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(eth.payload_mut()); + ip_repr.emit(&mut ip, &checksums); + let mut tcp = TcpPacket::new_unchecked(ip.payload_mut()); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + &checksums, + ); + buf } + /// Verify that a guest SYN-ACK frame on a SynSent entry: + /// (a) transitions the flow state to Established, and + /// (b) queues exactly one plain ACK frame towards the guest. + /// + /// The full E2E behavioral contract (including host-listener wiring) will be + /// pinned in `tests/network_baseline.rs::tcp_port_forward_inbound` (task 5.5b.5). #[test] - fn test_tcp_nat_entry_has_write_buffer() { - let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); - let addr = listener.local_addr().unwrap(); - let stream = TcpStream::connect_timeout(&addr, Duration::from_secs(1)).unwrap(); - stream.set_nonblocking(true).ok(); + fn tcp_inbound_syn_ack_completes_handshake() { + use std::net::TcpListener; + + let guest_port: u16 = 8080; + let high_port: u16 = 44000; + let our_isn: u32 = 0x0000_1000; + let guest_isn: u32 = 0xDEAD_BEEF; + + // Create a loopback TcpStream pair for the host_stream field. + // The stream is never read/written in this unit test — we only + // exercise the TCP state machine. + let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback"); + let host_stream = + TcpStream::connect(listener.local_addr().unwrap()).expect("connect loopback"); + host_stream.set_nonblocking(true).ok(); + + let mut backend = SlirpBackend::new().expect("SlirpBackend::new"); + backend.insert_synthetic_synsent_entry(guest_port, high_port, our_isn, host_stream); + + // Confirm state is SynSent before feeding the SYN-ACK. + assert_eq!( + backend.tcp_flow_state(guest_port, high_port), + Some(TcpNatState::SynSent), + "entry must start as SynSent" + ); - let entry = TcpNatEntry { - host_stream: stream, - state: TcpNatState::Established, - our_seq: 1000, - guest_ack: 2000, - to_guest: Vec::new(), - to_host: Vec::new(), - to_host_pending_ack: None, - last_activity: Instant::now(), - }; + // Build the guest's SYN-ACK: src=GUEST:guest_port, dst=GATEWAY:high_port, + // SYN+ACK, seq=guest_isn, ack=our_isn+1. + let syn_ack = build_guest_tcp_frame( + SLIRP_GATEWAY_IP, + guest_port, + high_port, + guest_isn, + our_isn.wrapping_add(1), + TcpControl::Syn, // SYN flag — combined with ACK flag via ack_number=Some(...) + true, // set ACK flag + ); + + backend + .process_guest_frame(&syn_ack) + .expect("process SYN-ACK"); + + // (a) state must be Established now. + assert_eq!( + backend.tcp_flow_state(guest_port, high_port), + Some(TcpNatState::Established), + "state must be Established after SYN-ACK" + ); - assert!(entry.to_host.is_empty()); - assert!(entry.to_host_pending_ack.is_none()); + // (b) exactly one plain ACK must have been queued for injection to the guest. + assert_eq!( + backend.injected_plain_ack_count(), + 1, + "exactly one plain ACK must be queued for the guest" + ); } + /// Verify that `process_pending_inbound_accepts` drains one `InboundAccept` + /// from the channel, inserts a `SynSent` flow-table entry, and queues a + /// synthesized SYN frame for injection to the guest. + /// + /// This pins the contract for task 5.5b.3. The test is white-box: it uses + /// `push_inbound_accept` (a `#[cfg(test)]` helper that injects into the + /// internal channel) so we don't need a real listener thread. #[test] - fn test_to_host_buffer_rejects_over_limit() { - let existing = vec![0u8; MAX_TO_HOST_BUFFER]; - let new_payload = [0u8; 1]; - assert!(existing.len() + new_payload.len() > MAX_TO_HOST_BUFFER); + fn process_pending_inbound_accepts_seeds_synsent_and_queues_syn() { + use std::net::TcpListener; + + let guest_port: u16 = 9000; + + let listener = TcpListener::bind("127.0.0.1:0").expect("bind loopback"); + let local_addr = listener.local_addr().unwrap(); + let host_stream = TcpStream::connect(local_addr).expect("connect loopback"); + let high_port = host_stream.local_addr().unwrap().port(); + host_stream.set_nonblocking(true).ok(); - let small_existing = vec![0u8; MAX_TO_HOST_BUFFER - 10]; - let fits = [0u8; 10]; - assert!(small_existing.len() + fits.len() <= MAX_TO_HOST_BUFFER); + let mut backend = SlirpBackend::new().expect("SlirpBackend::new"); - let overflows = [0u8; 11]; - assert!(small_existing.len() + overflows.len() > MAX_TO_HOST_BUFFER); + // Inject an InboundAccept without a real listener thread. + backend.push_inbound_accept(InboundAccept { + host_stream, + high_port, + guest_port, + }); + + // Before processing, no flow entry should exist. + assert_eq!( + backend.tcp_flow_state(guest_port, high_port), + None, + "no flow entry before processing" + ); + + // Drive process_pending_inbound_accepts. + backend.process_pending_inbound_accepts(); + + // After processing, a SynSent entry must exist. + assert_eq!( + backend.tcp_flow_state(guest_port, high_port), + Some(TcpNatState::SynSent), + "SynSent entry must be present after processing" + ); + + // Exactly one SYN frame must have been queued for injection. + // Note: build_tcp_packet_static sets ack_number=Some(0) which also + // sets the ACK flag bit; we detect the SYN by checking just the SYN bit. + let syn_count = backend + .inject_to_guest + .iter() + .filter(|frame| { + if frame.len() < 54 { + return false; + } + let tcp_offset = 14 + 20; + let flags_byte = frame[tcp_offset + 13]; + flags_byte & 0x02 != 0 + }) + .count(); + assert_eq!(syn_count, 1, "exactly one SYN must be queued for the guest"); + } + + /// Verify that `with_security` binds exactly one epoll-driven listener when + /// given one TCP port-forward rule, and zero listeners when given none. + #[test] + fn with_security_binds_listener_per_tcp_port_forward() { + // Empty port-forwards: no listeners. + let empty = SlirpBackend::with_security(64, 50, &["169.254.0.0/16".to_string()], &[]) + .expect("SlirpBackend::with_security (empty)"); + assert_eq!( + empty.port_forward_listeners.len(), + 0, + "zero listeners for empty port_forwards" + ); + + // One TCP port-forward: exactly one listener. + let one = + SlirpBackend::with_security(64, 50, &["169.254.0.0/16".to_string()], &[(18080, 80)]) + .expect("SlirpBackend::with_security (one forward)"); + assert_eq!( + one.port_forward_listeners.len(), + 1, + "one listener for one TCP port-forward rule" + ); } } diff --git a/src/vmm/mod.rs b/src/vmm/mod.rs index 354ea5ef..97fe2d0f 100644 --- a/src/vmm/mod.rs +++ b/src/vmm/mod.rs @@ -36,7 +36,7 @@ use crate::guest::protocol::{ ExecOutputChunk, ExecRequest, ExecResponse, MkdirPRequest, MkdirPResponse, TelemetrySubscribeRequest, WriteFileRequest, WriteFileResponse, }; -use crate::network::slirp::SlirpStack; +use crate::network::slirp::SlirpBackend; use crate::observe::telemetry::TelemetryAggregator; use crate::observe::Observer; use crate::vmm::cpu::MmioDevices; @@ -315,11 +315,15 @@ impl MicroVm { // Virtio-net with SLIRP backend if networking is enabled let virtio_net = if config.network { debug!("Setting up SLIRP networking"); - let slirp = Arc::new(Mutex::new(SlirpStack::with_security( - config.security.max_concurrent_connections, - config.security.max_connections_per_second, - &config.security.network_deny_list, - )?)); + let slirp: Arc> = + Arc::new(Mutex::new(SlirpBackend::with_security( + config.security.max_concurrent_connections, + config.security.max_connections_per_second, + &config.security.network_deny_list, + // TODO(5.5b): wire port_forwards from NetworkConfig once VoidBoxConfig + // carries the field; for now no host listeners are spawned. + &[], + )?)); let mut net_device = VirtioNetDevice::new(slirp)?; net_device.set_mmio_base(0xd000_0000); debug!( @@ -685,7 +689,8 @@ impl MicroVm { // 7b. Restore virtio-net if snapshot had networking enabled let virtio_net: Option>> = if snap.config.network { if let Some(ref net_state) = snap.net_state { - let slirp = Arc::new(Mutex::new(SlirpStack::new()?)); + let slirp: Arc> = + Arc::new(Mutex::new(SlirpBackend::new()?)); let mut net_dev = VirtioNetDevice::new(slirp)?; net_dev.restore_state(net_state); net_dev.set_mmio_base(0xd000_0000); @@ -1589,8 +1594,19 @@ fn vsock_irq_thread( /// from host TCP sockets accumulates unread, causing TLS handshakes and /// API calls to time out. /// -/// This thread wakes every 5 ms, reads any pending host data via -/// `try_inject_rx`, and fires IRQ 10 to notify the guest. +/// This thread uses an adaptive `EpollDispatch::wait_with_timeout`: +/// - **Active** (5 ms): any kernel readiness event in the last cycle keeps +/// the thread in the 5 ms cadence so the guest's TCP delayed-ACK timer +/// fires on schedule. Both real socket readiness events and self-pipe +/// wakes (from `epoll_waker.wake()` after a new SYN or injected ACK) +/// count as activity. +/// - **Idle** (50 ms): a cycle with no kernel events backs off to 50 ms. +/// New flows or incoming data wake the wait immediately via the epoll set +/// or the waker, so the 50 ms cap only fires when the network is truly +/// quiet. +/// +/// When the network backend does not provide an epoll instance +/// (non-SlirpBackend), the thread falls back to a fixed 5 ms sleep. fn net_poll_thread(net_dev: Arc>, vm: Arc, running: Arc) { #[repr(C)] struct KvmIrqLevel { @@ -1598,10 +1614,83 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A level: u32, } const KVM_IRQ_LINE: libc::c_ulong = 0x4008_AE61; + // Adaptive epoll_wait timeout. Active periods need a 5 ms cadence so + // the guest's TCP delayed-ACK timer fires on schedule (the guest spends + // most idle time in HLT and relies on our IRQ pulses to advance vCPU + // schedule slots; a 50 ms gap causes +40 ms CRR latency, exactly + // Linux's delayed-ACK period). Idle periods can use the long timeout + // safely: any new flow's SYN goes through process_guest_frame which + // calls epoll_waker.wake(), and host data arrival fires EPOLLIN — both + // wake the wait immediately, so the 50 ms ceiling never bites a real + // packet. We pick the next timeout based on whether the last wait + // returned events: had-events ⇒ stay in the active 5 ms cadence, + // timed-out ⇒ back off to 50 ms. Maintains correctness; recovers the + // 10x idle wakeup reduction that motivated Phase 6.4 in the first + // place. + const ACTIVE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(5); + const IDLE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(50); + const FALLBACK_SLEEP: std::time::Duration = std::time::Duration::from_millis(5); + + // Start in the idle regime — first SYN flips us into active. + let mut epoll_wait_timeout: std::time::Duration = IDLE_TIMEOUT; + let vm_fd = vm.vm_fd().as_raw_fd(); let guest_memory = vm.guest_memory(); + + // Obtain the epoll Arc from the backend without holding the device lock + // across the blocking wait. Falls back to None if the backend is not + // a SlirpBackend (e.g. in unit tests or future alternative backends). + let epoll_arc = { + match net_dev.lock() { + Ok(guard) => guard.epoll_arc(), + Err(_) => None, + } + }; + + let mut epoll_events: Vec = Vec::new(); + while running.load(Ordering::Relaxed) { - std::thread::sleep(std::time::Duration::from_millis(5)); + // Block outside the device lock: either on epoll readiness or a short + // sleep. This lets the vCPU thread acquire the device lock without + // contention during the wait phase. + epoll_events.clear(); + // Raw kernel count from epoll_wait, including self-pipe wakes + // that the filter strips from `epoll_events`. A self-pipe wake + // is the signal that handle_tcp_frame queued a frame and called + // epoll_waker.wake() — i.e. real activity that should keep the + // adaptive timeout in the active 5 ms cadence even though + // `epoll_events.is_empty()`. + let mut raw_kernel_events: usize = 0; + if let Some(ref ep_arc) = epoll_arc { + raw_kernel_events = ep_arc + .wait_with_timeout(&mut epoll_events, epoll_wait_timeout) + .unwrap_or(0); + } else { + std::thread::sleep(FALLBACK_SLEEP); + } + + // Adapt the next-cycle timeout based on this cycle's outcome. + // Any kernel event (real readiness OR self-pipe wake from the + // vCPU thread) signals activity and keeps us in the 5 ms + // cadence so the guest's TCP delayed-ACK timer fires on time. + // A pure timeout drops us to the 50 ms idle cadence. One quiet + // cycle to switch to idle, one event to switch back to active. + epoll_wait_timeout = if raw_kernel_events > 0 { + ACTIVE_TIMEOUT + } else { + IDLE_TIMEOUT + }; + + // Push ready events into the backend's queue before acquiring the + // device lock for inject/IRQ work. drain_to_guest will consume them + // without re-locking EpollDispatch, eliminating mutex contention + // between the net-poll thread's 50 ms blocking wait and the vCPU + // thread's process_guest_frame → drain_to_guest path. + if !epoll_events.is_empty() { + if let Ok(guard) = net_dev.lock() { + guard.push_events_to_backend(&epoll_events); + } + } let has_interrupt = { let mut guard = match net_dev.lock() { @@ -1616,6 +1705,9 @@ fn net_poll_thread(net_dev: Arc>, vm: Arc, running: A // an earlier edge was missed by the guest. if has_interrupt { let assert_irq = KvmIrqLevel { irq: 10, level: 1 }; + // SAFETY: KVM_IRQ_LINE ioctl writes the KvmIrqLevel struct into + // the in-kernel APIC; the struct is #[repr(C)] and the fd is valid + // for the lifetime of `vm`. unsafe { libc::ioctl(vm_fd, KVM_IRQ_LINE as _, &assert_irq); } diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs new file mode 100644 index 00000000..d5115426 --- /dev/null +++ b/tests/network_baseline.rs @@ -0,0 +1,1293 @@ +//! Layer-1 correctness pins for the smoltcp-based SLIRP stack. +//! +//! These tests drive `SlirpBackend` directly with synthetic Ethernet +//! frames — no VM, no kernel, no host sockets to outside hosts. The +//! goal is to lock observable behavior (including deliberately broken +//! behavior) so the passt-pattern refactor's diff is legible to +//! reviewers. +//! +//! TODO(0D.4): migrate poll() → drain_to_guest() and remove #[allow(deprecated)]. +#![allow(deprecated)] +//! +//! Three tests assert *broken* behavior on purpose. Each is marked +//! `BROKEN_ON_PURPOSE` and flips when the corresponding fix lands: +//! +//! - `tcp_writes_more_than_256kb_succeed` (was `tcp_to_host_buffer_drops_at_256kb`) +//! - `udp_non_dns_round_trips` (was `udp_non_dns_silently_dropped`) +//! - `icmp_echo_returns_reply` (was `icmp_echo_silently_dropped`) +//! +//! Run with: `cargo test --test network_baseline` + +#![cfg(target_os = "linux")] +// Imports and helpers used by test cases added in tasks 0A.2–0A.9. +#![allow(unused_imports, dead_code)] + +use smoltcp::wire::{ + ArpOperation, ArpPacket, ArpRepr, EthernetAddress, EthernetFrame, EthernetProtocol, + EthernetRepr, Icmpv4Packet, Icmpv4Repr, IpAddress, IpProtocol, Ipv4Address, Ipv4Packet, + Ipv4Repr, TcpControl, TcpPacket, TcpRepr, UdpPacket, UdpRepr, +}; +use std::io::{Read, Write}; +use std::net::{Ipv4Addr, SocketAddr, TcpListener, UdpSocket}; +use std::os::unix::io::AsRawFd; +use void_box::network::nat::{translate_outbound, Rules}; +use void_box::network::slirp::{ + SlirpBackend, GATEWAY_MAC, GUEST_MAC, SLIRP_DNS_IP, SLIRP_GATEWAY_IP, SLIRP_GUEST_IP, +}; +use void_box::network::NetworkBackend; +// Used by tcp_deny_list_emits_rst to express the deny CIDR as a typed network. +// `with_security` takes `&[String]`, so we convert via `.to_string()` at the +// call site; this import is kept here (module scope) per project convention. +use ipnet::Ipv4Net; + +const GUEST_EPHEMERAL_PORT: u16 = 49152; +const ETH_HDR_LEN: usize = 14; +const IPV4_MIN_HDR_LEN: usize = 20; +const TCP_MIN_HDR_LEN: usize = 20; +const UDP_HDR_LEN: usize = 8; + +/// Builds a minimal IPv4-over-Ethernet TCP segment from guest to a +/// pretend external IP. Returns the full Ethernet frame bytes. +fn build_tcp_frame( + dst_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], +) -> Vec { + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: smoltcp::wire::TcpSeqNumber(seq as i32), + ack_number: if ack == 0 { + None + } else { + Some(smoltcp::wire::TcpSeqNumber(ack as i32)) + }, + window_len: 65535, + window_scale: None, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked(&mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..]); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + &Default::default(), + ); + buf +} + +/// Builds a UDP-over-Ethernet datagram from guest. +fn build_udp_frame(dst_ip: Ipv4Address, src_port: u16, dst_port: u16, payload: &[u8]) -> Vec { + let udp_repr = UdpRepr { src_port, dst_port }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Udp, + payload_len: UDP_HDR_LEN + payload.len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + UDP_HDR_LEN + payload.len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + payload.len(), + |b| b.copy_from_slice(payload), + &Default::default(), + ); + buf +} + +/// Parses one emitted frame as a TCP segment directed to the guest. +/// +/// Returns `(seq, ack, control, payload_len)` on success, or `None` +/// if the frame is not IPv4-TCP destined for the guest or has an +/// unrecognized flag combination. +fn parse_tcp_to_guest(frame: &[u8]) -> Option<(u32, u32, TcpControl, usize)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + return None; + } + let tcp = TcpPacket::new_checked(ip.payload()).ok()?; + // Reconstruct TcpControl from individual flag accessors (smoltcp 0.11 + // exposes no combined .control() method on TcpPacket). + let control = match (tcp.syn(), tcp.fin(), tcp.rst(), tcp.psh()) { + (false, false, false, false) => TcpControl::None, + (false, false, false, true) => TcpControl::Psh, + (true, false, false, _) => TcpControl::Syn, + (false, true, false, _) => TcpControl::Fin, + (false, false, true, _) => TcpControl::Rst, + _ => return None, + }; + Some(( + tcp.seq_number().0 as u32, + tcp.ack_number().0 as u32, + control, + tcp.payload().len(), + )) +} + +/// Drains frames the stack wants to send to the guest, calling +/// `drain_to_guest` up to `n` times. Returns all frames produced +/// across the calls (caller may not care about per-call boundaries). +fn drain_n(stack: &mut SlirpBackend, n: usize) -> Vec> { + let mut out: Vec> = Vec::new(); + for _ in 0..n { + stack.drain_to_guest(&mut out); + } + out +} + +#[test] +fn tcp_handshake_emits_synack() { + // Bind a host listener on 127.0.0.1 so the stack's connect() + // succeeds. SLIRP rewrites 10.0.2.2 → 127.0.0.1. + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let mut stack = SlirpBackend::new().expect("stack"); + + // Guest sends SYN to gateway IP at the listener's port. + let syn = build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + ); + stack.process_guest_frame(&syn).expect("process syn"); + + // Drain — SYN-ACK should be queued. + let frames = drain_n(&mut stack, 4); + let synack = frames + .iter() + .find_map(|f| parse_tcp_to_guest(f)) + .expect("synack emitted"); + + let (_seq, ack, ctrl, _len) = synack; + assert_eq!(ctrl, TcpControl::Syn, "control flags include SYN+ACK"); + assert_eq!(ack, 1001, "ack = guest_seq + 1"); +} + +#[test] +fn tcp_data_round_trip() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Spawn a thread that accepts and echoes one chunk. + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + let mut buf = [0u8; 16]; + let n = sock.read(&mut buf).unwrap(); + sock.write_all(&buf[..n]).unwrap(); + }); + + let mut stack = SlirpBackend::new().expect("stack"); + + // SYN + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + + // Drain SYN-ACK; capture our_seq. + let synack_frames = drain_n(&mut stack, 4); + let (our_seq, _ack, _ctrl, _len) = synack_frames + .iter() + .find_map(|f| parse_tcp_to_guest(f)) + .expect("synack"); + + // ACK the SYN-ACK (completes handshake). + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Send 5 bytes of data. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::Psh, + b"hello", + )) + .unwrap(); + + // Wait for server to echo and stack to relay back. + server.join().unwrap(); + let mut total_payload = 0; + for _ in 0..40 { + let frames = drain_n(&mut stack, 1); + for f in frames.iter() { + if let Some((_, _, _, len)) = parse_tcp_to_guest(f) { + total_payload += len; + } + } + if total_payload >= 5 { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + assert!( + total_payload >= 5, + "expected at least 5 bytes echoed back to guest, got {total_payload}" + ); +} + +/// BROKEN_ON_PURPOSE pin (now passing): passt-style sequence mirroring and +/// don't-ACK-on-WouldBlock backpressure replace the 256 KB userspace cliff. +/// Pushing >1 MB through the relay succeeds — the kernel's socket buffer +/// holds outstanding bytes, the guest retransmits unacked segments, and the +/// connection stays alive instead of being reset. +#[test] +fn tcp_writes_more_than_256kb_succeed() { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Constrain the listener's recv buffer (small but reasonable — + // ensures TCP backpressure kicks in at a point we can observe + // without a multi-megabyte memory footprint). + { + let val: libc::c_int = 4096; + unsafe { + libc::setsockopt( + listener.as_raw_fd(), + libc::SOL_SOCKET, + libc::SO_RCVBUF, + &val as *const libc::c_int as *const libc::c_void, + std::mem::size_of::() as libc::socklen_t, + ); + } + } + + // Server: accept and drain everything we get. + let bytes_received = Arc::new(AtomicUsize::new(0)); + let bytes_received_thr = Arc::clone(&bytes_received); + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + let mut buf = [0u8; 4096]; + loop { + match sock.read(&mut buf) { + Ok(0) => break, // EOF from guest side + Ok(n) => { + bytes_received_thr.fetch_add(n, Ordering::Relaxed); + } + Err(_) => break, + } + } + }); + + let mut stack = SlirpBackend::new().expect("stack"); + + // Handshake. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let synack = drain_n(&mut stack, 4) + .into_iter() + .find_map(|f| parse_tcp_to_guest(&f)) + .expect("synack"); + let (our_seq, _, _, _) = synack; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1001, + our_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Push 1 MB in 1 KB chunks. Drain after every batch so the + // host's read thread can drain the kernel buffer and ACKs flow + // back to the guest. The new TCP-backpressure path means some + // chunks won't be ACK'd immediately; we re-send those (TCP-style + // retransmit) until they go through. + const TOTAL: usize = 1024 * 1024; + const CHUNK: usize = 1024; + let chunk = vec![b'x'; CHUNK]; + let mut seq = 1001u32; + let mut acked_seq = 1001u32; + let mut saw_close = false; + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); + + while bytes_received.load(Ordering::Relaxed) < TOTAL && std::time::Instant::now() < deadline { + // Retransmit semantics: only advance the send cursor once the + // previous chunk has been ACK'd. If the stack stops ACKing + // (backpressure engaged), we re-send the same seq/payload until + // it's acknowledged. This matches production guest-TCP retransmit + // behavior. + let _ = stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + seq, + our_seq + 1, + TcpControl::Psh, + &chunk, + )); + + // Drain frames; track the highest ACK we've seen and watch + // for RST/FIN that would indicate a premature close. + for f in drain_n(&mut stack, 4) { + if let Some((_, ack, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Rst | TcpControl::Fin) { + saw_close = true; + } + if ack > acked_seq { + acked_seq = ack; + } + } + } + + if saw_close { + break; + } + + // Advance our send cursor only past ACK'd data. If the stack + // didn't ACK this chunk, the next loop iteration re-sends the + // same seq/payload (true TCP retransmit semantics). + if acked_seq >= seq.wrapping_add(CHUNK as u32) { + seq = seq.wrapping_add(CHUNK as u32); + } else if seq.wrapping_sub(acked_seq) > 256 * 1024 { + // Out-paced kernel recv buffer; sleep briefly so the host + // server thread can drain. + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } + + // Close the connection cleanly so the server's read loop exits. + let _ = stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + seq, + our_seq + 1, + TcpControl::Fin, + &[], + )); + for _ in 0..40 { + let _ = drain_n(&mut stack, 1); + if server.is_finished() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + let _ = server.join(); + + let received = bytes_received.load(Ordering::Relaxed); + assert!( + !saw_close, + "TCP backpressure must not RST/FIN mid-stream — the relay must hold \ + the line while the kernel drains. Saw RST or FIN." + ); + assert!( + received >= TOTAL * 95 / 100, + "server must receive ~all bytes pushed (got {received}/{TOTAL}); \ + backpressure must retransmit until success, not silently drop." + ); +} + +#[test] +fn tcp_rate_limit_emits_rst() { + // 5 conn/s allowance; 10 attempts. + let mut stack = SlirpBackend::with_security(64, 5, &[], &[]).unwrap(); + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let mut rsts = 0; + for i in 0..10 { + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT + i as u16, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + for f in drain_n(&mut stack, 2) { + if let Some((_, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if ctrl == TcpControl::Rst { + rsts += 1; + } + } + } + } + assert!(rsts >= 4, "expected ≥4 RSTs from rate limit, saw {rsts}"); + drop(listener); +} + +#[test] +fn tcp_max_concurrent_emits_rst() { + let mut stack = SlirpBackend::with_security(2, 1000, &[], &[]).unwrap(); + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Open 4 distinct connections; cap is 2. + let mut rsts = 0; + for i in 0..4 { + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT + i, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + for f in drain_n(&mut stack, 2) { + if let Some((_, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if ctrl == TcpControl::Rst { + rsts += 1; + } + } + } + } + assert!(rsts >= 1, "expected RST after concurrent limit, saw {rsts}"); + drop(listener); +} + +#[test] +fn tcp_deny_list_emits_rst() { + // `with_security` takes `&[String]`; parse via `Ipv4Net` to validate the + // CIDR at compile-check time, then convert to the expected string form. + let deny_cidr: Ipv4Net = "169.254.169.254/32".parse().unwrap(); + let deny_strings = [deny_cidr.to_string()]; + let mut stack = SlirpBackend::with_security(64, 1000, &deny_strings, &[]).unwrap(); + + stack + .process_guest_frame(&build_tcp_frame( + Ipv4Address::new(169, 254, 169, 254), + GUEST_EPHEMERAL_PORT, + 80, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let rst = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_tcp_to_guest(&f)) + .map(|(_, _, ctrl, _)| ctrl == TcpControl::Rst); + assert_eq!(rst, Some(true), "deny-list IP must get RST"); +} + +/// Builds an ARP request Ethernet frame from the guest asking "who has +/// `target_ip`?". The sender is the guest MAC/IP; target hardware address +/// is zeroed as per ARP request convention. +fn build_arp_request(target_ip: Ipv4Address) -> Vec { + let arp_repr = ArpRepr::EthernetIpv4 { + operation: ArpOperation::Request, + source_hardware_addr: EthernetAddress(GUEST_MAC), + source_protocol_addr: SLIRP_GUEST_IP, + target_hardware_addr: EthernetAddress([0; 6]), + target_protocol_addr: target_ip, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress([0xff; 6]), + ethertype: EthernetProtocol::Arp, + }; + let total = ETH_HDR_LEN + arp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut arp = ArpPacket::new_unchecked(&mut buf[ETH_HDR_LEN..]); + arp_repr.emit(&mut arp); + buf +} + +/// Parses an Ethernet frame as an ARP reply. +/// +/// Returns `Some((source_hardware_addr, source_protocol_addr))` when the +/// frame carries an ARP reply opcode, `None` otherwise. +fn parse_arp_reply(frame: &[u8]) -> Option<(EthernetAddress, Ipv4Address)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Arp { + return None; + } + let arp = ArpPacket::new_checked(eth.payload()).ok()?; + let repr = ArpRepr::parse(&arp).ok()?; + if let ArpRepr::EthernetIpv4 { + operation: ArpOperation::Reply, + source_hardware_addr, + source_protocol_addr, + .. + } = repr + { + Some((source_hardware_addr, source_protocol_addr)) + } else { + None + } +} + +#[test] +fn arp_replies_for_gateway() { + let mut stack = SlirpBackend::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(SLIRP_GATEWAY_IP)) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)) + .expect("arp reply for gateway"); + assert_eq!(reply.1, SLIRP_GATEWAY_IP); + assert_eq!(reply.0, EthernetAddress(GATEWAY_MAC)); +} + +#[test] +fn arp_replies_for_random_subnet_ip() { + let mut stack = SlirpBackend::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(Ipv4Address::new(10, 0, 2, 99))) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)) + .expect("arp reply for in-subnet IP"); + assert_eq!(reply.0, EthernetAddress(GATEWAY_MAC)); +} + +#[test] +fn arp_does_not_reply_for_guest_ip() { + let mut stack = SlirpBackend::new().unwrap(); + stack + .process_guest_frame(&build_arp_request(SLIRP_GUEST_IP)) + .unwrap(); + let reply = drain_n(&mut stack, 2) + .into_iter() + .find_map(|f| parse_arp_reply(&f)); + assert!(reply.is_none(), "stack must not claim guest's own IP"); +} + +/// Wire-format label for `example.com`, used in DNS query frames. +/// +/// Encoded as a DNS QNAME: each label is prefixed by its byte length, +/// terminated by a zero-length label. This is the representation that +/// goes directly into the DNS question section. +const QNAME_EXAMPLE_COM: &[u8] = b"\x07example\x03com\x00"; + +/// Builds a minimal DNS query UDP Ethernet frame from the guest to `SLIRP_DNS_IP`. +/// +/// `xid` is placed in the transaction-ID field. `qname` must be a +/// fully-encoded DNS name (length-prefixed labels, zero terminator). +/// The question section requests an A record (`QTYPE=1`, `QCLASS=1`). +/// +/// Unlike `build_udp_frame` (which carries a pre-existing off-by-one in +/// the `payload_len` argument passed to `udp_repr.emit`), this helper +/// passes only the DNS payload length so the UDP `len` field is correct +/// and the stack's smoltcp parser accepts the frame. +fn build_dns_query(xid: u16, qname: &[u8]) -> Vec { + // DNS message layout: + // 2B transaction ID + // 2B flags (standard query, RD=1) + // 2B QDCOUNT = 1 + // 2B ANCOUNT = 0 + // 2B NSCOUNT = 0 + // 2B ARCOUNT = 0 + // ..B QNAME (length-label encoded, zero terminated) + // 2B QTYPE = 1 (A) + // 2B QCLASS = 1 (IN) + let mut dns_payload = Vec::new(); + dns_payload.extend_from_slice(&xid.to_be_bytes()); + dns_payload.extend_from_slice(&0x0100u16.to_be_bytes()); // flags: RD=1 + dns_payload.extend_from_slice(&1u16.to_be_bytes()); // QDCOUNT + dns_payload.extend_from_slice(&0u16.to_be_bytes()); // ANCOUNT + dns_payload.extend_from_slice(&0u16.to_be_bytes()); // NSCOUNT + dns_payload.extend_from_slice(&0u16.to_be_bytes()); // ARCOUNT + dns_payload.extend_from_slice(qname); + dns_payload.extend_from_slice(&1u16.to_be_bytes()); // QTYPE A + dns_payload.extend_from_slice(&1u16.to_be_bytes()); // QCLASS IN + + // Build the Ethernet frame manually so we can pass the correct + // `payload_len` (DNS payload only) to `udp_repr.emit`. + let udp_repr = UdpRepr { + src_port: GUEST_EPHEMERAL_PORT, + dst_port: 53, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: SLIRP_DNS_IP, + next_header: IpProtocol::Udp, + payload_len: UDP_HDR_LEN + dns_payload.len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + UDP_HDR_LEN + dns_payload.len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut udp = UdpPacket::new_unchecked(&mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..]); + udp_repr.emit( + &mut udp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(SLIRP_DNS_IP), + dns_payload.len(), // payload length only, not header+payload + |b| b.copy_from_slice(&dns_payload), + &Default::default(), + ); + buf +} + +/// Parses an Ethernet frame emitted by the stack and returns the DNS +/// transaction ID (XID) if the frame is a UDP datagram addressed to +/// the guest on port `GUEST_EPHEMERAL_PORT` with a plausible DNS +/// header (≥ 12 bytes of DNS payload). +/// +/// Returns `None` for any frame that does not match those criteria. +fn parse_dns_reply_xid(frame: &[u8]) -> Option { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Udp || ip.dst_addr() != SLIRP_GUEST_IP { + return None; + } + let udp = UdpPacket::new_checked(ip.payload()).ok()?; + if udp.dst_port() != GUEST_EPHEMERAL_PORT { + return None; + } + let dns_payload = udp.payload(); + if dns_payload.len() < 12 { + return None; + } + Some(u16::from_be_bytes([dns_payload[0], dns_payload[1]])) +} + +#[test] +fn dns_query_resolves() { + let mut stack = match SlirpBackend::new() { + Ok(s) => s, + Err(e) => { + eprintln!("skip: SlirpBackend::new() failed ({e}), no DNS available"); + return; + } + }; + + let query = build_dns_query(0x1234, QNAME_EXAMPLE_COM); + if let Err(e) = stack.process_guest_frame(&query) { + eprintln!("skip: process_guest_frame failed ({e})"); + return; + } + + let mut reply_xid: Option = None; + for _ in 0..20 { + for frame in stack.poll() { + if let Some(xid) = parse_dns_reply_xid(&frame) { + reply_xid = Some(xid); + } + } + if reply_xid.is_some() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + } + + match reply_xid { + Some(xid) => assert_eq!(xid, 0x1234, "reply XID must match query XID"), + None => { + eprintln!("skip: no DNS reply in 20×100 ms, upstream resolver unreachable"); + } + } +} + +#[test] +fn dns_cache_keys_by_question_not_xid() { + let mut stack = match SlirpBackend::new() { + Ok(s) => s, + Err(e) => { + eprintln!("skip: SlirpBackend::new() failed ({e}), no DNS available"); + return; + } + }; + + // Warm the cache with xid=1. + let warm_query = build_dns_query(0x0001, QNAME_EXAMPLE_COM); + if let Err(e) = stack.process_guest_frame(&warm_query) { + eprintln!("skip: warm query process_guest_frame failed ({e})"); + return; + } + let mut warmed = false; + for _ in 0..20 { + for frame in stack.poll() { + if let Some(xid) = parse_dns_reply_xid(&frame) { + if xid == 0x0001 { + warmed = true; + } + } + } + if warmed { + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + } + if !warmed { + eprintln!("skip: cache warm-up timed out, upstream resolver unreachable"); + return; + } + + // Now query with xid=2; the cache must rewrite the reply XID to 2. + let second_query = build_dns_query(0x0002, QNAME_EXAMPLE_COM); + if let Err(e) = stack.process_guest_frame(&second_query) { + eprintln!("skip: second query process_guest_frame failed ({e})"); + return; + } + let mut reply_xid: Option = None; + for _ in 0..20 { + for frame in stack.poll() { + if let Some(xid) = parse_dns_reply_xid(&frame) { + reply_xid = Some(xid); + } + } + if reply_xid.is_some() { + break; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + } + + match reply_xid { + Some(xid) => assert_eq!(xid, 0x0002, "cache must rewrite XID to match the new query"), + None => { + eprintln!("skip: no reply for second query in 20×100 ms"); + } + } +} + +/// BROKEN_ON_PURPOSE pin (now passing): arbitrary UDP (any destination +/// port, not just 53) round-trips through the per-flow connected-socket +/// NAT. +#[test] +fn udp_non_dns_round_trips() { + let host_sock = UdpSocket::bind("127.0.0.1:0").unwrap(); + let host_port = host_sock.local_addr().unwrap().port(); + host_sock + .set_read_timeout(Some(std::time::Duration::from_millis(500))) + .unwrap(); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest → gateway:host_port (translated to 127.0.0.1:host_port). + stack + .process_guest_frame(&build_udp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + b"hello", + )) + .unwrap(); + let _ = drain_n(&mut stack, 4); + + // Host receives the datagram. + let mut buf = [0u8; 32]; + let (n, peer) = host_sock + .recv_from(&mut buf) + .expect("host receives guest UDP"); + assert_eq!(&buf[..n], b"hello"); + + // Host echoes back. + host_sock.send_to(&buf[..n], peer).unwrap(); + + // Drain — guest should see the reply on its source port. + let mut saw_reply = false; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + let Some(eth) = EthernetFrame::new_checked(f.as_slice()).ok() else { + continue; + }; + if eth.ethertype() != EthernetProtocol::Ipv4 { + continue; + } + let Some(ip) = Ipv4Packet::new_checked(eth.payload()).ok() else { + continue; + }; + if ip.next_header() != IpProtocol::Udp { + continue; + } + let Some(udp_pkt) = UdpPacket::new_checked(ip.payload()).ok() else { + continue; + }; + if udp_pkt.dst_port() == GUEST_EPHEMERAL_PORT && udp_pkt.payload() == b"hello" { + saw_reply = true; + break; + } + } + if saw_reply { + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + assert!(saw_reply, "guest must receive UDP reply via per-flow NAT"); +} + +/// BROKEN_ON_PURPOSE pin (now passing): the guest receives an ICMP echo +/// reply via the host's unprivileged `IPPROTO_ICMP SOCK_DGRAM` socket. +/// +/// Skips gracefully if `net.ipv4.ping_group_range` forbids unprivileged +/// ICMP for the calling GID — in that environment the warn-once log +/// fires and the SLIRP stack drops ICMP, which is the documented +/// fallback (see `slirp.rs::ICMP_PROBE`). +#[test] +fn icmp_echo_returns_reply() { + use smoltcp::wire::{Icmpv4Packet, Icmpv4Repr}; + + // Probe whether unprivileged ICMP is permitted on this host. If not, + // skip gracefully — the SLIRP stack falls back to silently dropping + // ICMP in that environment (see slirp.rs::ICMP_PROBE). + let probe_fd = unsafe { libc::socket(libc::AF_INET, libc::SOCK_DGRAM, libc::IPPROTO_ICMP) }; + if probe_fd < 0 { + let err = std::io::Error::last_os_error(); + let raw = err.raw_os_error().unwrap_or(0); + if raw == libc::EPERM || raw == libc::EACCES { + eprintln!("skip: unprivileged ICMP forbidden ({err}); see net.ipv4.ping_group_range"); + return; + } + panic!("unexpected ICMP probe error: {err}"); + } + unsafe { libc::close(probe_fd) }; + + let icmp_repr = Icmpv4Repr::EchoRequest { + ident: 0xbeef, + seq_no: 1, + data: b"ping", + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + // 127.0.0.1 — the host kernel always replies on loopback. + dst_addr: Ipv4Address::new(127, 0, 0, 1), + next_header: IpProtocol::Icmp, + payload_len: icmp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + icmp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut icmp = Icmpv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..]); + icmp_repr.emit(&mut icmp, &Default::default()); + + let mut stack = match SlirpBackend::new() { + Ok(s) => s, + Err(_) => { + eprintln!("skip: SlirpBackend::new failed"); + return; + } + }; + if stack.process_guest_frame(&buf).is_err() { + eprintln!("skip: process_guest_frame failed (likely no ICMP support)"); + return; + } + + // Poll up to 20 × 50ms for the reply. + let mut saw_reply = false; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + let Some(eth) = EthernetFrame::new_checked(f.as_slice()).ok() else { + continue; + }; + if eth.ethertype() != EthernetProtocol::Ipv4 { + continue; + } + let Some(ip) = Ipv4Packet::new_checked(eth.payload()).ok() else { + continue; + }; + if ip.next_header() == IpProtocol::Icmp && ip.dst_addr() == SLIRP_GUEST_IP { + saw_reply = true; + break; + } + } + if saw_reply { + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + + assert!( + saw_reply, + "guest must receive ICMP echo reply via host IPPROTO_ICMP socket" + ); +} + +#[test] +fn slirp_backend_implements_network_backend() { + fn assert_send() {} + fn assert_backend() {} + assert_send::(); + assert_backend::(); +} + +#[test] +fn nat_translate_outbound_loopback_rewrite() { + let rules = Rules { + gateway_loopback: true, + deny_cidrs: vec![], + port_forwards: vec![], + }; + let result = translate_outbound(&rules, SLIRP_GATEWAY_IP, 80, SLIRP_GATEWAY_IP).unwrap(); + assert_eq!( + result, + SocketAddr::from((Ipv4Addr::LOCALHOST, 80)), + "gateway IP must be rewritten to 127.0.0.1 when gateway_loopback=true" + ); +} + +#[test] +fn nat_translate_outbound_unmodified_external_ip() { + let rules = Rules { + gateway_loopback: true, + deny_cidrs: vec![], + port_forwards: vec![], + }; + let external = Ipv4Address::new(8, 8, 8, 8); + let result = translate_outbound(&rules, external, 53, SLIRP_GATEWAY_IP).unwrap(); + assert_eq!( + result, + SocketAddr::from((Ipv4Addr::new(8, 8, 8, 8), 53)), + "non-gateway IPs must pass through unchanged" + ); +} + +/// E2E contract for inbound port-forwarding. +/// +/// Builds a `SlirpBackend` with one TCP port-forward rule +/// (`HOST_PORT` → `GUEST_PORT`), has a host thread connect to +/// `127.0.0.1:HOST_PORT`, then drives `drain_to_guest` and +/// synthesizes a guest TCP listener by responding with SYN-ACK to +/// the synthesized SYN the stack emits. +/// +/// The test asserts **three** contract points, each covering a distinct +/// 5.5b sub-task: +/// +/// 1. `host TcpStream::connect` **succeeds** — the listener thread +/// (5.5b.3) is bound and accepts incoming connections. +/// 2. `drain_to_guest` **emits a synthesized SYN** to `GUEST_PORT` — +/// `process_pending_inbound_accepts` (5.5b.3) dequeues the +/// `InboundAccept` and `synthesize_inbound_syn` (5.5b.2) emits the +/// SYN frame; `with_security` (5.5b.4) wired the channel. +/// 3. After the synthetic guest replies with SYN-ACK, `drain_to_guest` +/// **emits an ACK frame** — the `SynSent → Established` arm (5.5b.1) +/// fired and the handshake completed end-to-end. +/// +/// Byte-level round-trip is deferred — connect + full 3WH completion +/// is the minimum contract for the listener implementation. +#[test] +fn tcp_port_forward_inbound_connect_succeeds() { + use std::sync::mpsc; + use std::time::{Duration, Instant}; + + const HOST_PORT: u16 = 18080; + const GUEST_PORT: u16 = 8080; + const GUEST_ISN: u32 = 5000; + + let mut stack = SlirpBackend::with_security(64, 1000, &[], &[(HOST_PORT, GUEST_PORT)]) + .expect("build stack with port-forward rule"); + + // ── Contract 1: listener thread is bound and accepts connections ───── + // Spawn the host connector in a background thread so it doesn't block + // the test thread. The OS-level SYN/SYN-ACK/ACK between host connector + // and the listener socket is handled by the kernel; the SLIRP stack + // is not involved in that handshake. + let (tx, rx) = mpsc::channel::>(); + std::thread::spawn(move || { + let result = std::net::TcpStream::connect_timeout( + &format!("127.0.0.1:{HOST_PORT}").parse().unwrap(), + Duration::from_secs(5), + ); + let _ = tx.send(result); + }); + + // ── Contract 2 + 3: drain until we see the synthesized SYN (2) and ── + // then the ACK that completes the inbound 3WH (3). + let deadline = Instant::now() + Duration::from_secs(5); + let mut saw_synthesized_syn = false; + let mut saw_ack_after_synack = false; + let mut connect_result: Option> = None; + + while Instant::now() < deadline + && (!saw_synthesized_syn || !saw_ack_after_synack || connect_result.is_none()) + { + let mut out = Vec::new(); + stack.drain_to_guest(&mut out); + + let mut high_port_for_ack: Option = None; + + for frame in &out { + let Some((syn_seq, _ack, src_port, dst_port, ctrl)) = parse_tcp_to_guest_full(frame) + else { + continue; + }; + + // Contract 2: synthesized SYN arriving at the guest. + if ctrl == TcpControl::Syn && dst_port == GUEST_PORT && !saw_synthesized_syn { + saw_synthesized_syn = true; + high_port_for_ack = Some(src_port); + + // Synthetic guest listener replies with SYN-ACK. + // build_tcp_frame: src=SLIRP_GUEST_IP, dst=SLIRP_GATEWAY_IP + let syn_ack = build_tcp_frame( + SLIRP_GATEWAY_IP, // dst from guest's perspective + GUEST_PORT, // guest service port (src_port in frame) + src_port, // high_port (dst_port in frame) + GUEST_ISN, // guest's own ISN + syn_seq + 1, // ack = their SYN seq + 1 + TcpControl::Syn, // SYN+ACK: ack_number is non-zero + &[], + ); + stack + .process_guest_frame(&syn_ack) + .expect("process synthetic SYN-ACK"); + } + + // Contract 3: ACK back to the guest completing the inbound 3WH. + // After processing our SYN-ACK, the stack emits a plain ACK + // (ctrl=None, ack set) directed at GUEST_PORT. + if ctrl == TcpControl::None + && dst_port == GUEST_PORT + && high_port_for_ack == Some(src_port) + { + saw_ack_after_synack = true; + } + } + + // A second drain pass so the stack processes the SYN-ACK we just + // injected and emits its ACK in the same iteration. + let mut ack_out = Vec::new(); + stack.drain_to_guest(&mut ack_out); + for frame in &ack_out { + let Some((_seq, _ack, src_port, dst_port, ctrl)) = parse_tcp_to_guest_full(frame) + else { + continue; + }; + if ctrl == TcpControl::None + && dst_port == GUEST_PORT + && high_port_for_ack == Some(src_port) + { + saw_ack_after_synack = true; + } + } + + if let Ok(r) = rx.try_recv() { + connect_result = Some(r); + } + + std::thread::sleep(Duration::from_millis(10)); + } + + // Contract 1. + let connect_result = + connect_result.expect("host TcpStream::connect did not complete within 5 s"); + let _stream = connect_result.expect("host TcpStream::connect failed"); + + // Contract 2. + assert!( + saw_synthesized_syn, + "drain_to_guest must emit a synthesized SYN to GUEST_PORT \ + after drain_to_guest processes the InboundAccept (5.5b.2/5.5b.3)" + ); + + // Contract 3. + assert!( + saw_ack_after_synack, + "drain_to_guest must emit an ACK completing the inbound 3-way handshake \ + after the synthetic guest SYN-ACK is processed (5.5b.1)" + ); +} + +/// Richer TCP-to-guest frame parser that also returns src/dst ports. +/// +/// Returns `(seq, ack, src_port, dst_port, control)` for any IPv4/TCP +/// frame whose destination is `SLIRP_GUEST_IP`, or `None` for anything +/// else. Used by `tcp_port_forward_inbound_connect_succeeds` to identify +/// the synthesized SYN and extract the ephemeral `high_port`. +fn parse_tcp_to_guest_full(frame: &[u8]) -> Option<(u32, u32, u16, u16, TcpControl)> { + let eth = EthernetFrame::new_checked(frame).ok()?; + if eth.ethertype() != EthernetProtocol::Ipv4 { + return None; + } + let ip = Ipv4Packet::new_checked(eth.payload()).ok()?; + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + return None; + } + let tcp = TcpPacket::new_checked(ip.payload()).ok()?; + let control = match (tcp.syn(), tcp.fin(), tcp.rst(), tcp.psh()) { + (false, false, false, false) => TcpControl::None, + (false, false, false, true) => TcpControl::Psh, + (true, false, false, _) => TcpControl::Syn, + (false, true, false, _) => TcpControl::Fin, + (false, false, true, _) => TcpControl::Rst, + _ => return None, + }; + Some(( + tcp.seq_number().0 as u32, + tcp.ack_number().0 as u32, + tcp.src_port(), + tcp.dst_port(), + control, + )) +} + +#[test] +fn nat_translate_outbound_deny_list() { + let rules = Rules { + gateway_loopback: true, + deny_cidrs: vec!["169.254.0.0/16".parse::().unwrap()], + port_forwards: vec![], + }; + let metadata = Ipv4Address::new(169, 254, 169, 254); + assert!( + translate_outbound(&rules, metadata, 80, SLIRP_GATEWAY_IP).is_none(), + "deny-listed IP must return None" + ); + + // Adjacent (non-denied) IP still passes. + let public = Ipv4Address::new(169, 253, 0, 1); + assert!( + translate_outbound(&rules, public, 80, SLIRP_GATEWAY_IP).is_some(), + "IPs outside deny CIDR must pass" + ); +} + +/// Snapshot/restore must rebuild the epoll dispatch from `flow_table` +/// contents. The `epoll_fd` is a kernel handle that does not survive +/// snapshot; a fresh dispatcher starts with zero registered FDs even +/// though `flow_table` may contain entries with live host sockets. +/// +/// This smoke test verifies the rebuild path end-to-end: +/// 1. Insert a synthetic TCP flow into the flow table. +/// 2. Reset the epoll dispatcher to a fresh empty one (simulating what +/// snapshot restore does: the kernel handle is gone, a new one is created). +/// 3. Confirm the pre-rebuild count is zero. +/// 4. Call `rebuild_epoll_from_flow_table`. +/// 5. Confirm the post-rebuild count is one. +/// +/// Gated on `bench-helpers` because it consumes synthetic-injection helpers +/// (`insert_synthetic_synsent_entry`, `reset_epoll_for_snapshot_test`, +/// `registered_fd_count`) that are only visible to external test/bench +/// consumers when that feature is enabled. Default `cargo test` skips this +/// pin; CI runs it via `cargo test --features bench-helpers`. +#[cfg(feature = "bench-helpers")] +#[test] +fn epoll_set_rebuilt_from_flow_table_smoke() { + use std::net::TcpListener; + + let mut backend = SlirpBackend::new().expect("backend"); + + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let host_stream = + std::net::TcpStream::connect(listener.local_addr().unwrap()).expect("connect"); + host_stream.set_nonblocking(true).ok(); + + // Insert a synthetic flow (may or may not register with epoll depending on + // cfg context). Then reset the epoll dispatcher to a fresh empty one — + // this is the key step that simulates what happens after snapshot restore: + // the kernel-side `epoll_fd` does not survive, so a new one is created + // with zero registrations even though `flow_table` has live entries. + backend.insert_synthetic_synsent_entry(8080, 49152, 1000, host_stream); + backend.reset_epoll_for_snapshot_test(); + + let before = backend.registered_fd_count(); + assert_eq!( + before, 0, + "after reset, epoll must have zero registered FDs (simulates post-snapshot state)" + ); + + backend.rebuild_epoll_from_flow_table(); + + let after = backend.registered_fd_count(); + assert_eq!( + after, 1, + "rebuild_epoll_from_flow_table must register all live flow FDs" + ); +}