From 07917396633d47d30bf7491646ee6bfdb52254cb Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 08:49:15 -0300 Subject: [PATCH 01/11] =?UTF-8?q?docs:=20Phase=206.3=20detailed=20TDD=20pl?= =?UTF-8?q?an=20=E2=80=94=20TCP=20window=20management?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 10 bite-sized tasks covering proper TCP windowing: - TcpNatEntry tracks guest_window (u32) + guest_window_scale (u8) - handle_tcp_frame parses tcp.window_scale() on guest SYN, stores per-flow; updates guest_window on every incoming frame - build_tcp_packet_static signature changes to take (window_len, window_scale) — caller decides - SYN-ACK negotiates OUR_WINDOW_SCALE = 7 (passt's default; 128x) - New host_recv_window helper queries TCP_INFO.tcpi_rcv_space and scales it for the advertised window on outgoing frames - relay_tcp_nat_data gates host→guest sends on entry.guest_window to honor real backpressure - Three new pins: tcp_advertised_window_tracks_guest_buffer (BROKEN_ON_PURPOSE → flips at Task 7), tcp_window_scale_negotiated_in_synack, plus tcp_bulk_throughput_constrained_window parametric bench Severity: MEDIUM — perf gap. Hardcoded window_len: 65535 caps throughput at 64 KB / RTT regardless of bandwidth, and inject_to_guest can grow unbounded if the guest is slow. --- .../2026-04-30-smoltcp-passt-port-phase6.3.md | 429 ++++++++++++++++++ 1 file changed, 429 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.3.md diff --git a/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.3.md b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.3.md new file mode 100644 index 00000000..57394503 --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.3.md @@ -0,0 +1,429 @@ +# Phase 6.3: TCP Window Management Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development. + +**Goal:** Stop ignoring the guest's advertised window and stop hardcoding our advertised window. Track per-flow guest window (with scaling), advertise our own window from the host kernel's actual recv-buffer headroom, and negotiate `window_scale` on the SYN/SYN-ACK exchange. + +**Severity:** Medium — perf gap. The current code emits `window_len: 65535, window_scale: None` on every outgoing frame, never reads `tcp.window_len()` from incoming guest frames, and never honors guest backpressure on host→guest send. Effect: throughput is capped at 64 KB / RTT regardless of available bandwidth, and `inject_to_guest` can grow unbounded if the guest is slow. + +**Architecture:** Add `guest_window: u32` and `guest_window_scale: u8` fields to `TcpNatEntry`. Read window updates from incoming guest packets; respect them when deciding how much to send via `frames_to_inject`. Negotiate `window_scale: Some(7)` (128× scale, passt's default) on the SYN-ACK we emit. On every outgoing frame, advertise a window derived from `getsockopt(TCP_INFO).tcpi_rcv_space` so the guest sees real backpressure. + +**Tech stack:** smoltcp 0.11 wire types (already in use). `libc::getsockopt(..., TCP_INFO, ...)` for kernel rcv-space. No new crates. + +--- + +## Background + +Three things are wrong today: + +1. `src/network/slirp.rs:93` — `const TCP_WINDOW: u16 = 65535`. Hardcoded. +2. `build_tcp_packet_static` at `src/network/slirp.rs:2811-2827` — emits `window_len: TCP_WINDOW` and `window_scale: None` on every frame. Never reads anything from the host kernel. +3. The guest-frame parser in `handle_tcp_frame` never reads `tcp.window_len()` from the incoming `TcpRepr`. The guest's advertised window is silently discarded; we treat the guest as having infinite buffer. + +The 256 KB host→guest cap that Phase 3 fixed (`tcp_writes_more_than_256kb_succeed`) was a userspace-side band-aid for the symptom. With proper window honoring, host→guest is bounded by the guest's *actual* receive buffer, which is normally far larger than 256 KB on a modern Linux kernel guest with `tcp_window_scaling=1`. + +passt's `tcp_conn` ([passt/tcp_conn.h:21](https://passt.top/passt/tree/tcp_conn.h#n21)) tracks `wnd_from_tap`, `wnd_to_tap`, scale factors, and updates ACK/window per [tcp.c:1021](https://passt.top/passt/tree/tcp.c#n1021), [tcp.c:1426](https://passt.top/passt/tree/tcp.c#n1426). + +## Invariants (carried) + +1. All-Rust path. `libc::getsockopt` is fine. +2. Full observability — log scale negotiation in trace; log window updates at debug. +3. Cross-platform discipline. +4. No regression in Phase 0–5 + 5.5b + 6.4 + listener-on-epoll + 6.1 + 6.2 baselines. +5. Snapshot/restore correctness — new fields need `#[serde(default)]` for backward compatibility with pre-6.3 snapshots; default to scale=0 / window=65535 (current behavior). + +--- + +## File impact + +| File | Action | +|---|---| +| `src/network/slirp.rs` | `TcpNatEntry` adds `guest_window: u32`, `guest_window_scale: u8`. `build_tcp_packet_static` signature changes to take a `(window_len, window_scale)` pair. `handle_tcp_frame` reads window updates. SYN/SYN-ACK paths negotiate scale. `relay_tcp_nat_data` gates host→guest sends on `entry.guest_window`. | +| `tests/network_baseline.rs` | Two new pins. | +| `benches/network.rs` | One new bench: `tcp_bulk_throughput_constrained_window` (parametric on guest window size). | + +--- + +## Tasks + +### Task 1: Add `guest_window` + `guest_window_scale` fields, default-init + +In `src/network/slirp.rs`, extend `TcpNatEntry`: + +```rust +struct TcpNatEntry { + // ... existing fields ... + /// Guest's advertised receive window in bytes, scaled per + /// `guest_window_scale`. Updated on every incoming TCP frame's + /// `window_len`. Initial value 65535 matches an unscaled SYN. + guest_window: u32, + /// Window-scale shift the guest negotiated in its SYN. Zero + /// means "guest does not support window scaling" (or we did not + /// see a window-scale option in the SYN). + guest_window_scale: u8, +} +``` + +Initialize at every `TcpNatEntry { ... }` literal site: +- `handle_tcp_frame` SYN handler (the existing `Connecting`/`SynReceived` paths) +- `process_pending_inbound_accepts` +- `insert_synthetic_synsent_entry` (bench-helpers) +- `insert_synthetic_lastack_entry` (bench-helpers) +- `insert_synthetic_connecting_entry` (bench-helpers, just added by 6.2) + +Initial values: `guest_window: 65535, guest_window_scale: 0` (no-op default). + +Snapshot serde: `#[serde(default)]` on each new field. Run `cargo check`. + +**Commit:** `feat(slirp): TcpNatEntry tracks guest_window + guest_window_scale` + +--- + +### Task 2: Read window scale from guest SYN; track per-flow + +When the guest sends SYN (in `handle_tcp_frame`'s SYN flow), parse the TCP options for `WindowScale`. smoltcp's `TcpRepr` exposes `window_scale: Option`. Stash it in the entry: + +```rust +let window_scale = tcp.window_scale().unwrap_or(0); +// ... in the entry literal ... +guest_window_scale: window_scale, +guest_window: u32::from(tcp.window_len()) << window_scale, +``` + +For the SYN-ACK we emit back to the guest, advertise our own scale (suggest 7 = 128×, matching passt). This requires changing `build_tcp_packet_static`'s signature — see Task 4. + +Run `cargo check` (no semantic change yet — we're just stashing values). + +**Commit:** `feat(slirp): parse guest's window_scale on SYN, store on flow` + +--- + +### Task 3: Update `guest_window` on every incoming guest packet + +In `handle_tcp_frame`, after locating the `entry` and after the existing `entry.last_activity = Instant::now()` line, update window: + +```rust +entry.guest_window = u32::from(tcp.window_len()) << entry.guest_window_scale; +``` + +This runs for every frame the guest sends — data, pure ACK, FIN, RST. Always reflects the most recent advertised window. + +`cargo check`. No tests yet. + +**Commit:** `feat(slirp): track guest's advertised window on every incoming frame` + +--- + +### Task 4: Change `build_tcp_packet_static` to take `(window_len, window_scale)` + +Currently: + +```rust +fn build_tcp_packet_static( + src_ip, dst_ip, src_port, dst_port, seq, ack, control, payload, +) -> Vec { + // ... uses TCP_WINDOW + None internally +} +``` + +Change signature to add explicit window parameters: + +```rust +fn build_tcp_packet_static( + src_ip: Ipv4Address, + dst_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], + window_len: u16, + window_scale: Option, +) -> Vec { + let tcp_repr = TcpRepr { + // ... + window_len, + window_scale, + // ... + }; + // ... unchanged below ... +} +``` + +Search-replace every call site to thread the new arguments. The call sites split into three cases: + +- **SYN-ACK** (in the SYN handler): pass `(65535, Some(OUR_WINDOW_SCALE))` where `OUR_WINDOW_SCALE = 7`. +- **All other frames** (data ACKs, plain ACKs, FIN-ACK, RST): pass `(advertised_window, None)`. `advertised_window` is computed from the host kernel via `host_recv_window(entry.host_stream.as_raw_fd())` — a new helper added in Task 5. + +For Task 4 specifically: get the signature change landed and pass `(65535, None)` everywhere except the SYN-ACK site, which passes `(65535, Some(7))`. Subsequent tasks adapt the value. + +Add a module-level constant near `TCP_WINDOW`: + +```rust +/// Window-scale shift we advertise on SYN-ACK frames. Matches passt's +/// default. 7 means each unit in `window_len` represents 128 bytes, +/// extending the effective window from 64 KiB to 8 MiB. +const OUR_WINDOW_SCALE: u8 = 7; +``` + +Run `cargo check && cargo test --test network_baseline`. Expected: 22/22 still pass — no behavior change beyond scale negotiation. + +**Commit:** `refactor(slirp): build_tcp_packet_static takes (window_len, window_scale)` + +--- + +### Task 5: `host_recv_window` helper + use it on outgoing frames + +New helper that reads `TCP_INFO.tcpi_rcv_space` from the host kernel: + +```rust +/// Host kernel's current receive-buffer headroom, scaled down by +/// `OUR_WINDOW_SCALE`, for advertising as our `window_len` on +/// outgoing frames. Returns `65535 / 2` (a conservative middle +/// value) on getsockopt failure rather than 0 (which would stall +/// the connection) or `u16::MAX` (which would overcommit). +fn host_recv_window(fd: RawFd) -> u16 { + use std::mem::MaybeUninit; + let mut info: MaybeUninit = MaybeUninit::zeroed(); + let mut len = std::mem::size_of::() as libc::socklen_t; + // SAFETY: getsockopt fills `info` if it returns 0. + let rc = unsafe { + libc::getsockopt( + fd, + libc::IPPROTO_TCP, + libc::TCP_INFO, + info.as_mut_ptr() as *mut libc::c_void, + &mut len, + ) + }; + if rc != 0 { + return 32768; + } + let info = unsafe { info.assume_init() }; + let scaled = info.tcpi_rcv_space >> OUR_WINDOW_SCALE; + scaled.min(u32::from(u16::MAX)) as u16 +} +``` + +`libc::tcp_info` and `libc::TCP_INFO` are stable in the libc crate. + +Update every `build_tcp_packet_static` call site that passes `(65535, None)` to instead pass `(host_recv_window(entry.host_stream.as_raw_fd()), None)`. The SYN-ACK site keeps `(65535, Some(OUR_WINDOW_SCALE))`. + +Doc-comment the trade: a fresh socket has `tcpi_rcv_space` pre-filled to ~32 KiB; under load it grows to 4 MiB+ on Linux. Scaled by `>> 7`, that gives 256 KiB advertised. Both extremes are within `u16::MAX`. + +Run baseline + bulk-throughput bench. Expected: no regression on `tcp_bulk_throughput_1mb`; possibly slight improvement. + +**Commit:** `feat(slirp): advertise host-kernel-derived window on outgoing frames` + +--- + +### Task 6: Failing pin — `tcp_advertised_window_tracks_guest_buffer` + +Synthesize a guest with a small advertised window (4096 bytes, no scale), push 64 KB of data from host, assert `inject_to_guest` never holds more than ~4 KB of un-acknowledged bytes. Today the test fails because we ignore the guest's window. + +```rust +#[test] +fn tcp_advertised_window_tracks_guest_buffer() { + use std::io::Write; + use std::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + let server = std::thread::spawn(move || -> std::net::TcpStream { + let (sock, _) = listener.accept().unwrap(); + sock + }); + + let mut stack = SlirpBackend::new().unwrap(); + + let our_seq = 1000u32; + // Guest SYN with explicit small window (no scale). + let syn = build_tcp_frame_with_window( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + our_seq, 0, TcpControl::Syn, &[], + 4096, None, + ); + stack.process_guest_frame(&syn).unwrap(); + + let mut gateway_seq = 0u32; + for f in drain_n(&mut stack, 4) { + if let Some((s, _, ctrl, _)) = parse_tcp_to_guest(f.as_slice()) { + if matches!(ctrl, TcpControl::Syn) { gateway_seq = s; break; } + } + } + + // Complete handshake with the same small window. + stack.process_guest_frame(&build_tcp_frame_with_window( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + our_seq + 1, gateway_seq + 1, TcpControl::None, &[], + 4096, None, + )).unwrap(); + + let mut host_stream = server.join().unwrap(); + let payload = vec![0xAB; 64 * 1024]; + host_stream.write_all(&payload).unwrap(); + + // Drive drain_to_guest a few times. With proper window tracking, + // total bytes injected before any ACK should be <= guest_window + // (4096 plus a small slop for one MTU-sized segment). + let mut total_payload_injected: usize = 0; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + if let Some((_, _, _, payload_len)) = parse_tcp_to_guest(f.as_slice()) { + total_payload_injected += payload_len; + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + assert!( + total_payload_injected <= 4096 + 1500, + "injected {total_payload_injected} bytes; must respect guest_window=4096 (one MTU slop allowed)" + ); +} + +fn build_tcp_frame_with_window( + dst_ip: Ipv4Address, src_port: u16, dst_port: u16, + seq: u32, ack: u32, control: TcpControl, payload: &[u8], + window_len: u16, window_scale: Option, +) -> Vec { + // Same shape as build_tcp_frame but plumbs window_len/scale. + // ... +} +``` + +Run: should FAIL pre-Task-7 — `relay_tcp_nat_data` doesn't gate on `entry.guest_window` yet. + +**Commit:** `test(network): pin tcp_advertised_window_tracks_guest_buffer (BROKEN_ON_PURPOSE)` + +--- + +### Task 7: Gate host→guest sends on `entry.guest_window` + +In `relay_tcp_nat_data`, where we currently push frames into `frames_to_inject` for the relay's data path, compute the un-ACKed-bytes-in-flight and STOP sending when it would exceed `entry.guest_window`: + +```rust +// Before pushing a new payload chunk: +let in_flight = entry.bytes_in_flight as usize; +let window_remaining = (entry.guest_window as usize).saturating_sub(in_flight); +if window_remaining == 0 { + // Guest window is full; stop sending until guest ACKs. + trace!( + "SLIRP TCP: guest window exhausted on flow guest_port={} (in_flight={}, window={})", + key.guest_src_port, in_flight, entry.guest_window + ); + break; +} +let chunk_size = chunk.len().min(window_remaining); +let chunk = &chunk[..chunk_size]; +``` + +The `bytes_in_flight` field already exists from Phase 3 — use it. + +Run the pin from Task 6: should now PASS. + +**Commit:** `feat(slirp): gate host→guest send on guest's advertised window` + +--- + +### Task 8: Failing pin — `tcp_window_scale_negotiated_in_synack` + +```rust +#[test] +fn tcp_window_scale_negotiated_in_synack() { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + let mut stack = SlirpBackend::new().unwrap(); + + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + 1000, 0, TcpControl::Syn, &[], + )).unwrap(); + + let mut saw_synack_with_scale = false; + for f in drain_n(&mut stack, 4) { + let eth = EthernetFrame::new_unchecked(f.as_slice()); + if eth.ethertype() != EthernetProtocol::Ipv4 { continue; } + let ip = Ipv4Packet::new_checked(eth.payload()).unwrap(); + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + continue; + } + let tcp = TcpPacket::new_checked(ip.payload()).unwrap(); + if tcp.syn() && tcp.ack() { + // smoltcp's TcpPacket exposes options via .options() — look for WS. + for option in TcpOption::parse_from(tcp.options()) { + if let TcpOption::WindowScale(scale) = option { + assert_eq!(scale, 7, "advertised scale must be OUR_WINDOW_SCALE"); + saw_synack_with_scale = true; + } + } + } + } + assert!(saw_synack_with_scale, "SYN-ACK must include WindowScale option"); +} +``` + +Run: should PASS post-Task-4 (we already advertise scale in SYN-ACK). + +**Commit:** `test(network): pin tcp_window_scale_negotiated_in_synack` + +--- + +### Task 9: Bench `tcp_bulk_throughput_constrained_window` (parametric) + +Mirrors `tcp_bulk_throughput_1mb` but parametrizes the guest's advertised window. Pre-Phase-6.3 throughput should be ~bandwidth-delay-product limited at small windows; post-6.3 it should scale. + +```rust +#[divan::bench(args = [4096, 16384, 65536])] +fn tcp_bulk_throughput_constrained_window(bencher: Bencher, guest_window: u32) { + // ... same harness shape as tcp_bulk_throughput_1mb but uses + // build_tcp_frame_with_window to negotiate `guest_window`. +} +``` + +Documents the win numerically. + +**Commit:** `bench(network): tcp_bulk_throughput_constrained_window parametric` + +--- + +### Task 10: Phase 6.3 validation gate + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +cargo test --test network_baseline # 24/24 +cargo test --test network_baseline --features bench-helpers -- --test-threads=1 # 26/26 +cargo test --lib network # 23/23+ +cargo bench --bench network --features bench-helpers --no-run +cargo build --release +``` + +Wall-clock sanity: +```bash +voidbox-network-bench --iterations 3 --bulk-mb 10 +# g2h ≥ 5.5 Gbps (likely improves with scale negotiation), CRR ~10 ms, RR ~2 µs +``` + +`bench-compare.sh --baseline 47868f0 --skip-vm`: +- `tcp_bulk_throughput_constrained_window/4096` should NOT regress vs the older "ignore window" path (we now respect it; throughput at 4 KB window is bandwidth-limited but correctness is right). +- `tcp_bulk_throughput_1mb` should be ≤ baseline. +- All other benches no regression. + +--- + +## Out of scope + +- `TCP_FASTOPEN`, `SACK`, `TIMESTAMPS` — separate phases or never. +- Dynamic scale renegotiation post-handshake (impossible in TCP) — handled correctly today by sticking with the scale we set in SYN-ACK. + +## Reviewer pointers + +- Verify `host_recv_window` returns sane numbers under load. Add a bench-helpers method `synthetic_advertised_window(fd)` if it's hard to inspect. +- Verify `bytes_in_flight` tracking is still consistent — the `in_flight` calculation in Task 7 reuses the Phase 3 field; if anyone broke it, the pin from Task 6 would be flaky. +- Snapshot interaction: pre-6.3 entries default to `guest_window: 65535, guest_window_scale: 0` via `#[serde(default)]`. That's the same behavior as before this phase. Verify with snapshot_integration if env is available. + +## Document history + +- 2026-05-05: initial plan written. From a6992c88d61a6661ee7286ec95c872a4257ebed1 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 08:53:26 -0300 Subject: [PATCH 02/11] feat(slirp): TcpNatEntry tracks guest_window + guest_window_scale --- src/network/slirp.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 111c0916..9d1e0311 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -268,6 +268,18 @@ struct TcpNatEntry { #[allow(dead_code)] // Read by EPOLLOUT-driven completion in relay_pending_connects (Task 5). guest_isn: u32, + /// Guest's advertised receive window in bytes, scaled per + /// `guest_window_scale`. Updated on every incoming TCP frame's + /// `window_len`. Initial value 65535 matches an unscaled SYN. + // Written by Tasks 2–3; read by Task 7. + #[allow(dead_code)] + guest_window: u32, + /// Window-scale shift the guest negotiated in its SYN. Zero + /// means "guest does not support window scaling" (or we did not + /// see a window-scale option in the SYN). + // Written by Task 2; read by Task 3. + #[allow(dead_code)] + guest_window_scale: u8, } /// Key for the ICMP echo NAT table: (guest ICMP id, destination IP). @@ -873,6 +885,8 @@ impl SlirpBackend { // EPOLLOUT-driven completion path only reads guest_isn for // outbound (guest-initiated) SYNs. guest_isn: 0, + guest_window: 65535, + guest_window_scale: 0, }; let host_fd = entry.host_stream.as_raw_fd(); let flow_key = FlowKey::Tcp(key); @@ -1668,6 +1682,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: seq, + guest_window: 65535, + guest_window_scale: 0, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -1720,6 +1736,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: seq, + guest_window: 65535, + guest_window_scale: 0, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -3106,6 +3124,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: 0, + guest_window: 65535, + guest_window_scale: 0, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -3225,6 +3245,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: true, guest_isn: 0, + guest_window: 65535, + guest_window_scale: 0, }; self.flow_table .insert(FlowKey::Tcp(key), FlowEntry::Tcp(entry)); @@ -3294,6 +3316,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: 1000, + guest_window: 65535, + guest_window_scale: 0, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); From 974582406c7fe4a8e11bcabedd654cecaf6d52ee Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 08:57:11 -0300 Subject: [PATCH 03/11] feat(slirp): parse guest's window_scale on SYN, store on flow --- src/network/slirp.rs | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 9d1e0311..27f5e0b7 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -65,7 +65,7 @@ use smoltcp::time::Instant as SmolInstant; use smoltcp::wire::{ EthernetAddress, EthernetFrame, EthernetProtocol, EthernetRepr, HardwareAddress, Icmpv4Packet, Icmpv4Repr, IpAddress, IpCidr, IpProtocol, Ipv4Address, Ipv4Packet, Ipv4Repr, TcpControl, - TcpPacket, TcpRepr, TcpSeqNumber, UdpPacket, UdpRepr, + TcpOption, TcpPacket, TcpRepr, TcpSeqNumber, UdpPacket, UdpRepr, }; use tracing::{debug, trace, warn}; @@ -132,6 +132,22 @@ const PROTO_TAG_LISTEN: u64 = 0x0400_0000_0000_0000; /// any realistic process lifetime. static FLOW_TOKEN_COUNTER: AtomicU64 = AtomicU64::new(0); +/// Parse the `WindowScale` option from a raw TCP options buffer. +/// +/// Returns 0 when no `WindowScale` option is present (the guest is not +/// advertising window scaling, so shift = 0 means no scaling applied). +fn parse_tcp_window_scale(options: &[u8]) -> u8 { + let mut remaining = options; + loop { + match TcpOption::parse(remaining) { + Ok((_, TcpOption::EndOfList)) | Err(_) => break, + Ok((_, TcpOption::WindowScale(scale))) => return scale, + Ok((rest, _)) => remaining = rest, + } + } + 0 +} + /// Allocate a fresh, globally unique `FlowToken` tagged for the given protocol. /// /// The lower 56 bits are drawn from a relaxed monotonic counter shared across @@ -1548,6 +1564,16 @@ impl SlirpBackend { src_ip, src_port, dst_ip, dst_port ); + // Parse window scaling from the SYN's TCP options so it can be + // stored on the flow entry. Zero when the guest omits the option. + let syn_window_scale = parse_tcp_window_scale(tcp.options()); + let syn_window: u32 = u32::from(tcp.window_len()) << syn_window_scale; + trace!( + "SLIRP TCP SYN: guest window_scale={} initial_window={}", + syn_window_scale, + syn_window + ); + // Unified outbound translation: combines the gateway-loopback // rewrite + deny-list check in one pure-function call. Returns None if // the dst is denied; on Some, the SocketAddr already has the right @@ -1682,8 +1708,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: seq, - guest_window: 65535, - guest_window_scale: 0, + guest_window: syn_window, + guest_window_scale: syn_window_scale, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -1736,8 +1762,8 @@ impl SlirpBackend { last_state_change: Instant::now(), our_fin_sent: false, guest_isn: seq, - guest_window: 65535, - guest_window_scale: 0, + guest_window: syn_window, + guest_window_scale: syn_window_scale, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); From 27896734f8fce4a1af73f5a72219e57f0a4b9679 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 08:57:59 -0300 Subject: [PATCH 04/11] feat(slirp): track guest's advertised window on every incoming frame --- src/network/slirp.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 27f5e0b7..7e01a072 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -287,14 +287,10 @@ struct TcpNatEntry { /// Guest's advertised receive window in bytes, scaled per /// `guest_window_scale`. Updated on every incoming TCP frame's /// `window_len`. Initial value 65535 matches an unscaled SYN. - // Written by Tasks 2–3; read by Task 7. - #[allow(dead_code)] guest_window: u32, /// Window-scale shift the guest negotiated in its SYN. Zero /// means "guest does not support window scaling" (or we did not /// see a window-scale option in the SYN). - // Written by Task 2; read by Task 3. - #[allow(dead_code)] guest_window_scale: u8, } @@ -1827,6 +1823,11 @@ impl SlirpBackend { entry.last_activity = Instant::now(); + // Track the most recent window advertisement from the guest. Runs for + // every frame (data, ACK, FIN, RST) so `guest_window` always reflects + // the current receive-buffer headroom on the guest side. + entry.guest_window = u32::from(tcp.window_len()) << entry.guest_window_scale; + // Inbound port-forward: guest's SYN-ACK completing the host-initiated // 3-way handshake. We synthesized a SYN to the guest (5.5b.2/5.5b.3); // the guest's kernel accepted it and replied with SYN+ACK. Send an ACK From 78d1554c9da5899f1a3c2a77ff622d4b6c776d90 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:00:51 -0300 Subject: [PATCH 05/11] refactor(slirp): build_tcp_packet_static takes (window_len, window_scale) --- src/network/slirp.rs | 54 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 7e01a072..38c50741 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -90,7 +90,10 @@ pub const GATEWAY_MAC: [u8; 6] = [0x52, 0x54, 0x00, 0x12, 0x34, 0x01]; const MTU: usize = 1500; const MAX_QUEUE_SIZE: usize = 64; -const TCP_WINDOW: u16 = 65535; +/// Window-scale shift we advertise on SYN-ACK frames. Matches passt's +/// default. 7 means each unit in `window_len` represents 128 bytes, +/// extending the effective window from 64 KiB to 8 MiB. +const OUR_WINDOW_SCALE: u8 = 7; const UDP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); /// Timeout for TCP entries stuck in the LastAck state (i.e. we sent a FIN /// but the guest's final ACK never arrived). Two TCP MSLs (2 × 30 s = 60 s) @@ -1591,6 +1594,8 @@ impl SlirpBackend { seq + 1, TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); return Ok(()); @@ -1617,6 +1622,8 @@ impl SlirpBackend { seq + 1, TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); return Ok(()); @@ -1637,6 +1644,8 @@ impl SlirpBackend { seq + 1, TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); return Ok(()); @@ -1677,6 +1686,8 @@ impl SlirpBackend { seq + 1, TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); return Ok(()); @@ -1729,6 +1740,8 @@ impl SlirpBackend { seq + 1, TcpControl::Syn, &[], + 65535, + Some(OUR_WINDOW_SCALE), ); self.inject_to_guest.push(syn_ack); debug!( @@ -1795,6 +1808,8 @@ impl SlirpBackend { seq + 1, TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); } @@ -1848,6 +1863,8 @@ impl SlirpBackend { tcp.seq_number().0.wrapping_add(1) as u32, // ack — guest ISN + 1 TcpControl::None, &[], + 65535, + None, ); self.inject_to_guest.push(ack_frame); entry.our_seq = entry.our_seq.wrapping_add(1); @@ -1991,6 +2008,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], + 65535, + None, ); self.inject_to_guest.push(ack_frame); trace!( @@ -2024,6 +2043,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], + 65535, + None, ); self.inject_to_guest.push(ack_frame); @@ -2064,6 +2085,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], + 65535, + None, ); self.inject_to_guest.push(ack_frame); if let Err(e) = entry.host_stream.shutdown(std::net::Shutdown::Write) { @@ -2190,6 +2213,8 @@ impl SlirpBackend { guest_isn.wrapping_add(1), TcpControl::Rst, &[], + 65535, + None, ); self.inject_to_guest.push(rst); if let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&flow_key) { @@ -2226,6 +2251,8 @@ impl SlirpBackend { guest_isn.wrapping_add(1), TcpControl::Syn, &[], + 65535, + Some(OUR_WINDOW_SCALE), ); self.inject_to_guest.push(syn_ack); debug!( @@ -2400,6 +2427,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, chunk, + 65535, + None, ); frames_to_inject.push(frame); entry.our_seq = entry.our_seq.wrapping_add(chunk.len() as u32); @@ -2444,6 +2473,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], + 65535, + None, )); entry.our_seq = entry.our_seq.wrapping_add(1); entry.our_fin_sent = true; @@ -2461,6 +2492,8 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], + 65535, + None, )); } } // entry borrow ends here @@ -2493,6 +2526,8 @@ impl SlirpBackend { entry.guest_isn.wrapping_add(1), TcpControl::Rst, &[], + 65535, + None, ); frames_to_inject.push(rst); } @@ -2841,7 +2876,12 @@ impl NetworkBackend for SlirpBackend { } } -/// Build a TCP packet (free function to avoid borrow issues with &self methods) +/// Build a TCP packet (free function to avoid borrow issues with &self methods). +/// +/// `window_len` is the raw 16-bit window field; `window_scale` is included as a +/// TCP option only when `Some(_)` — callers pass `Some(OUR_WINDOW_SCALE)` on +/// SYN-ACK frames and `None` on all other frames (the scale was already +/// negotiated at handshake time and does not re-appear in later headers). #[allow(clippy::too_many_arguments)] fn build_tcp_packet_static( src_ip: Ipv4Address, @@ -2852,14 +2892,16 @@ fn build_tcp_packet_static( ack: u32, control: TcpControl, payload: &[u8], + window_len: u16, + window_scale: Option, ) -> Vec { let tcp_repr = TcpRepr { src_port, dst_port, seq_number: TcpSeqNumber(seq as i32), ack_number: Some(TcpSeqNumber(ack as i32)), - window_len: TCP_WINDOW, - window_scale: None, + window_len, + window_scale, control, max_seg_size: if control == TcpControl::Syn { Some(MTU as u16 - 40) @@ -2930,6 +2972,8 @@ pub fn synthesize_inbound_syn(high_port: u16, guest_port: u16, our_seq: u32) -> 0, TcpControl::Syn, &[], + 65535, + None, ) } @@ -2945,6 +2989,8 @@ fn synthesize_inbound_syn(high_port: u16, guest_port: u16, our_seq: u32) -> Vec< 0, TcpControl::Syn, &[], + 65535, + None, ) } From 4e6eb87485546188f22d8f5593e1e12c433deefb Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:04:18 -0300 Subject: [PATCH 06/11] feat(slirp): advertise host-kernel-derived window on outgoing frames --- src/network/slirp.rs | 58 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 38c50741..92f9e5c6 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -1863,7 +1863,7 @@ impl SlirpBackend { tcp.seq_number().0.wrapping_add(1) as u32, // ack — guest ISN + 1 TcpControl::None, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, ); self.inject_to_guest.push(ack_frame); @@ -2008,7 +2008,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, ); self.inject_to_guest.push(ack_frame); @@ -2043,7 +2043,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, ); self.inject_to_guest.push(ack_frame); @@ -2085,7 +2085,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, ); self.inject_to_guest.push(ack_frame); @@ -2417,6 +2417,7 @@ impl SlirpBackend { if peek_n > in_flight { let new_bytes = &peek_buf[in_flight..peek_n]; let mut sent_total: usize = 0; + let our_window = host_recv_window(entry.host_stream.as_raw_fd()); for chunk in new_bytes.chunks(MTU - 54) { let frame = build_tcp_packet_static( key.dst_ip, @@ -2427,7 +2428,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, chunk, - 65535, + our_window, None, ); frames_to_inject.push(frame); @@ -2473,7 +2474,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, )); entry.our_seq = entry.our_seq.wrapping_add(1); @@ -2492,7 +2493,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], - 65535, + host_recv_window(entry.host_stream.as_raw_fd()), None, )); } @@ -2876,6 +2877,49 @@ impl NetworkBackend for SlirpBackend { } } +/// Host kernel's current receive-buffer headroom for a TCP socket, scaled down +/// by `OUR_WINDOW_SCALE`, for advertising as our `window_len` on outgoing frames. +/// +/// A fresh TCP socket has `tcpi_rcv_space` pre-filled to ~32 KiB; under load it +/// grows to 4 MiB+ on Linux with auto-tuning enabled. Dividing by 128 (shift 7) +/// keeps the value within `u16::MAX` and matches the scale we advertised in the +/// SYN-ACK. +/// +/// Returns `32768` on `getsockopt` failure rather than `0` (which stalls the +/// connection) or `u16::MAX` (which over-commits buffer space). +#[cfg(target_os = "linux")] +fn host_recv_window(fd: std::os::fd::RawFd) -> u16 { + use std::mem::MaybeUninit; + let mut info: MaybeUninit = MaybeUninit::zeroed(); + let mut len = std::mem::size_of::() as libc::socklen_t; + // SAFETY: `getsockopt` writes into `info` when it returns 0; the pointer + // is valid and the length is exact. + let rc = unsafe { + libc::getsockopt( + fd, + libc::IPPROTO_TCP, + libc::TCP_INFO, + info.as_mut_ptr().cast::(), + &mut len, + ) + }; + if rc != 0 { + return 32768; + } + // SAFETY: getsockopt returned 0, so `info` is fully initialised. + let info = unsafe { info.assume_init() }; + let scaled = info.tcpi_rcv_space >> OUR_WINDOW_SCALE; + scaled.min(u32::from(u16::MAX)) as u16 +} + +/// Non-Linux stub: always return a conservative fixed window. +/// The SLIRP relay only runs on Linux; this stub keeps cross-platform builds +/// compiling without `#[cfg]` gating at every call site. +#[cfg(not(target_os = "linux"))] +fn host_recv_window(_fd: std::os::fd::RawFd) -> u16 { + 32768 +} + /// Build a TCP packet (free function to avoid borrow issues with &self methods). /// /// `window_len` is the raw 16-bit window field; `window_scale` is included as a From 540c96bb6c5c469bab3007d61d12c7ba50323616 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:05:52 -0300 Subject: [PATCH 07/11] test(network): pin tcp_advertised_window_tracks_guest_buffer (BROKEN_ON_PURPOSE) --- tests/network_baseline.rs | 152 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index 543cc13b..f5b71b73 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -102,6 +102,65 @@ fn build_tcp_frame( buf } +/// Like `build_tcp_frame` but exposes explicit `window_len` and `window_scale` +/// parameters so tests can exercise window-management behaviour. +#[allow(clippy::too_many_arguments)] +fn build_tcp_frame_with_window( + dst_ip: Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], + window_len: u16, + window_scale: Option, +) -> Vec { + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: smoltcp::wire::TcpSeqNumber(seq as i32), + ack_number: if ack == 0 { + None + } else { + Some(smoltcp::wire::TcpSeqNumber(ack as i32)) + }, + window_len, + window_scale, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let total = ETH_HDR_LEN + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[ETH_HDR_LEN..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked(&mut buf[ETH_HDR_LEN + ip_repr.buffer_len()..]); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + &Default::default(), + ); + buf +} + /// Builds a UDP-over-Ethernet datagram from guest. fn build_udp_frame(dst_ip: Ipv4Address, src_port: u16, dst_port: u16, payload: &[u8]) -> Vec { let udp_repr = UdpRepr { src_port, dst_port }; @@ -1798,3 +1857,96 @@ fn tcp_connect_async_eventual_rst_on_failure() { "guest must eventually receive RST when async connect to dropped-listener port fails" ); } + +/// BROKEN_ON_PURPOSE: `relay_tcp_nat_data` does not yet gate host→guest sends +/// on `entry.guest_window`. This test will FAIL until Task 7 gates the relay +/// on `guest_window`, at which point it flips to PASSING. +/// +/// The test establishes a flow with a small guest window (4096 bytes, no scale), +/// feeds 64 KiB from the host side, and asserts that injected payload before any +/// ACK does not exceed the guest's advertised window plus one MTU slop. +#[test] +fn tcp_advertised_window_tracks_guest_buffer() { + use std::io::Write; + use std::net::TcpListener; + use std::time::Instant; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + let server = std::thread::spawn(move || -> std::net::TcpStream { + let (sock, _) = listener.accept().unwrap(); + sock + }); + + let mut stack = SlirpBackend::new().unwrap(); + + let our_seq = 1000u32; + // Guest SYN with explicit small window (4096 bytes, no scale). + let syn = build_tcp_frame_with_window( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq, + 0, + TcpControl::Syn, + &[], + 4096, + None, + ); + stack.process_guest_frame(&syn).unwrap(); + + // Collect SYN-ACK from the stack. + let mut gateway_seq = 0u32; + let deadline = Instant::now() + std::time::Duration::from_secs(2); + 'outer: while Instant::now() < deadline { + for f in drain_n(&mut stack, 4) { + if let Some((s, _, ctrl, _)) = parse_tcp_to_guest(f.as_slice()) { + if matches!(ctrl, TcpControl::Syn) { + gateway_seq = s; + break 'outer; + } + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + // Complete handshake with the same small window. + stack + .process_guest_frame(&build_tcp_frame_with_window( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq + 1, + TcpControl::None, + &[], + 4096, + None, + )) + .unwrap(); + + // Wait for the server thread to accept and obtain the host stream. + let mut host_stream = server.join().unwrap(); + + // Push 64 KiB from the host side. + let payload = vec![0xABu8; 64 * 1024]; + host_stream.write_all(&payload).unwrap(); + + // Drive drain_to_guest a few times. With proper window tracking, + // total bytes injected before any ACK should be <= guest_window + // (4096 plus one MTU-sized slop for partial segment). + let mut total_payload_injected: usize = 0; + for _ in 0..20 { + for f in drain_n(&mut stack, 1) { + if let Some((_, _, _, payload_len)) = parse_tcp_to_guest(f.as_slice()) { + total_payload_injected += payload_len; + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + assert!( + total_payload_injected <= 4096 + 1500, + "injected {total_payload_injected} bytes; must respect guest_window=4096 (one MTU slop allowed)" + ); +} From 44055691bba27125d7a6455b1b647e3ff05de14a Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:06:55 -0300 Subject: [PATCH 08/11] =?UTF-8?q?feat(slirp):=20gate=20host=E2=86=92guest?= =?UTF-8?q?=20send=20on=20guest's=20advertised=20window?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/network/slirp.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 92f9e5c6..2fee7fbf 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -2419,6 +2419,24 @@ impl SlirpBackend { let mut sent_total: usize = 0; let our_window = host_recv_window(entry.host_stream.as_raw_fd()); for chunk in new_bytes.chunks(MTU - 54) { + // Honour the guest's advertised receive window. + // `bytes_in_flight` tracks how many bytes the + // guest has not yet ACK'd; stop sending once we + // would exceed its buffer. + let window_remaining = (entry.guest_window as usize) + .saturating_sub(entry.bytes_in_flight as usize); + if window_remaining == 0 { + trace!( + "SLIRP TCP: guest window exhausted on flow \ + guest_port={} (in_flight={}, window={})", + key.guest_src_port, + entry.bytes_in_flight, + entry.guest_window + ); + break; + } + let send_len = chunk.len().min(window_remaining); + let chunk = &chunk[..send_len]; let frame = build_tcp_packet_static( key.dst_ip, SLIRP_GUEST_IP, From 3da83d0cd238f02cd7e660497d3fe5df36d0a7cf Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:07:46 -0300 Subject: [PATCH 09/11] test(network): pin tcp_window_scale_negotiated_in_synack --- tests/network_baseline.rs | 75 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index f5b71b73..7bfdfa57 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -25,7 +25,7 @@ use smoltcp::wire::{ ArpOperation, ArpPacket, ArpRepr, EthernetAddress, EthernetFrame, EthernetProtocol, EthernetRepr, Icmpv4Packet, Icmpv4Repr, IpAddress, IpProtocol, Ipv4Address, Ipv4Packet, - Ipv4Repr, TcpControl, TcpPacket, TcpRepr, UdpPacket, UdpRepr, + Ipv4Repr, TcpControl, TcpOption, TcpPacket, TcpRepr, UdpPacket, UdpRepr, }; use std::io::{Read, Write}; use std::net::{Ipv4Addr, SocketAddr, TcpListener, UdpSocket}; @@ -1858,6 +1858,79 @@ fn tcp_connect_async_eventual_rst_on_failure() { ); } +/// Asserts that the SYN-ACK the stack emits in response to a guest SYN +/// includes a `WindowScale` option set to `OUR_WINDOW_SCALE` (7). +/// +/// This pin validates Task 4's SYN-ACK advertisement and is expected to +/// PASS post-Task-4: `build_tcp_packet_static` now passes +/// `Some(OUR_WINDOW_SCALE)` on the SYN-ACK call site. +#[test] +fn tcp_window_scale_negotiated_in_synack() { + use std::time::Instant; + + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + let mut stack = SlirpBackend::new().unwrap(); + + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + 1000, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + + let mut saw_synack_with_scale = false; + let deadline = Instant::now() + std::time::Duration::from_secs(2); + 'drain: while Instant::now() < deadline { + for f in drain_n(&mut stack, 4) { + let eth = match EthernetFrame::new_checked(f.as_slice()) { + Ok(e) => e, + Err(_) => continue, + }; + if eth.ethertype() != EthernetProtocol::Ipv4 { + continue; + } + let ip = match Ipv4Packet::new_checked(eth.payload()) { + Ok(p) => p, + Err(_) => continue, + }; + if ip.next_header() != IpProtocol::Tcp || ip.dst_addr() != SLIRP_GUEST_IP { + continue; + } + let tcp = match TcpPacket::new_checked(ip.payload()) { + Ok(p) => p, + Err(_) => continue, + }; + if !tcp.syn() || !tcp.ack() { + continue; + } + // Parse options to find WindowScale. + let mut remaining = tcp.options(); + loop { + match TcpOption::parse(remaining) { + Ok((_, TcpOption::EndOfList)) | Err(_) => break, + Ok((_, TcpOption::WindowScale(scale))) => { + assert_eq!(scale, 7, "advertised scale must be OUR_WINDOW_SCALE (7)"); + saw_synack_with_scale = true; + break 'drain; + } + Ok((rest, _)) => remaining = rest, + } + } + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + assert!( + saw_synack_with_scale, + "SYN-ACK must include WindowScale option with value 7" + ); +} + /// BROKEN_ON_PURPOSE: `relay_tcp_nat_data` does not yet gate host→guest sends /// on `entry.guest_window`. This test will FAIL until Task 7 gates the relay /// on `guest_window`, at which point it flips to PASSING. From 5c30b1c58686709d37fd5195624867532d792658 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 09:12:24 -0300 Subject: [PATCH 10/11] bench(network): tcp_bulk_throughput_constrained_window parametric Adds tcp_bulk_throughput_constrained_window bench that exercises the Task 7 window-gating path under three guest-window sizes (4096, 16384, 65536 bytes). Mirrors tcp_bulk_throughput_1mb with a parametric window so regressions in window-constrained relay show up numerically. --- benches/network.rs | 196 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/benches/network.rs b/benches/network.rs index df2bfdef..b1c95f06 100644 --- a/benches/network.rs +++ b/benches/network.rs @@ -459,6 +459,138 @@ mod linux_benches { }); } + /// Parametric bulk-throughput bench: pushes host→guest data with the guest + /// advertising a fixed receive window of `guest_window` bytes. + /// + /// Documents the effect of Task 7's window gating. At small windows (4096 B) + /// the relay is constrained to one window's worth of unACK'd data before the + /// simulated guest ACKs back; at 65536 B the relay can pipeline far more. + /// Divan reports per-arm throughput so regressions are visible numerically. + #[cfg(feature = "bench-helpers")] + #[divan::bench(args = [4096u32, 16384, 65536], sample_count = 10)] + fn tcp_bulk_throughput_constrained_window(bencher: Bencher, guest_window: u32) { + use smoltcp::wire::TcpControl; + use std::io::Write; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + const TOTAL_BYTES: usize = 512 * 1024; + const DEADLINE_SECS: u64 = 5; + const GUEST_SRC_PORT: u16 = 49210; + const INITIAL_GUEST_SEQ: u32 = 2000; + + bencher + .counter(BytesCount::new(TOTAL_BYTES as u64)) + .bench_local(|| { + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let bytes_injected = Arc::new(AtomicUsize::new(0)); + let bytes_injected_thr = Arc::clone(&bytes_injected); + + // Server thread: accept connection and write TOTAL_BYTES. + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + let payload = vec![0xABu8; TOTAL_BYTES]; + let _ = sock.write_all(&payload); + }); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest SYN with the parametric window (no scale option so + // the relay sees a raw 16-bit window exactly as supplied). + let win16 = guest_window.min(65535) as u16; + let syn = build_tcp_data_frame_with_window( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + INITIAL_GUEST_SEQ, + 0, + TcpControl::Syn, + &[], + win16, + None, + ); + stack.process_guest_frame(&syn).unwrap(); + + // Collect SYN-ACK. + let mut drain_frames = Vec::new(); + let gateway_seq = { + let deadline = + std::time::Instant::now() + std::time::Duration::from_secs(DEADLINE_SECS); + loop { + drain_frames.clear(); + stack.drain_to_guest(&mut drain_frames); + if let Some((s, _, _, _)) = drain_frames + .iter() + .find_map(|f| parse_tcp_to_guest_frame(f)) + { + break s; + } + if std::time::Instant::now() > deadline { + panic!("no SYN-ACK"); + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + }; + + // Complete handshake. + let ack = build_tcp_data_frame_with_window( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + INITIAL_GUEST_SEQ + 1, + gateway_seq + 1, + TcpControl::None, + &[], + win16, + None, + ); + stack.process_guest_frame(&ack).unwrap(); + + // Drive drain_to_guest until TOTAL_BYTES worth of payload has + // been injected. Simulate guest ACKs after each drain so the + // relay can keep sending (mimics the real guest draining and + // re-advertising its window). + let guest_seq = INITIAL_GUEST_SEQ + 1; + let deadline = + std::time::Instant::now() + std::time::Duration::from_secs(DEADLINE_SECS); + + while bytes_injected.load(Ordering::Relaxed) < TOTAL_BYTES * 95 / 100 + && std::time::Instant::now() < deadline + { + drain_frames.clear(); + stack.drain_to_guest(&mut drain_frames); + for frame in &drain_frames { + if let Some((data_seq, _, _, plen)) = parse_tcp_to_guest_frame(frame) { + if plen > 0 { + let new_ack = data_seq.wrapping_add(plen as u32); + bytes_injected_thr.fetch_add(plen, Ordering::Relaxed); + // Simulate guest ACK with re-advertised window. + let guest_ack_frame = build_tcp_data_frame_with_window( + SLIRP_GATEWAY_IP, + GUEST_SRC_PORT, + host_port, + guest_seq, + new_ack, + TcpControl::None, + &[], + win16, + None, + ); + let _ = stack.process_guest_frame(&guest_ack_frame); + } + } + } + if drain_frames.is_empty() { + std::thread::sleep(std::time::Duration::from_millis(1)); + } + } + let _ = server.join(); + divan::black_box(bytes_injected.load(Ordering::Relaxed)); + }); + } + /// Builds a minimal IPv4-over-Ethernet TCP segment from guest to gateway. /// /// Returns the full Ethernet frame bytes. Mirrors the `build_tcp_frame` @@ -521,6 +653,70 @@ mod linux_benches { buf } + /// Like `build_tcp_data_frame` but accepts explicit `window_len` and + /// `window_scale` parameters. Used by the constrained-window bench to + /// simulate a guest with a small receive buffer. + #[cfg(feature = "bench-helpers")] + #[allow(clippy::too_many_arguments)] + fn build_tcp_data_frame_with_window( + dst_ip: smoltcp::wire::Ipv4Address, + src_port: u16, + dst_port: u16, + seq: u32, + ack: u32, + control: TcpControl, + payload: &[u8], + window_len: u16, + window_scale: Option, + ) -> Vec { + use smoltcp::wire::{IpAddress, TcpSeqNumber}; + + let tcp_repr = TcpRepr { + src_port, + dst_port, + control, + seq_number: TcpSeqNumber(seq as i32), + ack_number: if ack == 0 { + None + } else { + Some(TcpSeqNumber(ack as i32)) + }, + window_len, + window_scale, + max_seg_size: None, + sack_permitted: false, + sack_ranges: [None, None, None], + payload, + }; + let ip_repr = Ipv4Repr { + src_addr: SLIRP_GUEST_IP, + dst_addr: dst_ip, + next_header: IpProtocol::Tcp, + payload_len: tcp_repr.buffer_len(), + hop_limit: 64, + }; + let eth_repr = EthernetRepr { + src_addr: EthernetAddress(GUEST_MAC), + dst_addr: EthernetAddress(GATEWAY_MAC), + ethertype: EthernetProtocol::Ipv4, + }; + let eth_hdr_len = 14usize; + let total = eth_hdr_len + ip_repr.buffer_len() + tcp_repr.buffer_len(); + let mut buf = vec![0u8; total]; + let mut eth = EthernetFrame::new_unchecked(&mut buf[..]); + eth_repr.emit(&mut eth); + let mut ip = Ipv4Packet::new_unchecked(&mut buf[eth_hdr_len..]); + ip_repr.emit(&mut ip, &Default::default()); + let mut tcp = TcpPacket::new_unchecked(&mut buf[eth_hdr_len + ip_repr.buffer_len()..]); + tcp_repr.emit( + &mut tcp, + &IpAddress::Ipv4(SLIRP_GUEST_IP), + &IpAddress::Ipv4(dst_ip), + &Default::default(), + ); + buf + } + /// Parses one frame emitted by the stack as a TCP segment directed to the guest. /// /// Returns `(seq, ack, control, payload_len)` on success, `None` otherwise. From 1b9ba720113a05cc13810ebbadbdcaa56e949963 Mon Sep 17 00:00:00 2001 From: diego Date: Wed, 6 May 2026 10:15:39 -0300 Subject: [PATCH 11/11] perf(slirp): cache host_recv_window per-flow with 5ms TTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profiling tcp_bulk_throughput_1mb showed __getsockopt at 5.7% flat CPU — Phase 6.3's host_recv_window was issuing one getsockopt(TCP_INFO) per outgoing TCP frame, costing ~10k syscalls/s at line rate. Cache the result on TcpNatEntry and refresh only every RECV_WINDOW_TTL (5 ms). At line rate this collapses to ~200 syscalls/s — a ~50x reduction — while the advertised window stays within 5 ms of reality, which is well below any realistic RTT. cached_recv_window is initialized at flow construction with one host_recv_window call so the first emitted frame doesn't pay the syscall cost on the data path either. --- src/network/slirp.rs | 79 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 2fee7fbf..7a28078e 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -295,6 +295,17 @@ struct TcpNatEntry { /// means "guest does not support window scaling" (or we did not /// see a window-scale option in the SYN). guest_window_scale: u8, + /// Cached value of `host_recv_window(host_stream)`. Refreshed + /// every `RECV_WINDOW_TTL` instead of on every outgoing frame + /// so the bulk-throughput data path doesn't issue a + /// `getsockopt(TCP_INFO)` per packet. The window we advertise + /// stays within `RECV_WINDOW_TTL` of reality, which is well below + /// any realistic RTT. + cached_recv_window: u16, + /// Wall clock when `cached_recv_window` was last refreshed. + /// `Instant::now() - RECV_WINDOW_TTL` at construction forces + /// the first emit to populate the cache before advertising. + cached_recv_window_at: Instant, } /// Key for the ICMP echo NAT table: (guest ICMP id, destination IP). @@ -886,6 +897,7 @@ impl SlirpBackend { dst_port: high_port, }; let token = next_flow_token(PROTO_TAG_TCP); + let cached_recv_window = host_recv_window(host_stream.as_raw_fd()); let entry = TcpNatEntry { host_stream, state: TcpNatState::SynSent, @@ -902,6 +914,8 @@ impl SlirpBackend { guest_isn: 0, guest_window: 65535, guest_window_scale: 0, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; let host_fd = entry.host_stream.as_raw_fd(); let flow_key = FlowKey::Tcp(key); @@ -1704,6 +1718,7 @@ impl SlirpBackend { let our_seq: u32 = rand_seq(); let token = next_flow_token(PROTO_TAG_TCP); let flow_key = FlowKey::Tcp(key); + let cached_recv_window = host_recv_window(host_fd); let entry = TcpNatEntry { host_stream: stream, state: TcpNatState::SynReceived, @@ -1717,6 +1732,8 @@ impl SlirpBackend { guest_isn: seq, guest_window: syn_window, guest_window_scale: syn_window_scale, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -1760,6 +1777,7 @@ impl SlirpBackend { let our_seq: u32 = rand_seq(); let token = next_flow_token(PROTO_TAG_TCP); let flow_key = FlowKey::Tcp(key); + let cached_recv_window = host_recv_window(host_fd); let entry = TcpNatEntry { host_stream: stream, state: TcpNatState::Connecting, @@ -1773,6 +1791,8 @@ impl SlirpBackend { guest_isn: seq, guest_window: syn_window, guest_window_scale: syn_window_scale, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -1863,7 +1883,7 @@ impl SlirpBackend { tcp.seq_number().0.wrapping_add(1) as u32, // ack — guest ISN + 1 TcpControl::None, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, ); self.inject_to_guest.push(ack_frame); @@ -2008,7 +2028,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, ); self.inject_to_guest.push(ack_frame); @@ -2043,7 +2063,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, ); self.inject_to_guest.push(ack_frame); @@ -2085,7 +2105,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::None, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, ); self.inject_to_guest.push(ack_frame); @@ -2417,7 +2437,7 @@ impl SlirpBackend { if peek_n > in_flight { let new_bytes = &peek_buf[in_flight..peek_n]; let mut sent_total: usize = 0; - let our_window = host_recv_window(entry.host_stream.as_raw_fd()); + let our_window = cached_host_recv_window(entry); for chunk in new_bytes.chunks(MTU - 54) { // Honour the guest's advertised receive window. // `bytes_in_flight` tracks how many bytes the @@ -2492,7 +2512,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, )); entry.our_seq = entry.our_seq.wrapping_add(1); @@ -2511,7 +2531,7 @@ impl SlirpBackend { entry.guest_ack, TcpControl::Fin, &[], - host_recv_window(entry.host_stream.as_raw_fd()), + cached_host_recv_window(entry), None, )); } @@ -2895,6 +2915,39 @@ impl NetworkBackend for SlirpBackend { } } +/// Refresh interval for the per-flow `cached_recv_window`. Bounding the +/// freshness of the advertised window to a few milliseconds keeps it well +/// below any realistic RTT, while collapsing what would otherwise be one +/// `getsockopt(TCP_INFO)` per outgoing frame into one per `RECV_WINDOW_TTL`. +const RECV_WINDOW_TTL: Duration = Duration::from_millis(5); + +/// Per-flow cache wrapper around [`host_recv_window`]. +/// +/// Reads the cached value from `entry` and refreshes it via a real +/// `getsockopt(TCP_INFO)` only when it is older than [`RECV_WINDOW_TTL`]. +/// At line-rate this drops the syscall from "every outgoing frame" to +/// "every few milliseconds", which profiling identified as the dominant +/// per-frame cost in Phase 6.3. +#[cfg(target_os = "linux")] +fn cached_host_recv_window(entry: &mut TcpNatEntry) -> u16 { + if entry.cached_recv_window_at.elapsed() >= RECV_WINDOW_TTL { + entry.cached_recv_window = host_recv_window(entry.host_stream.as_raw_fd()); + entry.cached_recv_window_at = Instant::now(); + } + entry.cached_recv_window +} + +/// Non-Linux stub: same shape as the Linux version, but `host_recv_window` +/// is itself a constant on non-Linux so caching is moot. +#[cfg(not(target_os = "linux"))] +fn cached_host_recv_window(entry: &mut TcpNatEntry) -> u16 { + if entry.cached_recv_window_at.elapsed() >= RECV_WINDOW_TTL { + entry.cached_recv_window = host_recv_window(entry.host_stream.as_raw_fd()); + entry.cached_recv_window_at = Instant::now(); + } + entry.cached_recv_window +} + /// Host kernel's current receive-buffer headroom for a TCP socket, scaled down /// by `OUR_WINDOW_SCALE`, for advertising as our `window_len` on outgoing frames. /// @@ -2905,6 +2958,9 @@ impl NetworkBackend for SlirpBackend { /// /// Returns `32768` on `getsockopt` failure rather than `0` (which stalls the /// connection) or `u16::MAX` (which over-commits buffer space). +/// +/// Hot-path callers should use [`cached_host_recv_window`] instead — this +/// function is the uncached primitive used by the cache itself. #[cfg(target_os = "linux")] fn host_recv_window(fd: std::os::fd::RawFd) -> u16 { use std::mem::MaybeUninit; @@ -3248,6 +3304,7 @@ impl SlirpBackend { let host_fd = host_stream.as_raw_fd(); let token = next_flow_token(PROTO_TAG_TCP); let flow_key = FlowKey::Tcp(key); + let cached_recv_window = host_recv_window(host_fd); let entry = TcpNatEntry { host_stream, state: TcpNatState::SynSent, @@ -3261,6 +3318,8 @@ impl SlirpBackend { guest_isn: 0, guest_window: 65535, guest_window_scale: 0, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -3369,6 +3428,7 @@ impl SlirpBackend { dst_port: high_port, }; let token = next_flow_token(PROTO_TAG_TCP); + let cached_recv_window = host_recv_window(host_stream.as_raw_fd()); let entry = TcpNatEntry { host_stream, state: TcpNatState::LastAck, @@ -3382,6 +3442,8 @@ impl SlirpBackend { guest_isn: 0, guest_window: 65535, guest_window_scale: 0, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; self.flow_table .insert(FlowKey::Tcp(key), FlowEntry::Tcp(entry)); @@ -3440,6 +3502,7 @@ impl SlirpBackend { let host_fd = stream.as_raw_fd(); let token = next_flow_token(PROTO_TAG_TCP); let flow_key = FlowKey::Tcp(key); + let cached_recv_window = host_recv_window(host_fd); let entry = TcpNatEntry { host_stream: stream, state: TcpNatState::Connecting, @@ -3453,6 +3516,8 @@ impl SlirpBackend { guest_isn: 1000, guest_window: 65535, guest_window_scale: 0, + cached_recv_window, + cached_recv_window_at: Instant::now(), }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key);