From 9e80773edb6efb80a29bfd9f1eb2752d0a5d4690 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 12:49:38 -0300 Subject: [PATCH 1/9] =?UTF-8?q?docs:=20Phase=206.1=20detailed=20TDD=20plan?= =?UTF-8?q?=20=E2=80=94=20TCP=20half-close?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 9 bite-sized tasks covering the half-close state machine: - Established → FinWait1 (guest closed first) → LastAck on host EOF - Established → CloseWait (host closed first) → LastAck on guest FIN - LastAck → Closed on guest's final ACK or LAST_ACK_TIMEOUT (60s) Three new pins: tcp_half_close_guest_writes_first (BROKEN_ON_PURPOSE flips at Task 3), tcp_half_close_host_writes_first (passes after Task 3), tcp_last_ack_timeout_reaps_stale_entry (bench-helpers gated, Task 7). Severity: HIGH — current code drops host's pending response on guest shutdown(SHUT_WR), causing silent data loss for HTTP, SMTP, and any protocol with orderly half-close. --- .../2026-04-30-smoltcp-passt-port-phase6.1.md | 525 ++++++++++++++++++ 1 file changed, 525 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.1.md diff --git a/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.1.md b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.1.md new file mode 100644 index 00000000..ea74d7fc --- /dev/null +++ b/docs/superpowers/plans/2026-04-30-smoltcp-passt-port-phase6.1.md @@ -0,0 +1,525 @@ +# Phase 6.1: TCP Half-Close Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Wire `TcpNatState`'s currently-unused `FinWait1`, `CloseWait`, `LastAck` variants into the SLIRP relay so guest `shutdown(SHUT_WR)` mid-write doesn't lose tail data. + +**Severity:** High — silent data loss on every protocol that uses orderly half-close (HTTP request bodies, SMTP DATA, anything with `shutdown(SHUT_WR)`). + +**Architecture:** Replace the current "guest FIN → immediate close" short-circuit with a five-state machine driven by guest FIN events and host-side EOF on `host_stream`. Add a `LAST_ACK_TIMEOUT` of 60 s (TCP MSL×2) so missing final ACKs don't leak entries. + +**Tech stack:** Existing `TcpNatEntry` + `TcpNatState` types in `src/network/slirp.rs`. No new dependencies. + +--- + +## Background + +`TcpNatState` (`src/network/slirp.rs:131-144`) declares `FinWait1`, `FinWait2`, `CloseWait`, `LastAck` but the implementation only ever uses `Syn*`, `Established`, `Closed`. The enum carries `#[allow(dead_code)]` (line 130) to mute warnings. + +Guest FIN handler (`src/network/slirp.rs:1657-1676`): + +```rust +if tcp.fin() { + entry.guest_ack = seq.wrapping_add(1); + let fin_ack_frame = build_tcp_packet_static(..., TcpControl::Fin, &[]); + self.inject_to_guest.push(fin_ack_frame); + entry.our_seq = entry.our_seq.wrapping_add(1); + entry.state = TcpNatState::Closed; // ← LOSES IN-FLIGHT HOST DATA + self.pending_close.push(flow_key); // ← FORCES IMMEDIATE REAP + return Ok(()); +} +``` + +This pushes the host-side `TcpStream` into `pending_close`, which `relay_tcp_nat_data` reaps the same tick. The host's pending write data never makes it back to the guest. + +## Target state machine + +``` + guest FIN (we ACK + shutdown(Write)) + Established ──────────────────────────────────────────► FinWait1 + │ │ + │ host EOF (we send FIN) │ host EOF (we send FIN) + ▼ ▼ + CloseWait LastAck + │ guest FIN (we ACK) │ guest's final ACK + ▼ │ — or — + LastAck ◄──────────────────────────────────────────────────── │ LAST_ACK_TIMEOUT (60 s) + │ │ + │ guest's final ACK / LAST_ACK_TIMEOUT ▼ + └─────────────────────────────────► Closed ◄──────────────┘ + │ + ▼ unregister from epoll + flow_table + .remove() +``` + +We collapse `FinWait2` (in real TCP, "we sent FIN, peer ACKed our FIN, peer hasn't sent its own FIN yet"). Since we don't observe per-segment ACKs from the kernel — only data + EOF — we treat `FinWait1` as continuing until host EOF arrives, then jump straight to `LastAck`. + +## Invariants (carried) + +1. All-Rust path. No new crate deps. +2. Full observability — every state transition logs at `trace!` or `debug!`. +3. Cross-platform discipline (Linux-only SLIRP unchanged). +4. No regression in Phase 0–5 + 5.5b + 6.4 baselines. `bench-compare.sh --baseline 47868f0 --skip-vm` enforced. +5. Snapshot/restore correctness — `snapshot_integration` continues to pass. New states must round-trip OR cleanly degrade to `Closed` within `LAST_ACK_TIMEOUT`. + +--- + +## File impact + +| File | Action | +|---|---| +| `src/network/slirp.rs` | Modify `TcpNatEntry` (add `last_state_change: Instant`), rewrite FIN handler, add CloseWait/LastAck transitions in `relay_tcp_nat_data`, add `LAST_ACK_TIMEOUT` reaping. Drop `#[allow(dead_code)]` on `TcpNatState`. | +| `tests/network_baseline.rs` | Three new pins. | +| Snapshot serde | Verify the new state transitions round-trip; if not, document the cleanup-on-restore behavior. | + +No public API change. + +--- + +## Tasks + +### Task 1: Failing pin — `tcp_half_close_guest_writes_first` + +Pure synthetic harness. Drives `SlirpBackend` directly without a VM. + +```rust +#[test] +fn tcp_half_close_guest_writes_first() { + use std::io::{Read, Write}; + use std::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let server = std::thread::spawn(move || -> Vec { + let (mut sock, _) = listener.accept().unwrap(); + // Read until EOF (guest will send data + FIN). + let mut request = Vec::new(); + let _ = sock.read_to_end(&mut request); + // After guest's shutdown(WRITE), we still want to send response. + sock.write_all(b"HTTP/1.1 200 OK\r\n\r\nBODY").unwrap(); + // Then close. + drop(sock); + request + }); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest 3-way handshake. + let our_seq = 1000u32; + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, our_seq, 0, + TcpControl::Syn, &[], + )).unwrap(); + let mut gateway_seq = 0u32; + for f in drain_n(&mut stack, 4) { + if let Some((s, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Syn) { gateway_seq = s; break; } + } + } + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + our_seq + 1, gateway_seq + 1, TcpControl::None, &[], + )).unwrap(); + + // Guest sends "HELLO" data + FIN. + let request = b"HELLO"; + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + our_seq + 1, gateway_seq + 1, TcpControl::Psh, request, + )).unwrap(); + stack.process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, GUEST_EPHEMERAL_PORT, host_port, + our_seq + 1 + request.len() as u32, gateway_seq + 1, + TcpControl::Fin, &[], + )).unwrap(); + + // Drive drain_to_guest until we see host's response data AND its FIN. + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + let mut response_bytes: Vec = Vec::new(); + let mut saw_host_fin = false; + while std::time::Instant::now() < deadline { + for f in drain_n(&mut stack, 1) { + if let Some((_, _, ctrl, payload_len)) = parse_tcp_to_guest(&f) { + // Extract payload bytes. + let eth = EthernetFrame::new_unchecked(f.as_slice()); + let ip = Ipv4Packet::new_unchecked(eth.payload()); + let tcp = TcpPacket::new_unchecked(ip.payload()); + if payload_len > 0 { + response_bytes.extend_from_slice(tcp.payload()); + } + if matches!(ctrl, TcpControl::Fin) { + saw_host_fin = true; + } + } + } + if !response_bytes.is_empty() && saw_host_fin { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + let _ = server.join(); + + assert_eq!(&response_bytes[..], b"HTTP/1.1 200 OK\r\n\r\nBODY", + "guest must receive ALL host response data after sending FIN"); + assert!(saw_host_fin, "guest must receive host's FIN"); +} +``` + +Run: `cargo test --test network_baseline tcp_half_close_guest_writes_first`. Expected: **FAIL** — current code marks state=Closed on guest FIN; host's response data is dropped. + +**Commit:** `test(network): pin tcp_half_close_guest_writes_first (BROKEN_ON_PURPOSE)` + +--- + +### Task 2: Wire `FinWait1` transition + `shutdown(Write)` on guest FIN + +In `src/network/slirp.rs::handle_tcp_frame`, replace the FIN handler: + +```rust +// FIN from guest +if tcp.fin() { + debug!("SLIRP TCP: FIN from guest for {}:{}", dst_ip, dst_port); + entry.guest_ack = seq.wrapping_add(1); + + // ACK the guest's FIN — but don't send our own FIN yet. Host + // application may have data still to send. We transition to + // FinWait1 and shut down the host socket's write side so the + // host knows no more data is coming from the guest. + let ack_frame = build_tcp_packet_static( + dst_ip, SLIRP_GUEST_IP, dst_port, src_port, + entry.our_seq, entry.guest_ack, + TcpControl::None, &[], + ); + self.inject_to_guest.push(ack_frame); + + if let Err(e) = entry.host_stream.shutdown(std::net::Shutdown::Write) { + warn!("SLIRP TCP: shutdown(Write) failed on guest FIN, falling back \ + to immediate close: {}", e); + entry.state = TcpNatState::Closed; + self.pending_close.push(flow_key); + return Ok(()); + } + + entry.state = TcpNatState::FinWait1; + entry.last_state_change = Instant::now(); + trace!("SLIRP TCP: state Established → FinWait1 for {}:{}", dst_ip, dst_port); + return Ok(()); +} +``` + +Add `last_state_change: Instant` field to `TcpNatEntry` (used for `LAST_ACK_TIMEOUT` reaping): + +```rust +struct TcpNatEntry { + // ... existing fields ... + /// Wall clock when the entry's state last changed. Used by + /// LAST_ACK_TIMEOUT reaping in relay_tcp_nat_data so a missing + /// final ACK doesn't leak the entry forever. + last_state_change: Instant, +} +``` + +Initialize at every existing `TcpNatEntry { ... }` literal site (search for `TcpNatEntry {` in slirp.rs). Set to `Instant::now()` everywhere. + +Run: `cargo test --test network_baseline tcp_half_close_guest_writes_first`. Expected: **still FAIL** — host's FIN doesn't reach the guest yet (relay loop doesn't know about FinWait1). + +**Commit:** `feat(slirp): FinWait1 transition on guest FIN, host write-side shutdown` + +--- + +### Task 3: Wire `FinWait1 → LastAck` on host EOF + send our FIN to guest + +In `src/network/slirp.rs::relay_tcp_nat_data`, find the recv_peek `Ok(0)` arm (host EOF, ~line 1773 area): + +```rust +Ok(0) => { + // Host closed the connection. + debug!("SLIRP TCP: host EOF on flow guest_port={}", key.guest_src_port); + match entry.state { + TcpNatState::Established => { + // Host closed first → CloseWait. We send FIN to guest; + // guest may still send data which we'll forward to host + // (but host's write side may be closed — that's a guest + // write failure, not our concern). + entry.state = TcpNatState::CloseWait; + entry.last_state_change = Instant::now(); + became_closed = false; // no longer immediately reap + // Build FIN to guest below via the existing fin_frame branch. + } + TcpNatState::FinWait1 => { + // Guest closed first; now host has finished writing. + // Send FIN to guest, transition to LastAck. + entry.state = TcpNatState::LastAck; + entry.last_state_change = Instant::now(); + became_closed = false; + } + _ => { + // Already in a closing state or Closed. + became_closed = false; + } + } +} +``` + +Then update the FIN-emit logic just below (~line 1824): + +```rust +// Send FIN if we just transitioned to a state that demands one. +let needs_fin = matches!( + entry.state, + TcpNatState::CloseWait | TcpNatState::LastAck +); +if needs_fin && !entry.our_fin_sent { + fin_frame = Some(build_tcp_packet_static( + key.dst_ip, SLIRP_GUEST_IP, key.dst_port, key.guest_src_port, + entry.our_seq, entry.guest_ack, + TcpControl::Fin, &[], + )); + entry.our_seq = entry.our_seq.wrapping_add(1); + entry.our_fin_sent = true; + trace!("SLIRP TCP: sent FIN to guest, state={:?}", entry.state); +} +``` + +Add `our_fin_sent: bool` field to `TcpNatEntry` so we don't re-send FIN on repeat polls (every Established-side recv_peek would otherwise queue another FIN). + +Run: `cargo test --test network_baseline tcp_half_close_guest_writes_first`. Expected: **PASS** — host's response data + FIN now flow back through. + +**Commit:** `feat(slirp): FinWait1 → LastAck and Established → CloseWait on host EOF` + +--- + +### Task 4: Wire `CloseWait → LastAck` on guest FIN; `LastAck → Closed` on guest's final ACK + +Update `handle_tcp_frame` to handle FIN from a CloseWait entry: + +```rust +if tcp.fin() { + match entry.state { + TcpNatState::Established => { + // ... Task 2 path: → FinWait1 ... + } + TcpNatState::CloseWait => { + // Host already closed; guest just closed too. ACK the + // guest's FIN and transition to LastAck. + entry.guest_ack = seq.wrapping_add(1); + let ack_frame = build_tcp_packet_static( + dst_ip, SLIRP_GUEST_IP, dst_port, src_port, + entry.our_seq, entry.guest_ack, + TcpControl::None, &[], + ); + self.inject_to_guest.push(ack_frame); + entry.state = TcpNatState::LastAck; + entry.last_state_change = Instant::now(); + return Ok(()); + } + _ => { + // Repeat FIN or unexpected — ACK and stay where we are. + } + } +} +``` + +Update the ACK handler to handle the guest's final ACK in LastAck: + +```rust +// ACK while in LastAck — guest acknowledged our FIN. Reap. +if tcp.ack() && entry.state == TcpNatState::LastAck { + debug!("SLIRP TCP: LastAck → Closed for {}:{}", dst_ip, dst_port); + entry.state = TcpNatState::Closed; + self.pending_close.push(flow_key); + return Ok(()); +} +``` + +Place this branch BEFORE the existing "ACK transitions SynReceived → Established" block (otherwise the SynReceived branch would catch ACKs in LastAck — they're mutually exclusive but explicit ordering is clearer). + +Run all baseline pins: `cargo test --test network_baseline`. Expected: 18/18 pass. + +**Commit:** `feat(slirp): CloseWait → LastAck on guest FIN; LastAck → Closed on final ACK` + +--- + +### Task 5: Failing pin — `tcp_half_close_host_writes_first` + +Symmetric mirror of Task 1's pin. Host writes first, then closes; guest replies with data + FIN. + +```rust +#[test] +fn tcp_half_close_host_writes_first() { + use std::io::{Read, Write}; + use std::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let server = std::thread::spawn(move || { + let (mut sock, _) = listener.accept().unwrap(); + sock.write_all(b"GREETING").unwrap(); + sock.shutdown(std::net::Shutdown::Write).unwrap(); + let mut buf = Vec::new(); + let _ = sock.read_to_end(&mut buf); + // Mirror back what the guest sent. + buf + }); + + let mut stack = SlirpBackend::new().unwrap(); + // ... handshake (same shape as Task 1) ... + + // Drive drain_to_guest until we see GREETING + host's FIN. + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + let mut received: Vec = Vec::new(); + let mut saw_host_fin = false; + while std::time::Instant::now() < deadline { + for f in drain_n(&mut stack, 1) { + // Same parsing as Task 1. + } + if &received[..] == b"GREETING" && saw_host_fin { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + assert_eq!(&received[..], b"GREETING"); + assert!(saw_host_fin); + + // Guest sends reply data + FIN. Host must receive both. + let reply = b"REPLY"; + // ACK host's FIN first to advance gateway_seq, then send reply data + // ... (synthesized guest frames) ... + + let host_received = server.join().unwrap(); + assert_eq!(&host_received[..], b"REPLY", "host must receive guest's reply post-FIN"); +} +``` + +The full test body should mirror Task 1's structure. Run: should PASS post-Task-3 (host EOF triggers CloseWait + FIN to guest correctly). + +**Commit:** `test(network): pin tcp_half_close_host_writes_first` + +--- + +### Task 6: `LAST_ACK_TIMEOUT` reaping + +In `src/network/slirp.rs`, near the existing `TCP_IDLE_TIMEOUT`: + +```rust +const LAST_ACK_TIMEOUT: Duration = Duration::from_secs(60); // TCP MSL × 2 +``` + +In `relay_tcp_nat_data`'s idle-timeout sweep (after the existing `TCP_IDLE_TIMEOUT` check): + +```rust +for (flow_key, entry) in &self.flow_table { + if let FlowEntry::Tcp(tcp_entry) = entry { + if tcp_entry.state == TcpNatState::LastAck + && tcp_entry.last_state_change.elapsed() > LAST_ACK_TIMEOUT + && !to_remove.contains(flow_key) + { + warn!("SLIRP TCP: LastAck timeout for guest_port={}, reaping", + // ... key.guest_src_port from match ...); + to_remove.push(*flow_key); + } + } +} +``` + +(merge with the existing `TCP_IDLE_TIMEOUT` sweep into one pass for cache-friendliness.) + +Run baseline pins: 18/18 + 2 new = 20 pass. + +**Commit:** `feat(slirp): LAST_ACK_TIMEOUT reaping prevents LastAck entry leak` + +--- + +### Task 7: Failing pin — `tcp_last_ack_timeout_reaps_stale_entry` + +Uses the existing `#[cfg(any(test, feature = "bench-helpers"))]` synthetic-injection pattern: + +```rust +#[cfg(feature = "bench-helpers")] +#[test] +fn tcp_last_ack_timeout_reaps_stale_entry() { + let mut stack = SlirpBackend::new().unwrap(); + // ... open a flow, drive it into LastAck via a helper ... + // Synthesize last_state_change = Instant::now() - 70 seconds (>LAST_ACK_TIMEOUT). + stack.set_synthetic_last_state_change(guest_port, high_port, /* 70s ago */); + // One drain_to_guest cycle. + let mut out = Vec::new(); + stack.drain_to_guest(&mut out); + // Entry should be gone. + assert!(stack.tcp_flow_state(guest_port, high_port).is_none(), + "LastAck entry past LAST_ACK_TIMEOUT must be reaped"); +} +``` + +May need a new bench-helpers helper `set_synthetic_last_state_change` on `SlirpBackend`. + +Gate the test with `#[cfg(feature = "bench-helpers")]` per the existing snapshot-rebuild smoke pin pattern (`tests/network_baseline.rs::epoll_set_rebuilt_from_flow_table_smoke`). Default `cargo test` skips it; `cargo test --features bench-helpers -- --test-threads=1` runs it. + +**Commit:** `test(network): pin tcp_last_ack_timeout_reaps_stale_entry (bench-helpers)` + +--- + +### Task 8: Drop `#[allow(dead_code)]` on `TcpNatState` + +Now that all variants are wired. Verify clippy doesn't complain about any remaining unused variant. + +Verify `FinWait2` is actually unused — it should be safe to remove the variant entirely (or document why we keep it for future symmetry). Suggest: remove `FinWait2` variant; if anyone needs the distinction later they can re-add it. + +Update the doc comment on `TcpNatState` to reflect the new state machine. + +**Commit:** `refactor(slirp): drop allow(dead_code) on TcpNatState; remove unused FinWait2` + +--- + +### Task 9: Phase 6.1 validation gate + +Standard contract per `AGENTS.md` plus the new pins. + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings +cargo test --workspace --all-features -- --test-threads=1 +cargo test --test network_baseline # 20 pass +cargo test --test network_baseline --features bench-helpers -- --test-threads=1 # 21 pass +cargo build --release +scripts/bench-compare.sh --baseline 47868f0 --skip-vm # no regressions +``` + +If `VOID_BOX_KERNEL` and `VOID_BOX_INITRAMFS` are set: + +```bash +cargo test --test snapshot_integration -- --ignored --test-threads=1 # passes +cargo test --test conformance -- --ignored --test-threads=1 +``` + +Wall-clock sanity: + +```bash +voidbox-network-bench --iterations 3 --bulk-mb 10 # g2h ≥ 6 Gbps, CRR ~10 ms +``` + +--- + +## Out of scope + +- TCP window management (Phase 6.3). +- Async outbound connect (Phase 6.2). +- IPv6 (Phase 7). +- Real `FinWait2` distinction — would require kernel-side ACK observation we don't currently have. + +## Reviewer pointers + +- Verify FIN is sent EXACTLY ONCE per state transition (search for `our_fin_sent` checks). +- Verify the ACK path correctly transitions LastAck → Closed without consuming a SYN-state ACK by mistake. +- Verify `last_state_change` is updated on every state transition, not just initial. +- Verify guest writes during CloseWait are still relayed to host (host has full read side; only its write side is closed, which it learns from our shutdown — but writes from us to it are valid). +- Snapshot interaction: `TcpNatState` serde already exists; new fields (`last_state_change`, `our_fin_sent`) need `#[serde(default)]` for backward compatibility with pre-6.1 snapshots. + +## Document history + +- 2026-05-04: initial plan written. From 68d4f8a7a4a4805328dc7f56b3e2eff7deac36e6 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 12:52:06 -0300 Subject: [PATCH 2/9] test(network): pin tcp_half_close_guest_writes_first (BROKEN_ON_PURPOSE) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guest sends FIN after data; current code marks state=Closed immediately on guest FIN, so the host's response data is dropped. This pin locks the broken behavior so the Phase 6.1 fix is legible to reviewers. Flips to PASS when Tasks 2–3 implementation lands. --- tests/network_baseline.rs | 120 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index d5115426..b1e0bf19 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -1291,3 +1291,123 @@ fn epoll_set_rebuilt_from_flow_table_smoke() { "rebuild_epoll_from_flow_table must register all live flow FDs" ); } + +// ── Phase 6.1: TCP half-close pins ────────────────────────────────────── + +/// BROKEN_ON_PURPOSE: guest sends FIN after data; current code marks state=Closed +/// immediately on guest FIN, so host's response data is dropped. +/// Flips to PASS when Task 2–3 implementation lands. +#[test] +fn tcp_half_close_guest_writes_first() { + use std::io::{Read, Write}; + use std::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + let server = std::thread::spawn(move || -> Vec { + let (mut sock, _) = listener.accept().unwrap(); + // Read until EOF (guest will send data + FIN). + let mut request = Vec::new(); + let _ = sock.read_to_end(&mut request); + // After guest's shutdown(WRITE), we still want to send response. + sock.write_all(b"HTTP/1.1 200 OK\r\n\r\nBODY").unwrap(); + // Then close. + drop(sock); + request + }); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest 3-way handshake. + let our_seq = 1000u32; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let mut gateway_seq = 0u32; + for f in drain_n(&mut stack, 4) { + if let Some((s, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Syn) { + gateway_seq = s; + break; + } + } + } + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Guest sends "HELLO" data + FIN. + let request = b"HELLO"; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq + 1, + TcpControl::Psh, + request, + )) + .unwrap(); + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1 + request.len() as u32, + gateway_seq + 1, + TcpControl::Fin, + &[], + )) + .unwrap(); + + // Drive drain_to_guest until we see host's response data AND its FIN. + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + let mut response_bytes: Vec = Vec::new(); + let mut saw_host_fin = false; + while std::time::Instant::now() < deadline { + for f in drain_n(&mut stack, 1) { + if let Some((_, _, ctrl, payload_len)) = parse_tcp_to_guest(&f) { + if payload_len > 0 { + let eth = EthernetFrame::new_unchecked(f.as_slice()); + let ip = Ipv4Packet::new_unchecked(eth.payload()); + let tcp = TcpPacket::new_unchecked(ip.payload()); + response_bytes.extend_from_slice(tcp.payload()); + } + if matches!(ctrl, TcpControl::Fin) { + saw_host_fin = true; + } + } + } + if !response_bytes.is_empty() && saw_host_fin { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + let _ = server.join(); + + assert_eq!( + &response_bytes[..], + b"HTTP/1.1 200 OK\r\n\r\nBODY", + "guest must receive ALL host response data after sending FIN" + ); + assert!(saw_host_fin, "guest must receive host's FIN"); +} From fda0e0867e6616dd14ab4a11e78f6329e1755c08 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 13:00:41 -0300 Subject: [PATCH 3/9] =?UTF-8?q?feat(slirp):=20FinWait1=20=E2=86=92=20LastA?= =?UTF-8?q?ck=20and=20Established=20=E2=86=92=20CloseWait=20on=20host=20EO?= =?UTF-8?q?F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tasks 2+3+4 implemented together: clippy's -D warnings requires our_fin_sent to be used before committing the field addition, so the relay loop changes from Task 3 and the ACK handler from Task 4 are bundled with Task 2's struct changes. Structural changes (TcpNatEntry): - Add last_state_change: Instant — tracks when state last changed, used for LAST_ACK_TIMEOUT reaping (Task 6). - Add our_fin_sent: bool — prevents re-sending FIN on repeated epoll readiness events for the same half-close transition. - Initialize both fields at all three TcpNatEntry literal sites. FIN handler (handle_tcp_frame): - Established → FinWait1: ACK guest FIN, shutdown(Write) host socket so host sees EOF; stay alive for host's pending response data. - CloseWait → LastAck: host already closed; guest just closed; ACK and wait for guest's final ACK of our FIN. ACK handler (handle_tcp_frame): - LastAck → Closed: guest's final ACK reaps the entry. - Extend ACK-driven consume to FinWait1 state so bytes_in_flight drains and recv_peek eventually sees Ok(0) (host EOF). Relay loop (relay_tcp_nat_data): - Extend data relay to FinWait1 in addition to Established. - Ok(0) arm now dispatches on state: Established → CloseWait, FinWait1 → LastAck (instead of immediate Closed). - FIN-emit logic uses our_fin_sent guard and needs_fin predicate (CloseWait | LastAck) to send FIN exactly once per transition. Test: update tcp_half_close_guest_writes_first to ACK received data segments so the ACK-driven consume path drains the kernel buffer and recv_peek sees EOF. Without ACKs the FinWait1 relay sees the same bytes on every peek and never reaches Ok(0). --- src/network/slirp.rs | 194 ++++++++++++++++++++++++++++++++------ tests/network_baseline.rs | 23 ++++- 2 files changed, 187 insertions(+), 30 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 827134ae..3b947969 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -199,6 +199,14 @@ struct TcpNatEntry { /// via `next_flow_token(PROTO_TAG_TCP)` and stored here so unregister /// sites never need to recompute it. flow_token: u64, + /// Wall clock when the entry's state last changed. Used by + /// LAST_ACK_TIMEOUT reaping in relay_tcp_nat_data so a missing + /// final ACK doesn't leak the entry forever. + last_state_change: Instant, + /// True once we have sent our FIN to the guest. Prevents re-sending + /// FIN on repeated epoll readiness events for the same transition. + /// Read in relay_tcp_nat_data's FIN-emit logic (Task 3). + our_fin_sent: bool, } /// Key for the ICMP echo NAT table: (guest ICMP id, destination IP). @@ -732,6 +740,8 @@ impl SlirpBackend { last_activity: Instant::now(), bytes_in_flight: 0, flow_token: token, + last_state_change: Instant::now(), + our_fin_sent: false, }; let host_fd = entry.host_stream.as_raw_fd(); let flow_key = FlowKey::Tcp(key); @@ -1479,6 +1489,8 @@ impl SlirpBackend { last_activity: Instant::now(), bytes_in_flight: 0, flow_token: token, + last_state_change: Instant::now(), + our_fin_sent: false, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); @@ -1584,6 +1596,16 @@ impl SlirpBackend { return Ok(()); } + // ACK while in LastAck — guest acknowledged our FIN. Reap. + // Placed before the SynReceived ACK branch to be explicit (the + // states are mutually exclusive, but explicit ordering is clearer). + if tcp.ack() && entry.state == TcpNatState::LastAck { + debug!("SLIRP TCP: LastAck → Closed for {}:{}", dst_ip, dst_port); + entry.state = TcpNatState::Closed; + self.pending_close.push(flow_key); + return Ok(()); + } + // ACK (completing handshake or acknowledging data) if tcp.ack() && entry.state == TcpNatState::SynReceived { entry.state = TcpNatState::Established; @@ -1600,9 +1622,18 @@ impl SlirpBackend { // buffer to advance the kernel's read pointer. Without this step the // kernel buffer fills up and recv_peek keeps returning the same bytes. // - // Only runs in Established state — the SynReceived ACK above does not - // carry data acknowledgements from us yet (bytes_in_flight == 0 then). - if tcp.ack() && entry.state == TcpNatState::Established && entry.bytes_in_flight > 0 { + // Runs in Established and FinWait1: even after the guest half-closes + // (FinWait1), the relay may have already sent host response data that + // the guest must ACK. Draining the kernel buffer on ACK lets + // recv_peek see Ok(0) (EOF) once all data is consumed, which then + // triggers the FinWait1 → LastAck transition and the final FIN to guest. + // + // Does not run in SynReceived — that ACK doesn't carry data acks yet. + let ack_consume_state = matches!( + entry.state, + TcpNatState::Established | TcpNatState::FinWait1 + ); + if tcp.ack() && ack_consume_state && entry.bytes_in_flight > 0 { // segment_ack: what the guest is now confirming it has received // from us (our send-side sequence space). let segment_ack: u32 = tcp.ack_number().0 as u32; @@ -1703,23 +1734,74 @@ impl SlirpBackend { // FIN from guest if tcp.fin() { debug!("SLIRP TCP: FIN from guest for {}:{}", dst_ip, dst_port); - entry.guest_ack = seq.wrapping_add(1); - let fin_ack_frame = build_tcp_packet_static( - dst_ip, - SLIRP_GUEST_IP, - dst_port, - src_port, - entry.our_seq, - entry.guest_ack, - TcpControl::Fin, - &[], - ); - self.inject_to_guest.push(fin_ack_frame); - entry.our_seq = entry.our_seq.wrapping_add(1); - entry.state = TcpNatState::Closed; - // entry last used above; borrow ends before pending_close push. - self.pending_close.push(flow_key); - return Ok(()); + match entry.state { + TcpNatState::Established => { + entry.guest_ack = seq.wrapping_add(1); + + // ACK the guest's FIN — but don't send our own FIN yet. Host + // application may have data still to send. We transition to + // FinWait1 and shut down the host socket's write side so the + // host knows no more data is coming from the guest. + let ack_frame = build_tcp_packet_static( + dst_ip, + SLIRP_GUEST_IP, + dst_port, + src_port, + entry.our_seq, + entry.guest_ack, + TcpControl::None, + &[], + ); + self.inject_to_guest.push(ack_frame); + + if let Err(e) = entry.host_stream.shutdown(std::net::Shutdown::Write) { + warn!( + "SLIRP TCP: shutdown(Write) failed on guest FIN, falling back \ + to immediate close: {}", + e + ); + entry.state = TcpNatState::Closed; + self.pending_close.push(flow_key); + return Ok(()); + } + + entry.state = TcpNatState::FinWait1; + entry.last_state_change = Instant::now(); + trace!( + "SLIRP TCP: state Established → FinWait1 for {}:{}", + dst_ip, + dst_port + ); + return Ok(()); + } + TcpNatState::CloseWait => { + // Host already closed; guest just closed too. ACK the + // guest's FIN and transition to LastAck. + entry.guest_ack = seq.wrapping_add(1); + let ack_frame = build_tcp_packet_static( + dst_ip, + SLIRP_GUEST_IP, + dst_port, + src_port, + entry.our_seq, + entry.guest_ack, + TcpControl::None, + &[], + ); + self.inject_to_guest.push(ack_frame); + entry.state = TcpNatState::LastAck; + entry.last_state_change = Instant::now(); + trace!( + "SLIRP TCP: state CloseWait → LastAck for {}:{}", + dst_ip, + dst_port + ); + return Ok(()); + } + _ => { + // Repeat FIN or unexpected — ACK and stay where we are. + } + } } // RST from guest @@ -1799,7 +1881,13 @@ impl SlirpBackend { continue; }; - if entry.state != TcpNatState::Established { + // Relay data for Established and FinWait1 (guest half-closed; + // host may still have data to send). Skip all other states. + let relay_data = matches!( + entry.state, + TcpNatState::Established | TcpNatState::FinWait1 + ); + if !relay_data { continue; } @@ -1811,13 +1899,42 @@ impl SlirpBackend { let mut peek_buf = [0u8; 65536]; match recv_peek(&entry.host_stream, &mut peek_buf) { Ok(0) => { - // Host closed the connection. Send FIN to guest below. + // Host closed the connection. debug!( - "SLIRP TCP: host EOF on flow guest_port={}, marking Closed", + "SLIRP TCP: host EOF on flow guest_port={}", key.guest_src_port ); - entry.state = TcpNatState::Closed; - became_closed = true; + match entry.state { + TcpNatState::Established => { + // Host closed first → CloseWait. We send FIN to + // guest; guest may still have data to send which + // we'll forward (host's write side may be closed + // already — that's a guest write failure, not our + // concern). + entry.state = TcpNatState::CloseWait; + entry.last_state_change = Instant::now(); + trace!( + "SLIRP TCP: Established → CloseWait for guest_port={}", + key.guest_src_port + ); + became_closed = false; + } + TcpNatState::FinWait1 => { + // Guest closed first; now host has finished writing. + // Send FIN to guest, transition to LastAck. + entry.state = TcpNatState::LastAck; + entry.last_state_change = Instant::now(); + trace!( + "SLIRP TCP: FinWait1 → LastAck for guest_port={}", + key.guest_src_port + ); + became_closed = false; + } + _ => { + // Already in a closing state or Closed — no action. + became_closed = false; + } + } } Ok(peek_n) => { let in_flight = entry.bytes_in_flight as usize; @@ -1850,7 +1967,7 @@ impl SlirpBackend { ); } // else: kernel buffer holds only already-in-flight bytes. - // Wait for guest ACK before sending more (Task 3.4). + // Wait for guest ACK before sending more. } Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => { // Kernel recv buffer empty; nothing to do this poll. @@ -1865,8 +1982,27 @@ impl SlirpBackend { } } - // FIN if host closed - if entry.state == TcpNatState::Closed { + // Send FIN if we just transitioned to a state that demands one. + let needs_fin = + matches!(entry.state, TcpNatState::CloseWait | TcpNatState::LastAck); + if needs_fin && !entry.our_fin_sent { + fin_frame = Some(build_tcp_packet_static( + key.dst_ip, + SLIRP_GUEST_IP, + key.dst_port, + key.guest_src_port, + entry.our_seq, + entry.guest_ack, + TcpControl::Fin, + &[], + )); + entry.our_seq = entry.our_seq.wrapping_add(1); + entry.our_fin_sent = true; + trace!("SLIRP TCP: sent FIN to guest, state={:?}", entry.state); + } + + // Legacy: FIN for the immediate Closed path (error or RST). + if entry.state == TcpNatState::Closed && became_closed && fin_frame.is_none() { fin_frame = Some(build_tcp_packet_static( key.dst_ip, SLIRP_GUEST_IP, @@ -2605,6 +2741,8 @@ impl SlirpBackend { last_activity: Instant::now(), bytes_in_flight: 0, flow_token: token, + last_state_change: Instant::now(), + our_fin_sent: false, }; self.flow_table.insert(flow_key, FlowEntry::Tcp(entry)); self.token_to_key.insert(token, flow_key); diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index b1e0bf19..7c77b348 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -1341,6 +1341,7 @@ fn tcp_half_close_guest_writes_first() { } } } + // Complete 3-way handshake: ACK the SYN-ACK. stack .process_guest_frame(&build_tcp_frame( SLIRP_GATEWAY_IP, @@ -1379,17 +1380,35 @@ fn tcp_half_close_guest_writes_first() { .unwrap(); // Drive drain_to_guest until we see host's response data AND its FIN. + // We must ACK each data segment as it arrives so the kernel recv buffer + // drains — recv_peek returns Ok(0) (EOF) only once all buffered bytes + // are consumed via the ACK-driven read path. let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); let mut response_bytes: Vec = Vec::new(); let mut saw_host_fin = false; while std::time::Instant::now() < deadline { - for f in drain_n(&mut stack, 1) { - if let Some((_, _, ctrl, payload_len)) = parse_tcp_to_guest(&f) { + let frames = drain_n(&mut stack, 1); + for f in &frames { + if let Some((seq, _ack, ctrl, payload_len)) = parse_tcp_to_guest(f) { if payload_len > 0 { let eth = EthernetFrame::new_unchecked(f.as_slice()); let ip = Ipv4Packet::new_unchecked(eth.payload()); let tcp = TcpPacket::new_unchecked(ip.payload()); response_bytes.extend_from_slice(tcp.payload()); + // ACK the data so the relay's ACK-driven consume path can + // drain the kernel recv buffer and eventually see EOF. + let ack_num = seq.wrapping_add(payload_len as u32); + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1 + request.len() as u32 + 1, // seq after FIN + ack_num, + TcpControl::None, + &[], + )) + .unwrap(); } if matches!(ctrl, TcpControl::Fin) { saw_host_fin = true; From a776c9d464ac5fbde6f9d0eb90bb7d9288e8dccf Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 13:08:42 -0300 Subject: [PATCH 4/9] test(network): pin tcp_half_close_host_writes_first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symmetric mirror of tcp_half_close_guest_writes_first: host writes GREETING, shuts down its write side (Established → CloseWait), and waits for guest to reply; guest sends REPLY data + FIN. Also fix two implementation gaps found while writing this pin: 1. Guest→host data forwarding in CloseWait: the payload forward guard was Established-only. CloseWait must also forward guest data (host closed its write side but can still read). Extend the predicate to include CloseWait. 2. shutdown(Write) on CloseWait → LastAck: when the guest sends FIN in CloseWait the host application is blocked on read_to_end. We must call shutdown(Write) on host_stream so the kernel delivers EOF to the host. Without this the host blocks forever. --- src/network/slirp.rs | 23 +++++- tests/network_baseline.rs | 157 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 3 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 3b947969..70c33b45 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -1685,7 +1685,15 @@ impl SlirpBackend { } let payload = tcp.payload(); - if !payload.is_empty() && entry.state == TcpNatState::Established { + // Forward guest data in Established and CloseWait: in CloseWait the + // host closed its write side but can still read data the guest sends. + // FinWait1 is not included — in that state the guest already closed + // its write side (we called shutdown(Write) on the host socket). + let forward_data = matches!( + entry.state, + TcpNatState::Established | TcpNatState::CloseWait + ); + if !payload.is_empty() && forward_data { // Guest→host backpressure: rely on the kernel's send buffer + TCP // retransmit. ACK only the bytes the kernel accepted right now; // on WouldBlock, don't ACK at all and let the guest retransmit. @@ -1775,8 +1783,12 @@ impl SlirpBackend { return Ok(()); } TcpNatState::CloseWait => { - // Host already closed; guest just closed too. ACK the - // guest's FIN and transition to LastAck. + // Host already closed its write side; guest just closed + // too. Shut down our write side of host_stream so the + // host application's read sees EOF, ACK the guest's FIN, + // and transition to LastAck waiting for guest's final ACK + // of our FIN (which was already sent when we entered + // CloseWait). entry.guest_ack = seq.wrapping_add(1); let ack_frame = build_tcp_packet_static( dst_ip, @@ -1789,6 +1801,11 @@ impl SlirpBackend { &[], ); self.inject_to_guest.push(ack_frame); + if let Err(e) = entry.host_stream.shutdown(std::net::Shutdown::Write) { + // Non-fatal: host already closed; ENOTCONN or similar + // is expected in some cases. + trace!("SLIRP TCP: shutdown(Write) in CloseWait (non-fatal): {}", e); + } entry.state = TcpNatState::LastAck; entry.last_state_change = Instant::now(); trace!( diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index 7c77b348..e0a0b298 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -1430,3 +1430,160 @@ fn tcp_half_close_guest_writes_first() { ); assert!(saw_host_fin, "guest must receive host's FIN"); } + +/// Pin for the symmetric half-close path: host writes first, then closes its +/// write side (Established → CloseWait); guest replies with data + FIN. +/// The host must receive the guest's reply data even after its own shutdown. +#[test] +fn tcp_half_close_host_writes_first() { + use std::io::{Read, Write}; + use std::net::TcpListener; + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let host_port = listener.local_addr().unwrap().port(); + + // Server: write greeting, shutdown write side, then read what guest sends back. + let server = std::thread::spawn(move || -> Vec { + let (mut sock, _) = listener.accept().unwrap(); + sock.write_all(b"GREETING").unwrap(); + sock.shutdown(std::net::Shutdown::Write).unwrap(); + let mut buf = Vec::new(); + let _ = sock.read_to_end(&mut buf); + buf + }); + + let mut stack = SlirpBackend::new().unwrap(); + + // Guest 3-way handshake. + let our_seq = 2000u32; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq, + 0, + TcpControl::Syn, + &[], + )) + .unwrap(); + let mut gateway_seq = 0u32; + for f in drain_n(&mut stack, 4) { + if let Some((s, _, ctrl, _)) = parse_tcp_to_guest(&f) { + if matches!(ctrl, TcpControl::Syn) { + gateway_seq = s; + break; + } + } + } + // Complete handshake: ACK the SYN-ACK. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq + 1, + TcpControl::None, + &[], + )) + .unwrap(); + + // Drive drain_to_guest until we see "GREETING" data AND host's FIN. + // ACK each data segment so the relay's ACK-driven consume path works. + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + let mut received: Vec = Vec::new(); + let mut saw_host_fin = false; + // gateway_seq_next: the relay's current sequence number (advances with data + FIN). + let mut gateway_seq_next = gateway_seq + 1; // after SYN-ACK + while std::time::Instant::now() < deadline { + let frames = drain_n(&mut stack, 1); + for f in &frames { + if let Some((seq, _ack, ctrl, payload_len)) = parse_tcp_to_guest(f) { + if payload_len > 0 { + let eth = EthernetFrame::new_unchecked(f.as_slice()); + let ip = Ipv4Packet::new_unchecked(eth.payload()); + let tcp = TcpPacket::new_unchecked(ip.payload()); + received.extend_from_slice(tcp.payload()); + gateway_seq_next = seq.wrapping_add(payload_len as u32); + // ACK the data. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq_next, + TcpControl::None, + &[], + )) + .unwrap(); + } + if matches!(ctrl, TcpControl::Fin) { + saw_host_fin = true; + // ACK the host FIN so the CloseWait entry receives it. + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq_next.wrapping_add(1), + TcpControl::None, + &[], + )) + .unwrap(); + } + } + } + if received == b"GREETING" && saw_host_fin { + break; + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + assert_eq!( + &received[..], + b"GREETING", + "guest must receive host's greeting" + ); + assert!( + saw_host_fin, + "guest must receive host's FIN (CloseWait transition)" + ); + + // Guest sends reply data + FIN after receiving host's greeting and FIN. + // The relay is now in CloseWait; guest data should still be forwarded to host. + let reply = b"REPLY"; + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1, + gateway_seq_next.wrapping_add(1), + TcpControl::Psh, + reply, + )) + .unwrap(); + stack + .process_guest_frame(&build_tcp_frame( + SLIRP_GATEWAY_IP, + GUEST_EPHEMERAL_PORT, + host_port, + our_seq + 1 + reply.len() as u32, + gateway_seq_next.wrapping_add(1), + TcpControl::Fin, + &[], + )) + .unwrap(); + // Drain to process the guest FIN (CloseWait → LastAck). + drain_n(&mut stack, 4); + + let host_received = server.join().unwrap(); + assert_eq!( + &host_received[..], + b"REPLY", + "host must receive guest's reply data after CloseWait" + ); +} From e432f623232f4040f11f33f0e9e54c4369f4f967 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 13:09:59 -0300 Subject: [PATCH 5/9] feat(slirp): LAST_ACK_TIMEOUT reaping prevents LastAck entry leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add LAST_ACK_TIMEOUT = 60 s (TCP MSL × 2) as a module-scope constant. Merge the LastAck-timeout check into the existing idle-timeout sweep in relay_tcp_nat_data, making a single O(n) pass handle both conditions: - Standard idle-timeout (300 s of inactivity, any state): unchanged. - LastAck-timeout (60 s since last_state_change, LastAck state): reaps entries where the guest's final ACK never arrived, e.g. after a guest crash. Logs at WARN so operators can connect the timeout to a prior half-close sequence without a separate debugging session. The combined sweep avoids a second loop and an extra to_remove Vec allocation. Using last_state_change (set on every state transition) rather than last_activity (set on data relay) gives an accurate 60 s window from the moment we sent our FIN. --- src/network/slirp.rs | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 70c33b45..c6f294fb 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -91,6 +91,11 @@ const MTU: usize = 1500; const MAX_QUEUE_SIZE: usize = 64; const TCP_WINDOW: u16 = 65535; const UDP_IDLE_TIMEOUT: Duration = Duration::from_secs(60); +/// Timeout for TCP entries stuck in the LastAck state (i.e. we sent a FIN +/// but the guest's final ACK never arrived). Two TCP MSLs (2 × 30 s = 60 s) +/// matches the POSIX TIME_WAIT recommendation and prevents LastAck entries +/// from leaking indefinitely when a guest drops the final ACK. +const LAST_ACK_TIMEOUT: Duration = Duration::from_secs(60); /// Sleep interval for the port-forward listener thread between non-blocking /// accept polls. Short enough to keep accept latency low; long enough to @@ -1859,14 +1864,34 @@ impl SlirpBackend { .into_iter() .collect(); - // Idle-timeout sweep: scan flow_table once without collecting a - // separate key Vec. 300-second inactivity applies regardless of epoll - // readiness; this is O(n) in the number of TCP flows. + // Timeout sweep: one pass over the flow table handles two independent + // timeout conditions without a separate Vec or extra allocation. + // Uses to_remove_set (HashSet) for O(1) membership checks. const TCP_IDLE_TIMEOUT: Duration = Duration::from_secs(300); for (flow_key, entry) in &self.flow_table { if let FlowEntry::Tcp(tcp_entry) = entry { + if to_remove_set.contains(flow_key) { + continue; + } + // Standard idle-timeout: 300 s of inactivity in any state. if tcp_entry.last_activity.elapsed() > TCP_IDLE_TIMEOUT { to_remove_set.insert(*flow_key); + continue; + } + // LastAck-timeout: final guest ACK never arrived. Reap so a + // misbehaving or crashed guest doesn't leak entries forever. + if tcp_entry.state == TcpNatState::LastAck + && tcp_entry.last_state_change.elapsed() > LAST_ACK_TIMEOUT + { + warn!( + "SLIRP TCP: LastAck timeout for guest_port={}, reaping", + if let FlowKey::Tcp(k) = flow_key { + k.guest_src_port + } else { + 0 + } + ); + to_remove_set.insert(*flow_key); } } } From a63a5289b8e4555a328b094f6696ca52e0b49e4c Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 13:13:55 -0300 Subject: [PATCH 6/9] test(network): pin tcp_last_ack_timeout_reaps_stale_entry (bench-helpers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a bench-helpers–gated pin that verifies the LAST_ACK_TIMEOUT reaper: 1. Insert a synthetic LastAck entry via the new bench-helpers helper insert_synthetic_lastack_entry (TcpNatState::LastAck, our_fin_sent=true). 2. Back-date last_state_change by 70 s (> LAST_ACK_TIMEOUT = 60 s) via set_synthetic_last_state_change. 3. One drain_to_guest cycle runs the timeout sweep. 4. Assert the entry is gone (tcp_flow_state returns None). Two new bench-helpers methods on SlirpBackend: - insert_synthetic_lastack_entry: seeds a LastAck flow without a full half-close exchange, for timeout reaping tests. - set_synthetic_last_state_change: back-dates last_state_change on an existing entry to simulate an expired LAST_ACK_TIMEOUT. Also widen TcpNatState and tcp_flow_state to pub so the bench-helpers– gated external test can reference them by path. Both were previously pub(crate); they are now observable as part of the bench-helpers surface. --- src/network/slirp.rs | 60 +++++++++++++++++++++++++++++++++++++-- tests/network_baseline.rs | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index c6f294fb..b61dc5f0 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -164,7 +164,7 @@ pub(crate) struct InboundAccept { #[derive(Debug, Clone, Copy, PartialEq)] #[allow(dead_code)] -pub(crate) enum TcpNatState { +pub enum TcpNatState { /// Guest sent SYN; we responded with SYN-ACK; waiting for guest's /// final ACK to complete the outbound 3-way handshake. SynReceived, @@ -2811,7 +2811,7 @@ impl SlirpBackend { /// Return the `TcpNatState` for the flow identified by `(guest_port, GATEWAY_IP, high_port)`, /// or `None` if no such entry exists in the flow table. #[allow(dead_code)] - pub(crate) fn tcp_flow_state(&self, guest_port: u16, high_port: u16) -> Option { + pub fn tcp_flow_state(&self, guest_port: u16, high_port: u16) -> Option { let key = NatKey { guest_src_port: guest_port, dst_ip: SLIRP_GATEWAY_IP, @@ -2872,6 +2872,62 @@ impl SlirpBackend { self.epoll = Arc::new(new_epoll_inner); self.epoll_waker = new_waker; } + + /// Insert a synthetic `LastAck` entry into the flow table. + /// + /// Used by `tcp_last_ack_timeout_reaps_stale_entry` to pre-seed a flow + /// in the LastAck state without going through a full half-close exchange. + /// + /// The entry's `last_state_change` is set to `Instant::now()` and can be + /// back-dated with [`set_synthetic_last_state_change`] to simulate an + /// expired timeout. + pub fn insert_synthetic_lastack_entry( + &mut self, + guest_port: u16, + high_port: u16, + host_stream: TcpStream, + ) { + let key = NatKey { + guest_src_port: guest_port, + dst_ip: SLIRP_GATEWAY_IP, + dst_port: high_port, + }; + let entry = TcpNatEntry { + host_stream, + state: TcpNatState::LastAck, + our_seq: 1, + guest_ack: 1, + last_activity: Instant::now(), + bytes_in_flight: 0, + last_state_change: Instant::now(), + our_fin_sent: true, + }; + self.flow_table + .insert(FlowKey::Tcp(key), FlowEntry::Tcp(entry)); + } + + /// Back-date the `last_state_change` of the flow identified by + /// `(guest_port, GATEWAY_IP, high_port)` by `age`. Used by + /// `tcp_last_ack_timeout_reaps_stale_entry` to simulate a LastAck + /// entry that has been sitting past `LAST_ACK_TIMEOUT` without + /// receiving the guest's final ACK. + pub fn set_synthetic_last_state_change( + &mut self, + guest_port: u16, + high_port: u16, + age: Duration, + ) { + let key = FlowKey::Tcp(NatKey { + guest_src_port: guest_port, + dst_ip: SLIRP_GATEWAY_IP, + dst_port: high_port, + }); + if let Some(FlowEntry::Tcp(entry)) = self.flow_table.get_mut(&key) { + // Instant::now() - age: subtract by creating an instant that + // appears to have occurred `age` ago. + entry.last_state_change = Instant::now().checked_sub(age).unwrap_or_else(Instant::now); + } + } } #[cfg(test)] diff --git a/tests/network_baseline.rs b/tests/network_baseline.rs index e0a0b298..f7c33393 100644 --- a/tests/network_baseline.rs +++ b/tests/network_baseline.rs @@ -1587,3 +1587,57 @@ fn tcp_half_close_host_writes_first() { "host must receive guest's reply data after CloseWait" ); } + +/// Verify that a LastAck entry whose `last_state_change` is older than +/// `LAST_ACK_TIMEOUT` (60 s) is reaped by the next `drain_to_guest` sweep, +/// preventing a leaked flow table entry when the guest drops the final ACK. +/// +/// Gated on `bench-helpers` because it uses synthetic-injection helpers +/// (`insert_synthetic_lastack_entry`, `set_synthetic_last_state_change`, +/// `tcp_flow_state`) that widen internal visibility for external test/bench +/// consumers. Default `cargo test` skips this pin; CI runs it via +/// `cargo test --features bench-helpers -- --test-threads=1`. +#[cfg(feature = "bench-helpers")] +#[test] +fn tcp_last_ack_timeout_reaps_stale_entry() { + use std::net::TcpListener; + + const GUEST_PORT: u16 = 8080; + const HIGH_PORT: u16 = 60000; + + let mut backend = SlirpBackend::new().expect("backend"); + + // Open a real TCP connection so host_stream is a valid socket. + let listener = TcpListener::bind("127.0.0.1:0").expect("bind"); + let host_stream = + std::net::TcpStream::connect(listener.local_addr().unwrap()).expect("connect"); + host_stream.set_nonblocking(true).ok(); + + // Insert a synthetic LastAck entry (already past our FIN, waiting for + // guest's final ACK which will never arrive). + backend.insert_synthetic_lastack_entry(GUEST_PORT, HIGH_PORT, host_stream); + + // Verify the entry is present. + assert_eq!( + backend.tcp_flow_state(GUEST_PORT, HIGH_PORT), + Some(void_box::network::slirp::TcpNatState::LastAck), + "entry must start in LastAck" + ); + + // Back-date last_state_change by 70 s (> LAST_ACK_TIMEOUT = 60 s). + backend.set_synthetic_last_state_change( + GUEST_PORT, + HIGH_PORT, + std::time::Duration::from_secs(70), + ); + + // One drain_to_guest cycle triggers the timeout sweep. + let mut out = Vec::new(); + backend.drain_to_guest(&mut out); + + // The entry should now be gone. + assert!( + backend.tcp_flow_state(GUEST_PORT, HIGH_PORT).is_none(), + "LastAck entry past LAST_ACK_TIMEOUT must be reaped by drain_to_guest" + ); +} From a0c914361669916a33d0230ade75ec4bd0d32e09 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 14:28:06 -0300 Subject: [PATCH 7/9] refactor(slirp): drop allow(dead_code) on TcpNatState; remove unused FinWait2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All TcpNatState variants are now wired into the state machine: - SynReceived, SynSent: handshake paths (unchanged) - Established: normal data transfer (unchanged) - FinWait1: guest half-closed; relay continues forwarding host response - CloseWait: host half-closed; relay continues forwarding guest data - LastAck: both sides closed; waiting for guest's final ACK or timeout - Closed: entry pending removal Remove FinWait2: distinguishing it from FinWait1 would require observing per-segment ACKs from the kernel — the relay does not track those. The relay stays in FinWait1 until host EOF, then jumps to LastAck. If the distinction is needed later, the variant can be re-added. Update the TcpNatState doc comment to document the state machine diagram and the FinWait2 omission rationale, replacing the old per-variant stubs. --- src/network/slirp.rs | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index b61dc5f0..426517ea 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -162,8 +162,28 @@ pub(crate) struct InboundAccept { // TCP NAT connection tracking // ────────────────────────────────────────────────────────────────────── +/// TCP connection state for the SLIRP NAT relay. +/// +/// The state machine models both guest-initiated and host-initiated half-close +/// sequences. `FinWait2` is omitted: distinguishing it from `FinWait1` requires +/// observing per-segment ACKs from the kernel, which the relay does not track. +/// Instead, the relay stays in `FinWait1` until host EOF arrives, then jumps +/// directly to `LastAck`. +/// +/// State transitions: +/// +/// ```text +/// SynReceived ──ACK──► Established ──guest FIN──► FinWait1 +/// SynSent ──SYN+ACK──► Established │ │ +/// │ │ └─ host EOF ──► LastAck +/// │ host EOF │ │ +/// ▼ │ guest ACK ──┘ +/// CloseWait ◄──────────────┘ └──► Closed +/// │ guest FIN +/// ▼ +/// LastAck ──── LAST_ACK_TIMEOUT ────► Closed +/// ``` #[derive(Debug, Clone, Copy, PartialEq)] -#[allow(dead_code)] pub enum TcpNatState { /// Guest sent SYN; we responded with SYN-ACK; waiting for guest's /// final ACK to complete the outbound 3-way handshake. @@ -171,11 +191,19 @@ pub enum TcpNatState { /// We synthesized a SYN to the guest (port-forwarding); waiting /// for the guest's SYN-ACK to advance to Established. SynSent, + /// Both sides exchanged handshake; data flows in both directions. Established, + /// Guest closed its write side (sent FIN); we ACKed and called + /// shutdown(Write) on the host socket. Host response data may still + /// be in-flight — relay continues until host EOF. FinWait1, - FinWait2, + /// Host closed its write side first (we saw EOF); we sent a FIN to + /// the guest. Guest may still send data, which the relay forwards. CloseWait, + /// We have sent our FIN to the guest; waiting for the guest's final + /// ACK. Reaped on ACK or after LAST_ACK_TIMEOUT. LastAck, + /// Connection fully closed; entry pending removal from the flow table. Closed, } From 8658e291d21f18cb1d301d1c5d3c94a3929c7d73 Mon Sep 17 00:00:00 2001 From: diego Date: Mon, 4 May 2026 16:20:47 -0300 Subject: [PATCH 8/9] fix(slirp): drop duplicate bind + init flow_token in lastack helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two artifacts from the rebase onto the new #69 tip: 1. bind_port_forward_listeners was called twice in with_security after the rebase — the second call shadowed the first, which was harmless but emitted an unused-variable warning under -D warnings. Drop the duplicate. 2. The bench-helpers helper insert_synthetic_lastack_entry was missing flow_token in its TcpNatEntry initializer (the field was added by the Copilot fix #1 monotonic-token rewrite that landed on #69 after the original phase 6.1 commit chain was written). Allocate via next_flow_token(PROTO_TAG_TCP). --- src/network/slirp.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 426517ea..0cf6d9ad 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -2920,6 +2920,7 @@ impl SlirpBackend { dst_ip: SLIRP_GATEWAY_IP, dst_port: high_port, }; + let token = next_flow_token(PROTO_TAG_TCP); let entry = TcpNatEntry { host_stream, state: TcpNatState::LastAck, @@ -2927,6 +2928,7 @@ impl SlirpBackend { guest_ack: 1, last_activity: Instant::now(), bytes_in_flight: 0, + flow_token: token, last_state_change: Instant::now(), our_fin_sent: true, }; From 8da934221598541912042a245d7c2b3ec4becfc0 Mon Sep 17 00:00:00 2001 From: diego Date: Tue, 5 May 2026 21:49:39 -0300 Subject: [PATCH 9/9] fix(slirp): doc-link to bench-helpers method via Self:: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bare intra-doc-link (`[`set_synthetic_last_state_change`]`) doesn't resolve to a struct method even when both items live in the same `impl` block — rustdoc looks up bare names in the module scope, not the enclosing impl. CI's `-D warnings` then elevates the `broken_intra_doc_links` warning to an error and fails the Documentation job. Switch to `Self::set_synthetic_last_state_change` so rustdoc resolves through the impl. --- src/network/slirp.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/network/slirp.rs b/src/network/slirp.rs index 0cf6d9ad..93c62a4d 100644 --- a/src/network/slirp.rs +++ b/src/network/slirp.rs @@ -2907,8 +2907,8 @@ impl SlirpBackend { /// in the LastAck state without going through a full half-close exchange. /// /// The entry's `last_state_change` is set to `Instant::now()` and can be - /// back-dated with [`set_synthetic_last_state_change`] to simulate an - /// expired timeout. + /// back-dated with [`Self::set_synthetic_last_state_change`] to simulate + /// an expired timeout. pub fn insert_synthetic_lastack_entry( &mut self, guest_port: u16,