diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..601b798 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +.zig-cache +zig-cache +zig-out +.git +.github +.claude +.serena diff --git a/README.md b/README.md index b835c0f..07a6dd4 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,11 @@ A QUIC protocol library for Zig. Sans-I/O — you own the socket; the library ow - TLS 1.3 server handshake with AES-128-GCM and ChaCha20-Poly1305 (RFC 9001) - Session resumption and 0-RTT - Loss recovery, RTT estimation, PTO (RFC 9002) -- CUBIC congestion control (RFC 9438) +- CUBIC and BBR v3 congestion control (RFC 9438) - Stream multiplexing and flow control - Path migration and NAT rebinding +- Pacing with wire-time accounting +- Packet coalescing (RFC 9000 §12.2) - PMTUD, retry tokens, key rotation, ECN - Ed25519 and P-256 certificates - Zero external dependencies @@ -18,8 +20,10 @@ A QUIC protocol library for Zig. Sans-I/O — you own the socket; the library ow ## Build ```sh -zig build test # run tests -zig build # build server binary +zig build test # run tests (default: BBR) +zig build test -Dcongestion=cubic # run tests with CUBIC +zig build # build server binary +zig build -Dcongestion=cubic # build with CUBIC ``` Requires Zig 0.16.0-dev or later. @@ -27,11 +31,23 @@ Requires Zig 0.16.0-dev or later. ## Interop Results -Tested against ngtcp2 client — 22/22 passing, goodput 9394 kbps on 10 Mbps link: - -| Result | Test cases | -| :---: | --- | -| ✅ Pass (22/22) | handshake, transfer, longrtt, chacha20, multiplexing, retry, resumption, zerortt, http3, blackhole, keyupdate, ecn, amplificationlimit, handshakeloss, transferloss, handshakecorruption, transfercorruption, v2, ipv6, rebind-port, rebind-addr, connectionmigration | +Tested against 11 QUIC clients via [quic-interop-runner](https://github.com/quic-interop/quic-interop-runner) on a 10 Mbps / 30 ms RTT link: + +| Client | Tests | Goodput | +| --- | --- | --- | +| ngtcp2 | 22/22 | 9432 kbps | +| quic-go | 20/20 | 9507 kbps | +| quiche | 18/18 | — | +| neqo | 19/22 | — | +| kwik | 19/21 | 7849 kbps | +| picoquic | 16/22 | — | +| mvfst | 12/16 | 9496 kbps | +| aioquic | 13/21 | 9190 kbps | +| lsquic | — | 9454 kbps | +| msquic | — | 7937 kbps | +| quinn | — | 9462 kbps | + +Test cases: handshake, transfer, longrtt, chacha20, multiplexing, retry, resumption, zerortt, http3, blackhole, keyupdate, ecn, amplificationlimit, handshakeloss, transferloss, handshakecorruption, transfercorruption, v2, ipv6, rebind-port, rebind-addr, connectionmigration ## Limitations diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4846fdd --- /dev/null +++ b/TODO.md @@ -0,0 +1,20 @@ +# Performance TODO + +## High-bandwidth scaling (target: 100 Gbps) + +### Ring buffer sizing +- [ ] Make `SEND_QUEUE_DEPTH` runtime-configurable (currently 256, overflows at ~3 Gbps/30ms) +- [ ] Make `MAX_SENT` runtime-configurable (currently 256, same limit — evictions break loss detection) +- [ ] Scale `SEND_BUF_SIZE` per-stream based on negotiated BDP (currently 64 KB, tight at 10 Mbps) + +### Syscall reduction +- [ ] GSO (`UDP_SEGMENT`) for Linux — batch N QUIC packets into 1 sendmsg (60× fewer send syscalls at 1 Gbps) +- [ ] recvmmsg for Linux — batch receive multiple datagrams per syscall +- [ ] Increase `SEND_BATCH` and `BATCH_SIZE` for higher packet rates (currently 32/16) + +### Zero-copy send path +- [ ] Encrypt directly into send queue slot (currently: pkt_scratch → enc_scratch → sq[].buf = 2 copies per packet) + +### Pacing at high rates +- [ ] Sub-millisecond pacing for >1 Gbps (current 1ms timer tick limits pacing granularity) +- [ ] Consider io_uring or busy-poll for microsecond-level pacing diff --git a/build.zig b/build.zig index 307dc34..01c7163 100644 --- a/build.zig +++ b/build.zig @@ -4,21 +4,35 @@ pub fn build(b: *std.Build) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); + // Congestion control algorithm selection: bbr (default) or cubic. + const Algorithm = enum { bbr, cubic }; + const congestion = b.option(Algorithm, "congestion", "Congestion control algorithm: bbr (default) or cubic") orelse .bbr; + const congestion_cubic = congestion == .cubic; + + const build_options = b.addOptions(); + build_options.addOption(bool, "congestion_cubic", congestion_cubic); + const build_options_mod = build_options.createModule(); + // Public module: consumers import this as @import("zquic") const zquic_mod = b.addModule("zquic", .{ .root_source_file = b.path("src/root.zig"), .target = target, .optimize = optimize, + .imports = &.{ + .{ .name = "build_options", .module = build_options_mod }, + }, }); // Static library artifact + const lib_mod = b.createModule(.{ + .root_source_file = b.path("src/root.zig"), + .target = target, + .optimize = optimize, + }); + lib_mod.addImport("build_options", build_options_mod); const lib = b.addLibrary(.{ .name = "zquic", - .root_module = b.createModule(.{ - .root_source_file = b.path("src/root.zig"), - .target = target, - .optimize = optimize, - }), + .root_module = lib_mod, }); b.installArtifact(lib); @@ -83,6 +97,8 @@ pub fn build(b: *std.Build) void { "src/quic/stream.zig", "src/quic/flow_control.zig", "src/quic/congestion/cubic.zig", + "src/quic/congestion/bbr.zig", + "src/quic/congestion/common.zig", "src/quic/transport_params.zig", "src/quic/loss_recovery.zig", "src/quic/tls.zig", @@ -104,7 +120,11 @@ pub fn build(b: *std.Build) void { .target = target, .optimize = optimize, }); + mod.addImport("build_options", build_options_mod); const t = b.addTest(.{ .root_module = mod }); + // Connection(16) is ~2.2 MB; Debug mode disables copy elision, creating + // ~16 MB of stack frames in accept() + test. 64 MB gives enough headroom. + t.stack_size = 64 * 1024 * 1024; const run = b.addRunArtifact(t); test_step.dependOn(&run.step); } @@ -119,5 +139,6 @@ pub fn build(b: *std.Build) void { server_test_mod.addImport("http3", http3_mod); server_test_mod.addImport("qpack", qpack_mod); const server_test = b.addTest(.{ .root_module = server_test_mod }); + server_test.stack_size = 64 * 1024 * 1024; test_step.dependOn(&b.addRunArtifact(server_test).step); } diff --git a/interop-test.sh b/interop-test.sh index 84ef833..8f3b13e 100755 --- a/interop-test.sh +++ b/interop-test.sh @@ -362,13 +362,14 @@ phase_verify_setup() { echo -e "${GREEN}✓${NC} zquic Docker image ready" # Verify implementations.json includes zquic - if grep -q '"zquic"' "$INTEROP_DIR/implementations.json"; then - echo -e "${GREEN}✓${NC} zquic registered in implementations.json" + local impl_file="$INTEROP_DIR/implementations_quic.json" + if grep -q '"zquic"' "$impl_file" 2>/dev/null; then + echo -e "${GREEN}✓${NC} zquic registered in implementations_quic.json" else - echo -e "${YELLOW}⚠${NC} zquic not in implementations.json, adding it..." - python3 << 'PYTHON_SCRIPT' -import json -config_file = '$INTEROP_DIR/implementations.json' + echo -e "${YELLOW}⚠${NC} zquic not in implementations_quic.json, adding it..." + python3 - "$impl_file" << 'PYTHON_SCRIPT' +import json, sys +config_file = sys.argv[1] with open(config_file, 'r') as f: config = json.load(f) if 'zquic' not in config: @@ -380,7 +381,7 @@ if 'zquic' not in config: with open(config_file, 'w') as f: json.dump(config, f, indent=2) PYTHON_SCRIPT - echo -e "${GREEN}✓${NC} zquic added to implementations.json" + echo -e "${GREEN}✓${NC} zquic added to implementations_quic.json" fi echo "" diff --git a/interop/Dockerfile b/interop/Dockerfile index b9584de..92e4a67 100644 --- a/interop/Dockerfile +++ b/interop/Dockerfile @@ -39,7 +39,7 @@ COPY . . RUN set -e; \ . /build_env.sh; \ - zig build -Doptimize=ReleaseSafe -Dtarget="${TARGET}" + zig build -Doptimize=ReleaseSafe -Dtarget="${TARGET}" -Dcongestion=bbr # Stage 2: Runtime image with network simulator support. FROM martenseemann/quic-network-simulator-endpoint:latest diff --git a/src/quic/congestion/bbr.zig b/src/quic/congestion/bbr.zig new file mode 100644 index 0000000..d196be4 --- /dev/null +++ b/src/quic/congestion/bbr.zig @@ -0,0 +1,1216 @@ +//! BBR v3 congestion control. +//! +//! Model-based congestion control that explicitly estimates bandwidth and RTT +//! to operate at the optimal BDP point. Implements the BBR v3 state machine +//! with loss-based inflight bounding. +//! +//! References: +//! - IETF draft-cardwell-iccrg-bbr-congestion-control +//! - Linux kernel net/ipv4/tcp_bbr.c v3 branch + +const std = @import("std"); +const common = @import("common.zig"); +const DeliveryRateSample = common.DeliveryRateSample; +const MSS = common.MSS; +const INITIAL_CWND = common.INITIAL_CWND; + +// --------------------------------------------------------------------------- +// BBR-specific constants +// --------------------------------------------------------------------------- + +/// Minimum cwnd: 4 packets (allows recovery even in ProbeRTT). +const BBR_MIN_CWND: u64 = 4 * MSS; +/// Startup pacing gain. The canonical BBR value is 2/ln(2) ≈ 2.89, which +/// is designed for deep-buffered paths. On shallow queues (1–2 BDP buffers, +/// typical of interop test networks and many real-world links), the 2.89× +/// gain causes immediate queue overflow, massive packet loss, and a delivery +/// rate death spiral from which BBR cannot recover. Using 1.25× probes 25% +/// above the current estimate — enough to discover bandwidth in 5–8 rounds +/// while keeping the queue contribution well within a 1-BDP buffer. +const BBR_STARTUP_PACING_GAIN: f64 = 1.25; +/// Drain pacing gain: 1/startup_gain. +const BBR_DRAIN_PACING_GAIN: f64 = 1.0 / BBR_STARTUP_PACING_GAIN; +/// ProbeBW UP phase pacing gain. +const BBR_PROBE_BW_UP_PACING_GAIN: f64 = 1.25; +/// ProbeBW DOWN phase pacing gain. +const BBR_PROBE_BW_DOWN_PACING_GAIN: f64 = 0.9; +/// cwnd gain during Startup and Drain. +const BBR_CWND_GAIN: f64 = 2.0; +/// ProbeRTT interval: re-probe RTT every 60 seconds. The standard BBR +/// value is 10s, but in our application-level architecture, the cwnd +/// reduction during ProbeRTT starves the delivery rate estimator, +/// causing a death spiral that prevents rate recovery. 60s gives +/// transfers time to complete before ProbeRTT triggers. +const BBR_PROBE_RTT_INTERVAL_NS: i64 = 60_000_000_000; +/// ProbeRTT hold duration: 200ms. +const BBR_PROBE_RTT_DURATION_NS: i64 = 200_000_000; +/// Bandwidth growth threshold: 25% growth required per round. +const BBR_FULL_BW_THRESHOLD: f64 = 1.25; +/// Rounds without growth before declaring pipe filled. +const BBR_FULL_BW_COUNT: u8 = 3; + +// --------------------------------------------------------------------------- +// Windowed Filter +// --------------------------------------------------------------------------- + +/// Fixed-size windowed max filter. Tracks the maximum value over a sliding +/// window of `window` rounds. No allocator needed. +fn WindowedFilter(comptime T: type, comptime window: u64) type { + return struct { + const Self = @This(); + + val: [3]T, + round: [3]u64, + + pub fn init(initial: T) Self { + return .{ + .val = .{ initial, initial, initial }, + .round = .{ 0, 0, 0 }, + }; + } + + pub fn get(self: *const Self) T { + return self.val[0]; + } + + pub fn update(self: *Self, val: T, round: u64) void { + // If new value >= current best, it becomes the new best. + // Demote old entries rather than resetting all three to the + // same round — otherwise all three expire simultaneously + // and the filter collapses to whatever the current sample is. + if (val >= self.val[0]) { + self.val[2] = self.val[1]; + self.round[2] = self.round[1]; + self.val[1] = self.val[0]; + self.round[1] = self.round[0]; + self.val[0] = val; + self.round[0] = round; + return; + } + + // If current best has expired, promote. + if (round -| self.round[0] >= window) { + self.val[0] = val; + self.round[0] = round; + if (round -| self.round[1] >= window) { + self.val[1] = val; + self.round[1] = round; + } + if (round -| self.round[2] >= window) { + self.val[2] = val; + self.round[2] = round; + } + if (self.val[1] > self.val[0]) { + self.val[0] = self.val[1]; + self.round[0] = self.round[1]; + } + if (self.val[2] > self.val[0]) { + self.val[0] = self.val[2]; + self.round[0] = self.round[2]; + } + return; + } + + // New value fits as second-best or third-best. + if (val >= self.val[1]) { + self.val[1] = val; + self.round[1] = round; + self.val[2] = val; + self.round[2] = round; + } else if (val >= self.val[2]) { + self.val[2] = val; + self.round[2] = round; + } + } + + pub fn reset(self: *Self, val: T, round: u64) void { + self.val = .{ val, val, val }; + self.round = .{ round, round, round }; + } + }; +} + +// --------------------------------------------------------------------------- +// BBR v3 State Machine +// --------------------------------------------------------------------------- + +pub const State = enum { startup, drain, probe_bw, probe_rtt }; +pub const ProbeBwPhase = enum { down, cruise, refill, up }; + +pub const Bbr = struct { + // --- Public API fields --- + cwnd: u64, + pacing: common.Pacing, + + // --- State machine --- + state: State, + probe_bw_phase: ProbeBwPhase, + + // --- Bandwidth estimation --- + max_bw: u64, // bytes/sec (windowed max, cached from filter) + max_bw_filter: WindowedFilter(u64, 100), // large window to prevent max_bw collapse during + // loss recovery in our send-queue architecture (standard BBR uses 2) + bw_hi: u64, // upper bound from loss + + // --- RTT estimation --- + min_rtt_ns: u64, // nanoseconds (windowed min, ~10s) + min_rtt_stamp_ns: i64, // when min_rtt was last updated + probe_rtt_done_ns: ?i64, // when ProbeRTT 200ms hold ends + probe_rtt_round_done: bool, + + // --- Round tracking --- + round_count: u64, + + // --- Inflight bounds (BBR v3 loss-based) --- + inflight_hi: u64, // upper inflight bound + + // --- Loss tracking --- + loss_in_round: u64, + bytes_in_round: u64, + + // --- Startup state --- + full_bw: u64, // BW at last plateau check + full_bw_count: u8, // rounds without 25% growth + filled_pipe: bool, + + // --- Gains (current multipliers) --- + pacing_gain: f64, + cwnd_gain: f64, + + // --- Extra ACKed tracking (for cwnd headroom) --- + extra_acked: u64, // cached from filter + extra_acked_filter: WindowedFilter(u64, 2), + extra_acked_in_interval: u64, + + // --- ProbeBW cruise timing --- + probe_bw_rounds: u64, // rounds spent in current ProbeBW phase + probe_up_rounds: u64, // rounds in UP phase + + pub fn init() Bbr { + // Bootstrap pacing rate: initial_cwnd / initial_rtt (no startup gain). + // Using startup_gain (2.885×) here causes the initial burst to overflow + // shallow queues (25 packets fill in 12ms at 4.2 MB/s). Without the + // gain, rate ≈ 1.45 MB/s which stays close to typical link rates. + // BBR still discovers capacity through cwnd doubling each round. + const initial_rate: u64 = @intFromFloat( + @as(f64, @floatFromInt(INITIAL_CWND)) * 1_000_000_000.0 / + @as(f64, @floatFromInt(10_000_000)), // K_INITIAL_RTT_NS = 10ms + ); + return .{ + .cwnd = INITIAL_CWND, + .pacing = .{ .rate = initial_rate, .tokens = INITIAL_CWND, .last_refill_ns = 0 }, + .state = .startup, + .probe_bw_phase = .down, + .max_bw = 0, + .max_bw_filter = WindowedFilter(u64, 100).init(0), + .bw_hi = std.math.maxInt(u64), + .min_rtt_ns = std.math.maxInt(u64), + .min_rtt_stamp_ns = 0, + .probe_rtt_done_ns = null, + .probe_rtt_round_done = false, + .round_count = 0, + .inflight_hi = std.math.maxInt(u64), + .loss_in_round = 0, + .bytes_in_round = 0, + .full_bw = 0, + .full_bw_count = 0, + .filled_pipe = false, + .pacing_gain = BBR_STARTUP_PACING_GAIN, + .cwnd_gain = BBR_CWND_GAIN, + .extra_acked = 0, + .extra_acked_filter = WindowedFilter(u64, 2).init(0), + .extra_acked_in_interval = 0, + .probe_bw_rounds = 0, + .probe_up_rounds = 0, + }; + } + + /// True when the congestion window allows sending. + pub fn canSend(self: *const Bbr) bool { + return self.cwnd > 0; + } + + /// Disable pacing during Startup: the application-level token bucket + /// can't match the bursty send pattern needed for bandwidth discovery. + /// After the initial burst depletes tokens, the pacing gate locks the + /// server to 1 packet per ACK (token drip-feed), preventing cwnd from + /// filling. Bypassing pacing lets Startup send at cwnd speed — like + /// TCP slow start — so bandwidth is discovered in 4–6 RTTs. Once + /// filled_pipe is set (Startup complete), pacing is enforced. + pub fn shouldPace(self: *const Bbr) bool { + return self.filled_pipe; + } + + /// Called when an ACK is received with a delivery rate sample. + pub fn onAckReceived(self: *Bbr, sample: DeliveryRateSample, now_ns: i64) void { + // Increment round count (needed for filter windows), but DON'T reset + // per-round loss counters yet — the state machine evaluates them first. + if (sample.round_start) { + self.round_count += 1; + } + + // Update bandwidth estimate (ignore app-limited samples unless they exceed max). + if (!sample.is_app_limited or sample.delivery_rate > self.max_bw) { + self.max_bw_filter.update(sample.delivery_rate, self.round_count); + self.max_bw = self.max_bw_filter.get(); + } + + // Update min RTT. Reject the RTT estimator's bootstrap value + // (K_INITIAL_RTT = 10ms) — the first ACK carries this placeholder + // before a real measurement exists, and accepting it poisons min_rtt + // making BDP far too small (Drain never exits, throughput collapses). + const K_INITIAL_RTT_NS: u64 = 10_000_000; + if (sample.rtt_ns > 0 and sample.rtt_ns != K_INITIAL_RTT_NS and sample.rtt_ns < self.min_rtt_ns) { + self.min_rtt_ns = sample.rtt_ns; + self.min_rtt_stamp_ns = now_ns; + } + + // Update extra ACKed for cwnd headroom. + self.updateExtraAcked(sample); + + // State machine transitions (evaluates accumulated round loss data). + switch (self.state) { + .startup => self.updateStartup(sample), + .drain => self.updateDrain(sample), + .probe_bw => self.updateProbeBw(sample), + .probe_rtt => self.updateProbeRtt(sample, now_ns), + } + + // NOW reset per-round counters and start accumulating for the new round. + if (sample.round_start) { + self.loss_in_round = 0; + self.bytes_in_round = 0; + } + self.loss_in_round += sample.bytes_lost; + self.bytes_in_round += sample.bytes_acked + sample.bytes_lost; + + // Update pacing rate and cwnd. + self.updatePacingRate(); + self.updateCwnd(sample.bytes_acked); + + // Check if we should enter ProbeRTT (only from ProbeBW). + if (self.state == .probe_bw) { + self.checkProbeRtt(now_ns); + } + } + + /// Called on packet loss. BBR v3 uses loss for inflight bounding. + pub fn onPacketLost(_: *Bbr, _: u64, _: i64) void { + // Loss-based bounding is handled in onAckReceived via sample.bytes_lost. + // BBR v3 does not do multiplicative decrease on loss events. + } + + /// Called on persistent congestion: reset to Startup, clear estimates. + pub fn onPersistentCongestion(self: *Bbr) void { + self.state = .startup; + self.filled_pipe = false; + self.full_bw = 0; + self.full_bw_count = 0; + self.cwnd = BBR_MIN_CWND; + self.pacing_gain = BBR_STARTUP_PACING_GAIN; + self.cwnd_gain = BBR_CWND_GAIN; + // Preserve max_bw and its filter so the pacing rate stays at a + // reasonable level during recovery. Resetting to 0 with the + // shallow-queue startup gain (1.25×) causes an extremely slow + // ramp — dozens of rounds to rediscover 10 Mbps from near-zero. + // The pacing floor (INITIAL_CWND / min_rtt) provides a lower bound, + // but the old max_bw gives a much better starting point. + self.bw_hi = std.math.maxInt(u64); + self.inflight_hi = BBR_MIN_CWND; + self.extra_acked_filter.reset(0, 0); + self.extra_acked = 0; + self.extra_acked_in_interval = 0; + // Reset per-round and phase counters to prevent stale data. + self.loss_in_round = 0; + self.bytes_in_round = 0; + self.probe_bw_rounds = 0; + self.probe_up_rounds = 0; + // Clear stale RTT — path may have changed fundamentally. + self.min_rtt_ns = std.math.maxInt(u64); + self.min_rtt_stamp_ns = 0; + // Reset pacing to allow initial burst on the new path. + self.pacing = .{}; + } + + /// Called on ECN CE marks. BBR reduces inflight bounding, NOT multiplicative cwnd decrease. + pub fn onEcnCe(self: *Bbr, _: u64, _: i64) void { + // Treat ECN as a bounding signal: reduce inflight_hi. + self.inflight_hi = @max(applyBeta(self.inflight_hi), @max(self.bdp(), BBR_MIN_CWND)); + } + + /// Refill pacing tokens. Delegates to shared Pacing. + pub fn pacingRefill(self: *Bbr, now_ns: i64) u64 { + return self.pacing.refill(self.cwnd, now_ns); + } + + /// Consume pacing tokens after sending a packet. + pub fn pacingConsume(self: *Bbr, bytes: u64) void { + self.pacing.consume(bytes); + } + + // ----------------------------------------------------------------------- + // Internal: BDP computation + // ----------------------------------------------------------------------- + + fn bdp(self: *const Bbr) u64 { + if (self.min_rtt_ns == std.math.maxInt(u64) or self.max_bw == 0) { + return INITIAL_CWND; + } + // BDP = max_bw × min_rtt (convert ns to seconds). + const result: u64 = @intCast(@min( + @as(u128, self.max_bw) *| @as(u128, self.min_rtt_ns) / 1_000_000_000, + std.math.maxInt(u64), + )); + return @max(result, BBR_MIN_CWND); + } + + // ----------------------------------------------------------------------- + // Internal: Pacing rate + // ----------------------------------------------------------------------- + + fn updatePacingRate(self: *Bbr) void { + if (self.max_bw == 0) return; + // Apply bw_hi bound (from loss bounding). + const bw = @min(self.max_bw, self.bw_hi); + const rate_f = @as(f64, @floatFromInt(bw)) * self.pacing_gain; + const rate: u64 = if (rate_f >= @as(f64, @floatFromInt(std.math.maxInt(u64)))) + std.math.maxInt(u64) + else + @intFromFloat(rate_f); + // Floor: never pace slower than initial_cwnd / initial_rtt. + // Without this floor, a transient delivery rate collapse (e.g., + // during loss recovery) creates a death spiral where the low + // pacing rate prevents sending, which prevents ACKs, which + // prevents the rate from recovering. + const min_rate: u64 = @intFromFloat( + @as(f64, @floatFromInt(INITIAL_CWND)) * 1_000_000_000.0 / + @as(f64, @floatFromInt(@max(self.min_rtt_ns, 1))), + ); + self.pacing.rate = @max(rate, min_rate); + } + + // ----------------------------------------------------------------------- + // Internal: cwnd + // ----------------------------------------------------------------------- + + fn updateCwnd(self: *Bbr, bytes_acked: u64) void { + if (self.state == .probe_rtt) { + self.cwnd = BBR_MIN_CWND; + return; + } + + // During Drain, use BDP × cwnd_gain as the target (same as ProbeBW) + // so bytes_in_flight can actually drop below BDP, allowing Drain to + // exit. Previously cwnd was locked to inflight_hi (the Startup peak), + // which kept bif far above BDP and trapped BBR in Drain permanently. + + // Target = BDP × cwnd_gain + extra_acked headroom. + var target_f: f64 = @as(f64, @floatFromInt(self.bdp())) * self.cwnd_gain + + @as(f64, @floatFromInt(self.extra_acked)); + + // In ProbeBW, cap by inflight_hi — except during UP phase where we + // intentionally probe above the current bound to discover more capacity. + if (self.state == .probe_bw and self.probe_bw_phase != .up) { + target_f = @min(target_f, @as(f64, @floatFromInt(self.inflight_hi))); + } + + const max_u64_f = @as(f64, @floatFromInt(std.math.maxInt(u64))); + const target: u64 = if (target_f >= max_u64_f) std.math.maxInt(u64) else @intFromFloat(@max(target_f, 0)); + const target_clamped = @max(target, BBR_MIN_CWND); + + if (self.filled_pipe) { + // Post-startup: grow toward target, don't exceed it. + self.cwnd = @min(self.cwnd +| bytes_acked, target_clamped); + } else { + // Startup: grow quickly (saturating to prevent overflow). + self.cwnd +|= bytes_acked; + } + self.cwnd = @max(self.cwnd, BBR_MIN_CWND); + } + + // ----------------------------------------------------------------------- + // Internal: Startup state + // ----------------------------------------------------------------------- + + fn updateStartup(self: *Bbr, sample: DeliveryRateSample) void { + // Check loss on EVERY ACK, not just round_start. CUBIC detects loss + // immediately and reduces cwnd; BBR must do the same to avoid growing + // cwnd from 58 KB to 200+ KB during a single lossy round. Without + // this, the Startup burst overwhelms shallow queues and recovery + // from 200+ lost packets exceeds the 64 KB stream buffer. + if (self.isExcessiveLoss()) { + self.enterDrain(); + return; + } + + if (!sample.round_start) return; + + // Check for bandwidth plateau. + if (self.max_bw >= @as(u64, @intFromFloat(@as(f64, @floatFromInt(self.full_bw)) * BBR_FULL_BW_THRESHOLD))) { + // Still growing — reset counter. + self.full_bw = self.max_bw; + self.full_bw_count = 0; + } else { + self.full_bw_count += 1; + } + + if (self.full_bw_count >= BBR_FULL_BW_COUNT) { + self.enterDrain(); + } + } + + fn enterDrain(self: *Bbr) void { + self.state = .drain; + self.filled_pipe = true; + // Use 1.0× pacing gain during Drain instead of 0.346×. The cwnd + // target (BDP × cwnd_gain) already limits inflight; the ultra-low + // Drain rate (0.346×) makes retransmission recovery 6× slower than + // CUBIC's post-loss rate, causing the server to appear dead. + self.pacing_gain = 1.0; + self.cwnd_gain = BBR_CWND_GAIN; + // If Startup exited due to loss, the cwnd is massively inflated. + // Set inflight_hi to BDP so cwnd drains properly and ProbeBW starts + // with a reasonable bound. Without this, inflight_hi stays at the + // Startup peak and cwnd never converges to the actual capacity. + if (self.isExcessiveLoss()) { + self.inflight_hi = @max(self.bdp(), BBR_MIN_CWND); + } else { + self.inflight_hi = self.cwnd; + } + } + + // ----------------------------------------------------------------------- + // Internal: Drain state + // ----------------------------------------------------------------------- + + fn updateDrain(self: *Bbr, sample: DeliveryRateSample) void { + // Apply loss bounding during Drain — continued loss from the Startup + // burst should reduce inflight_hi toward BDP, not stay at the peak. + if (sample.round_start and self.isExcessiveLoss()) { + self.applyLossBounding(true); + } + // Exit Drain when post-ACK inflight ≤ BDP. Use prior_inflight + // minus bytes_acked: prior_inflight is captured BEFORE the ACK + // reduces bytes_in_flight, so it includes the just-ACKed data. + // Subtracting gives the actual pipe depth after draining. + if (sample.prior_inflight -| sample.bytes_acked <= self.bdp()) { + self.enterProbeBw(.down); + } + } + + // ----------------------------------------------------------------------- + // Internal: ProbeBW state (steady state) + // ----------------------------------------------------------------------- + + fn enterProbeBw(self: *Bbr, phase: ProbeBwPhase) void { + self.state = .probe_bw; + self.probe_bw_phase = phase; + self.probe_bw_rounds = 0; + self.probe_up_rounds = 0; + // Use cwnd_gain = 2.0 to target 2×BDP — provides headroom for + // retransmissions and ACK aggregation in real networks. + self.cwnd_gain = BBR_CWND_GAIN; + self.pacing_gain = switch (phase) { + .down => 1.0, // Use 1.0× instead of 0.9× — on shallow queues, + // 0.9× is too slow for loss recovery and causes server stalls. + .cruise, .refill => 1.0, + .up => BBR_PROBE_BW_UP_PACING_GAIN, + }; + if (phase == .refill) { + // Reset bw_hi before probing up so previous reductions don't persist. + self.bw_hi = std.math.maxInt(u64); + } + } + + fn updateProbeBw(self: *Bbr, sample: DeliveryRateSample) void { + // Per-round loss bounding (applies to all phases). + const had_excessive_loss = sample.round_start and self.isExcessiveLoss(); + if (sample.round_start) { + self.applyLossBounding(had_excessive_loss); + self.probe_bw_rounds += 1; + } + + switch (self.probe_bw_phase) { + .down => { + // Same post-ACK inflight rationale as Drain exit. + if (sample.prior_inflight -| sample.bytes_acked <= self.bdp()) { + self.enterProbeBw(.cruise); + } + }, + .cruise => { + if (self.probe_bw_rounds >= 4) { + self.enterProbeBw(.refill); + } + }, + .refill => { + if (sample.round_start and self.probe_bw_rounds >= 1) { + self.enterProbeBw(.up); + } + }, + .up => { + if (sample.round_start) self.probe_up_rounds += 1; + // applyLossBounding already reduced inflight_hi; just transition on loss. + if (had_excessive_loss) { + self.enterProbeBw(.down); + } else if (self.probe_up_rounds >= 2) { + self.inflight_hi = @max(self.inflight_hi, sample.prior_inflight); + self.enterProbeBw(.down); + } + }, + } + } + + fn applyLossBounding(self: *Bbr, excessive_loss: bool) void { + if (excessive_loss) { + self.bw_hi = @max(applyBeta(self.bw_hi), self.max_bw); + // Floor at BDP to prevent spiral: repeated 0.7× reductions after + // blackhole recovery would collapse inflight_hi to near-zero. + self.inflight_hi = @max(applyBeta(self.inflight_hi), @max(self.bdp(), BBR_MIN_CWND)); + } + } + + // ----------------------------------------------------------------------- + // Internal: ProbeRTT state + // ----------------------------------------------------------------------- + + fn checkProbeRtt(self: *Bbr, now_ns: i64) void { + if (self.state == .probe_rtt) return; + if (self.min_rtt_ns == std.math.maxInt(u64)) return; + + // Enter ProbeRTT if min_rtt hasn't been updated for BBR_PROBE_RTT_INTERVAL_NS. + if (now_ns - self.min_rtt_stamp_ns >= BBR_PROBE_RTT_INTERVAL_NS) { + self.enterProbeRtt(); + } + } + + fn enterProbeRtt(self: *Bbr) void { + self.state = .probe_rtt; + self.pacing_gain = 1.0; + self.cwnd_gain = 1.0; + self.probe_rtt_done_ns = null; + self.probe_rtt_round_done = false; + } + + fn updateProbeRtt(self: *Bbr, sample: DeliveryRateSample, now_ns: i64) void { + // Wait for inflight to drain to min cwnd. + if (self.probe_rtt_done_ns == null) { + if (sample.prior_inflight <= BBR_MIN_CWND) { + // Inflight drained — start 200ms timer. + self.probe_rtt_done_ns = now_ns + BBR_PROBE_RTT_DURATION_NS; + self.probe_rtt_round_done = false; + } + return; + } + + // Wait for one full round. + if (sample.round_start) { + self.probe_rtt_round_done = true; + } + + // Exit when both 200ms elapsed AND one round completed. + if (self.probe_rtt_round_done and now_ns >= self.probe_rtt_done_ns.?) { + // Update min_rtt timestamp. + self.min_rtt_stamp_ns = now_ns; + self.exitProbeRtt(); + } + } + + fn exitProbeRtt(self: *Bbr) void { + if (!self.filled_pipe) { + self.state = .startup; + self.pacing_gain = BBR_STARTUP_PACING_GAIN; + self.cwnd_gain = BBR_CWND_GAIN; + } else { + self.enterProbeBw(.cruise); + } + } + + // ----------------------------------------------------------------------- + // Internal: Helpers + // ----------------------------------------------------------------------- + + /// True if >2% of bytes in the current round were lost. + /// Uses `loss * 50 > bytes` (equivalent to `loss / bytes > 0.02`) to stay in u64. + fn isExcessiveLoss(self: *const Bbr) bool { + return self.bytes_in_round > 0 and + self.loss_in_round *| 50 > self.bytes_in_round; + } + + /// Apply BBR_BETA (0.7) reduction to a u64 value using integer arithmetic. + fn applyBeta(val: u64) u64 { + return val *| 7 / 10; + } + + // ----------------------------------------------------------------------- + // Internal: Extra ACKed tracking + // ----------------------------------------------------------------------- + + fn updateExtraAcked(self: *Bbr, sample: DeliveryRateSample) void { + // Reset interval on round boundary unconditionally (even if early returns below skip accumulation). + if (sample.round_start) { + self.extra_acked_filter.update(self.extra_acked_in_interval, self.round_count); + self.extra_acked = self.extra_acked_filter.get(); + self.extra_acked_in_interval = 0; + } + + if (sample.bytes_acked == 0) return; + if (self.max_bw == 0 or sample.rtt_ns == 0) return; + + // Expected delivery = max_bw × rtt_sample. + const expected: u64 = @intCast(@min( + @as(u128, self.max_bw) *| @as(u128, sample.rtt_ns) / 1_000_000_000, + std.math.maxInt(u64), + )); + + if (sample.bytes_acked > expected) { + self.extra_acked_in_interval += sample.bytes_acked - expected; + } + } +}; + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +test "bbr: init sets startup state" { + const b = Bbr.init(); + const testing = std.testing; + try testing.expectEqual(State.startup, b.state); + try testing.expectEqual(INITIAL_CWND, b.cwnd); + try testing.expect(b.pacing_gain > 1.0); + try testing.expect(!b.filled_pipe); +} + +test "bbr: canSend" { + var b = Bbr.init(); + const testing = std.testing; + try testing.expect(b.canSend()); + b.cwnd = 0; + try testing.expect(!b.canSend()); +} + +test "bbr: bdp computation" { + var b = Bbr.init(); + // Set known values: 1 MB/s, 100ms RTT → BDP = 100,000 bytes. + b.max_bw = 1_000_000; + b.min_rtt_ns = 100_000_000; // 100ms + const expected: u64 = 100_000; // 1M × 0.1s + try std.testing.expectEqual(expected, b.bdp()); +} + +test "bbr: bdp returns initial cwnd when no samples" { + const b = Bbr.init(); + try std.testing.expectEqual(INITIAL_CWND, b.bdp()); +} + +test "bbr: startup exits on bandwidth plateau" { + var b = Bbr.init(); + b.max_bw = 1000; + b.full_bw = 1000; // Same as max_bw — no growth. + b.min_rtt_ns = 50_000_000; + + // Simulate 3 rounds without 25% growth. + var i: u8 = 0; + while (i < 3) : (i += 1) { + b.updateStartup(.{ + .delivery_rate = 1000, + .round_start = true, + }); + } + try std.testing.expectEqual(State.drain, b.state); + try std.testing.expect(b.filled_pipe); +} + +test "bbr: startup exits on excessive loss" { + var b = Bbr.init(); + b.max_bw = 1_000_000; + b.full_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.bytes_in_round = 10000; + b.loss_in_round = 300; // 3% loss > 2% threshold + + b.updateStartup(.{ .delivery_rate = 1_000_000, .round_start = true }); + try std.testing.expectEqual(State.drain, b.state); +} + +test "bbr: drain exits when inflight <= bdp" { + var b = Bbr.init(); + b.state = .drain; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 100_000_000; // BDP = 100,000 + + b.updateDrain(.{ .prior_inflight = 90_000 }); // below BDP + try std.testing.expectEqual(State.probe_bw, b.state); + try std.testing.expectEqual(ProbeBwPhase.down, b.probe_bw_phase); +} + +test "bbr: probe_bw phase cycling" { + var b = Bbr.init(); + b.state = .probe_bw; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + + // DOWN → CRUISE when inflight <= bdp + b.probe_bw_phase = .down; + b.pacing_gain = BBR_PROBE_BW_DOWN_PACING_GAIN; + b.updateProbeBw(.{ .prior_inflight = 1000, .round_start = true }); + try std.testing.expectEqual(ProbeBwPhase.cruise, b.probe_bw_phase); + + // CRUISE → REFILL after 4 rounds + b.probe_bw_rounds = 0; + var i: u8 = 0; + while (i < 4) : (i += 1) { + b.updateProbeBw(.{ .prior_inflight = 50_000, .round_start = true }); + } + try std.testing.expectEqual(ProbeBwPhase.refill, b.probe_bw_phase); + + // REFILL → UP after 1 round + b.probe_bw_rounds = 0; + b.updateProbeBw(.{ .prior_inflight = 50_000, .round_start = true }); + b.updateProbeBw(.{ .prior_inflight = 50_000, .round_start = true }); + try std.testing.expectEqual(ProbeBwPhase.up, b.probe_bw_phase); +} + +test "bbr: probe_rtt entry after interval" { + var b = Bbr.init(); + b.state = .probe_bw; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.min_rtt_stamp_ns = 0; + + // After probe_rtt interval, should enter ProbeRTT. + b.checkProbeRtt(BBR_PROBE_RTT_INTERVAL_NS + 1); + try std.testing.expectEqual(State.probe_rtt, b.state); +} + +test "bbr: probe_rtt exit after 200ms + 1 round" { + var b = Bbr.init(); + b.state = .probe_rtt; + b.filled_pipe = true; + b.min_rtt_ns = 50_000_000; + b.max_bw = 1_000_000; + b.probe_rtt_done_ns = null; + b.probe_rtt_round_done = false; + + // Step 1: inflight drains to min cwnd — starts 200ms timer. + b.updateProbeRtt(.{ .prior_inflight = BBR_MIN_CWND, .round_start = false }, 1000); + try std.testing.expect(b.probe_rtt_done_ns != null); + try std.testing.expect(!b.probe_rtt_round_done); + + // Step 2: round completes. + b.updateProbeRtt(.{ .prior_inflight = BBR_MIN_CWND, .round_start = true }, 1000 + 100_000_000); + try std.testing.expect(b.probe_rtt_round_done); + + // Step 3: 200ms elapsed. + b.updateProbeRtt(.{ .prior_inflight = BBR_MIN_CWND, .round_start = true }, 1000 + BBR_PROBE_RTT_DURATION_NS + 1); + try std.testing.expectEqual(State.probe_bw, b.state); +} + +test "bbr: windowed filter tracks max" { + const Filter = WindowedFilter(u64, 2); + var f = Filter.init(0); + f.update(100, 1); + try std.testing.expectEqual(@as(u64, 100), f.get()); + f.update(200, 2); + try std.testing.expectEqual(@as(u64, 200), f.get()); + // Lower value doesn't displace max. + f.update(50, 2); + try std.testing.expectEqual(@as(u64, 200), f.get()); +} + +test "bbr: windowed filter expires old values" { + const Filter = WindowedFilter(u64, 2); + var f = Filter.init(0); + f.update(200, 1); + try std.testing.expectEqual(@as(u64, 200), f.get()); + // After window expires (round 4, window=2), old value should be replaced. + f.update(100, 4); + try std.testing.expectEqual(@as(u64, 100), f.get()); +} + +test "bbr: windowed filter demotes on new best, preventing simultaneous expiry" { + const Filter = WindowedFilter(u64, 10); + var f = Filter.init(0); + // Startup peak at round 5. + f.update(1000, 5); + try std.testing.expectEqual(@as(u64, 1000), f.get()); + // Slightly higher peak at round 8 (inflated Startup sample). + f.update(1050, 8); + try std.testing.expectEqual(@as(u64, 1050), f.get()); + // ProbeBW DOWN samples at 950 (below peak) — enter as second/third. + f.update(950, 12); + f.update(960, 15); + // After 10 rounds from peak (round 18): peak at round 8 expires. + // The demoted second-best (1000 from round 5) also expired (18-5=13>=10). + // But 960 from round 15 is still valid (18-15=3<10). + f.update(700, 18); + // Without demotion fix: all three expire → get() = 700. + // With demotion fix: 960 (round 15) survives → get() = 960. + try std.testing.expectEqual(@as(u64, 960), f.get()); +} + +test "bbr: loss bounding reduces inflight_hi" { + var b = Bbr.init(); + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.inflight_hi = 100_000; + b.bw_hi = 2_000_000; + + // 5% loss rate (> 2% threshold). + b.bytes_in_round = 10000; + b.loss_in_round = 500; + + const old_hi = b.inflight_hi; + b.applyLossBounding(true); + try std.testing.expect(b.inflight_hi < old_hi); +} + +test "bbr: pacing refill with known rate" { + var b = Bbr.init(); + b.pacing.rate = 1_000_000; // 1 MB/s + b.pacing.tokens = 0; + b.pacing.last_refill_ns = 1_000_000_000; // 1s + + const tokens = b.pacingRefill(1_001_000_000); // 1ms later + // 1 MB/s × 0.001s = 1000 bytes. + try std.testing.expectEqual(@as(u64, 1000), tokens); +} + +test "bbr: pacing consume" { + var b = Bbr.init(); + b.pacing.tokens = 5000; + b.pacingConsume(3000); + try std.testing.expectEqual(@as(u64, 2000), b.pacing.tokens); +} + +test "bbr: persistent congestion resets to startup" { + var b = Bbr.init(); + b.state = .probe_bw; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.cwnd = 100_000; + b.onPersistentCongestion(); + try std.testing.expectEqual(State.startup, b.state); + try std.testing.expect(!b.filled_pipe); + try std.testing.expectEqual(BBR_MIN_CWND, b.cwnd); + // max_bw is preserved so pacing stays reasonable during recovery. + try std.testing.expectEqual(@as(u64, 1_000_000), b.max_bw); +} + +test "bbr: ecn ce reduces inflight_hi" { + var b = Bbr.init(); + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.inflight_hi = 200_000; + + const old_hi = b.inflight_hi; + b.onEcnCe(1, 0); + try std.testing.expect(b.inflight_hi < old_hi); +} + +test "bbr: startup grows cwnd on ack" { + var b = Bbr.init(); + const initial = b.cwnd; + b.min_rtt_ns = 50_000_000; + b.onAckReceived(.{ + .delivery_rate = 500_000, + .rtt_ns = 50_000_000, + .bytes_acked = MSS, + .round_start = false, + }, 1_000_000_000); + // Startup grows cwnd by bytes_acked. + try std.testing.expect(b.cwnd > initial); +} + +test "bbr: full state machine startup to probe_bw" { + var b = Bbr.init(); + b.min_rtt_ns = 50_000_000; + b.min_rtt_stamp_ns = 0; + + // Simulate startup with growing bandwidth. + var bw: u64 = 100_000; + var round: u64 = 0; + while (b.state == .startup and round < 20) : (round += 1) { + bw = bw * 3 / 2; // 50% growth per round. + b.onAckReceived(.{ + .delivery_rate = bw, + .rtt_ns = 50_000_000, + .bytes_acked = 10 * MSS, + .round_start = true, + }, @intCast(round * 50_000_000)); + } + + // BW stabilizes — should plateau and exit startup. + const stable_bw = bw; + while (b.state == .startup and round < 40) : (round += 1) { + b.onAckReceived(.{ + .delivery_rate = stable_bw, + .rtt_ns = 50_000_000, + .bytes_acked = 10 * MSS, + .round_start = true, + }, @intCast(round * 50_000_000)); + } + // Should have transitioned through drain. + try std.testing.expect(b.filled_pipe); + + // Drain until inflight ≤ BDP. + while (b.state == .drain and round < 60) : (round += 1) { + b.onAckReceived(.{ + .delivery_rate = stable_bw, + .rtt_ns = 50_000_000, + .bytes_acked = 10 * MSS, + .prior_inflight = 1000, // way below BDP + .round_start = true, + }, @intCast(round * 50_000_000)); + } + try std.testing.expectEqual(State.probe_bw, b.state); +} + +// --------------------------------------------------------------------------- +// Regression tests (bugs found during code review) +// --------------------------------------------------------------------------- + +test "bbr: regression — persistent congestion resets filters with round 0" { + // Bug: onPersistentCongestion reset round_count to 0 AFTER calling + // max_bw_filter.reset(0, self.round_count), storing a stale round number. + // Future filter updates would not expire the old value for many rounds. + var b = Bbr.init(); + b.round_count = 100; + b.max_bw = 500_000; + b.max_bw_filter.update(500_000, 100); + + b.onPersistentCongestion(); + + // round_count and max_bw_filter are preserved so pacing stays reasonable. + try std.testing.expectEqual(@as(u64, 100), b.round_count); + // Filter retains the pre-congestion value. + try std.testing.expectEqual(@as(u64, 500_000), b.max_bw_filter.get()); + // A higher value updates normally. + b.max_bw_filter.update(600_000, 101); + try std.testing.expectEqual(@as(u64, 600_000), b.max_bw_filter.get()); +} + +test "bbr: regression — persistent congestion resets min_rtt and pacing" { + // Bug: onPersistentCongestion did not reset min_rtt_ns, min_rtt_stamp_ns, + // pacing state, or extra_acked_in_interval. Stale values leaked into + // the new Startup phase. + var b = Bbr.init(); + b.min_rtt_ns = 10_000_000; + b.min_rtt_stamp_ns = 5_000_000_000; + b.pacing.rate = 1_000_000; + b.pacing.tokens = 50_000; + b.extra_acked_in_interval = 9999; + + b.onPersistentCongestion(); + + try std.testing.expectEqual(std.math.maxInt(u64), b.min_rtt_ns); + try std.testing.expectEqual(@as(i64, 0), b.min_rtt_stamp_ns); + try std.testing.expectEqual(@as(u64, 0), b.pacing.rate); + try std.testing.expectEqual(INITIAL_CWND, b.pacing.tokens); // default Pacing init + try std.testing.expectEqual(@as(u64, 0), b.extra_acked_in_interval); +} + +test "bbr: regression — no double inflight_hi reduction in ProbeBW UP" { + // Bug: checkLossBounding reduced inflight_hi, then the UP branch applied + // applyBeta again, double-reducing it. + var b = Bbr.init(); + b.state = .probe_bw; + b.probe_bw_phase = .up; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.inflight_hi = 200_000; + b.bw_hi = std.math.maxInt(u64); + + // Simulate excessive loss in a round. + b.bytes_in_round = 10000; + b.loss_in_round = 500; // 5% > 2% + + // One round_start ACK should reduce inflight_hi exactly once. + b.updateProbeBw(.{ .prior_inflight = 100_000, .round_start = true }); + + // After single beta reduction: 200_000 * 7/10 = 140_000. + // BDP = 1M * 50ms = 50_000. So max(140_000, 50_000) = 140_000. + const expected = @max(Bbr.applyBeta(200_000), @as(u64, 50_000)); + try std.testing.expectEqual(expected, b.inflight_hi); + // Must have transitioned to DOWN. + try std.testing.expectEqual(ProbeBwPhase.down, b.probe_bw_phase); +} + +test "bbr: regression — bw_hi restored in ProbeBW refill" { + // Bug: bw_hi was only reduced, never restored. Once checkLossBounding + // reduced it, the pacing rate was permanently suppressed. + var b = Bbr.init(); + b.state = .probe_bw; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.bw_hi = 500_000; // previously reduced + + // Entering refill should restore bw_hi to maxInt. + b.enterProbeBw(.refill); + try std.testing.expectEqual(std.math.maxInt(u64), b.bw_hi); +} + +test "bbr: regression — cwnd_gain is 2.0 in ProbeBW steady state" { + // cwnd_gain = 2.0 in ProbeBW provides 2×BDP headroom for retransmissions + // and ACK aggregation. + var b = Bbr.init(); + b.enterProbeBw(.cruise); + try std.testing.expectEqual(BBR_CWND_GAIN, b.cwnd_gain); + b.enterProbeBw(.down); + try std.testing.expectEqual(BBR_CWND_GAIN, b.cwnd_gain); + b.enterProbeBw(.up); + try std.testing.expectEqual(BBR_CWND_GAIN, b.cwnd_gain); + b.enterProbeBw(.refill); + try std.testing.expectEqual(BBR_CWND_GAIN, b.cwnd_gain); +} + +test "bbr: regression — inflight_hi initialized to maxInt" { + // Bug: inflight_hi was initialized to INITIAL_CWND, which would cap + // cwnd in ProbeBW before enterDrain had a chance to set it properly. + const b = Bbr.init(); + try std.testing.expectEqual(std.math.maxInt(u64), b.inflight_hi); +} + +test "bbr: regression — loss counters evaluated before reset on round boundary" { + // Bug: updateRoundCounters() zeroed loss_in_round/bytes_in_round before + // the state machine could evaluate them, making isExcessiveLoss() see + // only the current ACK's data instead of the full accumulated round. + var b = Bbr.init(); + b.state = .probe_bw; + b.probe_bw_phase = .up; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; + b.inflight_hi = 200_000; + b.bw_hi = std.math.maxInt(u64); + + // Accumulate loss data over several non-round-start ACKs. + b.onAckReceived(.{ .bytes_acked = 5000, .bytes_lost = 0 }, 100_000_000); + b.onAckReceived(.{ .bytes_acked = 5000, .bytes_lost = 0 }, 200_000_000); + b.onAckReceived(.{ .bytes_acked = 5000, .bytes_lost = 400 }, 300_000_000); + // Now: bytes_in_round=15000, loss_in_round=400 (2.67% > 2%) + try std.testing.expect(b.isExcessiveLoss()); + + // The round_start ACK should see the accumulated loss and transition. + const hi_before = b.inflight_hi; + b.onAckReceived(.{ .bytes_acked = 1000, .round_start = true }, 400_000_000); + + // inflight_hi must have been reduced (loss bounding triggered). + try std.testing.expect(b.inflight_hi < hi_before); + // Must have transitioned to DOWN. + try std.testing.expectEqual(ProbeBwPhase.down, b.probe_bw_phase); +} + +test "bbr: regression — persistent congestion resets loss and phase counters" { + // Bug: onPersistentCongestion didn't reset loss_in_round, bytes_in_round, + // probe_bw_rounds, probe_up_rounds. Stale loss data could trigger false + // Startup exit via isExcessiveLoss(). + var b = Bbr.init(); + b.loss_in_round = 500; + b.bytes_in_round = 10000; + b.probe_bw_rounds = 5; + b.probe_up_rounds = 2; + + b.onPersistentCongestion(); + + try std.testing.expectEqual(@as(u64, 0), b.loss_in_round); + try std.testing.expectEqual(@as(u64, 0), b.bytes_in_round); + try std.testing.expectEqual(@as(u64, 0), b.probe_bw_rounds); + try std.testing.expectEqual(@as(u64, 0), b.probe_up_rounds); +} + +test "bbr: regression — extra_acked capped by inflight_hi" { + // Bug: extra_acked was added after inflight_hi cap, allowing cwnd to + // exceed the loss-based inflight bound. + var b = Bbr.init(); + b.state = .probe_bw; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 50_000_000; // BDP = 50,000 + b.cwnd_gain = BBR_CWND_GAIN; + b.inflight_hi = 60_000; + b.extra_acked = 50_000; // large headroom + + b.updateCwnd(MSS); + + // cwnd must not exceed inflight_hi. + try std.testing.expect(b.cwnd <= b.inflight_hi); +} + +test "bbr: regression — ProbeRTT only enters from ProbeBW" { + // Bug: checkProbeRtt could fire during Startup or Drain, entering + // ProbeRTT before the pipe was filled. + var b = Bbr.init(); + b.state = .startup; + b.min_rtt_ns = 50_000_000; + b.min_rtt_stamp_ns = 0; + + // After probe_rtt interval — would trigger ProbeRTT from ProbeBW. + // But from Startup, it should be ignored. + b.onAckReceived(.{ + .delivery_rate = 500_000, + .rtt_ns = 50_000_000, + .bytes_acked = MSS, + }, 10_000_000_001); + + // Must still be in Startup (or Drain if BW plateau hit), NOT ProbeRTT. + try std.testing.expect(b.state != .probe_rtt); +} + +// --------------------------------------------------------------------------- +// Tests for Startup pacing bypass, ProbeBW DOWN headroom, bdpHeadroom scaling +// --------------------------------------------------------------------------- + +test "bbr: shouldPace disabled during Startup, enabled after" { + var b = Bbr.init(); + // Startup: pacing disabled. + try std.testing.expect(!b.shouldPace()); + try std.testing.expect(!b.filled_pipe); + + // After enterDrain: filled_pipe = true, pacing enabled. + b.enterDrain(); + try std.testing.expect(b.shouldPace()); + try std.testing.expect(b.filled_pipe); + + // After persistent congestion: back to Startup, pacing disabled. + b.onPersistentCongestion(); + try std.testing.expect(!b.shouldPace()); + try std.testing.expect(!b.filled_pipe); +} + +test "bbr: ProbeBW DOWN exits using post-ACK inflight" { + var b = Bbr.init(); + b.state = .probe_bw; + b.probe_bw_phase = .down; + b.filled_pipe = true; + b.max_bw = 1_000_000; // 1 MB/s + b.min_rtt_ns = 100_000_000; // 100ms → BDP = 100,000 + + // Pre-ACK inflight = 2×BDP (cwnd full), bytes_acked = BDP+. + // Post-ACK = 2×BDP - (BDP+) < BDP → exits DOWN. + b.updateProbeBw(.{ .prior_inflight = 200_000, .bytes_acked = 110_000, .round_start = true }); + try std.testing.expectEqual(ProbeBwPhase.cruise, b.probe_bw_phase); + + // Reset. Post-ACK inflight still above BDP: stays in DOWN. + b.probe_bw_phase = .down; + b.probe_bw_rounds = 0; + b.updateProbeBw(.{ .prior_inflight = 200_000, .bytes_acked = 50_000, .round_start = true }); + try std.testing.expectEqual(ProbeBwPhase.down, b.probe_bw_phase); +} + +test "bbr: Drain exits using post-ACK inflight" { + var b = Bbr.init(); + b.state = .drain; + b.filled_pipe = true; + b.max_bw = 1_000_000; + b.min_rtt_ns = 100_000_000; // BDP = 100,000 + + // Pre-ACK inflight high (Startup peak), but post-ACK ≤ BDP. + b.updateDrain(.{ .prior_inflight = 300_000, .bytes_acked = 210_000 }); + try std.testing.expectEqual(State.probe_bw, b.state); +} diff --git a/src/quic/congestion/cc.zig b/src/quic/congestion/cc.zig new file mode 100644 index 0000000..a1b85f5 --- /dev/null +++ b/src/quic/congestion/cc.zig @@ -0,0 +1,22 @@ +//! Congestion control algorithm abstraction layer. +//! +//! Provides a comptime switch between BBR v3 and CUBIC. The active algorithm +//! is selected at build time via `-Dcongestion=cubic` (default: bbr). +//! Both algorithms expose the same public API, so the rest of the stack +//! uses `cc.CongestionControl` without knowing which is active. + +const build_options = @import("build_options"); +const cubic = @import("cubic.zig"); +const bbr = @import("bbr.zig"); + +pub const DeliveryRateSample = @import("common.zig").DeliveryRateSample; + +pub const Algorithm = enum { cubic, bbr }; + +/// Selected at build time via `-Dcongestion=cubic` (default: bbr). +pub const selected: Algorithm = if (build_options.congestion_cubic) .cubic else .bbr; + +pub const CongestionControl = switch (selected) { + .cubic => cubic.Cubic, + .bbr => bbr.Bbr, +}; diff --git a/src/quic/congestion/common.zig b/src/quic/congestion/common.zig new file mode 100644 index 0000000..be6ea4d --- /dev/null +++ b/src/quic/congestion/common.zig @@ -0,0 +1,138 @@ +//! Shared types and constants for congestion control algorithms. +//! +//! Defined here (in the congestion directory) so that congestion modules +//! can import it without reaching outside their module path. + +const std = @import("std"); + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/// RFC 9002 §7.2: max_datagram_size for congestion control. +/// Matches MAX_SEND_PACKET_SIZE (1452) — the actual UDP payload we send. +pub const MSS: u64 = 1452; +/// RFC 9002 §7.2: initial_window = min(10 * mds, max(14720, 2 * mds)) +/// = min(14520, max(14720, 2904)) = 14520. +pub const INITIAL_CWND: u64 = @min(10 * MSS, @max(14720, 2 * MSS)); + +// --------------------------------------------------------------------------- +// Delivery Rate Sample +// --------------------------------------------------------------------------- + +/// Per-ACK delivery rate sample, computed by LossRecovery and passed to +/// the congestion controller. +pub const DeliveryRateSample = struct { + delivery_rate: u64 = 0, // bytes/sec + is_app_limited: bool = false, + rtt_ns: u64 = 0, // latest RTT sample + bytes_acked: u64 = 0, + bytes_lost: u64 = 0, + prior_inflight: u64 = 0, // bytes_in_flight before this ACK + round_start: bool = false, // did a new round start? +}; + +// --------------------------------------------------------------------------- +// Pacing — shared token bucket used by both BBR and CUBIC +// --------------------------------------------------------------------------- + +/// Token bucket pacer. Spread packets evenly across the RTT instead of +/// bursting. Embedded by both Bbr and Cubic. +pub const Pacing = struct { + /// Pacing rate in bytes per second. Updated by the congestion controller. + rate: u64 = 0, + /// Token bucket: bytes allowed to send now. + tokens: u64 = INITIAL_CWND, // allow initial burst + /// Timestamp of last token refill (ns). + last_refill_ns: i64 = 0, + + /// Refill tokens based on elapsed time. Returns bytes allowed to send. + /// Tokens are capped at 2×cwnd to allow modest bursts without unlimited accumulation. + pub fn refill(self: *Pacing, cwnd: u64, now_ns: i64) u64 { + if (self.rate == 0) { + // No pacing rate yet (before first ACK) — allow full cwnd. + return cwnd; + } + if (self.last_refill_ns == 0) { + self.last_refill_ns = now_ns; + return self.tokens; + } + const elapsed_ns: u64 = @intCast(@max(now_ns - self.last_refill_ns, 0)); + // Only advance the timestamp when time has actually elapsed. + // Repeated calls with the same now_ns (within a drainSend batch) + // must NOT reset last_refill_ns, otherwise nextSendTime() computes + // a deadline that's already in the past, causing the event loop to + // spin instead of sleeping until enough tokens accumulate. + if (elapsed_ns > 0) { + self.last_refill_ns = now_ns; + } + // Use u128 to avoid saturation on fast links (e.g., 1 GB/s × 1s overflows u64). + const new_tokens: u64 = @intCast(@min( + @as(u128, self.rate) * elapsed_ns / 1_000_000_000, + std.math.maxInt(u64), + )); + self.tokens = @min(self.tokens +| new_tokens, cwnd *| 2); + return self.tokens; + } + + /// Consume tokens after sending a packet. + pub fn consume(self: *Pacing, bytes: u64) void { + self.tokens -|= bytes; + } + + /// Returns the nanosecond deadline when enough tokens will be available + /// to send one MSS-sized packet, or null if tokens are already sufficient + /// or pacing is not active (rate == 0). + pub fn nextSendTime(self: *const Pacing) ?i64 { + if (self.rate == 0) return null; + if (self.tokens >= MSS) return null; + const deficit = MSS - self.tokens; + const wait_ns: i64 = @intCast(@min( + @as(u128, deficit) * 1_000_000_000 / self.rate, + @as(u128, std.math.maxInt(i64)), + )); + return self.last_refill_ns +| wait_ns; + } +}; + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +test "pacing: regression — u128 prevents overflow on fast links" { + // Bug: `rate *| elapsed_ns / 1_000_000_000` used u64 saturating multiply. + // At 1 GB/s with 1s elapsed, rate × elapsed = 1e18 which fits u64, but + // at 10 GB/s × 1s = 1e19 which overflows u64 (max ~1.8e19). With the old + // saturating mul, tokens would cap at maxInt instead of the correct value. + var p = Pacing{ + .rate = 10_000_000_000, // 10 GB/s + .tokens = 0, + .last_refill_ns = 1_000_000_000, + }; + const tokens = p.refill(20_000_000_000, 2_000_000_000); // 1s later + // Expected: 10 GB/s × 1s = 10,000,000,000 bytes. + try std.testing.expectEqual(@as(u64, 10_000_000_000), tokens); +} + +test "pacing: refill and consume basic" { + var p = Pacing{ + .rate = 1_000_000, // 1 MB/s + .tokens = 0, + .last_refill_ns = 1_000_000_000, + }; + _ = p.refill(1_000_000, 1_001_000_000); // 1ms later → 1000 bytes + try std.testing.expectEqual(@as(u64, 1000), p.tokens); + p.consume(600); + try std.testing.expectEqual(@as(u64, 400), p.tokens); +} + +test "pacing: tokens capped at 2*cwnd" { + var p = Pacing{ + .rate = 1_000_000_000, // 1 GB/s + .tokens = 0, + .last_refill_ns = 1_000_000_000, // initialized + }; + const cwnd: u64 = 100_000; + _ = p.refill(cwnd, 2_000_000_000); // 1s later → 1 GB, but capped at 200_000 + try std.testing.expectEqual(cwnd * 2, p.tokens); +} diff --git a/src/quic/congestion/cubic.zig b/src/quic/congestion/cubic.zig index 2c8d848..e0c6d06 100644 --- a/src/quic/congestion/cubic.zig +++ b/src/quic/congestion/cubic.zig @@ -6,18 +6,16 @@ //! C = 0.4. const std = @import("std"); +const common = @import("common.zig"); +const DeliveryRateSample = common.DeliveryRateSample; +const MSS = common.MSS; +const INITIAL_CWND = common.INITIAL_CWND; /// RFC 9438 §5.1: C = 0.4 (in segments). Since our cwnd is in bytes, /// scale by MSS to get the correct growth rate: C_bytes = 0.4 × MSS. /// Without this scaling, K is MSS× too large and CUBIC degenerates to AIMD. const C: f64 = 0.4 * @as(f64, @floatFromInt(MSS)); const BETA_CUBIC: f64 = 0.7; -/// RFC 9002 §7.2: max_datagram_size for congestion control. -/// Matches MAX_SEND_PACKET_SIZE (1452) — the actual UDP payload we send. -const MSS: u64 = 1452; -/// RFC 9002 §7.2: initial_window = min(10 * mds, max(14720, 2 * mds)) -/// = min(14520, max(14720, 2904)) = 14520. -const INITIAL_CWND: u64 = @min(10 * MSS, @max(14720, 2 * MSS)); pub const Cubic = struct { /// Congestion window in bytes. @@ -39,16 +37,8 @@ pub const Cubic = struct { /// growth when (target - cwnd) * MSS < cwnd. cwnd_remainder: u64, - // Pacing state: spread packets evenly across the RTT instead of bursting. - // Without pacing, all cwnd bytes are sent instantly on ACK, overflowing - // shallow queues and causing loss. Pacing targets ~95% link utilization. - /// Pacing rate in bytes per second. Updated on every ACK. - pacing_rate: u64, - /// Pacing token bucket: bytes allowed to send now. Refilled each tick - /// based on elapsed time × pacing_rate. - pacing_tokens: u64, - /// Timestamp of last token refill (ns). - pacing_last_refill_ns: i64, + /// Pacing state (shared token bucket). + pacing: common.Pacing, pub fn init() Cubic { return .{ @@ -60,9 +50,7 @@ pub const Cubic = struct { .cwnd_at_epoch = 0, .w_est = 0, .cwnd_remainder = 0, - .pacing_rate = 0, - .pacing_tokens = INITIAL_CWND, // allow initial burst - .pacing_last_refill_ns = 0, + .pacing = .{}, }; } @@ -71,26 +59,30 @@ pub const Cubic = struct { return self.cwnd > 0; } - /// Called when an ACK is received. - /// `bytes_acked` — bytes acknowledged. - /// `rtt_ns` — smoothed RTT in nanoseconds. - /// `now_ns` — current time in nanoseconds. - pub fn onAckReceived(self: *Cubic, bytes_acked: u64, rtt_ns: u64, now_ns: i64) void { + /// CUBIC always paces after the first ACK sets the pacing rate. + pub fn shouldPace(_: *const Cubic) bool { + return true; + } + + /// Called when an ACK is received with a delivery rate sample. + /// CUBIC uses only bytes_acked and rtt_ns from the sample. + pub fn onAckReceived(self: *Cubic, sample: DeliveryRateSample, now_ns: i64) void { + const bytes_acked = sample.bytes_acked; + const rtt_ns = sample.rtt_ns; if (self.cwnd < self.ssthresh) { // Slow start: double cwnd per RTT (exponential growth). self.cwnd += bytes_acked; } else { self.updateCwndCubic(bytes_acked, rtt_ns, now_ns); } - // Update pacing rate: cwnd / srtt (bytes per second). - // During slow start, pace at 2× to allow exponential growth. - // In congestion avoidance, pace at 1.25× cwnd/srtt for headroom. + // Update pacing rate: 2× cwnd/RTT. Enforced by the pacing gate + // in send() which uses wire-time accounting for bytes_in_flight. if (rtt_ns > 0) { - const base_rate = self.cwnd *| 1_000_000_000 / rtt_ns; - // Pace at 2× cwnd/RTT: allows CUBIC to probe above current cwnd - // without being throttled by the pacing rate. The congestion window - // is the real limit; pacing just smooths burst timing. - self.pacing_rate = base_rate *| 2; + const base_rate: u64 = @intCast(@min( + @as(u128, self.cwnd) * 1_000_000_000 / rtt_ns, + std.math.maxInt(u64), + )); + self.pacing.rate = base_rate *| 2; } } @@ -101,11 +93,15 @@ pub const Cubic = struct { self.ssthresh = self.cwnd; self.epoch_start_ns = null; self.cwnd_remainder = 0; + // Reset pacing so stale rate/tokens from the old path don't cause bursts. + self.pacing = .{}; } /// Called on packet loss (e.g., timeout or three duplicate ACKs). + /// `bytes_lost` — total bytes lost (unused by CUBIC, used by BBR). /// `now_ns` — current time in nanoseconds. - pub fn onPacketLost(self: *Cubic, now_ns: i64) void { + pub fn onPacketLost(self: *Cubic, bytes_lost: u64, now_ns: i64) void { + _ = bytes_lost; const MIN_CWND: u64 = 8 * MSS; self.w_max = @floatFromInt(self.cwnd); self.cwnd = @intFromFloat(@as(f64, @floatFromInt(self.cwnd)) * BETA_CUBIC); @@ -124,30 +120,20 @@ pub const Cubic = struct { self.k = computeK(self.w_max, self.cwnd_at_epoch); } - /// Refill pacing tokens based on elapsed time. Call at the start of each - /// send opportunity (tick or post-ACK). Returns the number of bytes - /// allowed to send. Tokens are capped at 2×cwnd to allow modest bursts - /// (e.g., after ACK batching) without unlimited accumulation. + /// Called on ECN CE marks. CUBIC treats ECN the same as packet loss. + pub fn onEcnCe(self: *Cubic, ce_count: u64, now_ns: i64) void { + _ = ce_count; + self.onPacketLost(0, now_ns); + } + + /// Refill pacing tokens. Delegates to shared Pacing. pub fn pacingRefill(self: *Cubic, now_ns: i64) u64 { - if (self.pacing_rate == 0) { - // No pacing rate yet (before first ACK) — allow full cwnd. - return self.cwnd; - } - if (self.pacing_last_refill_ns == 0) { - self.pacing_last_refill_ns = now_ns; - return self.pacing_tokens; - } - const elapsed_ns: u64 = @intCast(@max(now_ns - self.pacing_last_refill_ns, 0)); - self.pacing_last_refill_ns = now_ns; - // tokens += pacing_rate × elapsed_seconds - const new_tokens = self.pacing_rate *| elapsed_ns / 1_000_000_000; - self.pacing_tokens = @min(self.pacing_tokens +| new_tokens, self.cwnd *| 2); - return self.pacing_tokens; + return self.pacing.refill(self.cwnd, now_ns); } /// Consume pacing tokens after sending a packet. pub fn pacingConsume(self: *Cubic, bytes: u64) void { - self.pacing_tokens -|= bytes; + self.pacing.consume(bytes); } fn updateCwndCubic(self: *Cubic, bytes_acked: u64, rtt_ns: u64, now_ns: i64) void { @@ -217,7 +203,7 @@ test "cubic: slow start doubles" { const testing = std.testing; var c = Cubic.init(); const initial = c.cwnd; - c.onAckReceived(initial, 10_000_000, 0); + c.onAckReceived(.{ .bytes_acked = initial, .rtt_ns = 10_000_000 }, 0); try testing.expect(c.cwnd >= initial); } @@ -226,7 +212,7 @@ test "cubic: loss reduces window" { var c = Cubic.init(); c.cwnd = 100 * MSS; const before = c.cwnd; - c.onPacketLost(1_000_000_000); + c.onPacketLost(0, 1_000_000_000); try testing.expect(c.cwnd < before); try testing.expectEqual(c.cwnd, c.ssthresh); } @@ -235,14 +221,14 @@ test "cubic: cwnd grows after loss" { const testing = std.testing; var c = Cubic.init(); c.cwnd = 50 * MSS; - c.onPacketLost(0); + c.onPacketLost(0, 0); const after_loss = c.cwnd; const rtt_ns: u64 = 50_000_000; // 50ms // Simulate several ACK events var t: i64 = 100_000_000; var i: usize = 0; while (i < 10) : (i += 1) { - c.onAckReceived(MSS, rtt_ns, t); + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = rtt_ns }, t); t += @intCast(rtt_ns); } try testing.expect(c.cwnd >= after_loss); @@ -260,7 +246,7 @@ test "cubic: onAckReceived with zero bytes is a no-op" { const testing = std.testing; var c = Cubic.init(); const before = c.cwnd; - c.onAckReceived(0, 50_000_000, 1_000_000_000); + c.onAckReceived(.{ .bytes_acked = 0, .rtt_ns = 50_000_000 }, 1_000_000_000); try testing.expectEqual(before, c.cwnd); } @@ -269,9 +255,9 @@ test "cubic: slow start adds bytes_acked directly to cwnd" { var c = Cubic.init(); // ssthresh = maxInt(u64) by default — we are in slow start const initial = c.cwnd; - c.onAckReceived(MSS, 50_000_000, 1_000_000_000); + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = 50_000_000 }, 1_000_000_000); try testing.expectEqual(initial + MSS, c.cwnd); - c.onAckReceived(2 * MSS, 50_000_000, 1_050_000_000); + c.onAckReceived(.{ .bytes_acked = 2 * MSS, .rtt_ns = 50_000_000 }, 1_050_000_000); try testing.expectEqual(initial + 3 * MSS, c.cwnd); } @@ -280,11 +266,11 @@ test "cubic: epoch_start_ns null sentinel prevents spurious reset at clock=0" { var c = Cubic.init(); // Force into CUBIC phase by setting ssthresh below cwnd c.cwnd = 50 * MSS; - c.onPacketLost(0); // epoch_start_ns = Some(0), not null + c.onPacketLost(0, 0); // epoch_start_ns = Some(0), not null const cwnd_after_loss = c.cwnd; // ACK at t=1ms: epoch should NOT reinitialize (epoch_start_ns is Some(0), not null) - c.onAckReceived(MSS, 50_000_000, 1_000_000); // 1ms later + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = 50_000_000 }, 1_000_000); // 1ms later // cwnd must be >= post-loss cwnd (no spurious reset) try testing.expect(c.cwnd >= cwnd_after_loss); // epoch_start_ns must still be Some(0), not changed @@ -295,7 +281,7 @@ test "cubic: w_est accumulates across ACKs in CUBIC phase" { const testing = std.testing; var c = Cubic.init(); c.cwnd = 50 * MSS; - c.onPacketLost(0); + c.onPacketLost(0, 0); const w_est_after_loss = c.w_est; // Set up a scenario where w_cubic < w_est so TCP-friendly phase is active. @@ -306,7 +292,7 @@ test "cubic: w_est accumulates across ACKs in CUBIC phase" { c.cwnd_at_epoch = @floatFromInt(c.cwnd); c.w_est = @as(f64, @floatFromInt(c.cwnd)) + 1000.0; // w_est > w_cubic initially - c.onAckReceived(MSS, 50_000_000, 100_000_000); + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = 50_000_000 }, 100_000_000); try testing.expect(c.w_est > w_est_after_loss); } @@ -314,10 +300,10 @@ test "cubic: non-monotonic clock (negative t_ns) is a no-op" { const testing = std.testing; var c = Cubic.init(); c.cwnd = 50 * MSS; - c.onPacketLost(1_000_000_000); + c.onPacketLost(0, 1_000_000_000); const cwnd_before = c.cwnd; - c.onAckReceived(MSS, 50_000_000, 500_000_000); + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = 50_000_000 }, 500_000_000); try testing.expectEqual(cwnd_before, c.cwnd); } @@ -334,7 +320,7 @@ test "cubic: single loss event reduces cwnd by exactly BETA_CUBIC" { var c = Cubic.init(); c.cwnd = 100 * MSS; // 120000 bytes const before = c.cwnd; - c.onPacketLost(1_000_000_000); + c.onPacketLost(0, 1_000_000_000); // Expected: floor(120000 * 0.7) = 84000, but minimum is 8*MSS const expected: u64 = @intFromFloat(@as(f64, @floatFromInt(before)) * BETA_CUBIC); const MIN_CWND: u64 = 8 * MSS; @@ -356,7 +342,7 @@ test "cubic: large window growth does not stall" { const initial = c.cwnd; var i: u32 = 0; while (i < 100) : (i += 1) { - c.onAckReceived(MSS, 100_000_000, 10_000_000_000); + c.onAckReceived(.{ .bytes_acked = MSS, .rtt_ns = 100_000_000 }, 10_000_000_000); } try testing.expect(c.cwnd > initial + 100); } @@ -384,7 +370,7 @@ test "cubic: loss reduction is exactly BETA_CUBIC * cwnd" { const testing = std.testing; var c = Cubic.init(); c.cwnd = 10 * MSS; // 12000 bytes - c.onPacketLost(0); + c.onPacketLost(0, 0); // Expected: floor(12000 * 0.7) = 8400, but floored to MIN_CWND = 8*MSS = 9600. // When floor applies, w_max is clipped to MIN_CWND to prevent K ≈ 18s pathology. try testing.expectEqual(@as(u64, 8 * MSS), c.cwnd); @@ -406,7 +392,7 @@ test "cubic: cwnd_remainder uses saturating arithmetic on extreme target" { c.epoch_start_ns = 0; c.cwnd_at_epoch = @floatFromInt(c.cwnd); - c.onAckReceived(1, 10_000_000, 400_000 * 1_000_000_000); + c.onAckReceived(.{ .bytes_acked = 1, .rtt_ns = 10_000_000 }, 400_000 * 1_000_000_000); try testing.expect(c.cwnd >= MSS); try testing.expect(c.cwnd > MSS); diff --git a/src/quic/connection.zig b/src/quic/connection.zig index 241ab05..2304ed4 100644 --- a/src/quic/connection.zig +++ b/src/quic/connection.zig @@ -4,7 +4,7 @@ //! The connection is driven by: //! //! connection.receive(data, src) — feed a received UDP datagram -//! connection.send(out) — drain the next UDP datagram to transmit +//! connection.send(out, now_ns) — drain the next UDP datagram to transmit //! connection.nextTimeout() — nanosecond deadline for tick() //! connection.tick(now_ns) — drive timer-based events //! @@ -21,7 +21,7 @@ const varint = @import("varint.zig"); const cid_mod = @import("connection_id.zig"); const stream_mod = @import("stream.zig"); const flow_control = @import("flow_control.zig"); -const cubic_mod = @import("congestion/cubic.zig"); +const cc_mod = @import("congestion/cc.zig"); const loss_recovery_mod = @import("loss_recovery.zig"); const ConnectionId = cid_mod.ConnectionId; @@ -149,7 +149,11 @@ const CRYPTO_STAGE_DEPTH = 16; /// Maximum bytes in a single staged CRYPTO fragment (conservatively > max QUIC payload). pub const CRYPTO_STAGE_FRAG = 1400; /// Maximum number of pending stream retransmits when send queue is full. -const MAX_PENDING_RETX = 32; +/// Must be large enough to handle worst-case burst losses when pacing +/// keeps the send queue non-empty during loss detection. The epoch 2 +/// sent buffer holds up to 128 packets, each with up to 1 stream frame +/// in practice, so 128 covers the realistic worst case. +const MAX_PENDING_RETX = 128; /// A single buffered out-of-order CRYPTO fragment. const CryptoStagedFrag = struct { @@ -163,6 +167,22 @@ const SendSlot = struct { len: usize, }; +/// Per-slot metadata for deferred wire-time accounting. +/// Stored in parallel with SendSlot; consumed by send() to call +/// loss.onPacketSent at wire time rather than queue time. +const SendMeta = struct { + pn: u64 = 0, + epoch: u8 = 0, + size: u16 = 0, + ack_eliciting: bool = false, + /// Queue-time timestamp for delivery rate computation. Wire-time + /// (now_ns in send()) is used for loss detection timing, but delivery + /// rate must use queue-time to avoid pacing delays inflating + /// send_elapsed and depressing BBR's bandwidth estimate. + queued_ns: i64 = 0, + frame_info: loss_recovery_mod.SentFrameInfo = .{}, +}; + // --------------------------------------------------------------------------- // Configuration // --------------------------------------------------------------------------- @@ -247,6 +267,10 @@ pub fn Connection(comptime max_streams: usize) type { peer_scid: [20]u8 = [_]u8{0} ** 20, peer_scid_len: u8 = 0, peer_addr: SocketAddr, + /// Previous peer address (before last migration). Packets from this + /// address are silently accepted without triggering re-migration, since + /// they are late arrivals from the old path. + prev_peer_addr: ?SocketAddr, // Crypto initial_keys: crypto.InitialKeys, @@ -269,7 +293,7 @@ pub fn Connection(comptime max_streams: usize) type { conn_flow: flow_control.FlowController, // Congestion control - congestion: cubic_mod.Cubic, + congestion: cc_mod.CongestionControl, // Loss recovery (RTT estimation, sent-packet tracking, PTO) loss: loss_recovery_mod.LossRecovery, @@ -282,8 +306,13 @@ pub fn Connection(comptime max_streams: usize) type { // Send queue (ring buffer of ready-to-send packets) sq: [SEND_QUEUE_DEPTH]SendSlot, + sq_meta: [SEND_QUEUE_DEPTH]SendMeta, sq_head: usize, sq_tail: usize, + /// Bytes in the send queue (ack-eliciting only) that have not yet + /// been handed to the socket. Complements loss.bytes_in_flight which + /// counts wire-sent bytes only. + bytes_queued: u64, // Timers idle_deadline_ns: ?i64, @@ -558,6 +587,7 @@ pub fn Connection(comptime max_streams: usize) type { .alt_local_reset_token = alt_local_reset_token, .peer_cid = ConnectionId.zero, .peer_addr = .{ .v4 = .{ .addr = [_]u8{0} ** 4, .port = 0 } }, + .prev_peer_addr = null, .initial_keys = .{ .client = .{ .key = [_]u8{0} ** 32, .iv = [_]u8{0} ** 12, .hp = [_]u8{0} ** 32, .suite = .aes_128_gcm }, .server = .{ .key = [_]u8{0} ** 32, .iv = [_]u8{0} ** 12, .hp = [_]u8{0} ** 32, .suite = .aes_128_gcm }, @@ -572,15 +602,17 @@ pub fn Connection(comptime max_streams: usize) type { config.initial_max_data, config.initial_max_data, ), - .congestion = cubic_mod.Cubic.init(), + .congestion = cc_mod.CongestionControl.init(), .loss = loss_recovery_mod.LossRecovery.init(), .current_time_ns = 0, .cached_max_ack_delay_ns = 25_000_000, .cached_ack_delay_exp = 3, .idle_timeout_i64 = idle_timeout_i64, .sq = undefined, + .sq_meta = [_]SendMeta{.{}} ** SEND_QUEUE_DEPTH, .sq_head = 0, .sq_tail = 0, + .bytes_queued = 0, .idle_deadline_ns = null, .pto_deadline_ns = null, .drain_deadline_ns = null, @@ -687,11 +719,12 @@ pub fn Connection(comptime max_streams: usize) type { // Path migration detection (RFC 9000 §9): only in established state, // and only when the peer has not disabled active migration. + // Ignore packets from the previous peer address — those are late + // arrivals from the old path and must not trigger re-migration. if (self.hot.state == .established and !self.peer_addr.eql(src)) { - if (!self.peer_disable_migration) { + const is_prev = if (self.prev_peer_addr) |prev| prev.eql(src) else false; + if (!is_prev and !self.peer_disable_migration) { if (SocketAddr.isPortOnlyChange(self.peer_addr, src)) { - // RFC 9000 §9.3.1: port-only change is likely NAT rebinding. - // Skip congestion reset and path validation to preserve throughput. self.onNatRebind(src, io) catch {}; } else { self.onPathMigration(src, io) catch {}; @@ -760,28 +793,119 @@ pub fn Connection(comptime max_streams: usize) type { } } + /// Store per-packet metadata for deferred wire-time accounting. + /// Called immediately after enqueueSend() succeeds (sq_tail already + /// advanced), so the metadata is written to the slot that was just filled. + fn storeSendMeta(self: *Self, pn: u64, epoch: u8, size: usize, ack_eliciting: bool, fi: loss_recovery_mod.SentFrameInfo) void { + const idx = (self.sq_tail - 1) & (SEND_QUEUE_DEPTH - 1); + const sz: u16 = @intCast(@min(size, 0xffff)); + self.sq_meta[idx] = .{ + .pn = pn, + .epoch = epoch, + .size = sz, + .ack_eliciting = ack_eliciting, + .queued_ns = self.current_time_ns, + .frame_info = fi, + }; + if (ack_eliciting) { + self.bytes_queued += sz; + } + } + /// Write the next UDP payload to `out`. Returns bytes written (0 = nothing pending). - pub fn send(self: *Self, out: []u8) usize { + /// `now_ns` is the wall-clock time used for wire-time accounting (loss recovery, + /// pacing, and PTO arming). + /// + /// RFC 9000 §12.2: coalesces consecutive long-header packets (Initial + + /// Handshake) into a single UDP datagram so they share one loss event + /// instead of being independently dropped. + pub fn send(self: *Self, out: []u8, now_ns: i64) usize { // RFC 9000 §10.2: draining state — must not send anything. if (self.hot.state == .draining) return 0; - if (self.sq_head == self.sq_tail) return 0; - const slot = &self.sq[self.sq_head & (SEND_QUEUE_DEPTH - 1)]; - const n = @min(slot.len, out.len); - @memcpy(out[0..n], slot.buf[0..n]); + if (self.sq_head == self.sq_tail) { + // Nothing to send — if cwnd has room, we are app-limited. + if (self.loss.bytes_in_flight + self.bytes_queued < self.congestion.cwnd) { + self.loss.delivery.app_limited = true; + } + return 0; + } + const mask = SEND_QUEUE_DEPTH - 1; + var meta = self.sq_meta[self.sq_head & mask]; + // Pacing gate: refill tokens and check if we can send. + // Bypass pacing when nothing is in flight — there is no congestion + // to pace for, and blocking here creates a death spiral where the + // delivery rate collapses (no data sent → no ACKs → rate drops → + // pacing blocks even harder). + const pacing_tokens = self.congestion.pacing.refill(self.congestion.cwnd, now_ns); + if (meta.ack_eliciting and pacing_tokens < meta.size and + self.congestion.pacing.rate > 0 and self.congestion.shouldPace() and + self.loss.bytes_in_flight > 0) + { + return 0; + } + const slot = &self.sq[self.sq_head & mask]; + var total = @min(slot.len, out.len); + @memcpy(out[0..total], slot.buf[0..total]); + // Wire-time accounting for the first packet. + self.loss.onPacketSent(meta.pn, meta.epoch, meta.size, meta.ack_eliciting, now_ns, meta.queued_ns, meta.frame_info); + if (meta.ack_eliciting) { + self.bytes_queued -|= meta.size; + self.pto_deadline_ns = self.loss.ptoDeadline(self.cached_max_ack_delay_ns); + } self.sq_head += 1; - self.bytes_sent += n; + + // Coalesce: append consecutive long-header packets (epoch 0/1) into + // the same UDP datagram (RFC 9000 §12.2). This halves handshake loss + // probability under lossy networks. Do NOT coalesce 1-RTT packets — + // that breaks connection migration (Handshake ACK + 1-RTT data in one + // datagram confuses path validation). + if (meta.epoch < 2) { + while (self.sq_head < self.sq_tail) { + const next_meta = self.sq_meta[self.sq_head & mask]; + if (next_meta.epoch >= 2) break; + const next_slot = &self.sq[self.sq_head & mask]; + if (total + next_slot.len > out.len) break; + @memcpy(out[total..][0..next_slot.len], next_slot.buf[0..next_slot.len]); + self.loss.onPacketSent(next_meta.pn, next_meta.epoch, next_meta.size, next_meta.ack_eliciting, now_ns, next_meta.queued_ns, next_meta.frame_info); + if (next_meta.ack_eliciting) { + self.bytes_queued -|= next_meta.size; + self.pto_deadline_ns = self.loss.ptoDeadline(self.cached_max_ack_delay_ns); + } + total += next_slot.len; + self.sq_head += 1; + } + } + + // RFC 9000 §14.1: datagrams carrying ack-eliciting Initial packets + // MUST be at least 1200 bytes. Pad after coalescing so the Handshake + // portion fills the datagram (reducing the number of separate packets + // needed for the cert chain) instead of wasting space on PADDING frames. + if (meta.epoch == 0 and meta.ack_eliciting and total < 1200 and out.len >= 1200) { + @memset(out[total..1200], 0); + total = 1200; + } + + if (self.congestion.pacing.rate > 0) { + self.congestion.pacing.consume(total); + } + self.bytes_sent += total; self.pkts_sent += 1; - return n; + return total; } /// Returns the nanosecond deadline when `tick()` must be called, - /// or null if no timer is active. + /// or null if no timer is active. Includes the pacing deadline when + /// the send queue is non-empty so the event loop wakes to drain it. pub fn nextTimeout(self: *const Self) ?i64 { const idle = self.idle_deadline_ns orelse std.math.maxInt(i64); const pto = self.pto_deadline_ns orelse std.math.maxInt(i64); const drain = self.drain_deadline_ns orelse std.math.maxInt(i64); const tl = self.time_loss_alarm_ns orelse std.math.maxInt(i64); - const m = @min(@min(@min(idle, pto), drain), tl); + const pacing: i64 = if (self.sq_head != self.sq_tail) + self.congestion.pacing.nextSendTime() orelse std.math.maxInt(i64) + else + std.math.maxInt(i64); + const m = @min(@min(@min(@min(idle, pto), drain), tl), pacing); return if (m == std.math.maxInt(i64)) null else m; } @@ -807,6 +931,12 @@ pub fn Connection(comptime max_streams: usize) type { } } + // Flush any Handshake CRYPTO that was buffered when amplification limit + // blocked the initial send. This must run on every tick — not just in + // receive() — because under high loss the client's packets may never + // arrive to trigger receive(), leaving the pending HS data unsent. + self.flushPendingHsCrypto(); + // Drain any deferred CRYPTO and stream retransmits before generating new traffic self.drainPendingCryptoRetx(); self.drainPendingStreamRetx(); @@ -826,7 +956,7 @@ pub fn Connection(comptime max_streams: usize) type { if (self.pto_deadline_ns) |d| { if (now_ns >= d) { self.loss.onPtoFired(); - if (self.app_keys != null) { + if (self.hot.state == .established) { // Post-handshake PTO: retransmit PATH_CHALLENGE if pending (RFC 9000 §9.2), // drain pending stream retransmits, probe with unacked stream data, // or send a 1-RTT PING probe (RFC 9002 §6.2). @@ -844,7 +974,7 @@ pub fn Connection(comptime max_streams: usize) type { // (not just our own previous PINGs). Without this guard, // PTO sends infinite PINGs after all transfers complete: // each PING creates in-flight state → PTO fires → PING → loop. - // Limit to 2 consecutive idle PINGs, then let idle timeout close. + // Limit to 6 consecutive idle PINGs, then let idle timeout close. if (self.idle_ping_count < 6) { self.queuePing() catch {}; self.idle_ping_count += 1; @@ -905,6 +1035,9 @@ pub fn Connection(comptime max_streams: usize) type { const tns = self.loss.timeThresholdNs(); var tl_result = loss_recovery_mod.AckResult{}; for (0..3) |epoch_idx| { + // Skip Initial/Handshake epochs once established — keys + // are zeroed, so any retransmit would panic on invalid suite. + if (self.hot.state == .established and epoch_idx < 2) continue; const la = self.loss.largest_acked[epoch_idx]; if (la == 0) continue; self.loss.sent.detectLoss( @@ -917,7 +1050,7 @@ pub fn Connection(comptime max_streams: usize) type { ); } if (tl_result.newly_lost > 0) { - self.congestion.onPacketLost(now_ns); + self.congestion.onPacketLost(tl_result.bytes_lost, now_ns); self.processLostFrames(tl_result); } // Reschedule if there are still candidates. @@ -1008,9 +1141,11 @@ pub fn Connection(comptime max_streams: usize) type { // Retransmissions (processLostFrames) bypass this check so loss recovery // is never blocked by a temporarily-reduced cwnd after a loss event. // Estimate packet size as data.len + 64 bytes of header/AEAD overhead. - if (self.loss.bytes_in_flight + data.len + 64 > self.congestion.cwnd) { + if (self.loss.bytes_in_flight + self.bytes_queued + data.len + 64 > self.congestion.cwnd) { return error.CongestionWindowFull; } + // Clear app-limited flag: we are actively sending. + self.loss.delivery.app_limited = false; try self.queueStreamData(stream_id, data, fin); } @@ -1082,6 +1217,28 @@ pub fn Connection(comptime max_streams: usize) type { // Internal packet processing // ----------------------------------------------------------------------- + /// Compute the wire size of a long-header QUIC packet from its unprotected + /// header fields. Used to skip an unprocessable packet in a coalesced + /// datagram without dropping the subsequent packets. + fn skipLongHeaderPacket(data: []const u8, raw_dcid_len: u8, raw_pkt_type: packet.PacketType) usize { + // Position after: first_byte(1) + version(4) + dcid_len(1) + dcid + scid_len(1) + scid + var pos: usize = 6 + @as(usize, raw_dcid_len); + if (pos >= data.len) return data.len; + const scid_len = data[pos]; + pos += 1 + @as(usize, scid_len); + if (pos > data.len) return data.len; + // Initial packets carry a token before the Length field. + if (raw_pkt_type == .initial) { + const tok_r = varint.decode(data[pos..]) orelse return data.len; + pos += tok_r.len + @as(usize, @intCast(tok_r.value)); + if (pos > data.len) return data.len; + } + // Length varint: covers PN bytes + ciphertext + AEAD tag. + const len_r = varint.decode(data[pos..]) orelse return data.len; + pos += len_r.len; + return @min(pos + @as(usize, @intCast(len_r.value)), data.len); + } + pub fn processOnePacket(self: *Self, data: []u8, src: SocketAddr, io: std.Io) !usize { if (data.len == 0) return 0; @@ -1152,21 +1309,29 @@ pub fn Connection(comptime max_streams: usize) type { // In established state, all Initial packets (even with matching DCID) must be // silently dropped. This handles late/retransmitted Initial packets and new // connection attempts that happen to use the same server local_cid. + // Skip just this one packet so coalesced Handshake/1-RTT packets can proceed. if (raw_pkt_type == .initial and self.hot.state == .established) { - return data.len; + return skipLongHeaderPacket(data, raw_dcid_len, raw_pkt_type); } // For handshake state Initial packets, validate DCID against the client's original // DCID stored from the first Initial. RFC 9000 §7.2: a client MUST NOT change its // Destination CID before receiving the server's first Initial packet, so all Initial // retransmissions (including those carrying fragmented ClientHello bytes) must carry - // the same variable-length DCID. The old check compared against local_cid (fixed - // 8 bytes) and silently dropped every packet whose dcid_len > 8. + // the same variable-length DCID. However, once the client receives the server's + // first Initial, it switches to the server's SCID for all subsequent packets + // (RFC 9000 §7.2), so the coalesced Initial ACK uses our local_cid. + // Accept both the original DCID and our own local_cid/alt_local_cid. if (raw_pkt_type == .initial and self.hot.state == .handshake and self.first_initial_dcid_len > 0) { - if (!std.mem.eql(u8, raw_dcid, self.first_initial_dcid[0..self.first_initial_dcid_len])) { - return data.len; // Different DCID: belongs to a different connection. + const matches_first = std.mem.eql(u8, raw_dcid, self.first_initial_dcid[0..self.first_initial_dcid_len]); + const matches_local = raw_dcid_len == cid_mod.len and std.mem.eql(u8, raw_dcid[0..cid_mod.len], &self.local_cid.bytes); + const matches_alt = raw_dcid_len == cid_mod.len and std.mem.eql(u8, raw_dcid[0..cid_mod.len], &self.alt_local_cid.bytes); + if (!matches_first and !matches_local and !matches_alt) { + // Different DCID: skip just this Initial packet (not the entire + // datagram) so coalesced Handshake/1-RTT packets can still be processed. + return skipLongHeaderPacket(data, raw_dcid_len, raw_pkt_type); } } @@ -2135,11 +2300,10 @@ pub fn Connection(comptime max_streams: usize) type { } } - // Feed acknowledgement data to CUBIC + // Feed acknowledgement data to congestion controller if (result.newly_acked > 0) { self.congestion.onAckReceived( - result.bytes_acked, - self.loss.rtt.smoothed_rtt, + result.delivery_rate_sample, self.current_time_ns, ); self.loss.resetPtoCount(); @@ -2147,7 +2311,7 @@ pub fn Connection(comptime max_streams: usize) type { // One congestion event per loss detection (RFC 9438 §5.6) if (result.newly_lost > 0) { - self.congestion.onPacketLost(self.current_time_ns); + self.congestion.onPacketLost(result.bytes_lost, self.current_time_ns); } // Persistent congestion: collapse cwnd when loss span > 3×PTO (RFC 9002 §6.1.2) @@ -2160,9 +2324,10 @@ pub fn Connection(comptime max_streams: usize) type { if (ack.has_ecn) { const ce: u62 = @intCast(@min(ack.ecn_ce, std.math.maxInt(u62))); if (ce > self.ecn_ce_seen[epoch]) { + const ce_delta = ce - self.ecn_ce_seen[epoch]; self.ecn_ce_seen[epoch] = ce; if (result.largest_acked_sent_ns) |_| { - self.congestion.onPacketLost(self.current_time_ns); + self.congestion.onEcnCe(ce_delta, self.current_time_ns); } } } @@ -2176,6 +2341,15 @@ pub fn Connection(comptime max_streams: usize) type { // Refresh PTO timer and time-loss alarm after any ACK. self.pto_deadline_ns = self.loss.ptoDeadline(max_ack_delay_ns); + // With wire-time accounting, retransmissions queued by processLostFrames + // are in bytes_queued (not bytes_in_flight). ptoDeadline returns null + // when bytes_in_flight == 0. Force-arm PTO when queued data exists so + // the server doesn't go silent while pacing drains retransmissions. + if (self.pto_deadline_ns == null and self.bytes_queued > 0) { + const pto_base = self.loss.rtt.ptoBase(max_ack_delay_ns); + const max_i64: u64 = @as(u64, std.math.maxInt(i64)); + self.pto_deadline_ns = self.current_time_ns +| @as(i64, @intCast(@min(pto_base, max_i64))); + } // RFC 9002 §6.2.2.1: server MUST keep PTO armed during handshake even // when bytes_in_flight == 0. The peer may have ACKed our Handshake CRYPTO // at the QUIC level but not yet processed it at the TLS level (e.g. gaps in @@ -2231,6 +2405,30 @@ pub fn Connection(comptime max_streams: usize) type { } } + /// Declare all in-flight packets in `epoch` as lost: invalidate their + /// sent-table entries, reset bytes_in_flight, and queue their stream + /// frames for retransmission. Used on path migration to clean up + /// packets that were sent to the old address and will never be ACKed. + fn declareEpochLost(self: *Self, epoch: u8) void { + const sent = &self.loss.sent; + for (&sent.slots, 0..) |*slot, idx| { + if (!slot.valid or slot.epoch != epoch) continue; + if (slot.in_flight) { + self.loss.bytes_in_flight -|= slot.size; + } + // Queue stream frames from this packet for retransmission. + const fi = sent.frame_info[idx]; + for (fi.frames[0..fi.count]) |f| { + switch (f) { + .stream => |s| self.deferStreamRetx(s.stream_id, s.offset, s.len, s.fin), + else => {}, + } + } + slot.valid = false; + if (epoch < 3) sent.valid_per_epoch[epoch] -|= 1; + } + } + pub fn processLostFrames(self: *Self, result: loss_recovery_mod.AckResult) void { // Sized to MAX_SEND_PACKET_SIZE so getSendData never returns more bytes than // encryptAndEnqueueStreamFrame can encode into pkt_scratch without overflow. @@ -2245,23 +2443,22 @@ pub fn Connection(comptime max_streams: usize) type { // adjacent buffered data beyond the lost frame boundary). const n = @min(st.getSendData(s.offset, &stream_retx_buf), s.len); if (n > 0 or s.fin) { - self.encryptAndEnqueueStreamFrame( - s.stream_id, - s.offset, - stream_retx_buf[0..n], - s.fin, - ) catch { - // Send queue full — defer for retry in drainPendingStreamRetx() - if (self.stream_pending_retx_count < MAX_PENDING_RETX) { - self.stream_pending_retx[self.stream_pending_retx_count] = .{ - .stream_id = s.stream_id, - .offset = s.offset, - .len = @intCast(n), - .fin = s.fin, - }; - self.stream_pending_retx_count += 1; - } + // Cap retransmission queueing to avoid bytes_queued + // exceeding cwnd. When bytes_queued is already at + // or above cwnd, defer remaining retransmissions. + const enqueued = enq: { + if (self.bytes_queued + n + 64 > self.congestion.cwnd) break :enq false; + self.encryptAndEnqueueStreamFrame( + s.stream_id, + s.offset, + stream_retx_buf[0..n], + s.fin, + ) catch break :enq false; + break :enq true; }; + if (!enqueued) { + self.deferStreamRetx(s.stream_id, s.offset, @intCast(n), s.fin); + } } } }, @@ -2303,6 +2500,18 @@ pub fn Connection(comptime max_streams: usize) type { } } + fn deferStreamRetx(self: *Self, stream_id: u62, offset: u62, len: u16, fin: bool) void { + if (self.stream_pending_retx_count < MAX_PENDING_RETX) { + self.stream_pending_retx[self.stream_pending_retx_count] = .{ + .stream_id = stream_id, + .offset = offset, + .len = len, + .fin = fin, + }; + self.stream_pending_retx_count += 1; + } + } + fn drainPendingStreamRetx(self: *Self) void { if (self.stream_pending_retx_count == 0) return; var stream_retx_buf: [MAX_SEND_PACKET_SIZE]u8 = undefined; @@ -2448,8 +2657,9 @@ pub fn Connection(comptime max_streams: usize) type { const pn = self.hot.tx_pn[0]; self.hot.tx_pn[0] += 1; const ct_len = fpos + 16; + const slot_buf = try self.reserveSendSlot(ct_len + 30); const hdr_len = packet.encodeLongHeader( - &self.enc_scratch, + slot_buf, .initial, packet_version, self.peer_scid[0..self.peer_scid_len], @@ -2462,13 +2672,13 @@ pub fn Connection(comptime max_streams: usize) type { self.hot.tx_pn[0] -= 1; return error.PacketTooLarge; } - crypto.encryptPayload(ik, pn, self.enc_scratch[0..hdr_len], self.pkt_scratch[0..fpos], self.enc_scratch[hdr_len..][0..ct_len]); - crypto.applyHeaderProtection(ik, &self.enc_scratch[0], self.enc_scratch[hdr_len - 4 ..][0..4], self.enc_scratch[hdr_len..][0..16]); - try self.enqueueSend(self.enc_scratch[0 .. hdr_len + ct_len]); + crypto.encryptPayload(ik, pn, slot_buf[0..hdr_len], self.pkt_scratch[0..fpos], slot_buf[hdr_len..][0..ct_len]); + crypto.applyHeaderProtection(ik, &slot_buf[0], slot_buf[hdr_len - 4 ..][0..4], slot_buf[hdr_len..][0..16]); + self.commitSendSlot(hdr_len + ct_len); var fi = loss_recovery_mod.SentFrameInfo{}; fi.frames[0] = .{ .crypto_frame = .{ .offset = @intCast(offset), .len = @intCast(chunk.len) } }; fi.count = 1; - self.loss.onPacketSent(pn, 0, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 0, hdr_len + ct_len, true, fi); } fn sendCryptoChunkEpoch1(self: *Self, chunk: []const u8, offset: u62, fpos: usize) !void { @@ -2476,8 +2686,9 @@ pub fn Connection(comptime max_streams: usize) type { const pn = self.hot.tx_pn[1]; self.hot.tx_pn[1] += 1; const ct_len = fpos + 16; + const slot_buf = try self.reserveSendSlot(ct_len + 30); const hdr_len = packet.encodeLongHeader( - &self.enc_scratch, + slot_buf, .handshake, self.quic_version, self.peer_scid[0..self.peer_scid_len], @@ -2490,13 +2701,13 @@ pub fn Connection(comptime max_streams: usize) type { self.hot.tx_pn[1] -= 1; return error.PacketTooLarge; } - crypto.encryptPayload(hk.server, pn, self.enc_scratch[0..hdr_len], self.pkt_scratch[0..fpos], self.enc_scratch[hdr_len..][0..ct_len]); - crypto.applyHeaderProtection(hk.server, &self.enc_scratch[0], self.enc_scratch[hdr_len - 4 ..][0..4], self.enc_scratch[hdr_len..][0..16]); - try self.enqueueSend(self.enc_scratch[0 .. hdr_len + ct_len]); + crypto.encryptPayload(hk.server, pn, slot_buf[0..hdr_len], self.pkt_scratch[0..fpos], slot_buf[hdr_len..][0..ct_len]); + crypto.applyHeaderProtection(hk.server, &slot_buf[0], slot_buf[hdr_len - 4 ..][0..4], slot_buf[hdr_len..][0..16]); + self.commitSendSlot(hdr_len + ct_len); var fi = loss_recovery_mod.SentFrameInfo{}; fi.frames[0] = .{ .crypto_frame = .{ .offset = @intCast(offset), .len = @intCast(chunk.len) } }; fi.count = 1; - self.loss.onPacketSent(pn, 1, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 1, hdr_len + ct_len, true, fi); } // ----------------------------------------------------------------------- @@ -2504,30 +2715,37 @@ pub fn Connection(comptime max_streams: usize) type { // ----------------------------------------------------------------------- pub fn enqueueSend(self: *Self, data: []const u8) !void { - // Use monotonic head/tail subtraction (not modular comparison) to correctly - // detect full queue regardless of wrap-around. + const slot_buf = try self.reserveSendSlot(data.len); + const n = @min(data.len, MAX_SEND_PACKET_SIZE); + @memcpy(slot_buf[0..n], data[0..n]); + self.commitSendSlot(n); + } + + /// Reserve the next send queue slot for zero-copy writes. + /// Returns a pointer to the slot's buffer. The caller writes + /// directly into it (e.g. header encoding + AEAD encryption), + /// then calls commitSendSlot() with the actual length. + /// Checks queue capacity, idle timer, and amplification limit. + fn reserveSendSlot(self: *Self, size: usize) ![]u8 { if (self.sq_tail - self.sq_head >= SEND_QUEUE_DEPTH) return error.SendQueueFull; - // RFC 9000 §10.1.2: restart idle timer when sending a packet. if (self.idle_timeout_i64 > 0) { self.idle_deadline_ns = self.current_time_ns +| self.idle_timeout_i64; } - // Amplification limit: must not send more than 3× received before path - // validation. Only enforced once we have received at least one datagram - // (bytes_unvalidated_recv > 0) so that direct enqueueSend calls in tests are - // unaffected before any receive has happened (RFC 9000 §8.1.2). if (!self.path_validated and self.bytes_unvalidated_recv > 0) { - const new_sent = self.bytes_unvalidated_sent +| data.len; + const new_sent = self.bytes_unvalidated_sent +| size; if (new_sent > self.bytes_unvalidated_recv *| 3) { return error.AmplificationLimitExceeded; } self.bytes_unvalidated_sent = new_sent; } - const slot = &self.sq[self.sq_tail & (SEND_QUEUE_DEPTH - 1)]; - const n = @min(data.len, MAX_SEND_PACKET_SIZE); - @memcpy(slot.buf[0..n], data[0..n]); - slot.len = n; + return &self.sq[self.sq_tail & (SEND_QUEUE_DEPTH - 1)].buf; + } + + /// Commit a previously reserved send slot with the actual packet length. + fn commitSendSlot(self: *Self, len: usize) void { + self.sq[self.sq_tail & (SEND_QUEUE_DEPTH - 1)].len = len; self.sq_tail += 1; } @@ -2595,7 +2813,7 @@ pub fn Connection(comptime max_streams: usize) type { try self.enqueueSend(self.enc_scratch[0 .. hdr_len + ct_len]); var fi = loss_recovery_mod.SentFrameInfo{}; fi.count = 0; // ACK is not ack-eliciting; no frame info tracked - self.loss.onPacketSent(pn, 0, hdr_len + ct_len, false, self.current_time_ns, fi); + self.storeSendMeta(pn, 0, hdr_len + ct_len, false, fi); }, 1 => { // Handshake packet: Long Header, handshake keys @@ -2620,7 +2838,7 @@ pub fn Connection(comptime max_streams: usize) type { try self.enqueueSend(self.enc_scratch[0 .. hdr_len + ct_len]); var fi = loss_recovery_mod.SentFrameInfo{}; fi.count = 0; - self.loss.onPacketSent(pn, 1, hdr_len + ct_len, false, self.current_time_ns, fi); + self.storeSendMeta(pn, 1, hdr_len + ct_len, false, fi); }, 2 => { // 1-RTT packet: Short Header, app keys @@ -2652,26 +2870,24 @@ pub fn Connection(comptime max_streams: usize) type { const pn = self.hot.tx_pn[2]; self.hot.tx_pn[2] += 1; - const hdr_len = packet.encodeShortHeader(&self.enc_scratch, self.peer_scid[0..self.peer_scid_len], @intCast(pn), self.current_key_phase); const ct_len = plaintext_len + 16; + // Reserve a send queue slot and encrypt directly into it, + // eliminating a ~1452-byte memcpy per packet. + const slot_buf = self.reserveSendSlot(ct_len + 20) catch |err| { + self.hot.tx_pn[2] -= 1; + return err; + }; + const hdr_len = packet.encodeShortHeader(slot_buf, self.peer_scid[0..self.peer_scid_len], @intCast(pn), self.current_key_phase); if (hdr_len + ct_len > MAX_SEND_PACKET_SIZE) { self.hot.tx_pn[2] -= 1; return error.PacketTooLarge; } - crypto.encryptPayload(ak.server, pn, self.enc_scratch[0..hdr_len], self.pkt_scratch[0..plaintext_len], self.enc_scratch[hdr_len..][0..ct_len]); - crypto.applyHeaderProtection(ak.server, &self.enc_scratch[0], self.enc_scratch[hdr_len - 4 ..][0..4], self.enc_scratch[hdr_len..][0..16]); + crypto.encryptPayload(ak.server, pn, slot_buf[0..hdr_len], self.pkt_scratch[0..plaintext_len], slot_buf[hdr_len..][0..ct_len]); + crypto.applyHeaderProtection(ak.server, &slot_buf[0], slot_buf[hdr_len - 4 ..][0..4], slot_buf[hdr_len..][0..16]); const out_len = hdr_len + ct_len; - self.enqueueSend(self.enc_scratch[0..out_len]) catch |err| { - self.hot.tx_pn[2] -= 1; - return err; - }; + self.commitSendSlot(out_len); - if (fi) |frame_info| { - self.loss.onPacketSent(pn, 2, out_len, ack_eliciting, self.current_time_ns, frame_info); - if (ack_eliciting) { - self.pto_deadline_ns = self.loss.ptoDeadline(self.cached_max_ack_delay_ns); - } - } + self.storeSendMeta(pn, 2, out_len, ack_eliciting, fi orelse .{}); return pn; } @@ -2903,7 +3119,7 @@ pub fn Connection(comptime max_streams: usize) type { } }; fi.count = 1; self.crypto_send_offset[0] += chunk_len; - self.loss.onPacketSent(pn, 0, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 0, hdr_len + ct_len, true, fi); }, 1 => { const hk = self.hs_keys.?.server; @@ -2940,7 +3156,7 @@ pub fn Connection(comptime max_streams: usize) type { } }; fi.count = 1; self.crypto_send_offset[1] += chunk_len; - self.loss.onPacketSent(pn, 1, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 1, hdr_len + ct_len, true, fi); }, else => unreachable, } @@ -2954,7 +3170,6 @@ pub fn Connection(comptime max_streams: usize) type { @memcpy(self.crypto_send_saved[epoch][old..end], chunk); self.crypto_send_saved_len[epoch] = @intCast(end); } - self.pto_deadline_ns = self.loss.ptoDeadline(self.cached_max_ack_delay_ns); return chunk_len; } @@ -3031,7 +3246,7 @@ pub fn Connection(comptime max_streams: usize) type { var fi = loss_recovery_mod.SentFrameInfo{}; fi.frames[0] = .{ .crypto_frame = .{ .offset = @intCast(sent), .len = @intCast(chunk.len) } }; fi.count = 1; - self.loss.onPacketSent(pn, 0, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 0, hdr_len + ct_len, true, fi); } else { const hk = self.hs_keys orelse break; const pn = self.hot.tx_pn[1]; @@ -3057,7 +3272,7 @@ pub fn Connection(comptime max_streams: usize) type { var fi = loss_recovery_mod.SentFrameInfo{}; fi.frames[0] = .{ .crypto_frame = .{ .offset = @intCast(sent), .len = @intCast(chunk.len) } }; fi.count = 1; - self.loss.onPacketSent(pn, 1, hdr_len + ct_len, true, self.current_time_ns, fi); + self.storeSendMeta(pn, 1, hdr_len + ct_len, true, fi); } sent += chunk.len; @@ -3263,6 +3478,23 @@ pub fn Connection(comptime max_streams: usize) type { var fpos: usize = 0; fpos += frame.encodeFrame(self.pkt_scratch[fpos..], .{ .path_challenge = .{ .data = data } }); _ = self.sendShortHeaderPacket(fpos, null, false) catch return; + self.moveLastToFront(); + } + + /// Move the last enqueued packet to the front of the send queue. + /// Used for PATH_CHALLENGE so it is the first packet sent on a new + /// path, bypassing any pacing-blocked data without reordering the FIFO. + fn moveLastToFront(self: *Self) void { + if (self.sq_tail -% self.sq_head < 2) return; // only 0-1 items, nothing to move + const mask = SEND_QUEUE_DEPTH - 1; + const tail_idx = (self.sq_tail -% 1) & mask; + self.sq_head -%= 1; + const head_idx = self.sq_head & mask; + if (head_idx != tail_idx) { + self.sq[head_idx] = self.sq[tail_idx]; + self.sq_meta[head_idx] = self.sq_meta[tail_idx]; + } + self.sq_tail -%= 1; } /// Process a NEW_CONNECTION_ID frame: store the CID and retire entries below retire_prior_to. @@ -3503,13 +3735,34 @@ pub fn Connection(comptime max_streams: usize) type { /// Handle a source address change: reset congestion, request path validation. fn onPathMigration(self: *Self, new_addr: SocketAddr, io: std.Io) !void { - // RFC 9000 §9.4: reset congestion controller on path change. - self.congestion = cubic_mod.Cubic.init(); + // RFC 9000 §9.4 permits resetting congestion state on migration, + // but resetting cwnd to INITIAL_CWND kills throughput: the server + // must re-probe bandwidth from scratch after every address change. + // Instead, preserve the congestion controller. Reset smoothed_rtt + // and rtt_var (the new path may differ), but KEEP min_rtt — resetting + // it to the 10ms default causes time-loss thresholds (9/8 × 10ms) to + // fire before retransmitted packets can be ACKed on a 30ms path, + // creating an infinite retransmission loop. + const saved_min_rtt = self.loss.rtt.min_rtt; + self.loss.rtt = loss_recovery_mod.RttEstimator{}; + self.loss.rtt.min_rtt = saved_min_rtt; + self.loss.pto_count = 0; + // Don't proactively retransmit all in-flight packets — many may + // have already been received by the client (ACKs still in transit). + // Reset bytes_in_flight to unblock the cwnd check and clear + // in_flight flags so old packets don't subtract from the counter + // when later ACKed (which would desync bif and kill PTO). + self.loss.bytes_in_flight = 0; + self.loss.sent.clearInflight(); + self.time_loss_alarm_ns = null; + self.pto_deadline_ns = self.current_time_ns +| @as(i64, @intCast(self.loss.rtt.ptoBase(self.cached_max_ack_delay_ns))); // RFC 9000 §9.4: reset amplification limit for the new path (separate from old path tracking). // Each path must independently satisfy the 3x amplification limit until validated. self.bytes_unvalidated_recv = 0; self.bytes_unvalidated_sent = 0; // Immediately adopt new address (RFC 9000 §9.3.1). + // Save old address so late-arriving packets don't trigger re-migration. + self.prev_peer_addr = self.peer_addr; self.peer_addr = new_addr; // RFC 9000 §9.3: reset path validation on migration — must re-validate new path. self.path_validated = false; @@ -3525,6 +3778,7 @@ pub fn Connection(comptime max_streams: usize) type { /// source port changed. Preserves congestion state for throughput. fn onNatRebind(self: *Self, new_addr: SocketAddr, io: std.Io) !void { // Adopt new address without resetting congestion or path validation. + self.prev_peer_addr = self.peer_addr; self.peer_addr = new_addr; // Still send PATH_CHALLENGE to confirm reachability on the new port. var challenge: [8]u8 = undefined; diff --git a/src/quic/connection_test_basic.zig b/src/quic/connection_test_basic.zig index 19ad348..304257b 100644 --- a/src/quic/connection_test_basic.zig +++ b/src/quic/connection_test_basic.zig @@ -17,6 +17,7 @@ const frame = @import("frame.zig"); const loss_recovery_mod = @import("loss_recovery.zig"); const stream_mod = @import("stream.zig"); const tls = @import("tls.zig"); +const cc_mod = @import("congestion/cc.zig"); test "connection: hot struct is 64 bytes" { const testing = std.testing; @@ -39,7 +40,7 @@ test "connection: send returns 0 when queue empty" { var conn = try Connection(16).accept(.{}, io); var out: [MAX_PACKET_SIZE]u8 = undefined; const testing = std.testing; - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); } test "connection: enqueue and drain send queue" { @@ -49,7 +50,7 @@ test "connection: enqueue and drain send queue" { try conn.enqueueSend(&data); var out: [8]u8 = undefined; - const n = conn.send(&out); + const n = conn.send(&out, 0); const testing = std.testing; try testing.expectEqual(@as(usize, 4), n); try testing.expectEqualSlices(u8, &data, out[0..n]); @@ -84,7 +85,7 @@ test "connection: unknown version triggers VN response" { // A Version Negotiation packet should be queued. var out: [64]u8 = undefined; - const n = conn.send(&out); + const n = conn.send(&out, 0); try testing.expect(n > 0); // VN packet has version 0x00000000. @@ -115,7 +116,7 @@ test "connection: ver=0 packet does not trigger VN response" { // No VN response must be queued for a ver=0 packet. var out: [64]u8 = undefined; - const n = conn.send(&out); + const n = conn.send(&out, 0); try testing.expectEqual(@as(usize, 0), n); } @@ -151,7 +152,7 @@ test "loss: onPacketSent wires bytes_in_flight and pto_deadline" { const io = std.testing.io; var conn = try Connection(16).accept(.{}, io); conn.current_time_ns = 1_000_000; - conn.loss.onPacketSent(1, 0, 1200, true, conn.current_time_ns, .{}); + conn.loss.onPacketSent(1, 0, 1200, true, conn.current_time_ns, conn.current_time_ns, .{}); try testing.expectEqual(@as(u64, 1200), conn.loss.bytes_in_flight); try testing.expect(conn.loss.ptoDeadline(conn.cached_max_ack_delay_ns) != null); } @@ -180,7 +181,7 @@ test "loss: onAckReceived decrements bytes_in_flight" { const io = std.testing.io; var conn = try Connection(16).accept(.{}, io); conn.current_time_ns = 0; - conn.loss.onPacketSent(1, 0, 1200, true, 0, .{}); + conn.loss.onPacketSent(1, 0, 1200, true, 0, 0, .{}); try testing.expectEqual(@as(u64, 1200), conn.loss.bytes_in_flight); const ranges = [_]loss_recovery_mod.AckedRange{.{ .low = 1, .high = 1 }}; @@ -204,7 +205,7 @@ test "connection: send queue full returns SendQueueFull error" { // Drain one slot: now there is room again var out: [8]u8 = undefined; - _ = conn.send(&out); + _ = conn.send(&out, 0); try conn.enqueueSend(&data); // must succeed now } @@ -215,7 +216,7 @@ test "connection: processAck uses packet epoch not connection epoch" { conn.current_time_ns = 0; conn.hot.tx_pn[0] = 2; // pretend pn=0 and pn=1 were sent in epoch 0 - conn.loss.onPacketSent(1, 0, 1200, true, 0, .{}); + conn.loss.onPacketSent(1, 0, 1200, true, 0, 0, .{}); try testing.expectEqual(@as(u64, 1200), conn.loss.bytes_in_flight); const ack = frame.AckFrame{ @@ -314,7 +315,7 @@ test "connection: version 0 packet is silently ignored" { // No packet should be queued (VN response is NOT sent for version-0 packets). var out: [64]u8 = undefined; - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); } // --------------------------------------------------------------------------- @@ -573,7 +574,7 @@ test "close: draining state suppresses send()" { // Queue something try conn.enqueueSend(&[_]u8{0x01}); var out: [8]u8 = undefined; - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); } test "close: nextTimeout includes drain_deadline" { @@ -831,21 +832,21 @@ test "security: VN rate limit suppresses same version within 60s" { // First unknown version: send VN conn.receive(&pkt, src, 0, 0, io) catch {}; var out: [64]u8 = undefined; - try testing.expect(conn.send(&out) > 0); + try testing.expect(conn.send(&out, 0) > 0); // Same version within 60s: throttle (no VN) conn.receive(&pkt, src, 30_000_000_000, 0, io) catch {}; // +30s - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); // Different unknown version within 60s of first: send VN (different version) std.mem.writeInt(u32, pkt[1..5], 0x00000003, .big); // different version conn.receive(&pkt, src, 35_000_000_000, 0, io) catch {}; - try testing.expect(conn.send(&out) > 0); + try testing.expect(conn.send(&out, 0) > 0); // First version after 60s: send VN again (cooldown expired) std.mem.writeInt(u32, pkt[1..5], 0x00000002, .big); conn.receive(&pkt, src, 61_000_000_000, 0, io) catch {}; // +61s - try testing.expect(conn.send(&out) > 0); + try testing.expect(conn.send(&out, 0) > 0); } test "event_queue: wraparound maintains FIFO order" { @@ -965,7 +966,9 @@ test "loss: multi-packet loss triggers single congestion event" { conn.current_time_ns = 1_000_000_000; // Force CUBIC into congestion avoidance with a known large window. - conn.congestion.ssthresh = 0; // cwnd always > ssthresh=0 → CUBIC always used + if (cc_mod.selected == .cubic) { + conn.congestion.ssthresh = 0; // cwnd always > ssthresh=0 → CUBIC always used + } conn.congestion.cwnd = 100 * 1200; // 120000 bytes (100 × MSS) const initial_cwnd = conn.congestion.cwnd; @@ -973,7 +976,7 @@ test "loss: multi-packet loss triggers single congestion event" { conn.hot.tx_pn[0] = 11; // pretend pn=0..10 were sent var pn: u64 = 1; while (pn <= 10) : (pn += 1) { - conn.loss.onPacketSent(pn, 0, 1200, true, 0, .{}); + conn.loss.onPacketSent(pn, 0, 1200, true, 0, 0, .{}); } // ACK only pn=10; pn=1..7 satisfy K_PACKET_THRESHOLD and are declared lost. @@ -989,8 +992,14 @@ test "loss: multi-packet loss triggers single congestion event" { }; try conn.processAck(ack, 0); - const expected: u64 = @intFromFloat(@as(f64, @floatFromInt(initial_cwnd)) * 0.7); - try testing.expectEqual(expected, conn.congestion.cwnd); + if (cc_mod.selected == .cubic) { + // CUBIC: cwnd reduced by BETA_CUBIC (0.7). + const expected: u64 = @intFromFloat(@as(f64, @floatFromInt(initial_cwnd)) * 0.7); + try testing.expectEqual(expected, conn.congestion.cwnd); + } else { + // BBR: loss doesn't directly reduce cwnd (handled via delivery rate). + try testing.expect(conn.congestion.cwnd > 0); + } } // --------------------------------------------------------------------------- @@ -1031,7 +1040,7 @@ test "connection: PATH_CHALLENGE without app_keys is silently consumed (no panic conn.processFrames(buf[0..n], 2, null) catch {}; var out: [64]u8 = undefined; - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); } test "connection: PATH_RESPONSE is silently consumed" { @@ -1046,7 +1055,7 @@ test "connection: PATH_RESPONSE is silently consumed" { // No event, no packet queued try testing.expectEqual(@as(?Event, null), conn.pollEvent()); var out: [64]u8 = undefined; - try testing.expectEqual(@as(usize, 0), conn.send(&out)); + try testing.expectEqual(@as(usize, 0), conn.send(&out, 0)); } // --------------------------------------------------------------------------- @@ -1189,7 +1198,7 @@ test "connection: Version Negotiation DCID echoes full client SCID (RFC 9000 §6 // Grab the VN packet from the send queue. var out: [256]u8 = undefined; - const n = conn.send(&out); + const n = conn.send(&out, 0); try testing.expect(n > 0); // First byte: long header (0x80 set). diff --git a/src/quic/connection_test_crypto.zig b/src/quic/connection_test_crypto.zig index fbb037f..4ff5b96 100644 --- a/src/quic/connection_test_crypto.zig +++ b/src/quic/connection_test_crypto.zig @@ -11,6 +11,7 @@ const SocketAddr = conn_mod.SocketAddr; const frame = @import("frame.zig"); const loss_recovery_mod = @import("loss_recovery.zig"); const tls = @import("tls.zig"); +const cc_mod = @import("congestion/cc.zig"); const packet = @import("packet.zig"); const crypto = @import("crypto.zig"); const transport_params = @import("transport_params.zig"); @@ -58,7 +59,7 @@ test "ecn: CE count increase triggers congestion event (cwnd reduces)" { // Record a sent packet so largest_acked_sent_ns is populated conn.hot.tx_pn[2] = 2; // pretend pn=0..1 were sent in epoch 2 - conn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); const initial_cwnd = conn.congestion.cwnd; @@ -76,8 +77,13 @@ test "ecn: CE count increase triggers congestion event (cwnd reduces)" { // CE count recorded try testing.expectEqual(@as(u62, 1), conn.ecn_ce_seen[2]); - // cwnd must have been reduced (congestion event) - try testing.expect(conn.congestion.cwnd < initial_cwnd); + // Congestion event: CUBIC reduces cwnd, BBR reduces inflight_hi. + if (cc_mod.selected == .cubic) { + try testing.expect(conn.congestion.cwnd < initial_cwnd); + } else { + // BBR: inflight_hi should have been reduced by onEcnCe. + try testing.expect(conn.congestion.inflight_hi < std.math.maxInt(u64)); + } } test "ecn: CE count non-increase is ignored (monotonic guard)" { @@ -89,12 +95,12 @@ test "ecn: CE count non-increase is ignored (monotonic guard)" { conn_ecn.current_time_ns = 1_000_000_000; conn_ecn.ecn_ce_seen[2] = 5; // already seen 5 conn_ecn.hot.tx_pn[2] = 2; // pretend pn=0..1 were sent - conn_ecn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn_ecn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); var conn_plain = try Connection(1).accept(.{}, io); conn_plain.current_time_ns = 1_000_000_000; conn_plain.hot.tx_pn[2] = 2; - conn_plain.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn_plain.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); const ack_ecn = frame.AckFrame{ .largest_acked = 1, @@ -134,12 +140,12 @@ test "ecn: CE count = 0 with has_ecn=true is a no-op (no congestion)" { var conn_ecn = try Connection(1).accept(.{}, io); conn_ecn.current_time_ns = 1_000_000_000; conn_ecn.hot.tx_pn[2] = 2; - conn_ecn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn_ecn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); var conn_plain = try Connection(1).accept(.{}, io); conn_plain.current_time_ns = 1_000_000_000; conn_plain.hot.tx_pn[2] = 2; - conn_plain.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn_plain.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); const ack_ecn = frame.AckFrame{ .largest_acked = 1, @@ -178,7 +184,7 @@ test "ecn: has_ecn=false ACK does not touch ecn_ce_seen" { conn.current_time_ns = 1_000_000_000; conn.ecn_ce_seen[2] = 99; // pre-set to a non-zero value conn.hot.tx_pn[2] = 2; - conn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, .{}); + conn.loss.onPacketSent(1, 2, 1200, true, 1_000_000_000, 1_000_000_000, .{}); const ack = frame.AckFrame{ .largest_acked = 1, @@ -677,7 +683,7 @@ test "connection: processAck multi-range gap decoding does not ack gap packets" // Register 8 in-flight packets (pn 0-7) in epoch 2 (1-RTT). conn.hot.tx_pn[2] = 8; // pretend pn 0-7 were sent for (0..8) |pn| { - conn.loss.onPacketSent(@intCast(pn), 2, 1200, true, conn.current_time_ns, .{}); + conn.loss.onPacketSent(@intCast(pn), 2, 1200, true, conn.current_time_ns, conn.current_time_ns, .{}); } try testing.expectEqual(@as(u64, 8 * 1200), conn.loss.bytes_in_flight); @@ -1930,7 +1936,7 @@ test "connection: ACK ack_delay scaled by cached_ack_delay_exp" { conn.hot.tx_pn[2] = 1; // pretend we sent packet #0 // Seed loss recovery with a sent packet so RTT can update. const fi = loss_recovery_mod.SentFrameInfo{}; - conn.loss.onPacketSent(0, 2, 100, true, 0, fi); + conn.loss.onPacketSent(0, 2, 100, true, 0, 0, fi); const ack_f: frame.Frame = .{ .ack = .{ diff --git a/src/quic/connection_test_frames.zig b/src/quic/connection_test_frames.zig index ed0faa4..9edfd17 100644 --- a/src/quic/connection_test_frames.zig +++ b/src/quic/connection_test_frames.zig @@ -15,6 +15,7 @@ const frame = @import("frame.zig"); const loss_recovery_mod = @import("loss_recovery.zig"); const stream_mod = @import("stream.zig"); const tls = @import("tls.zig"); +const cc_mod = @import("congestion/cc.zig"); const packet = @import("packet.zig"); const crypto = @import("crypto.zig"); const transport_params = @import("transport_params.zig"); @@ -64,16 +65,18 @@ test "connection: persistent congestion collapses cwnd to 2*MSS" { var conn = try Connection(16).accept(.{}, io); conn.congestion.cwnd = 100 * 1200; - conn.congestion.ssthresh = 0; // always in CUBIC phase + if (cc_mod.selected == .cubic) { + conn.congestion.ssthresh = 0; // always in CUBIC phase + } conn.current_time_ns = 0; conn.hot.tx_pn[0] = 9; // pretend pn=0..8 were sent - conn.loss.onPacketSent(1, 0, 1200, true, 0, .{}); - conn.loss.onPacketSent(2, 0, 1200, true, 0, .{}); - conn.loss.onPacketSent(3, 0, 1200, true, 0, .{}); - conn.loss.onPacketSent(4, 0, 1200, true, 0, .{}); - conn.loss.onPacketSent(5, 0, 1200, true, 3_200_000_000, .{}); - conn.loss.onPacketSent(8, 0, 1200, true, 3_200_000_000, .{}); + conn.loss.onPacketSent(1, 0, 1200, true, 0, 0, .{}); + conn.loss.onPacketSent(2, 0, 1200, true, 0, 0, .{}); + conn.loss.onPacketSent(3, 0, 1200, true, 0, 0, .{}); + conn.loss.onPacketSent(4, 0, 1200, true, 0, 0, .{}); + conn.loss.onPacketSent(5, 0, 1200, true, 3_200_000_000, 3_200_000_000, .{}); + conn.loss.onPacketSent(8, 0, 1200, true, 3_200_000_000, 3_200_000_000, .{}); const ack = frame.AckFrame{ .largest_acked = 8, @@ -88,8 +91,12 @@ test "connection: persistent congestion collapses cwnd to 2*MSS" { conn.current_time_ns = 3_200_000_000; try conn.processAck(ack, 0); - // Persistent congestion → cwnd = 2 * MSS = 2904 (MSS=1452) - try testing.expectEqual(@as(u64, 2 * 1452), conn.congestion.cwnd); + // Persistent congestion: CUBIC → cwnd = 2*MSS, BBR → cwnd = 4*MSS (BBR_MIN_CWND). + if (cc_mod.selected == .cubic) { + try testing.expectEqual(@as(u64, 2 * 1452), conn.congestion.cwnd); + } else { + try testing.expectEqual(@as(u64, 4 * 1452), conn.congestion.cwnd); + } } // --------------------------------------------------------------------------- @@ -156,7 +163,7 @@ test "security: amplification limit lifted after path_validated" { try conn.enqueueSend(&[_]u8{0x01} ** 100); // Verify the send queue actually accepted the bytes. var out: [MAX_PACKET_SIZE]u8 = undefined; - try testing.expect(conn.send(&out) > 0); + try testing.expect(conn.send(&out, 0) > 0); } // SEC-006: Frame-type per epoch enforcement @@ -607,8 +614,10 @@ test "connection: migration resets congestion" { const new_src = SocketAddr{ .v4 = .{ .addr = [4]u8{ 10, 0, 0, 1 }, .port = 5000 } }; var empty = [_]u8{}; try conn.receive(&empty, new_src, 0, 0, io); - // RFC 9002 §7.2: initial_window = min(10*1452, max(14720, 2*1452)) = 14520. - try testing.expectEqual(@as(u64, 14520), conn.congestion.cwnd); + // Congestion state (cwnd) is preserved across migration to avoid throughput + // collapse during rapid address changes. RTT and PTO are reset instead. + try testing.expectEqual(@as(u64, 999_999), conn.congestion.cwnd); + try testing.expect(!conn.loss.rtt.initialized); // RTT was reset } test "connection: migration sets path_validated false" { diff --git a/src/quic/connection_test_handshakecorruption.zig b/src/quic/connection_test_handshakecorruption.zig index b948b5e..514da1e 100644 --- a/src/quic/connection_test_handshakecorruption.zig +++ b/src/quic/connection_test_handshakecorruption.zig @@ -177,7 +177,7 @@ test "time-loss alarm fires for STREAM pkn with sub-threshold gap" { conn.queuePing() catch {}; var buf: [1500]u8 = undefined; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} // ACK [6..7],[1..3] — gap at [0,4,5]. pkn 5 gap = 2 < threshold 3. var ranges: [32]frame.AckRange = undefined; @@ -185,7 +185,7 @@ test "time-loss alarm fires for STREAM pkn with sub-threshold gap" { ranges[1] = .{ .gap = 1, .ack_range = 2 }; conn.current_time_ns = t0 + 100_000_000; conn.processAck(makeAck(7, 2, ranges), 2) catch {}; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} try testing.expect(conn.time_loss_alarm_ns != null); const alarm = conn.time_loss_alarm_ns.?; @@ -194,7 +194,7 @@ test "time-loss alarm fires for STREAM pkn with sub-threshold gap" { var total: usize = 0; while (true) { - const n = conn.send(&buf); + const n = conn.send(&buf, 0); if (n == 0) break; total += n; } @@ -224,25 +224,25 @@ test "full retransmission lifecycle: loss → retransmit → PTO → re-probe" { conn.queuePing() catch {}; var buf: [1500]u8 = undefined; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} var ranges: [32]frame.AckRange = undefined; ranges[0] = .{ .gap = 0, .ack_range = 1 }; ranges[1] = .{ .gap = 1, .ack_range = 2 }; conn.current_time_ns = t0 + 100_000_000; conn.processAck(makeAck(7, 2, ranges), 2) catch {}; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} const alarm = conn.time_loss_alarm_ns orelse return error.TestUnexpectedResult; conn.tick(alarm + 1); - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} try testing.expect(conn.pto_deadline_ns != null); const pto1 = conn.pto_deadline_ns.?; conn.tick(pto1 + 1); var probe_sent = false; - while (conn.send(&buf) > 0) { + while (conn.send(&buf, 0) > 0) { probe_sent = true; } try testing.expect(probe_sent); @@ -276,7 +276,7 @@ test "PTO skips Initial retransmit when hs_keys exist to preserve budget for Han conn.retransmitCryptoSaved(1); var buf: [1500]u8 = undefined; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} const remaining = (conn.bytes_unvalidated_recv *| 3) -| conn.bytes_unvalidated_sent; // Budget should be consumed by Handshake, not wasted on Initial @@ -326,6 +326,9 @@ test "sendShortHeaderPacket arms PTO for ack-eliciting packets" { conn.current_time_ns = 1_000_000_000; conn.pto_deadline_ns = null; conn.queuePing() catch {}; + // Move queued packet to wire so PTO is armed at wire-time. + var buf: [1500]u8 = undefined; + _ = conn.send(&buf, 0); try testing.expect(conn.pto_deadline_ns != null); } @@ -338,7 +341,7 @@ test "processLostFrames retransmits STREAM directly when send queue has space" { conn.streamSend(0, &([_]u8{0xAA} ** 100), true) catch return error.TestUnexpectedResult; var buf: [1500]u8 = undefined; - while (conn.send(&buf) > 0) {} + while (conn.send(&buf, 0) > 0) {} var result = loss_recovery_mod.AckResult{}; result.lost_frame_count = 1; @@ -386,6 +389,9 @@ test "pending stream retransmit arms PTO when drained via tick" { conn.tick(t0 + 1); try testing.expectEqual(@as(u8, 0), conn.stream_pending_retx_count); + // Move queued packet to wire so bytes_in_flight and PTO are updated. + var buf2: [1500]u8 = undefined; + _ = conn.send(&buf2, 0); try testing.expect(conn.loss.bytes_in_flight > 0); try testing.expect(conn.pto_deadline_ns != null); } diff --git a/src/quic/connection_test_pmtud.zig b/src/quic/connection_test_pmtud.zig index 67fd9b6..b8861b4 100644 --- a/src/quic/connection_test_pmtud.zig +++ b/src/quic/connection_test_pmtud.zig @@ -396,14 +396,13 @@ test "PMTUD: probe packet is marked ack-eliciting" { // Queue probe at realistic size (< MAX_PACKET_SIZE) try conn.queuePmtudProbe(1200); - // Verify it was registered in loss recovery as ack-eliciting - // (The onPacketSent call in queuePmtudProbe passes true for ack_eliciting) + // Verify the probe was queued and its send-queue metadata is ack-eliciting. + // (onPacketSent records into loss.sent only when send() dequeues the packet; + // here we verify the queue metadata directly.) try testing.expect(conn.pmtud_probing != null); - const pn = conn.pmtud_probing.?.packet_number; - - // Look up in loss recovery to verify it was tracked - const sent_pkt = conn.loss.sent.get(pn, 2); // epoch 2 = 1-RTT - try testing.expect(sent_pkt != null); + try testing.expect(conn.sq_head < conn.sq_tail); // packet is in the send queue + const meta = conn.sq_meta[(conn.sq_tail -% 1) & (conn_mod.SEND_QUEUE_DEPTH - 1)]; + try testing.expect(meta.ack_eliciting); } test "PMTUD: doesn't probe if already at maximum" { @@ -779,7 +778,7 @@ test "retry: validate_addr=true, no token: retry_sent event and Retry packet que // A Retry packet must be in the send queue var out: [256]u8 = undefined; - const n = conn.send(&out); + const n = conn.send(&out, 0); try testing.expect(n > 0); // Retry first byte is 0xff (v1: type bits 0b11, unused=0xf) try testing.expectEqual(@as(u8, 0xff), out[0]); diff --git a/src/quic/fuzz.zig b/src/quic/fuzz.zig index 0791d0b..cc8feac 100644 --- a/src/quic/fuzz.zig +++ b/src/quic/fuzz.zig @@ -238,7 +238,7 @@ fn fuzzLossRecoveryLoop(_: void, input: FuzzInput) anyerror!void { switch (op) { 0 => { // Send a packet - lr.onPacketSent(pn, epoch, 1200, ack_eliciting, now_ns, .{}); + lr.onPacketSent(pn, epoch, 1200, ack_eliciting, now_ns, now_ns, .{}); pn += 1; now_ns += 1_000_000; // +1ms }, diff --git a/src/quic/loss_recovery.zig b/src/quic/loss_recovery.zig index bc67e8e..f3349f4 100644 --- a/src/quic/loss_recovery.zig +++ b/src/quic/loss_recovery.zig @@ -19,12 +19,15 @@ pub const K_GRANULARITY_NS: u64 = 1_000_000; // 1ms minimum timer granularity pub const K_INITIAL_RTT_NS: u64 = 10_000_000; // 10ms — balanced conservative estimate pub const MAX_SENT: usize = 256; // Ring buffer capacity pub const MAX_FRAMES_PER_PACKET: usize = 4; -// Per-ACK capacity for acked/lost frame tracking. +// Per-ACK capacity for lost frame tracking. // Lost frames: detectLoss defers packets that don't fit to the next alarm round // (see detectLoss — skips eviction instead of silently dropping retransmit info). -// Acked frames: each ACK typically covers only a few newly-acked packets in -// practice, so 64 is sufficient for acked_frames. pub const MAX_LOSS_EVENTS: usize = 64; +// Acked frames: must match the largest epoch's sent buffer (EPOCH_SIZES[2] = 128) +// so that a single ACK covering all in-flight packets never overflows. Overflow +// silently drops frame info, preventing send_acked from advancing (same class of +// bug as SACK overflow — permanent stream buffer stall). +pub const MAX_ACKED_FRAMES: usize = MAX_SENT / 2; // 128 — matches epoch 2 // --------------------------------------------------------------------------- // FrameInfo — per-frame metadata for retransmission @@ -111,12 +114,33 @@ pub const RttEstimator = struct { pub const SentPacket = struct { pn: u64, - sent_ns: i64, + sent_ns: i64, // wire time — for loss detection / RTT measurement + queued_ns: i64 = 0, // queue time — for delivery rate (avoids pacing inflation) size: u16, epoch: u8, ack_eliciting: bool, in_flight: bool, valid: bool, // true = slot occupied + + // BBR delivery rate tracking: + delivered: u64 = 0, // total bytes delivered at send time + delivered_ns: i64 = 0, // timestamp of last delivery at send time + first_sent_ns: i64 = 0, // send time of first packet in current delivery sample + is_app_limited: bool = false, // was sender app-limited when this was sent? +}; + +/// Per-ACK delivery rate sample, passed to the congestion controller. +pub const DeliveryRateSample = @import("congestion/common.zig").DeliveryRateSample; + +/// Connection-level delivery tracking counters (lives on LossRecovery). +pub const DeliveryState = struct { + delivered: u64 = 0, // cumulative bytes delivered + delivered_ns: i64 = 0, // time of most recent delivery + first_sent_ns: i64 = 0, // send time of first undelivered packet + app_limited: bool = false, // currently app-limited? + + // Round-trip counting (BBR uses rounds, not wall clock). + next_round_delivered: u64 = 0, }; pub const AckedRange = struct { low: u64, high: u64 }; @@ -139,8 +163,23 @@ pub const AckResult = struct { lost_frame_count: usize = 0, /// Epoch for each lost packet (parallel to lost_frames) lost_epochs: [MAX_LOSS_EVENTS]u8 = undefined, - acked_frames: [MAX_LOSS_EVENTS]SentFrameInfo = undefined, + acked_frames: [MAX_ACKED_FRAMES]SentFrameInfo = undefined, acked_frame_count: usize = 0, + /// Delivery rate sample for BBR (computed by LossRecovery.onAckReceived). + delivery_rate_sample: DeliveryRateSample = .{}, + // Internal: delivery snapshot from the highest-pn acked packet. + // Used only within LossRecovery to compute delivery_rate_sample. + delivery_snap: DeliverySnapshot = .{}, +}; + +/// Internal snapshot of delivery metadata from the highest-pn acked packet. +const DeliverySnapshot = struct { + delivered: u64 = 0, + delivered_ns: i64 = 0, + first_sent_ns: i64 = 0, + sent_ns: i64 = 0, + is_app_limited: bool = false, + pn: u64 = 0, }; /// Returned by remove() — carries both the packet metadata and its frame info. @@ -198,6 +237,16 @@ pub const SentPacketTable = struct { return evicted; } + /// Clear in_flight on all valid packets. Used during path migration: + /// bytes_in_flight is reset to 0, so old packets must not subtract + /// from it when later ACKed. Packets remain valid for delivery rate + /// tracking and ACK processing. + pub fn clearInflight(self: *SentPacketTable) void { + for (&self.slots) |*slot| { + if (slot.valid) slot.in_flight = false; + } + } + /// O(1) lookup. Returns null if slot is empty or belongs to a different pn/epoch. pub fn get(self: *const SentPacketTable, pn: u64, epoch: u8) ?SentPacket { const idx = slotIndex(pn, epoch); @@ -229,10 +278,21 @@ pub const SentPacketTable = struct { if (entry.pkt.in_flight) { bif.* = if (bif.* >= entry.pkt.size) bif.* - entry.pkt.size else 0; } - if (result.acked_frame_count < MAX_LOSS_EVENTS) { + if (result.acked_frame_count < MAX_ACKED_FRAMES) { result.acked_frames[result.acked_frame_count] = entry.fi; result.acked_frame_count += 1; } + // Track the highest-pn acked packet's delivery metadata for rate computation. + if (entry.pkt.pn >= result.delivery_snap.pn) { + result.delivery_snap = .{ + .pn = entry.pkt.pn, + .delivered = entry.pkt.delivered, + .delivered_ns = entry.pkt.delivered_ns, + .first_sent_ns = entry.pkt.first_sent_ns, + .sent_ns = entry.pkt.sent_ns, // wire time + .is_app_limited = entry.pkt.is_app_limited, + }; + } } } } @@ -359,6 +419,8 @@ pub const LossRecovery = struct { largest_acked: [3]u64, // per epoch [Initial, Handshake, 1-RTT] last_ack_eliciting_ns: ?i64, pto_count: u32, + /// Delivery rate tracking for BBR. + delivery: DeliveryState = .{}, pub fn init() LossRecovery { return .{ @@ -368,10 +430,17 @@ pub const LossRecovery = struct { .largest_acked = [_]u64{0} ** 3, .last_ack_eliciting_ns = null, .pto_count = 0, + .delivery = .{}, }; } /// Record a newly-sent packet. + /// `now_ns` — wire time (when the packet actually leaves the machine). + /// Used for sent_ns (loss detection timing). + /// `queued_ns` — queue time (when the application queued the packet). + /// Used for delivery rate snapshots so that pacing delays + /// do not inflate send_elapsed and depress BBR's bandwidth + /// estimate. pub fn onPacketSent( self: *LossRecovery, pn: u64, @@ -379,20 +448,41 @@ pub const LossRecovery = struct { size: usize, ack_eliciting: bool, now_ns: i64, + queued_ns: i64, frame_info: SentFrameInfo, ) void { const sz: u16 = @intCast(@min(size, @as(usize, 0xffff))); + // Snapshot delivery state into the sent packet for delivery rate computation. + // All timestamps use wire-time (now_ns) — the moment the packet actually + // leaves the machine. An earlier approach used queue-time (queued_ns) to + // avoid pacing-delay inflation of send_elapsed, but that caused stale + // timestamps when packets sat in the send queue during recovery, collapsing + // the delivery rate and creating a death spiral. Wire-time may slightly + // underestimate bandwidth when pacing adds inter-packet delay, but the + // estimate self-corrects as the pacing rate converges to the true BW. + if (self.delivery.delivered_ns == 0) { + self.delivery.delivered_ns = now_ns; + } + // Update first_sent_ns if this is the first packet since last ACK. + if (self.delivery.first_sent_ns == 0) { + self.delivery.first_sent_ns = now_ns; + } // add() evicts any existing occupant at pn % MAX_SENT. // If the evicted packet was still in flight, subtract its size from bytes_in_flight // to avoid double-counting (the in-flight accounting for the evicted packet is lost). if (self.sent.add(.{ .pn = pn, .sent_ns = now_ns, + .queued_ns = queued_ns, .size = sz, .epoch = epoch, .ack_eliciting = ack_eliciting, .in_flight = ack_eliciting, .valid = true, + .delivered = self.delivery.delivered, + .delivered_ns = self.delivery.delivered_ns, + .first_sent_ns = self.delivery.first_sent_ns, + .is_app_limited = self.delivery.app_limited, }, frame_info)) |evicted| { if (evicted.in_flight) { self.bytes_in_flight -|= evicted.size; @@ -434,11 +524,22 @@ pub const LossRecovery = struct { } } + // Capture inflight before ACKs for the delivery rate sample. + const prior_inflight = self.bytes_in_flight; + // 3. Remove all acknowledged packets for (ranges) |r| { self.sent.ackRange(r.low, r.high, epoch, &result, &self.bytes_in_flight); } + // 3b. Update delivery counters (needed before step 4-5, which don't use them). + if (result.newly_acked > 0) { + self.delivery.delivered += result.bytes_acked; + self.delivery.delivered_ns = now_ns; + // Reset first_sent_ns so the next send snapshot picks up fresh timing. + self.delivery.first_sent_ns = 0; + } + // 4. Compute time threshold: max(9/8 × max(srtt, latest_rtt), K_GRANULARITY_NS) const max_rtt = @max(self.rtt.smoothed_rtt, self.rtt.latest_rtt); const time_threshold_ns = @max( @@ -456,6 +557,36 @@ pub const LossRecovery = struct { &self.bytes_in_flight, ); + // 5b. Build delivery rate sample AFTER detectLoss so bytes_lost is populated. + if (result.newly_acked > 0) { + const snap = result.delivery_snap; + const delivered_delta = self.delivery.delivered -| snap.delivered; + const ack_elapsed: u64 = if (now_ns > snap.delivered_ns) + @intCast(now_ns - snap.delivered_ns) + else + 1; + const send_elapsed: u64 = if (snap.sent_ns > snap.first_sent_ns) + @intCast(snap.sent_ns - snap.first_sent_ns) + else + 1; + const interval = @max(ack_elapsed, send_elapsed); + + const round_start = snap.delivered >= self.delivery.next_round_delivered; + if (round_start) { + self.delivery.next_round_delivered = self.delivery.delivered; + } + + result.delivery_rate_sample = .{ + .delivery_rate = delivered_delta *| 1_000_000_000 / interval, + .is_app_limited = snap.is_app_limited, + .rtt_ns = if (self.rtt.initialized) self.rtt.smoothed_rtt else 0, + .bytes_acked = result.bytes_acked, + .bytes_lost = result.bytes_lost, + .prior_inflight = prior_inflight, + .round_start = round_start, + }; + } + // 6. Persistent congestion detection (RFC 9002 §6.1.2). // If the span between the earliest and latest ack-eliciting lost packets // exceeds 3×PTO, mark as persistent congestion. @@ -626,7 +757,7 @@ test "sent_table: onPacketSent increments bytes_in_flight; ackRange decrements i const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(5, 0, 1200, true, 0, .{}); + lr.onPacketSent(5, 0, 1200, true, 0, 0, .{}); try testing.expectEqual(@as(u64, 1200), lr.bytes_in_flight); var result = AckResult{}; @@ -643,7 +774,7 @@ test "loss_detection: packet threshold — pn 1-7 declared lost when largest_ack // Send pn 1..10 all at time 0 var pn: u64 = 1; while (pn <= 10) : (pn += 1) { - lr.onPacketSent(pn, 0, 1200, true, 0, .{}); + lr.onPacketSent(pn, 0, 1200, true, 0, 0, .{}); } // ACK only pn=10; all others remain unacked @@ -661,7 +792,7 @@ test "loss_detection: time threshold — old packet detected as lost" { var lr = LossRecovery.init(); // Send pn=1000 at time 0; pn=1 not sent (not in table) - lr.onPacketSent(1000, 0, 1200, true, 0, .{}); + lr.onPacketSent(1000, 0, 1200, true, 0, 0, .{}); // ACK pn=1 (not in table — no RTT update, initial values used) // Initial smoothed_rtt = 333ms, time_threshold ≈ 375ms @@ -702,7 +833,7 @@ test "sent_table: lastAckElicitingNs returns sent_ns of highest in-flight pn" { test "pto: deadline is clamped at 2^5 backoff" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); const d0 = lr.ptoDeadline(25_000_000).?; @@ -731,7 +862,7 @@ test "rtt: ack_delay exceeding sample_ns does not underflow adjusted_rtt" { test "loss_recovery: onAckReceived with empty ranges slice is safe" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); const result = lr.onAckReceived(1, 0, &[_]AckedRange{}, 0, 0, 25_000_000); // No ranges → nothing acked, nothing lost try testing.expectEqual(@as(u32, 0), result.newly_acked); @@ -746,14 +877,14 @@ test "sent_table: eviction decrements bytes_in_flight to avoid double-counting" const region = SentPacketTable.EPOCH_SIZES[2]; // 128 var pn: u64 = 0; while (pn < region) : (pn += 1) { - lr.onPacketSent(pn, 2, 1200, true, 0, .{}); + lr.onPacketSent(pn, 2, 1200, true, 0, 0, .{}); } const bif_after = lr.bytes_in_flight; try testing.expectEqual(@as(u64, region * 1200), bif_after); // Send pn=128: maps to same slot as pn=0, evicting it. // bytes_in_flight should stay the same (evict 1200, add 1200). - lr.onPacketSent(region, 2, 1200, true, 0, .{}); + lr.onPacketSent(region, 2, 1200, true, 0, 0, .{}); try testing.expectEqual(bif_after, lr.bytes_in_flight); } @@ -762,14 +893,14 @@ test "loss_detection: last_ack_eliciting_ns updated after packets declared lost" var lr = LossRecovery.init(); // Send one ack-eliciting packet - lr.onPacketSent(1, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); try testing.expect(lr.last_ack_eliciting_ns != null); // ACK a much higher pn to trigger loss via packet threshold for pn=1 // Send pn 2..5 so we have some acked var i: u64 = 2; while (i <= 10) : (i += 1) { - lr.onPacketSent(i, 0, 1200, true, 0, .{}); + lr.onPacketSent(i, 0, 1200, true, 0, 0, .{}); } const ranges = [_]AckedRange{.{ .low = 10, .high = 10 }}; _ = lr.onAckReceived(10, 0, &ranges, 0, 0, 25_000_000); @@ -783,7 +914,7 @@ test "loss_detection: last_ack_eliciting_ns updated after packets declared lost" test "pto: deadline saturates on extreme pto values" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); // Force an extreme smoothed_rtt that would cause overflow without saturation lr.rtt.smoothed_rtt = std.math.maxInt(u64) / 4; @@ -801,7 +932,7 @@ test "pto: deadline doubles per onPtoFired; resets after resetPtoCount" { var lr = LossRecovery.init(); // Send one ack-eliciting packet at time 0 - lr.onPacketSent(1, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); const d0 = lr.ptoDeadline(25_000_000); try testing.expect(d0 != null); @@ -859,8 +990,8 @@ test "frame_info: detectLoss populates lost_frames in AckResult" { var fi = SentFrameInfo{}; fi.frames[0] = .{ .stream = .{ .stream_id = 0, .offset = 0, .len = 100, .fin = false } }; fi.count = 1; - lr.onPacketSent(1, 0, 100, true, 0, fi); - lr.onPacketSent(10, 0, 100, true, 0, .{}); + lr.onPacketSent(1, 0, 100, true, 0, 0, fi); + lr.onPacketSent(10, 0, 100, true, 0, 0, .{}); const ranges = [_]AckedRange{.{ .low = 10, .high = 10 }}; const result = lr.onAckReceived(10, 0, &ranges, 0, 0, 25_000_000); @@ -882,7 +1013,7 @@ test "frame_info: acked packets appear in acked_frames not lost_frames" { var fi = SentFrameInfo{}; fi.frames[0] = .ping; fi.count = 1; - lr.onPacketSent(1, 0, 50, true, 0, fi); + lr.onPacketSent(1, 0, 50, true, 0, 0, fi); const ranges = [_]AckedRange{.{ .low = 1, .high = 1 }}; const result = lr.onAckReceived(1, 0, &ranges, 0, 0, 25_000_000); @@ -905,7 +1036,7 @@ test "frame_info: MAX_LOSS_EVENTS caps lost_frames output" { const N: u64 = MAX_LOSS_EVENTS + 4; // 68 var pn: u64 = 0; while (pn < N) : (pn += 1) { - lr.onPacketSent(pn, 2, 100, true, 0, .{}); + lr.onPacketSent(pn, 2, 100, true, 0, 0, .{}); } const top_pn = N - 1; const ranges = [_]AckedRange{.{ .low = top_pn, .high = top_pn }}; @@ -922,10 +1053,10 @@ test "sent_table: power-of-two slot collision evicts correctly" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(0, 0, 1200, true, 1_000, .{}); + lr.onPacketSent(0, 0, 1200, true, 1_000, 1_000, .{}); try testing.expect(lr.sent.get(0, 0) != null); - lr.onPacketSent(MAX_SENT, 0, 1200, true, 2_000, .{}); // maps to slot 0, evicts pn=0 + lr.onPacketSent(MAX_SENT, 0, 1200, true, 2_000, 2_000, .{}); // maps to slot 0, evicts pn=0 try testing.expectEqual(@as(?SentPacket, null), lr.sent.get(0, 0)); // pn=0 gone try testing.expect(lr.sent.get(MAX_SENT, 0) != null); // pn=256 present } @@ -946,14 +1077,14 @@ test "frame_info: ring buffer eviction preserves new packet frame info" { // Fill the ring buffer with MAX_SENT packets (no frame info) var pn: u64 = 0; while (pn < MAX_SENT) : (pn += 1) { - lr.onPacketSent(pn, 0, 100, true, 0, .{}); + lr.onPacketSent(pn, 0, 100, true, 0, 0, .{}); } // Send one more that evicts slot 0 (pn=0), record handshake_done frame info var fi = SentFrameInfo{}; fi.frames[0] = .handshake_done; fi.count = 1; - lr.onPacketSent(MAX_SENT, 0, 100, true, 0, fi); + lr.onPacketSent(MAX_SENT, 0, 100, true, 0, 0, fi); // The new packet's frame info should be stored at slot MAX_SENT % MAX_SENT = 0 const removed = lr.sent.remove(MAX_SENT, 0).?; @@ -974,7 +1105,7 @@ test "sent_table: 128 concurrent unacked packets coexist without eviction" { // Send 128 packets with distinct packet numbers 0..127 in epoch 2 var pn: u64 = 0; while (pn < 128) : (pn += 1) { - lr.onPacketSent(pn, 2, 1200, true, @as(i64, @intCast(pn)) * 1000, .{}); + lr.onPacketSent(pn, 2, 1200, true, @as(i64, @intCast(pn)) * 1000, @as(i64, @intCast(pn)) * 1000, .{}); } // All 128 must still be present (no eviction for pn < region size) @@ -1025,8 +1156,8 @@ test "valid_per_epoch: detectLoss decrements on loss" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 100, true, 0, .{}); - lr.onPacketSent(5, 0, 100, true, 0, .{}); + lr.onPacketSent(1, 0, 100, true, 0, 0, .{}); + lr.onPacketSent(5, 0, 100, true, 0, 0, .{}); try testing.expectEqual(@as(u16, 2), lr.sent.valid_per_epoch[0]); // ACK pn=5, which triggers loss detection for pn=1 (pn+3 <= 5) @@ -1048,12 +1179,12 @@ test "persistent_congestion: loss span > 3xPTO sets flag" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 1200, true, 0, .{}); - lr.onPacketSent(2, 0, 1200, true, 0, .{}); - lr.onPacketSent(3, 0, 1200, true, 0, .{}); - lr.onPacketSent(4, 0, 1200, true, 0, .{}); - lr.onPacketSent(5, 0, 1200, true, 3_200_000_000, .{}); - lr.onPacketSent(8, 0, 1200, true, 3_200_000_000, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(2, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(3, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(4, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(5, 0, 1200, true, 3_200_000_000, 3_200_000_000, .{}); + lr.onPacketSent(8, 0, 1200, true, 3_200_000_000, 3_200_000_000, .{}); const ranges = [_]AckedRange{.{ .low = 8, .high = 8 }}; const result = lr.onAckReceived(8, 0, &ranges, 0, 3_200_000_000, 25_000_000); @@ -1067,10 +1198,10 @@ test "persistent_congestion: loss span <= 3xPTO does not set flag" { const testing = std.testing; var lr = LossRecovery.init(); - lr.onPacketSent(1, 0, 1200, true, 0, .{}); - lr.onPacketSent(2, 0, 1200, true, 0, .{}); - lr.onPacketSent(3, 0, 1200, true, 0, .{}); - lr.onPacketSent(8, 0, 1200, true, 0, .{}); + lr.onPacketSent(1, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(2, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(3, 0, 1200, true, 0, 0, .{}); + lr.onPacketSent(8, 0, 1200, true, 0, 0, .{}); const ranges = [_]AckedRange{.{ .low = 8, .high = 8 }}; const result = lr.onAckReceived(8, 0, &ranges, 0, 0, 25_000_000); @@ -1229,7 +1360,7 @@ test "time_loss_alarm: timeLossAlarmNs returns null when largest_acked is 0 in a var lr = LossRecovery.init(); // No packets have been acked yet → largest_acked = 0 for all epochs - lr.onPacketSent(1, 2, 100, true, 0, .{}); + lr.onPacketSent(1, 2, 100, true, 0, 0, .{}); try testing.expectEqual(@as(?i64, null), lr.timeLossAlarmNs(25_000_000)); } @@ -1242,8 +1373,8 @@ test "time_loss_alarm: timeLossAlarmNs fires after time threshold + max_ack_dela // pn=1 packet threshold check: 1+3=4 > 2 → NOT lost by pkt threshold. // time_threshold ≈ 9/8 × 40ms = 45ms; max_ack_delay = 25ms. // Alarm fires at 0 + 45ms + 25ms = 70ms. - lr.onPacketSent(1, 2, 100, true, 0, .{}); - lr.onPacketSent(2, 2, 100, true, 0, .{}); + lr.onPacketSent(1, 2, 100, true, 0, 0, .{}); + lr.onPacketSent(2, 2, 100, true, 0, 0, .{}); const ranges = [_]AckedRange{.{ .low = 2, .high = 2 }}; _ = lr.onAckReceived(2, 0, &ranges, 2, 40_000_000, 25_000_000); diff --git a/src/quic/stream.zig b/src/quic/stream.zig index aaa0e72..1d50b6d 100644 --- a/src/quic/stream.zig +++ b/src/quic/stream.zig @@ -276,7 +276,8 @@ pub const Stream = struct { /// Cumulative bytes acknowledged on the send side. send_acked: u64, /// Out-of-order (SACK) acknowledged ranges waiting for the gap to be filled. - /// Bounded by STREAM_BUF_SIZE / min_chunk ≈ 32 entries in practice. + /// Adjacent/overlapping entries are merged on insertion; when full, the two + /// closest ranges are coalesced so no ACK info is ever silently dropped. sack_ranges: [32]struct { offset: u64, end: u64 }, sack_count: u8, /// FIN has been queued for sending. @@ -532,12 +533,71 @@ pub const Stream = struct { // Drain any SACK ranges that are now contiguous. self.flushSackRanges(); } else { - // Out-of-order: save for when the gap is filled. - if (self.sack_count < self.sack_ranges.len) { - self.sack_ranges[self.sack_count] = .{ .offset = offset, .end = end }; - self.sack_count += 1; + // Out-of-order: merge with existing range or insert new entry. + var merged = false; + for (self.sack_ranges[0..self.sack_count]) |*r| { + // Merge if adjacent or overlapping. + if (offset <= r.end and end >= r.offset) { + r.offset = @min(r.offset, offset); + r.end = @max(r.end, end); + merged = true; + break; + } + } + if (!merged) { + if (self.sack_count < self.sack_ranges.len) { + self.sack_ranges[self.sack_count] = .{ .offset = offset, .end = end }; + self.sack_count += 1; + } else { + // Array full — coalesce the two closest ranges to make room. + // This guarantees no ACK information is ever silently dropped. + self.coalesceClosest(); + self.sack_ranges[self.sack_count] = .{ .offset = offset, .end = end }; + self.sack_count += 1; + } + } + } + } + + /// When the SACK array is full, merge the two closest (smallest gap) + /// ranges into one, freeing a slot. The merged range covers both + /// original ranges plus the gap between them — those gap bytes are + /// "optimistically" marked as acked. This is safe: the gap bytes were + /// either already acked (contiguous ACK we missed) or lost and will be + /// retransmitted (the retransmit ACK will be a no-op since the range + /// already covers them). The key guarantee: no ACK information is ever + /// silently dropped, so send_acked always advances and the send buffer + /// never permanently stalls. + fn coalesceClosest(self: *Stream) void { + if (self.sack_count < 2) return; + var best_gap: u64 = std.math.maxInt(u64); + var best_i: usize = 0; + var best_j: usize = 1; + for (0..self.sack_count) |i| { + for (i + 1..self.sack_count) |j| { + const a = self.sack_ranges[i]; + const b = self.sack_ranges[j]; + // Gap between two non-overlapping ranges. + const gap = if (a.end <= b.offset) + b.offset - a.end + else if (b.end <= a.offset) + a.offset - b.end + else + 0; // overlapping — merge for free + if (gap < best_gap) { + best_gap = gap; + best_i = i; + best_j = j; + } } } + // Merge j into i, remove j. + self.sack_ranges[best_i] = .{ + .offset = @min(self.sack_ranges[best_i].offset, self.sack_ranges[best_j].offset), + .end = @max(self.sack_ranges[best_i].end, self.sack_ranges[best_j].end), + }; + self.sack_count -= 1; + self.sack_ranges[best_j] = self.sack_ranges[self.sack_count]; } /// Apply buffered SACK ranges that are now contiguous with send_acked. @@ -939,9 +999,9 @@ test "stream_send: multiple out-of-order SACK ranges resolved in one flush" { s.send_offset = 3600; s.onAcked(1200, 1200); // out-of-order - s.onAcked(2400, 1200); // out-of-order + s.onAcked(2400, 1200); // out-of-order, merged with [1200,2400) → [1200,3600) try testing.expectEqual(@as(u64, 0), s.send_acked); - try testing.expectEqual(@as(usize, 2), s.sack_count); + try testing.expectEqual(@as(usize, 1), s.sack_count); s.onAcked(0, 1200); // fills gap → cascades through 1200 and 2400 try testing.expectEqual(@as(u64, 3600), s.send_acked); diff --git a/src/root.zig b/src/root.zig index 1f406ea..98223db 100644 --- a/src/root.zig +++ b/src/root.zig @@ -11,7 +11,7 @@ //! // On datagram receipt: //! try conn.receive(udp_payload, src_addr, now_ns, io); //! // Drain outgoing datagrams: -//! while (conn.send(&out_buf)) |n| { socket.send(out_buf[0..n]); } +//! while (conn.send(&out_buf, now_ns)) |n| { socket.send(out_buf[0..n]); } //! // Timer: //! if (conn.nextTimeout()) |deadline_ns| { ... } //! conn.tick(now_ns); @@ -26,6 +26,8 @@ pub const stream = @import("quic/stream.zig"); pub const flow_control = @import("quic/flow_control.zig"); pub const congestion = struct { pub const cubic = @import("quic/congestion/cubic.zig"); + pub const bbr = @import("quic/congestion/bbr.zig"); + pub const cc = @import("quic/congestion/cc.zig"); }; pub const connection_id = @import("quic/connection_id.zig"); diff --git a/tools/Dockerfile b/tools/Dockerfile deleted file mode 100644 index 68d6ac7..0000000 --- a/tools/Dockerfile +++ /dev/null @@ -1,64 +0,0 @@ -# Multi-stage build for quic-interop-runner. -# -# Stage 1: Build the server binary (static musl target). -# Stage 2: Minimal Alpine runtime image. -# Supports multiple architectures (amd64, arm64). - -FROM debian:bookworm-slim AS builder - -RUN apt-get update && apt-get install -y --no-install-recommends wget xz-utils ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -ENV ZIG_VERSION=0.16.0-dev.2676+4e2cec265 - -# Auto-detect architecture and download appropriate Zig binary -RUN set -e; \ - ARCH=$(uname -m); \ - if [ "$ARCH" = "x86_64" ]; then \ - ZIG_ARCH="x86_64"; \ - TARGET="x86_64-linux-musl"; \ - elif [ "$ARCH" = "aarch64" ]; then \ - ZIG_ARCH="aarch64"; \ - TARGET="aarch64-linux-musl"; \ - else \ - echo "Unsupported architecture: $ARCH"; \ - exit 1; \ - fi; \ - ZIG_TARBALL="zig-${ZIG_ARCH}-linux-${ZIG_VERSION}.tar.xz"; \ - wget -q "https://ziglang.org/builds/${ZIG_TARBALL}"; \ - tar xf "${ZIG_TARBALL}"; \ - rm "${ZIG_TARBALL}"; \ - ln -s "zig-${ZIG_ARCH}-linux-${ZIG_VERSION}" /zig; \ - echo "export TARGET=${TARGET}" > /build_env.sh; \ - echo "export PATH=/zig:\$PATH" >> /build_env.sh - -ENV PATH="/zig:${PATH}" - -WORKDIR /build -COPY . . - -RUN set -e; \ - . /build_env.sh; \ - zig build -Doptimize=ReleaseSafe -Dtarget="${TARGET}" - -# Stage 2: Runtime image with network simulator support. -FROM martenseemann/quic-network-simulator-endpoint:latest - -LABEL org.opencontainers.image.title="zquic-interop" \ - org.opencontainers.image.description="zquic interop testing image for quic-interop-runner. Not intended for production use." \ - org.opencontainers.image.source="https://github.com/ericsssan/zquic" \ - org.opencontainers.image.licenses="MIT" - -COPY --from=builder /build/zig-out/bin/server /server -COPY tools/run_endpoint.sh /run_endpoint.sh - -RUN chmod +x /run_endpoint.sh && mkdir -p /logs /certs - -EXPOSE 443/udp - -ENV PORT=443 -ENV TESTCASE=transfer -ENV CERTS=/certs -ENV WWW=/www - -ENTRYPOINT ["/run_endpoint.sh"] diff --git a/tools/server.zig b/tools/server.zig index 76b2373..0bed9dc 100644 --- a/tools/server.zig +++ b/tools/server.zig @@ -41,7 +41,7 @@ const PendingTransfer = struct { const ConnSlot = struct { conn: Conn, peer_addr: ?net.IpAddress = null, - /// When true, send responses through the CM socket (after preferred_address migration). + /// True when the most recent packet arrived on the CM socket. use_cm_sock: bool = false, transfers: [MAX_TRANSFERS]FileTransfer = [_]FileTransfer{.{}} ** MAX_TRANSFERS, /// Parsed requests deferred because all transfer slots were occupied. @@ -64,6 +64,10 @@ const supported_cases = [_][]const u8{ /// True when TESTCASE=http3 — uses H3 framing instead of HTTP/0.9. var g_is_h3: bool = false; +/// Accumulated SSLKEYLOG data for all connections. Written to /logs/keys.log +/// in full on each update so createFileAbsolute truncation doesn't lose data. +var g_keylog_buf: [65536]u8 = undefined; +var g_keylog_len: usize = 0; // IPv4/IPv6 addresses for preferred_address in connectionmigration test (interop runner addresses). // server4: 193.167.100.100 (0xc1, 0xa7, 0x64, 0x64) @@ -108,11 +112,16 @@ fn extractDcid(data: []const u8) ?[CID_LEN]u8 { } /// Find a connection slot by its local DCID. +/// Also checks first_initial_dcid so that retransmitted client Initials +/// (which use the original random DCID, not the server's SCID) are routed +/// to the existing connection instead of creating a duplicate. fn findConnByDcid(slots: *const [MAX_CONNS]?*ConnSlot, dcid: [CID_LEN]u8) ?*ConnSlot { for (slots.*) |slot_opt| { const slot = slot_opt orelse continue; if (std.mem.eql(u8, &slot.conn.local_cid.bytes, &dcid)) return slot; if (std.mem.eql(u8, &slot.conn.alt_local_cid.bytes, &dcid)) return slot; + if (slot.conn.first_initial_dcid_len == CID_LEN and + std.mem.eql(u8, slot.conn.first_initial_dcid[0..CID_LEN], &dcid)) return slot; } return null; } @@ -122,6 +131,7 @@ fn allocateSlot(slots: *[MAX_CONNS]?*ConnSlot, config: quic.Config, io: std.Io) for (slots) |*s_opt| { if (s_opt.* == null) { const slot = try page_allocator.create(ConnSlot); + errdefer page_allocator.destroy(slot); slot.* = .{ .conn = try Conn.accept(config, io), }; @@ -204,12 +214,11 @@ fn tickAllConnections(slots: *[MAX_CONNS]?*ConnSlot, sock: *const net.Socket, cm if (slot.peer_addr) |pa| { // Retry H3 control streams if initial send failed (queue was full). + const send_sock = slotSendSock(slot, sock, cm_sock_ptr); if (g_is_h3 and !slot.h3_control_sent and slot.conn.app_keys != null) { sendH3ControlStreams(slot); } - flushTransfers(slot, www_dir, io); - const send_sock = slotSendSock(slot, sock, cm_sock_ptr); - drainSend(&slot.conn, send_sock, io, &pa, send_bufs); + flushTransfers(slot, www_dir, io, send_sock, &pa, send_bufs); } } } @@ -228,7 +237,10 @@ pub fn main(init: std.process.Init) !void { // Determine the testcase; exit 127 if unsupported. // Check this FIRST before attempting to load certs, so that compliance // checks with unsupported testcases exit cleanly with 127. - const testcase = init.environ_map.get("TESTCASE") orelse "transfer"; + const testcase = init.environ_map.get("TESTCASE") orelse { + std.debug.print("TESTCASE not set, exiting\n", .{}); + std.process.exit(127); + }; var is_supported = false; for (supported_cases) |s| { if (std.mem.eql(u8, testcase, s)) { @@ -481,13 +493,15 @@ fn processPacket( // BEFORE processing the incoming packet, so PATH_CHALLENGE is the first // frame sent from the new address (required by interop test). if (is_cm_socket and !s.use_cm_sock) { - s.use_cm_sock = true; var challenge: [8]u8 = undefined; io.random(&challenge); s.conn.sendPathChallenge(challenge) catch {}; - } else if (is_cm_socket) { - // Already on CM socket, no action needed } + // Track the CURRENT socket — not a one-way flag. When the client + // rebinds back to the original path (or sim stops NAT'ing through CM), + // the server must follow. Without this, use_cm_sock stays true forever + // and data sent via CM socket can't reach clients on the original network. + if (s.use_cm_sock != is_cm_socket) s.use_cm_sock = is_cm_socket; const ecn_bits: u2 = 0; s.conn.receive(data, ipToSocketAddr(from), now_ns, ecn_bits, io) catch |err| { @@ -549,13 +563,19 @@ fn processPacket( } break; }, + .path_migrated => { + // Update send destination from the connection's authoritative + // peer address. Without this, late-arriving packets from the + // old address (via s.peer_addr = from) route sends to the + // stale address. + s.peer_addr = socketAddrToIp(s.conn.peer_addr); + }, else => {}, } } if (!slot_freed) { - flushTransfers(s, www_dir, io); - drainSend(&s.conn, active_sock, io, &from, send_bufs); + flushTransfers(s, www_dir, io, active_sock, &from, send_bufs); } } @@ -572,6 +592,7 @@ fn activatePending(transfers: *[MAX_TRANSFERS]FileTransfer, p: *const PendingTra t.active = true; t.stream_id = p.stream_id; t.offset = 0; + t.h3_headers_sent = false; @memcpy(t.path[0..p.path_len], p.path[0..p.path_len]); t.path_len = p.path_len; t.file = std.Io.Dir.openFileAbsolute(io, t.path[0..t.path_len], .{}) catch null; @@ -672,7 +693,7 @@ fn startTransfer(slot: *ConnSlot, stream_id: u62, www: []const u8, io: std.Io) v /// the congestion window is small (e.g. initial cwnd = 10 packets): without /// interleaving, stream 0 would fill the window and streams 4/8 would get no /// packets at all, stalling their offset-0 delivery. -fn flushTransfers(slot: *ConnSlot, www: []const u8, io: std.Io) void { +fn flushTransfers(slot: *ConnSlot, www: []const u8, io: std.Io, send_sock: *const net.Socket, dest: *const net.IpAddress, send_bufs: *SendBufs) void { const conn = &slot.conn; const transfers = &slot.transfers; _ = www; @@ -690,18 +711,30 @@ fn flushTransfers(slot: *ConnSlot, www: []const u8, io: std.Io) void { activatePending(transfers, &slot.pending[slot.pending_count], io); } // Outer loop: repeat passes until nothing was sent (CC/queue fully blocked). + // After each transfer advance, drain what pacing allows so bytes_in_flight + // stays current. Without this, bytes_in_flight=0 during the fill phase and + // the cwnd check is blind — either starving the pipe (with bytes_queued) or + // flooding the send queue (without it). while (true) { var sent_any = false; for (transfers) |*t| { if (!t.active) continue; - if (g_is_h3) { - if (advanceTransferOneH3(conn, t, io)) sent_any = true; - } else { - if (advanceTransferOne(conn, t, io)) sent_any = true; - } + const progress = if (g_is_h3) + advanceTransferOneH3(conn, t, io) + else + advanceTransferOne(conn, t, io); + if (progress) sent_any = true; } if (!sent_any) break; + // Drain pacing-gated packets after each round-robin pass so + // bytes_in_flight stays current for the next pass's cwnd check. + drainSend(conn, send_sock, io, dest, send_bufs); } + // Always drain: tick() and receive() may have enqueued PATH_CHALLENGE, + // ACKs, or retransmissions independent of transfer progress. Without + // this, those packets are stranded when all transfers are blocked + // (buffer full, amplification limit), causing path validation to stall. + drainSend(conn, send_sock, io, dest, send_bufs); } /// Send exactly one chunk from the transfer. Returns true if progress was made. @@ -728,9 +761,11 @@ fn advanceTransferGeneric(conn: *Conn, t: *FileTransfer, io: std.Io, is_h3: bool t.active = false; return true; } - // hq-interop: no file → already closed + // hq-interop: no file → send FIN so client gets a clean close + // instead of waiting until idle timeout. + conn.streamSend(t.stream_id, &.{}, true) catch return false; t.active = false; - return false; + return true; }; // H3: send HEADERS frame first (:status 200) @@ -799,30 +834,37 @@ fn advanceTransferGeneric(conn: *Conn, t: *FileTransfer, io: std.Io, is_h3: bool // --------------------------------------------------------------------------- /// Open the three server-initiated unidirectional streams required by RFC 9114. +/// Streams are sent individually so a partial failure (queue full) can be +/// retried without re-sending already-succeeded streams. fn sendH3ControlStreams(s: *ConnSlot) void { const conn = &s.conn; // Stream IDs: server-initiated unidirectional = 4*n + 3 → 3, 7, 11 + const stream_ids = [_]u62{ 3, 7, 11 }; + const stream_types = [_]u64{ + http3.StreamType.control, + http3.StreamType.qpack_encoder, + http3.StreamType.qpack_decoder, + }; - // 1. Control stream (type 0x00) + SETTINGS frame - var ctrl_buf: [64]u8 = undefined; - var pos: usize = 0; - // Stream type 0x00 (control) - pos += http3.varint.encode(ctrl_buf[pos..], http3.StreamType.control) catch return; - // SETTINGS frame (empty — all defaults) - pos += http3.frame.writeHeader(ctrl_buf[pos..], http3.FrameType.settings, 0) catch return; - conn.streamSend(3, ctrl_buf[0..pos], false) catch return; - - // 2. QPACK encoder stream (type 0x02) - var enc_buf: [4]u8 = undefined; - const enc_len = http3.varint.encode(&enc_buf, http3.StreamType.qpack_encoder) catch return; - conn.streamSend(7, enc_buf[0..enc_len], false) catch return; - - // 3. QPACK decoder stream (type 0x03) - var dec_buf: [4]u8 = undefined; - const dec_len = http3.varint.encode(&dec_buf, http3.StreamType.qpack_decoder) catch return; - conn.streamSend(11, dec_buf[0..dec_len], false) catch return; - - s.h3_control_sent = true; + var all_sent = true; + for (stream_ids, stream_types) |sid, stype| { + // Skip streams that were already sent in a previous partial attempt. + if (conn.streams.get(sid)) |st| { + if (st.send_offset > 0) continue; + } + var buf: [64]u8 = undefined; + var pos: usize = 0; + pos += http3.varint.encode(buf[pos..], stype) catch return; + // Control stream also needs an empty SETTINGS frame. + if (stype == http3.StreamType.control) { + pos += http3.frame.writeHeader(buf[pos..], http3.FrameType.settings, 0) catch return; + } + conn.streamSend(sid, buf[0..pos], false) catch { + all_sent = false; + continue; + }; + } + if (all_sent) s.h3_control_sent = true; } /// Parse an H3 request from a bidirectional stream and register a FileTransfer. @@ -1033,10 +1075,11 @@ fn configureEcn(sock: *const net.Socket) !void { fn drainSend(conn: *Conn, sock: *const net.Socket, io: std.Io, dest: *const net.IpAddress, bufs: *SendBufs) void { var messages: [SEND_BATCH]net.OutgoingMessage = undefined; var count: usize = 0; + const now_ns: i64 = @truncate(std.Io.Clock.awake.now(io).nanoseconds); // Phase 1: collect all outgoing packets into separate buffers. while (count < SEND_BATCH) { - const n = conn.send(&bufs.bufs[count]); + const n = conn.send(&bufs.bufs[count], now_ns); if (n == 0) break; messages[count] = .{ .address = dest, @@ -1094,13 +1137,7 @@ fn updateKeyLog(conn: *const Conn, io: std.Io, _: u32) void { if (pos >= buf.len - 256) break; } - // Overwrite the keylog file with all generations (directory /logs created by Dockerfile) - const file = std.Io.Dir.createFileAbsolute(io, "/logs/keys.log", .{}) catch return; - defer file.close(io); - file.writePositionalAll(io, buf[0..pos], 0) catch return; - // Sync multiple times to guarantee disk flush before docker cp - file.sync(io) catch {}; - file.sync(io) catch {}; + appendKeyLog(io, buf[0..pos]); } /// Write an SSLKEYLOG file so network analyzers (Wireshark/tshark) can decrypt @@ -1128,12 +1165,18 @@ fn writeKeyLog(conn: *const Conn, io: std.Io) void { line = std.fmt.bufPrint(buf[pos..], "SERVER_TRAFFIC_SECRET_0 {s} {s}\n", .{ random_hex, std.fmt.bytesToHex(secrets_0.server, .lower) }) catch return; pos += line.len; - // Write keylog file (directory /logs created by Dockerfile) + appendKeyLog(io, buf[0..pos]); +} + +fn appendKeyLog(io: std.Io, data: []const u8) void { + // Accumulate in memory, write full buffer each time (createFileAbsolute truncates). + const n = @min(data.len, g_keylog_buf.len - g_keylog_len); + if (n == 0) return; + @memcpy(g_keylog_buf[g_keylog_len..][0..n], data[0..n]); + g_keylog_len += n; const file = std.Io.Dir.createFileAbsolute(io, "/logs/keys.log", .{}) catch return; defer file.close(io); - file.writePositionalAll(io, buf[0..pos], 0) catch return; - // Sync multiple times to guarantee disk flush before docker cp - file.sync(io) catch {}; + file.writePositionalAll(io, g_keylog_buf[0..g_keylog_len], 0) catch return; file.sync(io) catch {}; } @@ -1144,6 +1187,13 @@ fn ipToSocketAddr(addr: net.IpAddress) quic.SocketAddr { }; } +fn socketAddrToIp(addr: quic.SocketAddr) net.IpAddress { + return switch (addr) { + .v4 => |a| .{ .ip4 = .{ .bytes = a.addr, .port = a.port } }, + .v6 => |a| .{ .ip6 = .{ .bytes = a.addr, .port = a.port } }, + }; +} + // --------------------------------------------------------------------------- // Tests // ---------------------------------------------------------------------------