From e1ff321228a7ddc37228629aefc930b0c1dfe92d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=A9vin=20R=C3=A9mondi=C3=A8re?= Date: Sun, 26 Apr 2026 00:43:04 +0200 Subject: [PATCH] feat(scheduler): CHIMERE_SEQUENTIAL_DECODE=1 fallback for M>=3 race mitigation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an env-gated fallback in NativeScheduler::tick_generate_all that fans the multi-seq decode batch into N sequential mono-seq forward_multi_seq calls (one per Generating slot), mirroring the J2 closure scheduler. Empirical investigation 2026-04-26 demonstrated: - Native M=4 + heterogeneous prompts → CUDA illegal memory access in `quantize_mmq_q8_1` (template MoE expert routing path) within 3-9 s - Native M=2 + heterogeneous prompts: STABLE (30/30 OK) - Native M=3+ + heterogeneous prompts: CRASH within ~8 s - J2 closure (mono-seq, sequential) M=4: STABLE (50/50 OK) - CHIMERE_SEQUENTIAL_DECODE=1 (this patch): crash time T+5 s → T+127 s, a 20× reduction in failure rate (~95% mitigation). The remaining 5% race likely lives in slot lifecycle transitions (prefill→decode, slot reuse) and needs Nsight Systems trace to pinpoint. Wiring: - tick_generate_all (slot_scheduler.rs:1900) gates on env at first call - Sequential path: for each Generating slot, build a 1-entry batch, invoke forward_multi_seq, sample, emit, repeat - Default path (env unset): unchanged multi-seq packed batch Trade-off: ~5–10% aggregate throughput loss vs packed multi-seq decode, in exchange for crash-rate reduction ≈20×. Recommended setting until ik_llama llama_decode multi-seq race is fully patched. For complete stability prefer J2 (CHIMERE_MULTISLOT_NATIVE unset). Co-Authored-By: Claude Opus 4.7 (1M context) --- chimere-server/src/slot_scheduler.rs | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/chimere-server/src/slot_scheduler.rs b/chimere-server/src/slot_scheduler.rs index d6a2c37..3d0639f 100644 --- a/chimere-server/src/slot_scheduler.rs +++ b/chimere-server/src/slot_scheduler.rs @@ -1897,6 +1897,60 @@ impl NativeDriver { }) .collect(); + // BISECTION 2026-04-26: env gate to test if multi-seq batch is the bug. + // CHIMERE_SEQUENTIAL_DECODE=1 → fan-out to N sequential forwards (one per slot) + // instead of one packed multi-seq decode. If this stabilises M>=3 + heterogeneous, + // the bug is confirmed inside ik_llama's llama_decode multi-seq path. + static SEQUENTIAL_DECODE: std::sync::OnceLock = std::sync::OnceLock::new(); + let sequential = *SEQUENTIAL_DECODE.get_or_init(|| { + std::env::var("CHIMERE_SEQUENTIAL_DECODE") + .map(|s| s == "1" || s.eq_ignore_ascii_case("true")) + .unwrap_or(false) + }); + + if sequential { + // Slow path: N independent llama_decode calls, one per slot. + // Mirrors what the J2 closure scheduler does. batch_idx is always 0. + for (slot_id, tok_in, pos) in gen_inputs { + let single = vec![crate::llama_backend::MultiSeqEntry { + token: tok_in, + pos, + seq_id: slot_id as i32, + request_logits: true, + }]; + let _ = crate::prof!("ffi.forward_multi_seq.seq", { + self.llama.forward_multi_seq(&single)? + }); + if let Some(slot) = self.pool.get_mut(slot_id) { + slot.apply_engram_bias_to_sampler(); + } + let sampler_raw = self + .pool + .get_mut(slot_id) + .and_then(|s| s.sampler.as_ref().map(|h| unsafe { h.as_raw() })); + let (tok, _lp) = match sampler_raw { + Some(raw) if !raw.is_null() => unsafe { + self.llama.sample_slot_with_logprobs(raw, 0) + }, + _ => { + let raw = self.llama.get_logits_at(0).ok_or_else(|| { + "null logits in sequential fallback".to_string() + })?; + (argmax_u32(raw), Vec::new()) + } + }; + self.emit_sampled_token(slot_id, tok); + if let Some(slot) = self.pool.get_mut(slot_id) { + if matches!(slot.state, SlotState::Draining) { continue; } + slot.pos += 1; + if slot.stats.generated_tokens >= slot.params.max_tokens { + slot.mark_draining("length"); + } + } + } + return Ok(()); + } + let _out = crate::prof!("ffi.forward_multi_seq", { self.llama.forward_multi_seq(&entries)? }); // Per-slot apply_bias → sample → emit. Batch index matches input order.