From e1ff321228a7ddc37228629aefc930b0c1dfe92d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=A9vin=20R=C3=A9mondi=C3=A8re?=
 <kevin.remondiere@gmail.com>
Date: Sun, 26 Apr 2026 00:43:04 +0200
Subject: [PATCH] feat(scheduler): CHIMERE_SEQUENTIAL_DECODE=1 fallback for
 M>=3 race mitigation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an env-gated fallback in NativeScheduler::tick_generate_all that fans
the multi-seq decode batch into N sequential mono-seq forward_multi_seq
calls (one per Generating slot), mirroring the J2 closure scheduler.

Empirical investigation 2026-04-26 demonstrated:
  - Native M=4 + heterogeneous prompts → CUDA illegal memory access in
    `quantize_mmq_q8_1<D4>` (template MoE expert routing path) within 3-9 s
  - Native M=2 + heterogeneous prompts: STABLE (30/30 OK)
  - Native M=3+ + heterogeneous prompts: CRASH within ~8 s
  - J2 closure (mono-seq, sequential) M=4: STABLE (50/50 OK)
  - CHIMERE_SEQUENTIAL_DECODE=1 (this patch): crash time T+5 s → T+127 s,
    a 20× reduction in failure rate (~95% mitigation).

The remaining 5% race likely lives in slot lifecycle transitions
(prefill→decode, slot reuse) and needs Nsight Systems trace to pinpoint.

Wiring:
  - tick_generate_all (slot_scheduler.rs:1900) gates on env at first call
  - Sequential path: for each Generating slot, build a 1-entry batch,
    invoke forward_multi_seq, sample, emit, repeat
  - Default path (env unset): unchanged multi-seq packed batch

Trade-off: ~5–10% aggregate throughput loss vs packed multi-seq decode,
in exchange for crash-rate reduction ≈20×. Recommended setting until
ik_llama llama_decode multi-seq race is fully patched.

For complete stability prefer J2 (CHIMERE_MULTISLOT_NATIVE unset).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 chimere-server/src/slot_scheduler.rs | 54 ++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
diff --git a/chimere-server/src/slot_scheduler.rs b/chimere-server/src/slot_scheduler.rs
index d6a2c37..3d0639f 100644
--- a/chimere-server/src/slot_scheduler.rs
+++ b/chimere-server/src/slot_scheduler.rs
@@ -1897,6 +1897,60 @@ impl NativeDriver {
             })
             .collect();
 
+        // BISECTION 2026-04-26: env gate to test if multi-seq batch is the bug.
+        // CHIMERE_SEQUENTIAL_DECODE=1 → fan-out to N sequential forwards (one per slot)
+        // instead of one packed multi-seq decode. If this stabilises M>=3 + heterogeneous,
+        // the bug is confirmed inside ik_llama's llama_decode multi-seq path.
+        static SEQUENTIAL_DECODE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+        let sequential = *SEQUENTIAL_DECODE.get_or_init(|| {
+            std::env::var("CHIMERE_SEQUENTIAL_DECODE")
+                .map(|s| s == "1" || s.eq_ignore_ascii_case("true"))
+                .unwrap_or(false)
+        });
+
+        if sequential {
+            // Slow path: N independent llama_decode calls, one per slot.
+            // Mirrors what the J2 closure scheduler does. batch_idx is always 0.
+            for (slot_id, tok_in, pos) in gen_inputs {
+                let single = vec![crate::llama_backend::MultiSeqEntry {
+                    token: tok_in,
+                    pos,
+                    seq_id: slot_id as i32,
+                    request_logits: true,
+                }];
+                let _ = crate::prof!("ffi.forward_multi_seq.seq", {
+                    self.llama.forward_multi_seq(&single)?
+                });
+                if let Some(slot) = self.pool.get_mut(slot_id) {
+                    slot.apply_engram_bias_to_sampler();
+                }
+                let sampler_raw = self
+                    .pool
+                    .get_mut(slot_id)
+                    .and_then(|s| s.sampler.as_ref().map(|h| unsafe { h.as_raw() }));
+                let (tok, _lp) = match sampler_raw {
+                    Some(raw) if !raw.is_null() => unsafe {
+                        self.llama.sample_slot_with_logprobs(raw, 0)
+                    },
+                    _ => {
+                        let raw = self.llama.get_logits_at(0).ok_or_else(|| {
+                            "null logits in sequential fallback".to_string()
+                        })?;
+                        (argmax_u32(raw), Vec::new())
+                    }
+                };
+                self.emit_sampled_token(slot_id, tok);
+                if let Some(slot) = self.pool.get_mut(slot_id) {
+                    if matches!(slot.state, SlotState::Draining) { continue; }
+                    slot.pos += 1;
+                    if slot.stats.generated_tokens >= slot.params.max_tokens {
+                        slot.mark_draining("length");
+                    }
+                }
+            }
+            return Ok(());
+        }
+
         let _out = crate::prof!("ffi.forward_multi_seq", { self.llama.forward_multi_seq(&entries)? });
 
         // Per-slot apply_bias → sample → emit. Batch index matches input order.