diff --git a/voice/engine/src/audio_ml/vad.rs b/voice/engine/src/audio_ml/vad.rs index b757883..b16e739 100644 --- a/voice/engine/src/audio_ml/vad.rs +++ b/voice/engine/src/audio_ml/vad.rs @@ -14,6 +14,19 @@ pub enum VadEvent { SpeechEnd, } +/// Default VAD threshold used when the bot is silent / listening. +pub const VAD_THRESHOLD_IDLE: f32 = 0.70; + +/// VAD threshold used during bot playback on the standard Reactor path. +/// Audio is pre-filtered by the denoiser before reaching VAD, so 0.85 gives +/// meaningful noise suppression without requiring the user to shout to barge in. +pub const VAD_THRESHOLD_PLAYBACK: f32 = 0.85; + +/// VAD threshold used during Gemini Live bot playback. +/// Higher than `VAD_THRESHOLD_PLAYBACK` because audio on this path is raw +/// (undenoised) — no denoiser pre-filters mic input before reaching VAD. +pub const VAD_THRESHOLD_PLAYBACK_RAW: f32 = 0.90; + #[derive(Debug, Clone)] pub struct VadConfig { pub threshold: f32, @@ -27,7 +40,7 @@ pub struct VadConfig { impl Default for VadConfig { fn default() -> Self { Self { - threshold: 0.7, + threshold: VAD_THRESHOLD_IDLE, min_volume: 0.0035, silence_frames: 6, min_speech_frames: 6, // Increased from 3 (96ms) to 6 (192ms) to filter pops/echo @@ -59,6 +72,17 @@ pub struct SileroVad { } impl SileroVad { + pub fn threshold(&self) -> f32 { + self.config.threshold + } + + pub fn set_threshold(&mut self, threshold: f32) { + // Equality is safe here: both sides always come from named constants + // (VAD_THRESHOLD_*). If threshold is ever derived by arithmetic, switch + // to an epsilon comparison to avoid IEEE 754 surprises. + self.config.threshold = threshold; + } + pub fn new(model_path: &str, config: VadConfig) -> Self { let ctx = config.context_size; Self { @@ -327,3 +351,75 @@ impl SileroVad { None } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Build a synthetic PCM-16 frame at the given RMS amplitude (0.0–1.0). + /// 512 samples at 16kHz = 32ms, matching FRAME_SIZE. + fn make_frame(amplitude: f32) -> Vec { + let n = FRAME_SIZE; + let sample = (amplitude * 32767.0) as i16; + let mut frame = Vec::with_capacity(n * 2); + for _ in 0..n { + frame.extend_from_slice(&sample.to_le_bytes()); + } + frame + } + + fn make_vad(threshold: f32) -> SileroVad { + SileroVad::new( + "", + VadConfig { + threshold, + min_volume: 0.0, // disable volume gate so tests focus on threshold + silence_frames: 6, + min_speech_frames: 1, // fire SpeechStart on the first positive frame + lookback_frames: 0, + context_size: 0, + }, + ) + } + + /// prob=0.80 should fire SpeechStart at the idle threshold (0.70) but be + /// treated as silence at the playback threshold (0.85). + #[test] + fn threshold_controls_speech_detection() { + let frame = make_frame(0.1); + + // At idle threshold: 0.80 >= 0.70 → speech + let mut vad = make_vad(VAD_THRESHOLD_IDLE); + let result = vad.process_with_prob(0.80, &frame); + assert_eq!(result, Some(VadEvent::SpeechStart)); + + // At playback threshold: 0.80 < 0.85 → silence, no event + let mut vad = make_vad(VAD_THRESHOLD_PLAYBACK); + let result = vad.process_with_prob(0.80, &frame); + assert_eq!(result, None); + } + + /// set_threshold mid-stream updates the comparison boundary immediately. + #[test] + fn set_threshold_takes_effect_immediately() { + let frame = make_frame(0.1); + let mut vad = make_vad(VAD_THRESHOLD_IDLE); + + // Prime with sub-threshold prob so is_speaking stays false + vad.process_with_prob(0.50, &frame); + assert!(!vad.is_speaking()); + + // Elevate to playback threshold — 0.80 should now be below the gate + vad.set_threshold(VAD_THRESHOLD_PLAYBACK); + let result = vad.process_with_prob(0.80, &frame); + assert_eq!( + result, None, + "prob 0.80 should be below playback threshold 0.85" + ); + + // Drop back to idle — same prob should now trigger + vad.set_threshold(VAD_THRESHOLD_IDLE); + let result = vad.process_with_prob(0.80, &frame); + assert_eq!(result, Some(VadEvent::SpeechStart)); + } +} diff --git a/voice/engine/src/reactor/audio.rs b/voice/engine/src/reactor/audio.rs index c0cb425..b1a529f 100644 --- a/voice/engine/src/reactor/audio.rs +++ b/voice/engine/src/reactor/audio.rs @@ -24,7 +24,36 @@ impl Reactor { // SAFETY NOTE: The closure is sync and non-recursive; the reactor's // on_vad_event is async but is called *after* process_frames completes // (we collect the VAD result inside the closure, not await inside it). + + // True while TTS audio is actively being streamed to the client. + // bot_audio_sent: set on first TTS chunk, cleared on barge-in/cancel. + // tts.is_active(): true between start_ws/http() and mark_finished()/cancel(). + // Combined this is narrower than is_pipeline_active(), which also covers LLM. + let is_playing = self.bot_audio_sent && self.tts.is_active(); let mut vad_event: Option = None; + + // Threshold is a packet-level decision: is_playing doesn't change within a + // process_frames batch, so set it once here rather than once per frame. + // + // When the denoiser is disabled, audio reaches VAD unfiltered (raw mic), + // so we use VAD_THRESHOLD_PLAYBACK_RAW (0.90) instead of the denoised + // playback threshold (0.85) to maintain equivalent echo rejection. + // + // Known behaviour: if is_playing flips true while the user is already + // mid-utterance, the threshold elevation can cause a premature SpeechEnd + // (~192 ms / 6 silence frames). This is acceptable because the bot does not + // normally start TTS while the user is speaking (barge-in clears TTS first); + // re-engagement prompts are the only realistic scenario. + self.vad.set_threshold(if is_playing { + if self.denoiser.is_enabled() { + crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK + } else { + crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK_RAW + } + } else { + crate::audio_ml::vad::VAD_THRESHOLD_IDLE + }); + self.ring_buffer.process_frames(&resampled, |frame| { // Denoise (inline ONNX, or passthrough if disabled). // denoiser.process() allocates for the model output; the frame diff --git a/voice/engine/src/reactor/proc/denoiser.rs b/voice/engine/src/reactor/proc/denoiser.rs index 69cdacd..14dc7e4 100644 --- a/voice/engine/src/reactor/proc/denoiser.rs +++ b/voice/engine/src/reactor/proc/denoiser.rs @@ -48,6 +48,11 @@ impl DenoiserStage { } } + /// Returns `true` when a denoiser model is active; `false` in passthrough mode. + pub fn is_enabled(&self) -> bool { + self.inner.is_some() + } + /// Process a single 16kHz PCM-16 LE audio frame. /// Returns cleaned audio (or the original if denoising is disabled/failed). pub fn process(&mut self, frame: &[u8]) -> Vec { diff --git a/voice/engine/src/reactor/proc/vad.rs b/voice/engine/src/reactor/proc/vad.rs index 6ae1988..9987d9b 100644 --- a/voice/engine/src/reactor/proc/vad.rs +++ b/voice/engine/src/reactor/proc/vad.rs @@ -14,6 +14,14 @@ pub struct VadStage { } impl VadStage { + pub fn set_threshold(&mut self, threshold: f32) { + // Safe to use != here: callers always supply named constants (VAD_THRESHOLD_*). + // See the note in SileroVad::set_threshold if arithmetic thresholds are ever added. + if self.inner.threshold() != threshold { + self.inner.set_threshold(threshold); + } + } + pub fn new(model_path: &str, config: VadConfig) -> Self { Self { inner: SileroVad::new(model_path, config), diff --git a/voice/engine/src/session.rs b/voice/engine/src/session.rs index 3cbda1c..072e3a4 100644 --- a/voice/engine/src/session.rs +++ b/voice/engine/src/session.rs @@ -455,7 +455,6 @@ async fn run_native_multimodal( let mut hangup_target: Option = None; let mut hangup_max_target: Option = None; - // ── Main event loop ──────────────────────────────────────────── loop { tokio::select! { @@ -496,6 +495,19 @@ async fn run_native_multimodal( // Frame-align for VAD; collect audio to push async afterward. let mut pending_pcm: Vec> = Vec::new(); let mut vad_event: Option = None; + + // Threshold is a packet-level decision: bot_speaking doesn't change + // within a process_frames batch, so set it once here. + // Raw (undenoised) audio goes to Gemini — raise threshold during playback + // to suppress background noise from falsely triggering a local barge-in. + if vad_ok { + vad.set_threshold(if bot_speaking { + crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK_RAW + } else { + crate::audio_ml::vad::VAD_THRESHOLD_IDLE + }); + } + ring.process_frames(&resampled, |frame| { if recording_enabled { tracer.emit(Event::UserAudio {