Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 97 additions & 1 deletion voice/engine/src/audio_ml/vad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@ pub enum VadEvent {
SpeechEnd,
}

/// Default VAD threshold used when the bot is silent / listening.
pub const VAD_THRESHOLD_IDLE: f32 = 0.70;

/// VAD threshold used during bot playback on the standard Reactor path.
/// Audio is pre-filtered by the denoiser before reaching VAD, so 0.85 gives
/// meaningful noise suppression without requiring the user to shout to barge in.
pub const VAD_THRESHOLD_PLAYBACK: f32 = 0.85;

/// VAD threshold used during Gemini Live bot playback.
/// Higher than `VAD_THRESHOLD_PLAYBACK` because audio on this path is raw
/// (undenoised) — no denoiser pre-filters mic input before reaching VAD.
pub const VAD_THRESHOLD_PLAYBACK_RAW: f32 = 0.90;

#[derive(Debug, Clone)]
pub struct VadConfig {
pub threshold: f32,
Expand All @@ -27,7 +40,7 @@ pub struct VadConfig {
impl Default for VadConfig {
fn default() -> Self {
Self {
threshold: 0.7,
threshold: VAD_THRESHOLD_IDLE,
min_volume: 0.0035,
silence_frames: 6,
min_speech_frames: 6, // Increased from 3 (96ms) to 6 (192ms) to filter pops/echo
Expand Down Expand Up @@ -59,6 +72,17 @@ pub struct SileroVad {
}

impl SileroVad {
pub fn threshold(&self) -> f32 {
self.config.threshold
}

pub fn set_threshold(&mut self, threshold: f32) {
// Equality is safe here: both sides always come from named constants
// (VAD_THRESHOLD_*). If threshold is ever derived by arithmetic, switch
// to an epsilon comparison to avoid IEEE 754 surprises.
self.config.threshold = threshold;
}

pub fn new(model_path: &str, config: VadConfig) -> Self {
let ctx = config.context_size;
Self {
Expand Down Expand Up @@ -327,3 +351,75 @@ impl SileroVad {
None
}
}

#[cfg(test)]
mod tests {
use super::*;

/// Build a synthetic PCM-16 frame at the given RMS amplitude (0.0–1.0).
/// 512 samples at 16kHz = 32ms, matching FRAME_SIZE.
fn make_frame(amplitude: f32) -> Vec<u8> {
let n = FRAME_SIZE;
let sample = (amplitude * 32767.0) as i16;
let mut frame = Vec::with_capacity(n * 2);
for _ in 0..n {
frame.extend_from_slice(&sample.to_le_bytes());
}
frame
}

fn make_vad(threshold: f32) -> SileroVad {
SileroVad::new(
"",
VadConfig {
threshold,
min_volume: 0.0, // disable volume gate so tests focus on threshold
silence_frames: 6,
min_speech_frames: 1, // fire SpeechStart on the first positive frame
lookback_frames: 0,
context_size: 0,
},
)
}

/// prob=0.80 should fire SpeechStart at the idle threshold (0.70) but be
/// treated as silence at the playback threshold (0.85).
#[test]
fn threshold_controls_speech_detection() {
let frame = make_frame(0.1);

// At idle threshold: 0.80 >= 0.70 → speech
let mut vad = make_vad(VAD_THRESHOLD_IDLE);
let result = vad.process_with_prob(0.80, &frame);
assert_eq!(result, Some(VadEvent::SpeechStart));

// At playback threshold: 0.80 < 0.85 → silence, no event
let mut vad = make_vad(VAD_THRESHOLD_PLAYBACK);
let result = vad.process_with_prob(0.80, &frame);
assert_eq!(result, None);
}

/// set_threshold mid-stream updates the comparison boundary immediately.
#[test]
fn set_threshold_takes_effect_immediately() {
let frame = make_frame(0.1);
let mut vad = make_vad(VAD_THRESHOLD_IDLE);

// Prime with sub-threshold prob so is_speaking stays false
vad.process_with_prob(0.50, &frame);
assert!(!vad.is_speaking());

// Elevate to playback threshold — 0.80 should now be below the gate
vad.set_threshold(VAD_THRESHOLD_PLAYBACK);
let result = vad.process_with_prob(0.80, &frame);
assert_eq!(
result, None,
"prob 0.80 should be below playback threshold 0.85"
);

// Drop back to idle — same prob should now trigger
vad.set_threshold(VAD_THRESHOLD_IDLE);
let result = vad.process_with_prob(0.80, &frame);
assert_eq!(result, Some(VadEvent::SpeechStart));
}
}
29 changes: 29 additions & 0 deletions voice/engine/src/reactor/audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,36 @@ impl Reactor {
// SAFETY NOTE: The closure is sync and non-recursive; the reactor's
// on_vad_event is async but is called *after* process_frames completes
// (we collect the VAD result inside the closure, not await inside it).

// True while TTS audio is actively being streamed to the client.
// bot_audio_sent: set on first TTS chunk, cleared on barge-in/cancel.
// tts.is_active(): true between start_ws/http() and mark_finished()/cancel().
// Combined this is narrower than is_pipeline_active(), which also covers LLM.
let is_playing = self.bot_audio_sent && self.tts.is_active();
let mut vad_event: Option<crate::types::VadEvent> = None;

// Threshold is a packet-level decision: is_playing doesn't change within a
// process_frames batch, so set it once here rather than once per frame.
//
// When the denoiser is disabled, audio reaches VAD unfiltered (raw mic),
// so we use VAD_THRESHOLD_PLAYBACK_RAW (0.90) instead of the denoised
// playback threshold (0.85) to maintain equivalent echo rejection.
//
// Known behaviour: if is_playing flips true while the user is already
// mid-utterance, the threshold elevation can cause a premature SpeechEnd
// (~192 ms / 6 silence frames). This is acceptable because the bot does not
// normally start TTS while the user is speaking (barge-in clears TTS first);
// re-engagement prompts are the only realistic scenario.
self.vad.set_threshold(if is_playing {
if self.denoiser.is_enabled() {
crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK
} else {
crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK_RAW
}
} else {
crate::audio_ml::vad::VAD_THRESHOLD_IDLE
});

self.ring_buffer.process_frames(&resampled, |frame| {
// Denoise (inline ONNX, or passthrough if disabled).
// denoiser.process() allocates for the model output; the frame
Expand Down
5 changes: 5 additions & 0 deletions voice/engine/src/reactor/proc/denoiser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ impl DenoiserStage {
}
}

/// Returns `true` when a denoiser model is active; `false` in passthrough mode.
pub fn is_enabled(&self) -> bool {
self.inner.is_some()
}

/// Process a single 16kHz PCM-16 LE audio frame.
/// Returns cleaned audio (or the original if denoising is disabled/failed).
pub fn process(&mut self, frame: &[u8]) -> Vec<u8> {
Expand Down
8 changes: 8 additions & 0 deletions voice/engine/src/reactor/proc/vad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ pub struct VadStage {
}

impl VadStage {
pub fn set_threshold(&mut self, threshold: f32) {
// Safe to use != here: callers always supply named constants (VAD_THRESHOLD_*).
// See the note in SileroVad::set_threshold if arithmetic thresholds are ever added.
if self.inner.threshold() != threshold {
self.inner.set_threshold(threshold);
}
}

pub fn new(model_path: &str, config: VadConfig) -> Self {
Self {
inner: SileroVad::new(model_path, config),
Expand Down
14 changes: 13 additions & 1 deletion voice/engine/src/session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,6 @@ async fn run_native_multimodal(
let mut hangup_target: Option<tokio::time::Instant> = None;
let mut hangup_max_target: Option<tokio::time::Instant> = None;


// ── Main event loop ────────────────────────────────────────────
loop {
tokio::select! {
Expand Down Expand Up @@ -496,6 +495,19 @@ async fn run_native_multimodal(
// Frame-align for VAD; collect audio to push async afterward.
let mut pending_pcm: Vec<Vec<i16>> = Vec::new();
let mut vad_event: Option<crate::types::VadEvent> = None;

// Threshold is a packet-level decision: bot_speaking doesn't change
// within a process_frames batch, so set it once here.
// Raw (undenoised) audio goes to Gemini — raise threshold during playback
// to suppress background noise from falsely triggering a local barge-in.
if vad_ok {
vad.set_threshold(if bot_speaking {
crate::audio_ml::vad::VAD_THRESHOLD_PLAYBACK_RAW
} else {
crate::audio_ml::vad::VAD_THRESHOLD_IDLE
});
}

ring.process_frames(&resampled, |frame| {
if recording_enabled {
tracer.emit(Event::UserAudio {
Expand Down
Loading