Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/vad/src/continuous.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::{

use futures_util::{future, Stream, StreamExt};
use kalosm_sound::AsyncSource;
use silero_rs::{VadConfig, VadSession, VadTransition};
pub use silero_rs::{VadConfig, VadSession, VadTransition};

#[derive(Debug, Clone)]
pub enum VadStreamItem {
Expand Down
47 changes: 41 additions & 6 deletions plugins/listener/src/fsm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ struct AudioChannels {
process_mic_rx: flume::Receiver<Vec<f32>>,
process_speaker_tx: flume::Sender<Vec<f32>>,
process_speaker_rx: flume::Receiver<Vec<f32>>,
control_tx: flume::Sender<owhisper_interface::ControlMessage>,
control_rx: flume::Receiver<owhisper_interface::ControlMessage>,
}

impl AudioChannels {
Expand All @@ -81,6 +83,8 @@ impl AudioChannels {
let (process_speaker_tx, process_speaker_rx) =
flume::bounded::<Vec<f32>>(CHUNK_BUFFER_SIZE);

let (control_tx, control_rx) = flume::bounded::<owhisper_interface::ControlMessage>(8);

let (save_mic_raw_tx, save_mic_raw_rx) = if cfg!(debug_assertions) {
let (tx, rx) = flume::bounded::<Vec<f32>>(CHUNK_BUFFER_SIZE);
(Some(tx), Some(rx))
Expand Down Expand Up @@ -110,6 +114,8 @@ impl AudioChannels {
process_mic_rx,
process_speaker_tx,
process_speaker_rx,
control_tx,
control_rx,
}
}

Expand Down Expand Up @@ -293,13 +299,21 @@ impl Session {
let save_speaker_raw_tx = channels.save_speaker_raw_tx.clone();
let process_mic_tx = channels.process_mic_tx.clone();
let process_speaker_tx = channels.process_speaker_tx.clone();
let control_tx = channels.control_tx.clone();

async move {
let mut aec = hypr_aec::AEC::new().unwrap();
let mut mic_agc = hypr_agc::Agc::default();
let mut speaker_agc = hypr_agc::Agc::default();
let mut last_broadcast = Instant::now();

let mut vad = hypr_vad::VadSession::new(hypr_vad::VadConfig {
redemption_time: Duration::from_millis(70),
pre_speech_pad: Duration::from_millis(70),
..Default::default()
})
.unwrap();

loop {
let (mut mic_chunk_raw, mut speaker_chunk): (Vec<f32>, Vec<f32>) =
match tokio::join!(mic_rx.recv_async(), speaker_rx.recv_async()) {
Expand All @@ -325,6 +339,21 @@ impl Session {
let _ = rx.changed().await;
continue;
}
let mixed: Vec<f32> = mic_chunk
.iter()
.zip(speaker_chunk.iter())
.map(|(mic, speaker)| (mic + speaker).clamp(-1.0, 1.0))
.collect();

if let Ok(transitions) = vad.process(&mixed) {
for transition in transitions {
if let hypr_vad::VadTransition::SpeechEnd { .. } = transition {
let _ = control_tx
.send_async(owhisper_interface::ControlMessage::Finalize)
.await;
}
}
}

let processed_mic = mic_chunk.clone();
let processed_speaker = speaker_chunk.clone();
Expand Down Expand Up @@ -442,12 +471,18 @@ impl Session {
.into_stream()
.map(|v| hypr_audio_utils::f32_to_i16_bytes(v.into_iter()));

let combined_audio_stream =
mic_audio_stream
.zip(speaker_audio_stream)
.map(|(mic, speaker)| {
owhisper_interface::MixedMessage::Audio((mic.into(), speaker.into()))
});
let audio_stream = mic_audio_stream
.zip(speaker_audio_stream)
.map(|(mic, speaker)| {
owhisper_interface::MixedMessage::Audio((mic.into(), speaker.into()))
});

let control_stream = channels
.control_rx
.into_stream()
.map(|control| owhisper_interface::MixedMessage::Control(control));

let combined_audio_stream = futures_util::stream::select(audio_stream, control_stream);

tasks.spawn({
let app = self.app.clone();
Expand Down
37 changes: 29 additions & 8 deletions plugins/listener/src/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
pub struct TranscriptManager {
id: uuid::Uuid,
partial_words: Vec<owhisper_interface::Word>,
final_words: Vec<owhisper_interface::Word>,
}

impl Default for TranscriptManager {
fn default() -> Self {
Self {
id: uuid::Uuid::new_v4(),
partial_words: Vec::new(),
final_words: Vec::new(),
}
}
}
Expand Down Expand Up @@ -93,16 +95,35 @@ impl TranscriptManager {
};

if is_final {
let last_final_word_end = words.last().unwrap().end;
self.partial_words = self
.partial_words
.iter()
.filter(|w| w.start > last_final_word_end)
.cloned()
.collect::<Vec<_>>();
// TODO: maybe we should replace.
let new_final_words: Vec<owhisper_interface::Word> = words
.into_iter()
.filter(|new_word| {
!self.final_words.iter().any(|existing_word| {
(existing_word.start - new_word.start).abs() < 0.001
&& (existing_word.end - new_word.end).abs() < 0.001
})
})
.collect();

self.final_words.extend(new_final_words.clone());
self.final_words.sort_by(|a, b| {
a.start
.partial_cmp(&b.start)
.unwrap_or(std::cmp::Ordering::Equal)
});

if let Some(last_final_word) = new_final_words.last() {
self.partial_words = self
.partial_words
.iter()
.filter(|w| w.start > last_final_word.end)
.cloned()
.collect::<Vec<_>>();
}

return Diff {
final_words: words.clone(),
final_words: new_final_words,
partial_words: self.partial_words.clone(),
};
} else if data.confidence > 0.6 {
Expand Down
Loading