From bfa522d16dffdae9d9e3e4f4e61e4f69febce0aa Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 21 Apr 2026 16:37:01 +1200 Subject: [PATCH 1/4] feat(serde): support serde on `SessionOptions` and `SpeechOptions` --- CHANGELOG.md | 10 + Cargo.toml | 7 +- examples/streaming.rs | 2 +- src/detector.rs | 76 ++++---- src/options.rs | 420 +++++++++++++++++++++++++++++++++++------- src/session.rs | 15 +- 6 files changed, 417 insertions(+), 113 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 884db60..1412816 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.0] - 2026-04-21 + +### Added + +- `serde` support for `*Options` + +### Changed + +- Change `u32` ms to `Duration` in `SpeechOptions` + ## [0.1.0] - 2026-04-08 ### Added diff --git a/Cargo.toml b/Cargo.toml index 1e13fe1..d5b50d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "silero" -version = "0.1.1" +version = "0.2.0" edition = "2024" rust-version = "1.85" repository = "https://github.com/Findit-AI/silero" @@ -27,13 +27,18 @@ include = [ [features] default = ["bundled"] bundled = [] +serde = ["dep:serde", "dep:humantime-serde"] [dependencies] ort = "2.0.0-rc.12" thiserror = "2" +serde = { version = "1", optional = true, features = ["derive"] } +humantime-serde = { version = "1", optional = true } + [dev-dependencies] hound = "3" +serde_json = "1" [[example]] name = "detect_file" diff --git a/examples/streaming.rs b/examples/streaming.rs index 13be5bf..e531a7d 100644 --- a/examples/streaming.rs +++ b/examples/streaming.rs @@ -9,7 +9,7 @@ fn main() -> Result<(), Box> { let mut session = Session::from_memory(MODEL_BYTES)?; let config = SpeechOptions::default(); let mut stream = StreamState::new(config.sample_rate()); - let mut segmenter = SpeechSegmenter::new(config); + let mut segmenter = SpeechSegmenter::new(config.clone()); let synthetic_audio = vec![0.0_f32; config.sample_rate().chunk_samples() * 8]; segmenter.process_samples(&mut session, &mut stream, &synthetic_audio, |segment| { diff --git a/src/detector.rs b/src/detector.rs index 40770ad..fa4d7bf 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -14,7 +14,7 @@ pub struct SpeechSegment { impl SpeechSegment { /// Create a new speech segment with the given start and end samples and sample rate. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn new(start_sample: u64, end_sample: u64, sample_rate: SampleRate) -> Self { Self { start_sample, @@ -24,37 +24,37 @@ impl SpeechSegment { } /// Returns the start sample of this speech segment. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn start_sample(&self) -> u64 { self.start_sample } /// Returns the end sample of this speech segment. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn end_sample(&self) -> u64 { self.end_sample } /// Returns the sample rate of this speech segment. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn sample_rate(&self) -> SampleRate { self.sample_rate } /// Returns the number of samples in this speech segment. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn sample_count(&self) -> u64 { self.end_sample.saturating_sub(self.start_sample) } /// Returns the start time of this speech segment in seconds. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn start_seconds(&self) -> f64 { self.start_sample as f64 / self.sample_rate.hz() as f64 } /// Returns the end time of this speech segment in seconds. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn end_seconds(&self) -> f64 { self.end_sample as f64 / self.sample_rate.hz() as f64 } @@ -109,10 +109,10 @@ impl SpeechSegmenter { /// /// Changing sample rate starts a new logical timeline, so any /// in-flight segment state is cleared. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn set_sample_rate(&mut self, sample_rate: SampleRate) { if self.sample_rate() != sample_rate { - self.options = self.options.with_sample_rate(sample_rate); + self.options.set_sample_rate(sample_rate); self.reset(); } } @@ -341,6 +341,8 @@ pub fn detect_speech( #[cfg(test)] mod tests { + use std::time::Duration; + use crate::{SampleRate, SpeechOptions}; use super::{SpeechSegment, SpeechSegmenter}; @@ -366,7 +368,7 @@ mod tests { #[test] fn closes_segment_after_confirmed_silence() { let config = SpeechOptions::default(); - let mut segmenter = SpeechSegmenter::new(config); + let mut segmenter = SpeechSegmenter::new(config.clone()); let mut probabilities = vec![0.9; frame_count(320, SampleRate::Rate16k)]; probabilities.extend(vec![0.0; frame_count(128, SampleRate::Rate16k)]); @@ -379,7 +381,7 @@ mod tests { #[test] fn drops_short_bursts() { let config = SpeechOptions::default(); - let mut segmenter = SpeechSegmenter::new(config); + let mut segmenter = SpeechSegmenter::new(config.clone()); let mut probabilities = vec![0.9; frame_count(64, SampleRate::Rate16k)]; probabilities.extend(vec![0.0; frame_count(160, SampleRate::Rate16k)]); let segments = collect(&mut segmenter, &probabilities); @@ -389,9 +391,9 @@ mod tests { #[test] fn middle_band_frames_do_not_reset_tentative_end() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(0) - .with_min_silence_duration_ms(100); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::ZERO) + .with_min_silence_duration(Duration::from_millis(100)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; @@ -457,9 +459,9 @@ mod tests { #[test] fn force_splits_long_speech_when_max_duration_is_reached() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(0) - .with_max_speech_duration_ms(160); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::ZERO) + .with_max_speech_duration(Duration::from_millis(160)); let mut segmenter = SpeechSegmenter::new(config); let probabilities = vec![0.9; 8]; @@ -474,11 +476,11 @@ mod tests { #[test] fn prefers_recorded_silence_when_splitting_long_speech() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(0) - .with_min_silence_duration_ms(300) - .with_min_silence_at_max_speech_ms(64) - .with_max_speech_duration_ms(256); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::ZERO) + .with_min_silence_duration(Duration::from_millis(300)) + .with_min_silence_at_max_speech(Duration::from_millis(64)) + .with_max_speech_duration(Duration::from_millis(256)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; probabilities.extend(vec![0.0; 4]); @@ -495,11 +497,11 @@ mod tests { #[test] fn non_qualifying_silence_does_not_overwrite_next_start() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(0) - .with_min_silence_duration_ms(10_000) - .with_min_silence_at_max_speech_ms(64) - .with_max_speech_duration_ms(512); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::ZERO) + .with_min_silence_duration(Duration::from_millis(10_000)) + .with_min_silence_at_max_speech(Duration::from_millis(64)) + .with_max_speech_duration(Duration::from_millis(512)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; @@ -516,11 +518,11 @@ mod tests { #[test] fn force_split_during_silence_closes_without_restarting() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(0) - .with_min_silence_duration_ms(10_000) - .with_min_silence_at_max_speech_ms(64) - .with_max_speech_duration_ms(224); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::ZERO) + .with_min_silence_duration(Duration::from_millis(10_000)) + .with_min_silence_at_max_speech(Duration::from_millis(64)) + .with_max_speech_duration(Duration::from_millis(224)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; @@ -535,11 +537,11 @@ mod tests { #[test] fn force_split_applies_speech_pad_to_split_boundaries() { let config = SpeechOptions::default() - .with_min_speech_duration_ms(0) - .with_speech_pad_ms(32) - .with_min_silence_duration_ms(10_000) - .with_min_silence_at_max_speech_ms(64) - .with_max_speech_duration_ms(512); + .with_min_speech_duration(Duration::ZERO) + .with_speech_pad(Duration::from_millis(32)) + .with_min_silence_duration(Duration::from_millis(10_000)) + .with_min_silence_at_max_speech(Duration::from_millis(64)) + .with_max_speech_duration(Duration::from_millis(512)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; diff --git a/src/options.rs b/src/options.rs index 79133ce..d425db3 100644 --- a/src/options.rs +++ b/src/options.rs @@ -1,13 +1,89 @@ +use core::time::Duration; + pub use ort::session::builder::GraphOptimizationLevel; use crate::error::{Error, Result}; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "serde")] +mod graph_optimization_level { + use super::GraphOptimizationLevel; + use serde::*; + + #[derive( + Debug, Default, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize, Deserialize, + )] + #[serde(rename_all = "snake_case")] + enum OptimizationLevel { + Disable, + Level1, + Level2, + #[default] + Level3, + All, + } + + impl From for OptimizationLevel { + #[inline] + fn from(value: GraphOptimizationLevel) -> Self { + match value { + GraphOptimizationLevel::Disable => Self::Disable, + GraphOptimizationLevel::Level1 => Self::Level1, + GraphOptimizationLevel::Level2 => Self::Level2, + GraphOptimizationLevel::Level3 => Self::Level3, + GraphOptimizationLevel::All => Self::All, + } + } + } + + impl From for GraphOptimizationLevel { + #[inline] + fn from(value: OptimizationLevel) -> Self { + match value { + OptimizationLevel::Disable => Self::Disable, + OptimizationLevel::Level1 => Self::Level1, + OptimizationLevel::Level2 => Self::Level2, + OptimizationLevel::Level3 => Self::Level3, + OptimizationLevel::All => Self::All, + } + } + } + + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn serialize(level: &GraphOptimizationLevel, serializer: S) -> Result + where + S: Serializer, + { + OptimizationLevel::from(*level).serialize(serializer) + } + + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + OptimizationLevel::deserialize(deserializer).map(Into::into) + } + + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn default() -> GraphOptimizationLevel { + GraphOptimizationLevel::Disable + } +} + /// Sample rates directly supported by the Silero VAD model. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))] pub enum SampleRate { /// 8 kHz sample rate, which uses smaller chunks and less context. + #[cfg_attr(feature = "serde", serde(rename = "8k"))] Rate8k, /// 16 kHz sample rate, which uses larger chunks and more context for better accuracy. + #[cfg_attr(feature = "serde", serde(rename = "16k"))] + #[default] Rate16k, } @@ -49,22 +125,22 @@ impl SampleRate { } } } - -impl Default for SampleRate { - #[inline] - fn default() -> Self { - Self::Rate16k - } -} - /// Options for constructing an ONNX session. /// /// This type intentionally stays small. Deployment-specific runtime /// policy such as `intra_threads` / `inter_threads` should normally be /// configured one layer up, then passed down via /// [`crate::Session::from_ort_session`]. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct SessionOptions { + #[cfg_attr( + feature = "serde", + serde( + default = "graph_optimization_level::default", + with = "graph_optimization_level" + ) + )] optimization_level: GraphOptimizationLevel, } @@ -80,7 +156,7 @@ impl SessionOptions { #[cfg_attr(not(tarpaulin), inline(always))] pub const fn new() -> Self { Self { - optimization_level: GraphOptimizationLevel::Disable, + optimization_level: GraphOptimizationLevel::Level3, } } @@ -98,20 +174,76 @@ impl SessionOptions { } } +#[cfg_attr(not(tarpaulin), inline(always))] +const fn default_start_threshold() -> f32 { + 0.5 +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn default_min_speech_duration() -> Duration { + Duration::from_millis(250) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn default_min_silence_duration() -> Duration { + Duration::from_millis(100) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn default_min_silence_at_max_speech() -> Duration { + Duration::from_millis(98) +} + +#[cfg_attr(not(tarpaulin), inline(always))] +const fn default_speech_pad() -> Duration { + Duration::from_millis(30) +} + /// Configuration for turning frame probabilities into speech segments. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct SpeechOptions { + #[cfg_attr(feature = "serde", serde(default))] sample_rate: SampleRate, + #[cfg_attr(feature = "serde", serde(default = "default_start_threshold"))] start_threshold: f32, + #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))] end_threshold: Option, - min_speech_duration_ms: u32, - min_silence_duration_ms: u32, - min_silence_at_max_speech_ms: u32, - max_speech_duration_ms: Option, - speech_pad_ms: u32, + #[cfg_attr( + feature = "serde", + serde(default = "default_min_speech_duration", with = "humantime_serde") + )] + min_speech_duration: Duration, + #[cfg_attr( + feature = "serde", + serde(default = "default_min_silence_duration", with = "humantime_serde") + )] + min_silence_duration: Duration, + #[cfg_attr( + feature = "serde", + serde( + default = "default_min_silence_at_max_speech", + with = "humantime_serde" + ) + )] + min_silence_at_max_speech: Duration, + #[cfg_attr( + feature = "serde", + serde( + skip_serializing_if = "Option::is_none", + with = "humantime_serde::option" + ) + )] + max_speech_duration: Option, + #[cfg_attr( + feature = "serde", + serde(default = "default_speech_pad", with = "humantime_serde") + )] + speech_pad: Duration, } impl Default for SpeechOptions { + #[cfg_attr(not(tarpaulin), inline(always))] fn default() -> Self { Self::new() } @@ -123,14 +255,14 @@ impl SpeechOptions { pub const fn new() -> Self { Self { sample_rate: SampleRate::Rate16k, - start_threshold: 0.5, + start_threshold: default_start_threshold(), end_threshold: None, - min_speech_duration_ms: 250, - min_silence_duration_ms: 100, + min_speech_duration: default_min_speech_duration(), + min_silence_duration: default_min_silence_duration(), // Matches the upstream silero-vad Python default (0.098 s). - min_silence_at_max_speech_ms: 98, - max_speech_duration_ms: None, - speech_pad_ms: 30, + min_silence_at_max_speech: default_min_silence_at_max_speech(), + max_speech_duration: None, + speech_pad: default_speech_pad(), } } @@ -164,50 +296,50 @@ impl SpeechOptions { /// Returns the minimum duration of detected speech segments, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn min_speech_duration_ms(&self) -> u32 { - self.min_speech_duration_ms + pub const fn min_speech_duration(&self) -> Duration { + self.min_speech_duration } /// Returns the minimum duration of silence required to close a detected speech segment, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn min_silence_duration_ms(&self) -> u32 { - self.min_silence_duration_ms + pub const fn min_silence_duration(&self) -> Duration { + self.min_silence_duration } /// Returns the minimum silence duration used as a preferred split point when the maximum speech duration is reached. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn min_silence_at_max_speech_ms(&self) -> u32 { - self.min_silence_at_max_speech_ms + pub const fn min_silence_at_max_speech(&self) -> Duration { + self.min_silence_at_max_speech } /// Returns the maximum duration of a speech segment before the segmenter force-splits it. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn max_speech_duration_ms(&self) -> Option { - self.max_speech_duration_ms + pub const fn max_speech_duration(&self) -> Option { + self.max_speech_duration } /// Returns the amount of padding to add to the start of detected speech segments, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn speech_pad_ms(&self) -> u32 { - self.speech_pad_ms + pub const fn speech_pad(&self) -> Duration { + self.speech_pad } /// Returns the minimum duration of detected speech segments, in samples. #[cfg_attr(not(tarpaulin), inline(always))] pub fn min_speech_samples(&self) -> u64 { - ms_to_samples(self.min_speech_duration_ms, self.sample_rate) + ms_to_samples(self.min_speech_duration, self.sample_rate) } /// Returns the minimum duration of silence required to close a detected speech segment, in samples. #[cfg_attr(not(tarpaulin), inline(always))] pub fn min_silence_samples(&self) -> u64 { - ms_to_samples(self.min_silence_duration_ms, self.sample_rate) + ms_to_samples(self.min_silence_duration, self.sample_rate) } /// Returns the minimum silence duration used as a preferred split point when max speech duration is reached, in samples. #[cfg_attr(not(tarpaulin), inline(always))] pub fn min_silence_at_max_speech_samples(&self) -> u64 { - ms_to_samples(self.min_silence_at_max_speech_ms, self.sample_rate) + ms_to_samples(self.min_silence_at_max_speech, self.sample_rate) } /// Returns the maximum speech duration before force-splitting, in samples. @@ -219,8 +351,8 @@ impl SpeechOptions { /// the current segment and the start of the next one #[cfg_attr(not(tarpaulin), inline(always))] pub fn max_speech_samples(&self) -> Option { - self.max_speech_duration_ms.map(|duration_ms| { - ms_to_samples(duration_ms, self.sample_rate) + self.max_speech_duration.map(|duration| { + ms_to_samples(duration, self.sample_rate) .saturating_sub(self.sample_rate.chunk_samples() as u64) .saturating_sub(self.speech_pad_samples().saturating_mul(2)) }) @@ -229,20 +361,20 @@ impl SpeechOptions { /// Returns the amount of padding to add to the start of detected speech segments, in samples. #[cfg_attr(not(tarpaulin), inline(always))] pub fn speech_pad_samples(&self) -> u64 { - ms_to_samples(self.speech_pad_ms, self.sample_rate) + ms_to_samples(self.speech_pad, self.sample_rate) } /// Set the sample rate to use for speech detection. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn with_sample_rate(mut self, sample_rate: SampleRate) -> Self { - self.sample_rate = sample_rate; + self.set_sample_rate(sample_rate); self } /// Set the start threshold, which must be between 0 and 1. If not set, it defaults to 0.5. #[cfg_attr(not(tarpaulin), inline(always))] - pub fn with_start_threshold(mut self, threshold: f32) -> Self { - self.start_threshold = sanitize_probability(threshold); + pub const fn with_start_threshold(mut self, threshold: f32) -> Self { + self.set_start_threshold(threshold); self } @@ -254,8 +386,8 @@ impl SpeechOptions { /// fall back to the default derived hysteresis rule even if builder /// methods are called in a different order. #[cfg_attr(not(tarpaulin), inline(always))] - pub fn with_end_threshold(mut self, threshold: f32) -> Self { - self.end_threshold = Some(sanitize_probability(threshold)); + pub const fn with_end_threshold(mut self, threshold: f32) -> Self { + self.set_end_threshold(threshold); self } @@ -268,54 +400,116 @@ impl SpeechOptions { /// Set the minimum duration of detected speech segments, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn with_min_speech_duration_ms(mut self, duration_ms: u32) -> Self { - self.min_speech_duration_ms = duration_ms; + pub const fn with_min_speech_duration(mut self, duration: Duration) -> Self { + self.set_min_silence_duration(duration); self } /// Set the minimum duration of silence required to close a detected speech segment, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn with_min_silence_duration_ms(mut self, duration_ms: u32) -> Self { - self.min_silence_duration_ms = duration_ms; + pub const fn with_min_silence_duration(mut self, duration: Duration) -> Self { + self.set_min_silence_duration(duration); self } /// Set the minimum silence duration that can be used as a preferred split point when maximum speech duration is reached. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn with_min_silence_at_max_speech_ms(mut self, duration_ms: u32) -> Self { - self.min_silence_at_max_speech_ms = duration_ms; + pub const fn with_min_silence_at_max_speech(mut self, duration: Duration) -> Self { + self.set_min_silence_at_max_speech(duration); self } /// Set the maximum duration of a speech segment before the segmenter force-splits it. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn with_max_speech_duration_ms(mut self, duration_ms: u32) -> Self { - self.max_speech_duration_ms = Some(duration_ms); + pub const fn with_max_speech_duration(mut self, duration: Duration) -> Self { + self.set_max_speech_duration(duration); self } /// Clear the maximum speech duration, disabling force-splitting by segment length. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn clear_max_speech_duration(mut self) -> Self { - self.max_speech_duration_ms = None; + self.max_speech_duration = None; self } /// Set the amount of padding to add to the start of detected speech segments, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] - pub const fn with_speech_pad_ms(mut self, pad_ms: u32) -> Self { - self.speech_pad_ms = pad_ms; + pub const fn with_speech_pad(mut self, pad: Duration) -> Self { + self.set_speech_pad(pad); + self + } + + /// Set the sample rate to use for speech detection. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_sample_rate(&mut self, sample_rate: SampleRate) -> &mut Self { + self.sample_rate = sample_rate; + self + } + + /// Set the start threshold, which must be between 0 and 1. If not set, it defaults to 0.5. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_start_threshold(&mut self, threshold: f32) -> &mut Self { + self.start_threshold = sanitize_probability(threshold); + self + } + + /// Set the preferred end threshold. + /// + /// The stored value is sanitized into the `[0, 1]` range. When the + /// threshold is later read via [`Self::end_threshold`], it is also + /// checked against the current start threshold. Invalid combinations + /// fall back to the default derived hysteresis rule even if builder + /// methods are called in a different order. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_end_threshold(&mut self, threshold: f32) -> &mut Self { + self.end_threshold = Some(sanitize_probability(threshold)); + self + } + + /// Set the minimum duration of detected speech segments, in milliseconds. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_min_speech_duration(&mut self, duration: Duration) -> &mut Self { + self.min_speech_duration = duration; + self + } + + /// Set the minimum duration of silence required to close a detected speech segment, in milliseconds. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_min_silence_duration(&mut self, duration: Duration) -> &mut Self { + self.min_silence_duration = duration; + self + } + + /// Set the minimum silence duration that can be used as a preferred split point when maximum speech duration is reached. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_min_silence_at_max_speech(&mut self, duration: Duration) -> &mut Self { + self.min_silence_at_max_speech = duration; + self + } + + /// Set the maximum duration of a speech segment before the segmenter force-splits it. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_max_speech_duration(&mut self, duration: Duration) -> &mut Self { + self.max_speech_duration = Some(duration); + self + } + + /// Set the amount of padding to add to the start of detected speech segments, in milliseconds. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn set_speech_pad(&mut self, pad: Duration) -> &mut Self { + self.speech_pad = pad; self } } #[inline] -pub(crate) fn ms_to_samples(duration_ms: u32, sample_rate: SampleRate) -> u64 { - (u64::from(duration_ms) * u64::from(sample_rate.hz())) / 1_000 +pub(crate) const fn ms_to_samples(duration: Duration, sample_rate: SampleRate) -> u64 { + ((duration.as_millis() * (sample_rate.hz() as u128)) / 1_000) as u64 } #[inline] -fn sanitize_probability(value: f32) -> f32 { +const fn sanitize_probability(value: f32) -> f32 { if value.is_finite() { value.clamp(0.0, 1.0) } else { @@ -324,12 +518,12 @@ fn sanitize_probability(value: f32) -> f32 { } #[inline] -fn default_end_threshold(start_threshold: f32) -> f32 { +const fn default_end_threshold(start_threshold: f32) -> f32 { sanitize_probability((sanitize_probability(start_threshold) - 0.15).max(0.01)) } #[inline] -fn effective_end_threshold(start_threshold: f32, end_threshold: f32) -> f32 { +const fn effective_end_threshold(start_threshold: f32, end_threshold: f32) -> f32 { let start_threshold = sanitize_probability(start_threshold); let end_threshold = sanitize_probability(end_threshold); @@ -342,6 +536,8 @@ fn effective_end_threshold(start_threshold: f32, end_threshold: f32) -> f32 { #[cfg(test)] mod tests { + use std::time::Duration; + use ort::session::builder::GraphOptimizationLevel; use super::{SampleRate, SessionOptions, SpeechOptions, ms_to_samples}; @@ -360,17 +556,26 @@ mod tests { assert_eq!(config.sample_rate(), SampleRate::Rate16k); assert_eq!(config.start_threshold(), 0.5); assert_eq!(config.end_threshold(), 0.35); - assert_eq!(config.min_speech_duration_ms(), 250); - assert_eq!(config.min_silence_duration_ms(), 100); - assert_eq!(config.min_silence_at_max_speech_ms(), 98); - assert_eq!(config.max_speech_duration_ms(), None); - assert_eq!(config.speech_pad_ms(), 30); + assert_eq!(config.min_speech_duration(), Duration::from_millis(250)); + assert_eq!(config.min_silence_duration(), Duration::from_millis(100)); + assert_eq!( + config.min_silence_at_max_speech(), + Duration::from_millis(98) + ); + assert_eq!(config.max_speech_duration(), None); + assert_eq!(config.speech_pad(), Duration::from_millis(30)); } #[test] fn ms_to_samples_uses_stream_rate() { - assert_eq!(ms_to_samples(100, SampleRate::Rate16k), 1_600); - assert_eq!(ms_to_samples(100, SampleRate::Rate8k), 800); + assert_eq!( + ms_to_samples(Duration::from_millis(100), SampleRate::Rate16k), + 1_600 + ); + assert_eq!( + ms_to_samples(Duration::from_millis(100), SampleRate::Rate8k), + 800 + ); } #[test] @@ -405,10 +610,85 @@ mod tests { #[test] fn max_speech_duration_converts_to_samples_with_stream_lookahead_and_padding() { let options = SpeechOptions::default() - .with_speech_pad_ms(30) - .with_max_speech_duration_ms(1_000); - assert_eq!(options.max_speech_duration_ms(), Some(1_000)); + .with_speech_pad(Duration::from_millis(30)) + .with_max_speech_duration(Duration::from_millis(1_000)); + assert_eq!( + options.max_speech_duration(), + Some(Duration::from_millis(1_000)) + ); assert_eq!(options.min_silence_at_max_speech_samples(), 1_568); assert_eq!(options.max_speech_samples(), Some(14_528)); } + + #[cfg(feature = "serde")] + #[test] + fn test_serde() { + let opts = SessionOptions::default().with_optimization_level(GraphOptimizationLevel::Level2); + let serialized = serde_json::to_string(&opts).expect("serialize options"); + let deserialized: SessionOptions = + serde_json::from_str(&serialized).expect("deserialize options"); + assert_eq!(opts.optimization_level, deserialized.optimization_level); + + let default_deserialized: SessionOptions = + serde_json::from_str("{}").expect("deserialize default options"); + assert!(matches!( + default_deserialized.optimization_level, + GraphOptimizationLevel::Disable + )); + + // level1 + let level1_opts = + SessionOptions::default().with_optimization_level(GraphOptimizationLevel::Level1); + let level1_serialized = serde_json::to_string(&level1_opts).expect("serialize level1 options"); + let level1_deserialized: SessionOptions = + serde_json::from_str(&level1_serialized).expect("deserialize level1 options"); + assert!(matches!( + level1_deserialized.optimization_level, + GraphOptimizationLevel::Level1 + )); + + // level2 + let level2_opts = + SessionOptions::default().with_optimization_level(GraphOptimizationLevel::Level2); + let level2_serialized = serde_json::to_string(&level2_opts).expect("serialize level2 options"); + let level2_deserialized: SessionOptions = + serde_json::from_str(&level2_serialized).expect("deserialize level2 options"); + assert!(matches!( + level2_deserialized.optimization_level, + GraphOptimizationLevel::Level2 + )); + + // level3 + let level3_opts = + SessionOptions::default().with_optimization_level(GraphOptimizationLevel::Level3); + let level3_serialized = serde_json::to_string(&level3_opts).expect("serialize level3 options"); + let level3_deserialized: SessionOptions = + serde_json::from_str(&level3_serialized).expect("deserialize level3 options"); + assert!(matches!( + level3_deserialized.optimization_level, + GraphOptimizationLevel::Level3 + )); + + // all + let all_opts = SessionOptions::default().with_optimization_level(GraphOptimizationLevel::All); + let all_serialized = serde_json::to_string(&all_opts).expect("serialize all options"); + let all_deserialized: SessionOptions = + serde_json::from_str(&all_serialized).expect("deserialize all options"); + assert!(matches!( + all_deserialized.optimization_level, + GraphOptimizationLevel::All + )); + + // disable + let disable_opts = + SessionOptions::default().with_optimization_level(GraphOptimizationLevel::Disable); + let disable_serialized = + serde_json::to_string(&disable_opts).expect("serialize disable options"); + let disable_deserialized: SessionOptions = + serde_json::from_str(&disable_serialized).expect("deserialize disable options"); + assert!(matches!( + disable_deserialized.optimization_level, + GraphOptimizationLevel::Disable + )); + } } diff --git a/src/session.rs b/src/session.rs index 352074b..dec78dc 100644 --- a/src/session.rs +++ b/src/session.rs @@ -35,13 +35,13 @@ pub struct BatchInput<'a> { impl<'a> BatchInput<'a> { /// Returns the stream state associated with this batch input, which contains the recurrent memory and context for the stream that produced this chunk. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn state(&mut self) -> &mut StreamState { self.stream } /// Returns the chunk of audio samples for this batch input, which should be exactly the expected chunk size for the stream's sample rate. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub const fn chunk(&self) -> &'a [f32] { self.chunk } @@ -71,6 +71,7 @@ impl Session { /// Create a session from the bundled Silero VAD model with default options. #[cfg(feature = "bundled")] #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn bundled() -> Result { Self::bundled_with_options(SessionOptions::default()) } @@ -78,16 +79,19 @@ impl Session { /// Create a session from the bundled Silero VAD model with custom options. #[cfg(feature = "bundled")] #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn bundled_with_options(options: SessionOptions) -> Result { Self::from_memory_with_options(BUNDLED_MODEL, options) } /// Create a session from an ONNX file at the given path with default options. + #[cfg_attr(not(tarpaulin), inline(always))] pub fn from_file(path: impl AsRef) -> Result { Self::from_file_with_options(path, SessionOptions::default()) } /// Create a session from an ONNX file at the given path with custom options. + #[cfg_attr(not(tarpaulin), inline(always))] pub fn from_file_with_options(path: impl AsRef, options: SessionOptions) -> Result { let path = path.as_ref(); let session = OrtSession::builder()? @@ -102,11 +106,13 @@ impl Session { } /// Create a session from an ONNX model loaded in memory with default options. + #[cfg_attr(not(tarpaulin), inline(always))] pub fn from_memory(model_bytes: &[u8]) -> Result { Self::from_memory_with_options(model_bytes, SessionOptions::default()) } /// Create a session from an ONNX model loaded in memory with custom options. + #[cfg_attr(not(tarpaulin), inline(always))] pub fn from_memory_with_options(model_bytes: &[u8], options: SessionOptions) -> Result { let session = OrtSession::builder()? .with_optimization_level(options.optimization_level()) @@ -116,7 +122,7 @@ impl Session { } /// Create a session directly from an existing ONNX Runtime session. - #[inline] + #[cfg_attr(not(tarpaulin), inline(always))] pub fn from_ort_session(inner: OrtSession) -> Self { Self { inner, @@ -127,6 +133,7 @@ impl Session { } /// Infer one chunk for one stream, returning the speech probability for that chunk. + #[cfg_attr(not(tarpaulin), inline(always))] pub fn infer_chunk(&mut self, stream: &mut StreamState, chunk: &[f32]) -> Result { Self::infer_chunk_with_scratch( &mut self.inner, @@ -367,7 +374,7 @@ impl Session { } } -#[inline] +#[cfg_attr(not(tarpaulin), inline(always))] fn validate_shape(tensor: &'static str, actual: &[i64], expected: &[i64]) -> Result<()> { if actual == expected { Ok(()) From 251146901e45b5d31a7d70206e7ef8f70dac816e Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 21 Apr 2026 18:23:45 +1200 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/options.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/options.rs b/src/options.rs index d425db3..d05cf06 100644 --- a/src/options.rs +++ b/src/options.rs @@ -401,7 +401,7 @@ impl SpeechOptions { /// Set the minimum duration of detected speech segments, in milliseconds. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn with_min_speech_duration(mut self, duration: Duration) -> Self { - self.set_min_silence_duration(duration); + self.set_min_speech_duration(duration); self } @@ -467,28 +467,33 @@ impl SpeechOptions { self } - /// Set the minimum duration of detected speech segments, in milliseconds. + /// Set the minimum duration of detected speech segments as a `Duration`. + /// Sub-second precision is supported according to the precision of `Duration`. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn set_min_speech_duration(&mut self, duration: Duration) -> &mut Self { self.min_speech_duration = duration; self } - /// Set the minimum duration of silence required to close a detected speech segment, in milliseconds. + /// Set the minimum duration of silence required to close a detected speech segment as a `Duration`. + /// Sub-second precision is supported according to the precision of `Duration`. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn set_min_silence_duration(&mut self, duration: Duration) -> &mut Self { self.min_silence_duration = duration; self } - /// Set the minimum silence duration that can be used as a preferred split point when maximum speech duration is reached. + /// Set the minimum silence duration, as a `Duration`, that can be used as a preferred split point + /// when maximum speech duration is reached. Sub-second precision is supported according to the + /// precision of `Duration`. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn set_min_silence_at_max_speech(&mut self, duration: Duration) -> &mut Self { self.min_silence_at_max_speech = duration; self } - /// Set the maximum duration of a speech segment before the segmenter force-splits it. + /// Set the maximum duration of a speech segment, as a `Duration`, before the segmenter + /// force-splits it. Sub-second precision is supported according to the precision of `Duration`. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn set_max_speech_duration(&mut self, duration: Duration) -> &mut Self { self.max_speech_duration = Some(duration); @@ -505,7 +510,13 @@ impl SpeechOptions { #[inline] pub(crate) const fn ms_to_samples(duration: Duration, sample_rate: SampleRate) -> u64 { - ((duration.as_millis() * (sample_rate.hz() as u128)) / 1_000) as u64 + let samples = (duration.as_millis() * (sample_rate.hz() as u128)) / 1_000; + + if samples > u64::MAX as u128 { + u64::MAX + } else { + samples as u64 + } } #[inline] From e11947b1cb14d4aa39944363ef54df9cfed348e5 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:01:54 +1200 Subject: [PATCH 3/4] fix test --- src/options.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/options.rs b/src/options.rs index d05cf06..157724b 100644 --- a/src/options.rs +++ b/src/options.rs @@ -592,10 +592,7 @@ mod tests { #[test] fn session_options_default_to_unopinionated_core_settings() { let options = SessionOptions::default(); - assert_eq!( - options.optimization_level(), - GraphOptimizationLevel::Disable - ); + assert_eq!(options.optimization_level(), GraphOptimizationLevel::Level3,); } #[test] From e6b6ca8f46de7cced2b74ae8cab0614580106220 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:13:11 +1200 Subject: [PATCH 4/4] fix test --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a65b6ec..4edc25d 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ fn main() -> Result<(), silero::Error> { let mut session = Session::from_memory(model)?; let config = SpeechOptions::default(); let mut stream = StreamState::new(config.sample_rate()); - let mut segmenter = SpeechSegmenter::new(config); + let mut segmenter = SpeechSegmenter::new(config.clone()); let audio_chunk = vec![0.0_f32; config.sample_rate().chunk_samples()]; segmenter.process_samples(&mut session, &mut stream, &audio_chunk, |segment| {