Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ jobs:
- "webrtc"
- "silero"
- "ten-vad"
- "firered"
- "serde"
- "webrtc,silero,ten-vad,serde"
- "webrtc,silero,ten-vad,firered,serde"
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
Expand All @@ -72,7 +73,7 @@ jobs:
- name: Run accuracy regression check
run: |
cargo test --release -p wavekat-vad --no-default-features \
--features webrtc,silero,ten-vad \
--features webrtc,silero,ten-vad,firered \
-- --ignored accuracy_report --nocapture 2>&1 | tee accuracy-output.txt
- name: Job summary
if: always()
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-plz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
- name: Run accuracy report
run: |
cargo test --release -p wavekat-vad --no-default-features \
--features webrtc,silero,ten-vad \
--features webrtc,silero,ten-vad,firered \
-- --ignored accuracy_report --nocapture 2>&1 | tee accuracy-output.txt

- name: Update README benchmark table
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/vad-accuracy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Run accuracy report
run: |
cargo test --release -p wavekat-vad --no-default-features \
--features webrtc,silero,ten-vad \
--features webrtc,silero,ten-vad,firered \
-- --ignored accuracy_report --nocapture 2>&1 | tee accuracy-output.txt

- name: Update README benchmark table
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,15 @@ tools/vad-lab/frontend/dist/
*.wav
!testdata/**/*.wav

# Python virtual environments
.venv/

# Environment
.env

# macOS
.DS_Store

# ONNX models (downloaded at build time)
crates/wavekat-vad/models/

9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,20 @@ ci:
cargo test -p wavekat-vad --no-default-features --features "webrtc"
cargo test -p wavekat-vad --no-default-features --features "silero"
cargo test -p wavekat-vad --no-default-features --features "ten-vad"
cargo test -p wavekat-vad --no-default-features --features "firered"
cargo test -p wavekat-vad --no-default-features --features "serde"
cargo test -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad,serde"
cargo test -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad,firered,serde"

# Run criterion benchmarks for per-frame inference timing
bench:
cargo bench -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad"
cargo bench -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad,firered"

# Run accuracy test against the TEN-VAD testset (30 labeled audio files)
accuracy:
cargo test --release -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad" \
cargo test --release -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad,firered" \
-- --ignored accuracy_report --nocapture

# Update accuracy-baseline.json with current best scores (only raises, never lowers)
accuracy-update-baseline:
cargo test --release -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad" \
cargo test --release -p wavekat-vad --no-default-features --features "webrtc,silero,ten-vad,firered" \
-- --ignored accuracy_update_baseline --nocapture
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ let probability = vad.process(&samples, 16000).unwrap();
| WebRTC | `webrtc` (default) | 8/16/32/48 kHz | 10, 20, or 30ms | Binary (0.0 or 1.0) |
| Silero | `silero` | 8/16 kHz | 32ms (256 or 512 samples) | Continuous (0.0–1.0) |
| TEN-VAD | `ten-vad` | 16 kHz only | 16ms (256 samples) | Continuous (0.0–1.0) |
| FireRedVAD | `firered` | 16 kHz only | 10ms (160 samples) | Continuous (0.0–1.0) |

```toml
[dependencies]
wavekat-vad = "0.1" # WebRTC only (default)
wavekat-vad = { version = "0.1", features = ["silero"] }
wavekat-vad = { version = "0.1", features = ["ten-vad"] }
wavekat-vad = { version = "0.1", features = ["webrtc", "silero", "ten-vad"] } # all backends
wavekat-vad = { version = "0.1", features = ["firered"] }
wavekat-vad = { version = "0.1", features = ["webrtc", "silero", "ten-vad", "firered"] } # all backends
```

### Benchmarks
Expand Down Expand Up @@ -98,6 +100,19 @@ let samples = vec![0i16; 256]; // 16ms at 16kHz
let probability = vad.process(&samples, 16000).unwrap(); // 0.0–1.0
```

### FireRedVAD

Xiaohongshu's FireRedVAD using a DFSMN architecture with pure Rust FBank preprocessing. Returns continuous probability, 16kHz only. Best overall F1 and AUC-ROC across benchmarks.

```rust
use wavekat_vad::VoiceActivityDetector;
use wavekat_vad::backends::firered::FireRedVad;

let mut vad = FireRedVad::new().unwrap();
let samples = vec![0i16; 160]; // 10ms at 16kHz
let probability = vad.process(&samples, 16000).unwrap(); // 0.0–1.0
```

## The `VoiceActivityDetector` Trait

All backends implement a common trait, so you can write code that is generic over backends:
Expand Down Expand Up @@ -175,16 +190,18 @@ let cleaned = preprocessor.process(&raw_audio);
| `webrtc` | Yes | WebRTC VAD backend |
| `silero` | No | Silero VAD backend (ONNX model downloaded at build time) |
| `ten-vad` | No | TEN-VAD backend (ONNX model downloaded at build time) |
| `firered` | No | FireRedVAD backend (ONNX model downloaded at build time) |
| `denoise` | No | RNNoise-based noise suppression in the preprocessing pipeline |
| `serde` | No | `Serialize`/`Deserialize` for config types |

### ONNX Model Downloads

Silero and TEN-VAD models are downloaded automatically at build time. For offline or CI builds, point to a local model file:
Silero, TEN-VAD, and FireRedVAD models are downloaded automatically at build time. For offline or CI builds, point to a local model file:

```sh
SILERO_MODEL_PATH=/path/to/silero_vad.onnx cargo build --features silero
TEN_VAD_MODEL_PATH=/path/to/ten-vad.onnx cargo build --features ten-vad
FIRERED_MODEL_PATH=/path/to/fireredvad.onnx FIRERED_CMVN_PATH=/path/to/cmvn.ark cargo build --features firered
```

## Error Handling
Expand Down Expand Up @@ -223,6 +240,13 @@ Apache-2.0

The TEN-VAD ONNX model (used by the `ten-vad` feature) is licensed under Apache-2.0 with a non-compete clause by the TEN-framework / Agora. It restricts deployment that competes with Agora's offerings and limits deployment to "solely for your benefit and the benefit of your direct End Users." This is **not standard open-source** despite the Apache-2.0 label. Review the [TEN-VAD license](https://github.com/TEN-framework/ten-vad) before using in production.

### Third-party notices
### Acknowledgements

This project wraps and builds on several upstream projects:

This project uses [nnnoiseless](https://github.com/jneem/nnnoiseless) (BSD-3-Clause) for noise suppression via the `denoise` feature.
- [webrtc-vad](https://github.com/kaegi/webrtc-vad) — Rust bindings for Google's WebRTC VAD
- [Silero VAD](https://github.com/snakers4/silero-vad) — neural network VAD by the Silero team
- [TEN-VAD](https://github.com/TEN-framework/ten-vad) — lightweight VAD by TEN-framework / Agora
- [FireRedVAD](https://github.com/FireRedTeam/FireRedVAD) — DFSMN-based VAD by the FireRedTeam
- [ort](https://github.com/pykeio/ort) — ONNX Runtime bindings for Rust
- [nnnoiseless](https://github.com/jneem/nnnoiseless) — Rust port of RNNoise for noise suppression
5 changes: 5 additions & 0 deletions crates/wavekat-vad/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ webrtc = ["dep:webrtc-vad"]
silero = ["dep:ort", "dep:ndarray", "dep:ureq"]
denoise = ["dep:nnnoiseless"]
ten-vad = ["dep:ort", "dep:ndarray", "dep:realfft", "dep:ureq"]
firered = ["dep:ort", "dep:ndarray", "dep:realfft", "dep:ureq"]
serde = ["dep:serde"]

[dependencies]
Expand Down Expand Up @@ -45,6 +46,10 @@ name = "detect_speech"
name = "ten_vad_file"
required-features = ["ten-vad"]

[[example]]
name = "firered_file"
required-features = ["firered"]

[[bench]]
name = "vad_comparison"
harness = false
Expand Down
17 changes: 17 additions & 0 deletions crates/wavekat-vad/benches/vad_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@ fn vad_benchmarks(c: &mut Criterion) {
});
}

#[cfg(feature = "firered")]
{
use wavekat_vad::backends::firered::FireRedVad;
use wavekat_vad::VoiceActivityDetector;

let mut vad = FireRedVad::new().unwrap();
// FireRedVad needs 3 frames to produce the first result (buffering 400 samples)
let warmup = vec![0i16; 160];
let _ = vad.process(&warmup, 16000).unwrap();
let _ = vad.process(&warmup, 16000).unwrap();
let samples = vec![0i16; vad.capabilities().frame_size];

group.bench_function("fireredvad", |b| {
b.iter(|| vad.process(criterion::black_box(&samples), 16000).unwrap())
});
}

group.finish();
}

Expand Down
97 changes: 97 additions & 0 deletions crates/wavekat-vad/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ fn main() {
fs::write(&model_path, b"").expect("failed to write placeholder model");
}
}
#[cfg(feature = "firered")]
{
let out_dir = env::var("OUT_DIR").expect("OUT_DIR not set");
let model_path = Path::new(&out_dir).join("fireredvad_stream_vad_with_cache.onnx");
if !model_path.exists() {
fs::write(&model_path, b"").expect("failed to write placeholder model");
}
let cmvn_path = Path::new(&out_dir).join("firered_cmvn.ark");
if !cmvn_path.exists() {
fs::write(&cmvn_path, b"").expect("failed to write placeholder cmvn");
}
}
return;
}

Expand All @@ -43,6 +55,9 @@ fn main() {

#[cfg(feature = "ten-vad")]
setup_ten_vad_model();

#[cfg(feature = "firered")]
setup_firered_model();
}

#[cfg(feature = "silero")]
Expand Down Expand Up @@ -153,3 +168,85 @@ fn setup_ten_vad_model() {
model_dest.display()
);
}

#[cfg(feature = "firered")]
fn setup_firered_model() {
println!("cargo:rerun-if-env-changed=FIRERED_MODEL_PATH");
println!("cargo:rerun-if-env-changed=FIRERED_CMVN_PATH");

let out_dir = env::var("OUT_DIR").expect("OUT_DIR not set");

// --- ONNX model ---
let model_dest = Path::new(&out_dir).join("fireredvad_stream_vad_with_cache.onnx");

if let Ok(local_path) = env::var("FIRERED_MODEL_PATH") {
let local_path = Path::new(&local_path);
if !local_path.exists() {
panic!(
"FIRERED_MODEL_PATH points to non-existent file: {}",
local_path.display()
);
}
println!(
"cargo:warning=Using local FireRedVAD model: {}",
local_path.display()
);
fs::copy(local_path, &model_dest).expect("failed to copy local model file");
println!("cargo:rerun-if-changed={}", local_path.display());
} else if !model_dest.exists() {
let model_url = "https://github.com/FireRedTeam/FireRedVAD/raw/main/pretrained_models/onnx_models/fireredvad_stream_vad_with_cache.onnx";
println!("cargo:warning=Downloading FireRedVAD model from {model_url}");

let response = ureq::get(model_url).call().unwrap_or_else(|e| {
panic!("failed to download FireRedVAD model from {model_url}: {e}")
});

let bytes = response
.into_body()
.read_to_vec()
.expect("failed to read FireRedVAD model bytes");

fs::write(&model_dest, &bytes).expect("failed to write FireRedVAD model file");
println!(
"cargo:warning=FireRedVAD model downloaded to {}",
model_dest.display()
);
}

// --- CMVN file ---
let cmvn_dest = Path::new(&out_dir).join("firered_cmvn.ark");

if let Ok(local_path) = env::var("FIRERED_CMVN_PATH") {
let local_path = Path::new(&local_path);
if !local_path.exists() {
panic!(
"FIRERED_CMVN_PATH points to non-existent file: {}",
local_path.display()
);
}
println!(
"cargo:warning=Using local FireRedVAD CMVN: {}",
local_path.display()
);
fs::copy(local_path, &cmvn_dest).expect("failed to copy local cmvn file");
println!("cargo:rerun-if-changed={}", local_path.display());
} else if !cmvn_dest.exists() {
let cmvn_url = "https://github.com/FireRedTeam/FireRedVAD/raw/main/pretrained_models/onnx_models/cmvn.ark";
println!("cargo:warning=Downloading FireRedVAD CMVN from {cmvn_url}");

let response = ureq::get(cmvn_url)
.call()
.unwrap_or_else(|e| panic!("failed to download FireRedVAD CMVN from {cmvn_url}: {e}"));

let bytes = response
.into_body()
.read_to_vec()
.expect("failed to read CMVN bytes");

fs::write(&cmvn_dest, &bytes).expect("failed to write CMVN file");
println!(
"cargo:warning=FireRedVAD CMVN downloaded to {}",
cmvn_dest.display()
);
}
}
12 changes: 11 additions & 1 deletion crates/wavekat-vad/examples/detect_speech.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
//!
//! # TEN-VAD:
//! cargo run --example detect_speech --features ten-vad -- --backend ten-vad path/to/audio.wav
//!
//! # FireRedVAD:
//! cargo run --example detect_speech --features firered -- --backend firered path/to/audio.wav
//! ```

use std::env;
Expand All @@ -33,6 +36,11 @@ fn create_vad(backend: &str) -> Box<dyn VoiceActivityDetector> {
use wavekat_vad::backends::ten_vad::TenVad;
Box::new(TenVad::new().expect("failed to create TEN-VAD"))
}
#[cfg(feature = "firered")]
"firered" => {
use wavekat_vad::backends::firered::FireRedVad;
Box::new(FireRedVad::new().expect("failed to create FireRedVAD"))
}
other => {
eprintln!("Unknown or disabled backend: {other}");
eprintln!("Available backends:");
Expand All @@ -42,6 +50,8 @@ fn create_vad(backend: &str) -> Box<dyn VoiceActivityDetector> {
eprintln!(" silero");
#[cfg(feature = "ten-vad")]
eprintln!(" ten-vad");
#[cfg(feature = "firered")]
eprintln!(" firered");
std::process::exit(1);
}
}
Expand Down Expand Up @@ -75,7 +85,7 @@ fn main() {
}

let wav_path = wav_path.unwrap_or_else(|| {
eprintln!("Usage: detect_speech [--backend webrtc|silero|ten-vad] <wav-file>");
eprintln!("Usage: detect_speech [--backend webrtc|silero|ten-vad|firered] <wav-file>");
std::process::exit(1);
});

Expand Down
Loading