diff --git a/.github/workflows/turn-accuracy.yml b/.github/workflows/turn-accuracy.yml new file mode 100644 index 0000000..2860d32 --- /dev/null +++ b/.github/workflows/turn-accuracy.yml @@ -0,0 +1,86 @@ +name: Turn Accuracy + +on: + workflow_dispatch: + +permissions: + contents: write + pull-requests: write + +jobs: + accuracy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: dtolnay/rust-toolchain@stable + + - uses: Swatinem/rust-cache@v2 + + - name: Run accuracy report + run: | + cargo test --features pipecat --test accuracy \ + -- --ignored accuracy_report --nocapture 2>&1 | tee accuracy-output.txt + + - name: Update README benchmark table + run: | + python3 << 'PYEOF' + import re, sys + + with open('accuracy-output.txt') as f: + output = f.read() + + version = None + for line in output.split('\n'): + if line.startswith('BENCHMARK_VERSION='): + version = line.split('=', 1)[1].strip() + break + + if not version: + print("ERROR: No version found in test output") + sys.exit(1) + + table_lines = [l for l in output.split('\n') if l.startswith('|')] + if len(table_lines) < 3: + print("ERROR: No benchmark table found in test output") + sys.exit(1) + + table = '\n'.join(table_lines) + block = f'*v{version}*\n\n{table}' + + with open('README.md') as f: + readme = f.read() + + pattern = r'().*?()' + replacement = rf'\1\n{block}\n\2' + updated = re.sub(pattern, replacement, readme, flags=re.DOTALL) + + if updated == readme: + print("No changes to README") + else: + with open('README.md', 'w') as f: + f.write(updated) + print(f"README updated with v{version} benchmarks:") + print(table) + PYEOF + + - name: Job summary + if: always() + run: | + echo "## Turn Accuracy Report" >> "$GITHUB_STEP_SUMMARY" + if [ -f accuracy-output.txt ]; then + TABLE=$(grep '^|' accuracy-output.txt || true) + if [ -n "$TABLE" ]; then + echo "$TABLE" >> "$GITHUB_STEP_SUMMARY" + fi + fi + + - name: Create PR with updated benchmarks + uses: peter-evans/create-pull-request@v6 + with: + commit-message: "docs: update accuracy table" + title: "docs: update accuracy table" + body: | + Auto-generated accuracy update from the turn detection pipeline cross-validation. + branch: docs/update-accuracy + delete-branch: true diff --git a/.gitignore b/.gitignore index 99b3b4a..fb6236d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,13 @@ Cargo.lock *.swo .DS_Store .cargo/config.toml + +# Python tooling (scripts/) +scripts/.venv/ +scripts/__pycache__/ +scripts/*.onnx +__pycache__/ +*.pyc + +# Generated mel reference tensors (regenerate with scripts/gen_reference.py) +*.mel.npy diff --git a/Makefile b/Makefile index 1a66480..7065549 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,15 @@ -.PHONY: help check test fmt lint doc ci +.PHONY: help check test fmt lint doc ci accuracy mel help: @echo "Available targets:" - @echo " check Check workspace compiles" - @echo " test Run all tests" - @echo " fmt Format code" - @echo " lint Run clippy with warnings as errors" - @echo " doc Build and open docs in browser" - @echo " ci Run all CI checks locally (fmt, clippy, test, doc, features)" + @echo " check Check workspace compiles" + @echo " test Run all tests" + @echo " accuracy Cross-validate Rust pipeline against Python reference" + @echo " mel Compare Rust vs Python mel spectrograms element-wise" + @echo " fmt Format code" + @echo " lint Run clippy with warnings as errors" + @echo " doc Build and open docs in browser" + @echo " ci Run all CI checks locally (fmt, clippy, test, doc, features)" # Check workspace compiles check: @@ -17,6 +19,14 @@ check: test: cargo test --workspace +# Cross-validate Rust mel+ONNX pipeline against Python reference probabilities +accuracy: + cargo test --features pipecat --test accuracy -- --ignored accuracy_report --nocapture + +# Compare Rust vs Python mel spectrograms element-wise (requires .npy fixtures) +mel: + cargo test --features pipecat -- mel_report --ignored --nocapture + # Format code fmt: cargo fmt --all diff --git a/README.md b/README.md index 8685e99..97d850d 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,16 @@ wavekat-voice --> orchestrates VAD + turn + ASR + LLM + TTS - Text-based detectors depend on ASR transcript quality. Pair with a streaming ASR provider for best results. +## Accuracy + +Cross-validated against the original Python (Pipecat) pipeline on three fixture clips. +Tolerance: ±0.02 probability. + + + + +Run locally with `make accuracy`. See [`scripts/README.md`](scripts/README.md) for how to regenerate the Python reference. + ## License Licensed under [Apache 2.0](LICENSE). diff --git a/crates/wavekat-turn/Cargo.toml b/crates/wavekat-turn/Cargo.toml index 8f2bc82..9284d18 100644 --- a/crates/wavekat-turn/Cargo.toml +++ b/crates/wavekat-turn/Cargo.toml @@ -32,6 +32,9 @@ ureq = { version = "3", optional = true } [dev-dependencies] hound = "3.5" +ndarray-npy = "0.10" +serde = { version = "1", features = ["derive"] } +serde_json = "1" [package.metadata.docs.rs] all-features = true diff --git a/crates/wavekat-turn/src/audio/pipecat.rs b/crates/wavekat-turn/src/audio/pipecat.rs index 4554b1d..7b87c1a 100644 --- a/crates/wavekat-turn/src/audio/pipecat.rs +++ b/crates/wavekat-turn/src/audio/pipecat.rs @@ -143,12 +143,22 @@ impl MelExtractor { fn extract(&mut self, audio: &[f32], shift_frames: usize) -> Array2 { debug_assert_eq!(audio.len(), RING_CAPACITY); - // ---- Center-pad: N_FFT/2 zeros on each side → 128 400 samples ---- - // This replicates librosa/PyTorch `center=True` STFT behaviour, which - // gives exactly N_FRAMES + 1 = 801 frames; we discard the last one. - let pad = N_FFT / 2; - let mut padded = vec![0.0f32; pad + audio.len() + pad]; - padded[pad..pad + audio.len()].copy_from_slice(audio); + // ---- Center-pad: N_FFT/2 reflect samples on each side → 128 400 samples ---- + // Matches WhisperFeatureExtractor: np.pad(waveform, n_fft//2, mode="reflect"). + // Reflect (not zero) padding ensures the boundary frames match Python exactly. + // Gives exactly N_FRAMES + 1 = 801 frames; we discard the last one. + let pad = N_FFT / 2; // 200 + let n = audio.len(); // 128 000 + let mut padded = vec![0.0f32; pad + n + pad]; + padded[pad..pad + n].copy_from_slice(audio); + // Left reflect: padded[0..pad] = audio[pad..1] reversed (exclude edge) + for i in 0..pad { + padded[i] = audio[pad - i]; + } + // Right reflect: padded[pad+n..pad+n+pad] = audio[n-2..n-2-pad] reversed + for i in 0..pad { + padded[pad + n + i] = audio[n - 2 - i]; + } // n_total = (128 400 − 400) / 160 + 1 = 801 let n_total_frames = (padded.len() - N_FFT) / HOP_LENGTH + 1; @@ -527,3 +537,143 @@ impl AudioTurnDetector for PipecatSmartTurn { self.mel.invalidate_cache(); } } + +// --------------------------------------------------------------------------- +// Mel comparison tests (unit tests — need access to private MelExtractor) +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod mel_tests { + use std::path::{Path, PathBuf}; + + use ndarray::Array2; + use ndarray_npy::ReadNpyExt; + + use super::{prepare_audio, MelExtractor, RING_CAPACITY, SAMPLE_RATE}; + + /// Max allowed element-wise absolute difference between Rust and Python mel. + const MEL_TOLERANCE: f32 = 0.05; + + fn fixtures_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() // crates/ + .parent() + .unwrap() // repo root + .join("tests/fixtures") + } + + /// Load 16 kHz mono WAV as f32 in [-1, 1], normalised the same way as + /// Python's soundfile (divide by 32768, not i16::MAX). + fn load_wav_f32(path: &Path) -> Vec { + let mut reader = hound::WavReader::open(path) + .unwrap_or_else(|e| panic!("failed to open {}: {}", path.display(), e)); + let spec = reader.spec(); + assert_eq!(spec.sample_rate, SAMPLE_RATE, "expected 16 kHz"); + assert_eq!(spec.channels, 1, "expected mono"); + match spec.sample_format { + hound::SampleFormat::Int => reader + .samples::() + .map(|s| s.unwrap() as f32 / 32768.0) + .collect(), + hound::SampleFormat::Float => reader.samples::().map(|s| s.unwrap()).collect(), + } + } + + fn load_python_mel(clip: &str) -> Array2 { + let path = fixtures_dir().join(format!("{clip}.mel.npy")); + let file = std::fs::File::open(&path).unwrap_or_else(|_| { + panic!( + "missing {}: run `python scripts/gen_reference.py` first", + path.display() + ) + }); + Array2::::read_npy(file).expect("failed to parse .npy") + } + + struct MelDiff { + max_diff: f32, + mean_diff: f32, + /// (mel_bin, frame) of the single largest diff + max_at: (usize, usize), + /// fraction of elements with diff > 0.01 + outlier_frac: f32, + } + + fn compare_mel(clip: &str) -> MelDiff { + let samples = load_wav_f32(&fixtures_dir().join(clip)); + let audio = prepare_audio(&samples); + assert_eq!(audio.len(), RING_CAPACITY); + + let mut extractor = MelExtractor::new(); + let rust_mel = extractor.extract(&audio, 0); + let python_mel = load_python_mel(clip); + + assert_eq!( + rust_mel.shape(), + python_mel.shape(), + "{clip}: mel shape mismatch" + ); + + let shape = rust_mel.shape(); + let (n_mels, n_frames) = (shape[0], shape[1]); + + let mut max_diff = 0.0f32; + let mut max_at = (0, 0); + let mut sum_diff = 0.0f32; + let mut outliers = 0usize; + + for m in 0..n_mels { + for t in 0..n_frames { + let d = (rust_mel[[m, t]] - python_mel[[m, t]]).abs(); + sum_diff += d; + if d > max_diff { + max_diff = d; + max_at = (m, t); + } + if d > 0.01 { + outliers += 1; + } + } + } + + let total = (n_mels * n_frames) as f32; + MelDiff { + max_diff, + mean_diff: sum_diff / total, + max_at, + outlier_frac: outliers as f32 / total, + } + } + + /// Print a markdown table of mel-level diffs between Rust and Python. + /// Run with: `make mel` + #[test] + #[ignore] + fn mel_report() { + let clips = ["silence_2s.wav", "speech_finished.wav", "speech_mid.wav"]; + + println!(); + println!("MEL_TOLERANCE={MEL_TOLERANCE}"); + println!(); + println!("| Clip | Max Diff | Mean Diff | Max at (mel,frame) | Outliers >0.01 | Status |"); + println!("|------|----------|-----------|---------------------|----------------|--------|"); + for clip in clips { + let d = compare_mel(clip); + let status = if d.max_diff <= MEL_TOLERANCE { + "PASS" + } else { + "FAIL" + }; + println!( + "| `{clip}` | {:.6} | {:.6} | ({},{}) | {:.2}% | {status} |", + d.max_diff, + d.mean_diff, + d.max_at.0, + d.max_at.1, + d.outlier_frac * 100.0, + ); + } + println!(); + } +} diff --git a/crates/wavekat-turn/tests/accuracy.rs b/crates/wavekat-turn/tests/accuracy.rs new file mode 100644 index 0000000..dbb5efd --- /dev/null +++ b/crates/wavekat-turn/tests/accuracy.rs @@ -0,0 +1,219 @@ +//! Cross-validation accuracy test: Rust pipeline vs. Python reference. +//! +//! Verifies that our mel preprocessing and ONNX inference produce probabilities +//! within ±0.02 of the Python (Pipecat) reference for each fixture audio clip. +//! +//! Prerequisites: +//! 1. Run `python scripts/gen_reference.py` once to produce +//! `tests/fixtures/reference.json` and `tests/fixtures/silence_2s.wav`. +//! 2. Commit those files alongside the WAV clips. +//! +//! Run individual regression tests: `cargo test --features pipecat --test accuracy` +//! Run the full report table: `make accuracy` + +use std::path::PathBuf; + +const TOLERANCE: f32 = 0.02; + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +#[cfg(any(feature = "pipecat"))] +fn fixtures_dir() -> PathBuf { + std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() // crates/ + .parent() + .unwrap() // repo root + .join("tests/fixtures") +} + +// RefEntry and load_reference are used by backend mods and accuracy_report. +// Gate under any audio feature to avoid dead-code warnings in no-feature builds. +#[cfg(any(feature = "pipecat"))] +#[derive(serde::Deserialize)] +struct RefEntry { + file: String, + probability: f32, +} + +#[cfg(any(feature = "pipecat"))] +fn load_reference() -> Vec { + let path = fixtures_dir().join("reference.json"); + let json = std::fs::read_to_string(&path).unwrap_or_else(|_| { + panic!( + "missing {}: run `python scripts/gen_reference.py` first", + path.display() + ) + }); + serde_json::from_str(&json).expect("invalid reference.json") +} + +// --------------------------------------------------------------------------- +// Report row — one entry per (backend, clip) +// --------------------------------------------------------------------------- + +struct Row { + backend: &'static str, + clip: String, + python_prob: f32, + rust_prob: f32, +} + +impl Row { + fn diff(&self) -> f32 { + (self.rust_prob - self.python_prob).abs() + } + + fn status(&self) -> &'static str { + if self.diff() <= TOLERANCE { + "PASS" + } else { + "FAIL" + } + } +} + +// --------------------------------------------------------------------------- +// Pipecat backend +// --------------------------------------------------------------------------- + +#[cfg(feature = "pipecat")] +mod pipecat { + use std::path::Path; + + use wavekat_turn::audio::PipecatSmartTurn; + use wavekat_turn::{AudioFrame, AudioTurnDetector, TurnPrediction, TurnState}; + + use super::{fixtures_dir, RefEntry, Row, TOLERANCE}; + + fn load_wav_f32(path: &Path) -> Vec { + let mut reader = hound::WavReader::open(path) + .unwrap_or_else(|e| panic!("failed to open {}: {}", path.display(), e)); + let spec = reader.spec(); + assert_eq!(spec.sample_rate, 16_000, "expected 16 kHz"); + assert_eq!(spec.channels, 1, "expected mono"); + match spec.sample_format { + hound::SampleFormat::Int => reader + .samples::() + .map(|s| s.unwrap() as f32 / 32768.0) // match soundfile's normalization + .collect(), + hound::SampleFormat::Float => reader.samples::().map(|s| s.unwrap()).collect(), + } + } + + fn reference_prob(entries: &[RefEntry], name: &str) -> f32 { + entries + .iter() + .find(|e| e.file == name) + .unwrap_or_else(|| panic!("no entry for '{}' in reference.json", name)) + .probability + } + + pub(super) fn rows(entries: &[RefEntry]) -> Vec { + entries + .iter() + .map(|entry| { + let samples = load_wav_f32(&fixtures_dir().join(&entry.file)); + let mut detector = PipecatSmartTurn::new().expect("failed to load model"); + for chunk in samples.chunks(1600) { + detector.push_audio(&AudioFrame::new(chunk, 16_000)); + } + let pred = detector.predict().expect("predict failed"); + let rust_prob = raw_prob(&pred); + Row { + backend: "pipecat", + clip: entry.file.clone(), + python_prob: entry.probability, + rust_prob, + } + }) + .collect() + } + + fn raw_prob(pred: &TurnPrediction) -> f32 { + match pred.state { + TurnState::Finished => pred.confidence, + TurnState::Unfinished => 1.0 - pred.confidence, + TurnState::Wait => unreachable!(), + } + } + + pub(super) fn run_regression(clip: &str) { + let entries = super::load_reference(); + let python_prob = reference_prob(&entries, clip); + let row = rows(&[RefEntry { + file: clip.to_string(), + probability: python_prob, + }]) + .remove(0); + let diff = row.diff(); + assert!( + diff <= TOLERANCE, + "{clip}: rust={:.4} python={:.4} diff={diff:.4} (limit {TOLERANCE})", + row.rust_prob, + row.python_prob, + ); + } + + #[test] + fn test_accuracy_silence() { + run_regression("silence_2s.wav"); + } + + #[test] + fn test_accuracy_speech_finished() { + run_regression("speech_finished.wav"); + } + + #[test] + fn test_accuracy_speech_mid() { + run_regression("speech_mid.wav"); + } +} + +// Add future audio backends here: +// +// #[cfg(feature = "livekit-audio")] +// mod livekit_audio { +// pub(super) fn rows(entries: &[super::RefEntry]) -> Vec { ... } +// } + +// --------------------------------------------------------------------------- +// Accuracy report — prints a markdown table covering all enabled backends +// --------------------------------------------------------------------------- + +/// Print a markdown table comparing Rust vs Python probabilities for all clips +/// across all enabled backends. +/// Run with: `make accuracy` +#[test] +#[ignore] +fn accuracy_report() { + let rows: Vec = { + #[allow(unused_mut)] + let mut r = Vec::new(); + #[cfg(feature = "pipecat")] + r.extend(pipecat::rows(&load_reference())); + r + }; + + let version = env!("CARGO_PKG_VERSION"); + println!(); + println!("BENCHMARK_VERSION={version}"); + println!(); + println!("| Backend | Clip | Python P(complete) | Rust P(complete) | Diff | Status |"); + println!("|---------|------|--------------------|------------------|------|--------|"); + for r in &rows { + println!( + "| {} | `{}` | {:.4} | {:.4} | {:.4} | {} |", + r.backend, + r.clip, + r.python_prob, + r.rust_prob, + r.diff(), + r.status(), + ); + } + println!(); +} diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..b3774e5 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,33 @@ +# scripts/ + +## gen_reference.py + +Generates `tests/fixtures/reference.json` — the Python-side reference probabilities +used by the Rust accuracy test (`make accuracy`). + +### Setup + +```sh +python3 -m venv scripts/.venv +scripts/.venv/bin/pip install transformers onnxruntime numpy soundfile +``` + +### Run + +```sh +scripts/.venv/bin/python3 scripts/gen_reference.py +``` + +### Re-run when + +- A fixture WAV changes (`tests/fixtures/*.wav`) +- The model version changes (bump `MODEL_VERSION` in `build.rs` at the same time) + +### What it produces + +| File | Description | +|------|-------------| +| `tests/fixtures/silence_2s.wav` | 2 s of zeros at 16 kHz (generated if missing) | +| `tests/fixtures/reference.json` | P(complete) for each fixture clip | + +Commit both files after re-running. diff --git a/scripts/gen_reference.py b/scripts/gen_reference.py new file mode 100644 index 0000000..0a1d1e1 --- /dev/null +++ b/scripts/gen_reference.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Generate reference probabilities from the Pipecat Python pipeline. + +Outputs tests/fixtures/reference.json for use in the Rust accuracy test. + +Usage: + pip install transformers onnxruntime numpy soundfile + python scripts/gen_reference.py + +Re-run when: + - A fixture WAV changes + - The model version changes (bump MODEL_VERSION in build.rs at the same time) + +Speech fixture source: + speech_finished.wav and speech_mid.wav are original recordings of: + "Wavekat knows when you've finished speaking." + recorded at 16 kHz mono 16-bit PCM. +""" + +import json +import sys +import urllib.request +from pathlib import Path + +import numpy as np +import soundfile as sf +from transformers import WhisperFeatureExtractor + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent +FIXTURES = REPO_ROOT / "tests" / "fixtures" +SCRIPTS = REPO_ROOT / "scripts" + +MODEL_URL = "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.2-cpu.onnx" +MODEL_VERSION = "v3.2-cpu" +MODEL_CACHE = SCRIPTS / f"smart-turn-{MODEL_VERSION}.onnx" + +SAMPLE_RATE = 16_000 +BUFFER_SAMPLES = 128_000 # 8 seconds at 16 kHz (matches Rust ring buffer) + +CLIPS = ["silence_2s.wav", "speech_finished.wav", "speech_mid.wav"] + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def ensure_model() -> Path: + if MODEL_CACHE.exists(): + return MODEL_CACHE + print(f"Downloading model from {MODEL_URL} ...", flush=True) + SCRIPTS.mkdir(parents=True, exist_ok=True) + urllib.request.urlretrieve(MODEL_URL, MODEL_CACHE) + print(f"Saved to {MODEL_CACHE}", flush=True) + return MODEL_CACHE + + +def ensure_silence() -> None: + path = FIXTURES / "silence_2s.wav" + if not path.exists(): + print("Generating silence_2s.wav ...", flush=True) + FIXTURES.mkdir(parents=True, exist_ok=True) + sf.write(str(path), np.zeros(32_000, dtype=np.float32), SAMPLE_RATE, subtype="PCM_16") + + +def load_audio(path: Path) -> np.ndarray: + """Load WAV as mono float32 at 16 kHz, front-padded to 8 s.""" + audio, sr = sf.read(str(path), dtype="float32") + assert sr == SAMPLE_RATE, f"{path.name}: expected {SAMPLE_RATE} Hz, got {sr}" + assert audio.ndim == 1, f"{path.name}: expected mono audio" + # Front-pad with zeros to match Rust ring-buffer behaviour (shorter → zeros at front) + if len(audio) < BUFFER_SAMPLES: + audio = np.pad(audio, (BUFFER_SAMPLES - len(audio), 0)) + else: + audio = audio[-BUFFER_SAMPLES:] + return audio + + +def infer(audio: np.ndarray, session, extractor) -> tuple[float, np.ndarray]: + """Run the Pipecat pipeline on audio. + + Returns: + (probability, mel_tensor) where mel_tensor has shape [80, 800]. + """ + features = extractor(audio, sampling_rate=SAMPLE_RATE, return_tensors="np") + input_features = features["input_features"].astype(np.float32) # [1, 80, 800] + outputs = session.run(None, {"input_features": input_features}) + probability = float(np.squeeze(outputs[0])) # already a sigmoid probability in [0, 1] + mel = input_features[0] # [80, 800] + return probability, mel + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + try: + import onnxruntime as ort + except ImportError: + print("ERROR: onnxruntime not installed. Run: pip install onnxruntime", file=sys.stderr) + sys.exit(1) + + ensure_silence() + model_path = ensure_model() + + extractor = WhisperFeatureExtractor(chunk_length=8) + session = ort.InferenceSession(str(model_path)) + + results = [] + for name in CLIPS: + path = FIXTURES / name + if not path.exists(): + print(f"ERROR: missing fixture {path}", file=sys.stderr) + sys.exit(1) + audio = load_audio(path) + prob, mel = infer(audio, session, extractor) + np.save(str(FIXTURES / f"{name}.mel.npy"), mel) + print(f" {name}: probability = {prob:.4f}") + results.append({"file": name, "probability": round(prob, 6)}) + + out_path = FIXTURES / "reference.json" + with open(out_path, "w") as f: + json.dump(results, f, indent=2) + f.write("\n") + print(f"\nWrote {out_path}") + + +if __name__ == "__main__": + main() diff --git a/tests/fixtures/reference.json b/tests/fixtures/reference.json new file mode 100644 index 0000000..dfdf721 --- /dev/null +++ b/tests/fixtures/reference.json @@ -0,0 +1,14 @@ +[ + { + "file": "silence_2s.wav", + "probability": 0.987037 + }, + { + "file": "speech_finished.wav", + "probability": 0.984858 + }, + { + "file": "speech_mid.wav", + "probability": 0.047724 + } +] diff --git a/tests/fixtures/silence_2s.wav b/tests/fixtures/silence_2s.wav new file mode 100644 index 0000000..7f87b07 Binary files /dev/null and b/tests/fixtures/silence_2s.wav differ diff --git a/tests/fixtures/speech_finished.wav b/tests/fixtures/speech_finished.wav new file mode 100644 index 0000000..411d038 Binary files /dev/null and b/tests/fixtures/speech_finished.wav differ diff --git a/tests/fixtures/speech_mid.wav b/tests/fixtures/speech_mid.wav new file mode 100644 index 0000000..d65825e Binary files /dev/null and b/tests/fixtures/speech_mid.wav differ