diff --git a/.github/workflows/turn-accuracy.yml b/.github/workflows/turn-accuracy.yml
new file mode 100644
index 0000000..2860d32
--- /dev/null
+++ b/.github/workflows/turn-accuracy.yml
@@ -0,0 +1,86 @@
+name: Turn Accuracy
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  accuracy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2
+
+      - name: Run accuracy report
+        run: |
+          cargo test --features pipecat --test accuracy \
+            -- --ignored accuracy_report --nocapture 2>&1 | tee accuracy-output.txt
+
+      - name: Update README benchmark table
+        run: |
+          python3 << 'PYEOF'
+          import re, sys
+
+          with open('accuracy-output.txt') as f:
+              output = f.read()
+
+          version = None
+          for line in output.split('\n'):
+              if line.startswith('BENCHMARK_VERSION='):
+                  version = line.split('=', 1)[1].strip()
+                  break
+
+          if not version:
+              print("ERROR: No version found in test output")
+              sys.exit(1)
+
+          table_lines = [l for l in output.split('\n') if l.startswith('|')]
+          if len(table_lines) < 3:
+              print("ERROR: No benchmark table found in test output")
+              sys.exit(1)
+
+          table = '\n'.join(table_lines)
+          block = f'*v{version}*\n\n{table}'
+
+          with open('README.md') as f:
+              readme = f.read()
+
+          pattern = r'(<!-- benchmark-table-start -->).*?(<!-- benchmark-table-end -->)'
+          replacement = rf'\1\n{block}\n\2'
+          updated = re.sub(pattern, replacement, readme, flags=re.DOTALL)
+
+          if updated == readme:
+              print("No changes to README")
+          else:
+              with open('README.md', 'w') as f:
+                  f.write(updated)
+              print(f"README updated with v{version} benchmarks:")
+              print(table)
+          PYEOF
+
+      - name: Job summary
+        if: always()
+        run: |
+          echo "## Turn Accuracy Report" >> "$GITHUB_STEP_SUMMARY"
+          if [ -f accuracy-output.txt ]; then
+            TABLE=$(grep '^|' accuracy-output.txt || true)
+            if [ -n "$TABLE" ]; then
+              echo "$TABLE" >> "$GITHUB_STEP_SUMMARY"
+            fi
+          fi
+
+      - name: Create PR with updated benchmarks
+        uses: peter-evans/create-pull-request@v6
+        with:
+          commit-message: "docs: update accuracy table"
+          title: "docs: update accuracy table"
+          body: |
+            Auto-generated accuracy update from the turn detection pipeline cross-validation.
+          branch: docs/update-accuracy
+          delete-branch: true
diff --git a/.gitignore b/.gitignore
index 99b3b4a..fb6236d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,13 @@ Cargo.lock
 *.swo
 .DS_Store
 .cargo/config.toml
+
+# Python tooling (scripts/)
+scripts/.venv/
+scripts/__pycache__/
+scripts/*.onnx
+__pycache__/
+*.pyc
+
+# Generated mel reference tensors (regenerate with scripts/gen_reference.py)
+*.mel.npy
diff --git a/Makefile b/Makefile
index 1a66480..7065549 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,15 @@
-.PHONY: help check test fmt lint doc ci
+.PHONY: help check test fmt lint doc ci accuracy mel
 
 help:
 	@echo "Available targets:"
-	@echo "  check   Check workspace compiles"
-	@echo "  test    Run all tests"
-	@echo "  fmt     Format code"
-	@echo "  lint    Run clippy with warnings as errors"
-	@echo "  doc     Build and open docs in browser"
-	@echo "  ci      Run all CI checks locally (fmt, clippy, test, doc, features)"
+	@echo "  check     Check workspace compiles"
+	@echo "  test      Run all tests"
+	@echo "  accuracy  Cross-validate Rust pipeline against Python reference"
+	@echo "  mel       Compare Rust vs Python mel spectrograms element-wise"
+	@echo "  fmt       Format code"
+	@echo "  lint      Run clippy with warnings as errors"
+	@echo "  doc       Build and open docs in browser"
+	@echo "  ci        Run all CI checks locally (fmt, clippy, test, doc, features)"
 
 # Check workspace compiles
 check:
@@ -17,6 +19,14 @@ check:
 test:
 	cargo test --workspace
 
+# Cross-validate Rust mel+ONNX pipeline against Python reference probabilities
+accuracy:
+	cargo test --features pipecat --test accuracy -- --ignored accuracy_report --nocapture
+
+# Compare Rust vs Python mel spectrograms element-wise (requires .npy fixtures)
+mel:
+	cargo test --features pipecat -- mel_report --ignored --nocapture
+
 # Format code
 fmt:
 	cargo fmt --all
diff --git a/README.md b/README.md
index 8685e99..97d850d 100644
--- a/README.md
+++ b/README.md
@@ -87,6 +87,16 @@ wavekat-voice -->  orchestrates VAD + turn + ASR + LLM + TTS
 - Text-based detectors depend on ASR transcript quality. Pair with a
   streaming ASR provider for best results.
 
+## Accuracy
+
+Cross-validated against the original Python (Pipecat) pipeline on three fixture clips.
+Tolerance: ±0.02 probability.
+
+<!-- benchmark-table-start -->
+<!-- benchmark-table-end -->
+
+Run locally with `make accuracy`. See [`scripts/README.md`](scripts/README.md) for how to regenerate the Python reference.
+
 ## License
 
 Licensed under [Apache 2.0](LICENSE).
diff --git a/crates/wavekat-turn/Cargo.toml b/crates/wavekat-turn/Cargo.toml
index 8f2bc82..9284d18 100644
--- a/crates/wavekat-turn/Cargo.toml
+++ b/crates/wavekat-turn/Cargo.toml
@@ -32,6 +32,9 @@ ureq = { version = "3", optional = true }
 
 [dev-dependencies]
 hound = "3.5"
+ndarray-npy = "0.10"
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/crates/wavekat-turn/src/audio/pipecat.rs b/crates/wavekat-turn/src/audio/pipecat.rs
index 4554b1d..7b87c1a 100644
--- a/crates/wavekat-turn/src/audio/pipecat.rs
+++ b/crates/wavekat-turn/src/audio/pipecat.rs
@@ -143,12 +143,22 @@ impl MelExtractor {
     fn extract(&mut self, audio: &[f32], shift_frames: usize) -> Array2<f32> {
         debug_assert_eq!(audio.len(), RING_CAPACITY);
 
-        // ---- Center-pad: N_FFT/2 zeros on each side → 128 400 samples ----
-        // This replicates librosa/PyTorch `center=True` STFT behaviour, which
-        // gives exactly N_FRAMES + 1 = 801 frames; we discard the last one.
-        let pad = N_FFT / 2;
-        let mut padded = vec![0.0f32; pad + audio.len() + pad];
-        padded[pad..pad + audio.len()].copy_from_slice(audio);
+        // ---- Center-pad: N_FFT/2 reflect samples on each side → 128 400 samples ----
+        // Matches WhisperFeatureExtractor: np.pad(waveform, n_fft//2, mode="reflect").
+        // Reflect (not zero) padding ensures the boundary frames match Python exactly.
+        // Gives exactly N_FRAMES + 1 = 801 frames; we discard the last one.
+        let pad = N_FFT / 2; // 200
+        let n = audio.len(); // 128 000
+        let mut padded = vec![0.0f32; pad + n + pad];
+        padded[pad..pad + n].copy_from_slice(audio);
+        // Left reflect: padded[0..pad] = audio[pad..1] reversed (exclude edge)
+        for i in 0..pad {
+            padded[i] = audio[pad - i];
+        }
+        // Right reflect: padded[pad+n..pad+n+pad] = audio[n-2..n-2-pad] reversed
+        for i in 0..pad {
+            padded[pad + n + i] = audio[n - 2 - i];
+        }
 
         // n_total = (128 400 − 400) / 160 + 1 = 801
         let n_total_frames = (padded.len() - N_FFT) / HOP_LENGTH + 1;
@@ -527,3 +537,143 @@ impl AudioTurnDetector for PipecatSmartTurn {
         self.mel.invalidate_cache();
     }
 }
+
+// ---------------------------------------------------------------------------
+// Mel comparison tests (unit tests — need access to private MelExtractor)
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod mel_tests {
+    use std::path::{Path, PathBuf};
+
+    use ndarray::Array2;
+    use ndarray_npy::ReadNpyExt;
+
+    use super::{prepare_audio, MelExtractor, RING_CAPACITY, SAMPLE_RATE};
+
+    /// Max allowed element-wise absolute difference between Rust and Python mel.
+    const MEL_TOLERANCE: f32 = 0.05;
+
+    fn fixtures_dir() -> PathBuf {
+        Path::new(env!("CARGO_MANIFEST_DIR"))
+            .parent()
+            .unwrap() // crates/
+            .parent()
+            .unwrap() // repo root
+            .join("tests/fixtures")
+    }
+
+    /// Load 16 kHz mono WAV as f32 in [-1, 1], normalised the same way as
+    /// Python's soundfile (divide by 32768, not i16::MAX).
+    fn load_wav_f32(path: &Path) -> Vec<f32> {
+        let mut reader = hound::WavReader::open(path)
+            .unwrap_or_else(|e| panic!("failed to open {}: {}", path.display(), e));
+        let spec = reader.spec();
+        assert_eq!(spec.sample_rate, SAMPLE_RATE, "expected 16 kHz");
+        assert_eq!(spec.channels, 1, "expected mono");
+        match spec.sample_format {
+            hound::SampleFormat::Int => reader
+                .samples::<i16>()
+                .map(|s| s.unwrap() as f32 / 32768.0)
+                .collect(),
+            hound::SampleFormat::Float => reader.samples::<f32>().map(|s| s.unwrap()).collect(),
+        }
+    }
+
+    fn load_python_mel(clip: &str) -> Array2<f32> {
+        let path = fixtures_dir().join(format!("{clip}.mel.npy"));
+        let file = std::fs::File::open(&path).unwrap_or_else(|_| {
+            panic!(
+                "missing {}: run `python scripts/gen_reference.py` first",
+                path.display()
+            )
+        });
+        Array2::<f32>::read_npy(file).expect("failed to parse .npy")
+    }
+
+    struct MelDiff {
+        max_diff: f32,
+        mean_diff: f32,
+        /// (mel_bin, frame) of the single largest diff
+        max_at: (usize, usize),
+        /// fraction of elements with diff > 0.01
+        outlier_frac: f32,
+    }
+
+    fn compare_mel(clip: &str) -> MelDiff {
+        let samples = load_wav_f32(&fixtures_dir().join(clip));
+        let audio = prepare_audio(&samples);
+        assert_eq!(audio.len(), RING_CAPACITY);
+
+        let mut extractor = MelExtractor::new();
+        let rust_mel = extractor.extract(&audio, 0);
+        let python_mel = load_python_mel(clip);
+
+        assert_eq!(
+            rust_mel.shape(),
+            python_mel.shape(),
+            "{clip}: mel shape mismatch"
+        );
+
+        let shape = rust_mel.shape();
+        let (n_mels, n_frames) = (shape[0], shape[1]);
+
+        let mut max_diff = 0.0f32;
+        let mut max_at = (0, 0);
+        let mut sum_diff = 0.0f32;
+        let mut outliers = 0usize;
+
+        for m in 0..n_mels {
+            for t in 0..n_frames {
+                let d = (rust_mel[[m, t]] - python_mel[[m, t]]).abs();
+                sum_diff += d;
+                if d > max_diff {
+                    max_diff = d;
+                    max_at = (m, t);
+                }
+                if d > 0.01 {
+                    outliers += 1;
+                }
+            }
+        }
+
+        let total = (n_mels * n_frames) as f32;
+        MelDiff {
+            max_diff,
+            mean_diff: sum_diff / total,
+            max_at,
+            outlier_frac: outliers as f32 / total,
+        }
+    }
+
+    /// Print a markdown table of mel-level diffs between Rust and Python.
+    /// Run with: `make mel`
+    #[test]
+    #[ignore]
+    fn mel_report() {
+        let clips = ["silence_2s.wav", "speech_finished.wav", "speech_mid.wav"];
+
+        println!();
+        println!("MEL_TOLERANCE={MEL_TOLERANCE}");
+        println!();
+        println!("| Clip | Max Diff | Mean Diff | Max at (mel,frame) | Outliers >0.01 | Status |");
+        println!("|------|----------|-----------|---------------------|----------------|--------|");
+        for clip in clips {
+            let d = compare_mel(clip);
+            let status = if d.max_diff <= MEL_TOLERANCE {
+                "PASS"
+            } else {
+                "FAIL"
+            };
+            println!(
+                "| `{clip}` | {:.6} | {:.6} | ({},{}) | {:.2}% | {status} |",
+                d.max_diff,
+                d.mean_diff,
+                d.max_at.0,
+                d.max_at.1,
+                d.outlier_frac * 100.0,
+            );
+        }
+        println!();
+    }
+}
diff --git a/crates/wavekat-turn/tests/accuracy.rs b/crates/wavekat-turn/tests/accuracy.rs
new file mode 100644
index 0000000..dbb5efd
--- /dev/null
+++ b/crates/wavekat-turn/tests/accuracy.rs
@@ -0,0 +1,219 @@
+//! Cross-validation accuracy test: Rust pipeline vs. Python reference.
+//!
+//! Verifies that our mel preprocessing and ONNX inference produce probabilities
+//! within ±0.02 of the Python (Pipecat) reference for each fixture audio clip.
+//!
+//! Prerequisites:
+//!   1. Run `python scripts/gen_reference.py` once to produce
+//!      `tests/fixtures/reference.json` and `tests/fixtures/silence_2s.wav`.
+//!   2. Commit those files alongside the WAV clips.
+//!
+//! Run individual regression tests: `cargo test --features pipecat --test accuracy`
+//! Run the full report table:        `make accuracy`
+
+use std::path::PathBuf;
+
+const TOLERANCE: f32 = 0.02;
+
+// ---------------------------------------------------------------------------
+// Shared helpers
+// ---------------------------------------------------------------------------
+
+#[cfg(any(feature = "pipecat"))]
+fn fixtures_dir() -> PathBuf {
+    std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap() // crates/
+        .parent()
+        .unwrap() // repo root
+        .join("tests/fixtures")
+}
+
+// RefEntry and load_reference are used by backend mods and accuracy_report.
+// Gate under any audio feature to avoid dead-code warnings in no-feature builds.
+#[cfg(any(feature = "pipecat"))]
+#[derive(serde::Deserialize)]
+struct RefEntry {
+    file: String,
+    probability: f32,
+}
+
+#[cfg(any(feature = "pipecat"))]
+fn load_reference() -> Vec<RefEntry> {
+    let path = fixtures_dir().join("reference.json");
+    let json = std::fs::read_to_string(&path).unwrap_or_else(|_| {
+        panic!(
+            "missing {}: run `python scripts/gen_reference.py` first",
+            path.display()
+        )
+    });
+    serde_json::from_str(&json).expect("invalid reference.json")
+}
+
+// ---------------------------------------------------------------------------
+// Report row — one entry per (backend, clip)
+// ---------------------------------------------------------------------------
+
+struct Row {
+    backend: &'static str,
+    clip: String,
+    python_prob: f32,
+    rust_prob: f32,
+}
+
+impl Row {
+    fn diff(&self) -> f32 {
+        (self.rust_prob - self.python_prob).abs()
+    }
+
+    fn status(&self) -> &'static str {
+        if self.diff() <= TOLERANCE {
+            "PASS"
+        } else {
+            "FAIL"
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pipecat backend
+// ---------------------------------------------------------------------------
+
+#[cfg(feature = "pipecat")]
+mod pipecat {
+    use std::path::Path;
+
+    use wavekat_turn::audio::PipecatSmartTurn;
+    use wavekat_turn::{AudioFrame, AudioTurnDetector, TurnPrediction, TurnState};
+
+    use super::{fixtures_dir, RefEntry, Row, TOLERANCE};
+
+    fn load_wav_f32(path: &Path) -> Vec<f32> {
+        let mut reader = hound::WavReader::open(path)
+            .unwrap_or_else(|e| panic!("failed to open {}: {}", path.display(), e));
+        let spec = reader.spec();
+        assert_eq!(spec.sample_rate, 16_000, "expected 16 kHz");
+        assert_eq!(spec.channels, 1, "expected mono");
+        match spec.sample_format {
+            hound::SampleFormat::Int => reader
+                .samples::<i16>()
+                .map(|s| s.unwrap() as f32 / 32768.0) // match soundfile's normalization
+                .collect(),
+            hound::SampleFormat::Float => reader.samples::<f32>().map(|s| s.unwrap()).collect(),
+        }
+    }
+
+    fn reference_prob(entries: &[RefEntry], name: &str) -> f32 {
+        entries
+            .iter()
+            .find(|e| e.file == name)
+            .unwrap_or_else(|| panic!("no entry for '{}' in reference.json", name))
+            .probability
+    }
+
+    pub(super) fn rows(entries: &[RefEntry]) -> Vec<Row> {
+        entries
+            .iter()
+            .map(|entry| {
+                let samples = load_wav_f32(&fixtures_dir().join(&entry.file));
+                let mut detector = PipecatSmartTurn::new().expect("failed to load model");
+                for chunk in samples.chunks(1600) {
+                    detector.push_audio(&AudioFrame::new(chunk, 16_000));
+                }
+                let pred = detector.predict().expect("predict failed");
+                let rust_prob = raw_prob(&pred);
+                Row {
+                    backend: "pipecat",
+                    clip: entry.file.clone(),
+                    python_prob: entry.probability,
+                    rust_prob,
+                }
+            })
+            .collect()
+    }
+
+    fn raw_prob(pred: &TurnPrediction) -> f32 {
+        match pred.state {
+            TurnState::Finished => pred.confidence,
+            TurnState::Unfinished => 1.0 - pred.confidence,
+            TurnState::Wait => unreachable!(),
+        }
+    }
+
+    pub(super) fn run_regression(clip: &str) {
+        let entries = super::load_reference();
+        let python_prob = reference_prob(&entries, clip);
+        let row = rows(&[RefEntry {
+            file: clip.to_string(),
+            probability: python_prob,
+        }])
+        .remove(0);
+        let diff = row.diff();
+        assert!(
+            diff <= TOLERANCE,
+            "{clip}: rust={:.4} python={:.4} diff={diff:.4} (limit {TOLERANCE})",
+            row.rust_prob,
+            row.python_prob,
+        );
+    }
+
+    #[test]
+    fn test_accuracy_silence() {
+        run_regression("silence_2s.wav");
+    }
+
+    #[test]
+    fn test_accuracy_speech_finished() {
+        run_regression("speech_finished.wav");
+    }
+
+    #[test]
+    fn test_accuracy_speech_mid() {
+        run_regression("speech_mid.wav");
+    }
+}
+
+// Add future audio backends here:
+//
+// #[cfg(feature = "livekit-audio")]
+// mod livekit_audio {
+//     pub(super) fn rows(entries: &[super::RefEntry]) -> Vec<super::Row> { ... }
+// }
+
+// ---------------------------------------------------------------------------
+// Accuracy report — prints a markdown table covering all enabled backends
+// ---------------------------------------------------------------------------
+
+/// Print a markdown table comparing Rust vs Python probabilities for all clips
+/// across all enabled backends.
+/// Run with: `make accuracy`
+#[test]
+#[ignore]
+fn accuracy_report() {
+    let rows: Vec<Row> = {
+        #[allow(unused_mut)]
+        let mut r = Vec::new();
+        #[cfg(feature = "pipecat")]
+        r.extend(pipecat::rows(&load_reference()));
+        r
+    };
+
+    let version = env!("CARGO_PKG_VERSION");
+    println!();
+    println!("BENCHMARK_VERSION={version}");
+    println!();
+    println!("| Backend | Clip | Python P(complete) | Rust P(complete) | Diff | Status |");
+    println!("|---------|------|--------------------|------------------|------|--------|");
+    for r in &rows {
+        println!(
+            "| {} | `{}` | {:.4} | {:.4} | {:.4} | {} |",
+            r.backend,
+            r.clip,
+            r.python_prob,
+            r.rust_prob,
+            r.diff(),
+            r.status(),
+        );
+    }
+    println!();
+}
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..b3774e5
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,33 @@
+# scripts/
+
+## gen_reference.py
+
+Generates `tests/fixtures/reference.json` — the Python-side reference probabilities
+used by the Rust accuracy test (`make accuracy`).
+
+### Setup
+
+```sh
+python3 -m venv scripts/.venv
+scripts/.venv/bin/pip install transformers onnxruntime numpy soundfile
+```
+
+### Run
+
+```sh
+scripts/.venv/bin/python3 scripts/gen_reference.py
+```
+
+### Re-run when
+
+- A fixture WAV changes (`tests/fixtures/*.wav`)
+- The model version changes (bump `MODEL_VERSION` in `build.rs` at the same time)
+
+### What it produces
+
+| File | Description |
+|------|-------------|
+| `tests/fixtures/silence_2s.wav` | 2 s of zeros at 16 kHz (generated if missing) |
+| `tests/fixtures/reference.json` | P(complete) for each fixture clip |
+
+Commit both files after re-running.
diff --git a/scripts/gen_reference.py b/scripts/gen_reference.py
new file mode 100644
index 0000000..0a1d1e1
--- /dev/null
+++ b/scripts/gen_reference.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Generate reference probabilities from the Pipecat Python pipeline.
+
+Outputs tests/fixtures/reference.json for use in the Rust accuracy test.
+
+Usage:
+    pip install transformers onnxruntime numpy soundfile
+    python scripts/gen_reference.py
+
+Re-run when:
+  - A fixture WAV changes
+  - The model version changes (bump MODEL_VERSION in build.rs at the same time)
+
+Speech fixture source:
+  speech_finished.wav and speech_mid.wav are original recordings of:
+    "Wavekat knows when you've finished speaking."
+  recorded at 16 kHz mono 16-bit PCM.
+"""
+
+import json
+import sys
+import urllib.request
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+from transformers import WhisperFeatureExtractor
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+FIXTURES = REPO_ROOT / "tests" / "fixtures"
+SCRIPTS = REPO_ROOT / "scripts"
+
+MODEL_URL = "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.2-cpu.onnx"
+MODEL_VERSION = "v3.2-cpu"
+MODEL_CACHE = SCRIPTS / f"smart-turn-{MODEL_VERSION}.onnx"
+
+SAMPLE_RATE = 16_000
+BUFFER_SAMPLES = 128_000  # 8 seconds at 16 kHz (matches Rust ring buffer)
+
+CLIPS = ["silence_2s.wav", "speech_finished.wav", "speech_mid.wav"]
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def ensure_model() -> Path:
+    if MODEL_CACHE.exists():
+        return MODEL_CACHE
+    print(f"Downloading model from {MODEL_URL} ...", flush=True)
+    SCRIPTS.mkdir(parents=True, exist_ok=True)
+    urllib.request.urlretrieve(MODEL_URL, MODEL_CACHE)
+    print(f"Saved to {MODEL_CACHE}", flush=True)
+    return MODEL_CACHE
+
+
+def ensure_silence() -> None:
+    path = FIXTURES / "silence_2s.wav"
+    if not path.exists():
+        print("Generating silence_2s.wav ...", flush=True)
+        FIXTURES.mkdir(parents=True, exist_ok=True)
+        sf.write(str(path), np.zeros(32_000, dtype=np.float32), SAMPLE_RATE, subtype="PCM_16")
+
+
+def load_audio(path: Path) -> np.ndarray:
+    """Load WAV as mono float32 at 16 kHz, front-padded to 8 s."""
+    audio, sr = sf.read(str(path), dtype="float32")
+    assert sr == SAMPLE_RATE, f"{path.name}: expected {SAMPLE_RATE} Hz, got {sr}"
+    assert audio.ndim == 1, f"{path.name}: expected mono audio"
+    # Front-pad with zeros to match Rust ring-buffer behaviour (shorter → zeros at front)
+    if len(audio) < BUFFER_SAMPLES:
+        audio = np.pad(audio, (BUFFER_SAMPLES - len(audio), 0))
+    else:
+        audio = audio[-BUFFER_SAMPLES:]
+    return audio
+
+
+def infer(audio: np.ndarray, session, extractor) -> tuple[float, np.ndarray]:
+    """Run the Pipecat pipeline on audio.
+
+    Returns:
+        (probability, mel_tensor) where mel_tensor has shape [80, 800].
+    """
+    features = extractor(audio, sampling_rate=SAMPLE_RATE, return_tensors="np")
+    input_features = features["input_features"].astype(np.float32)  # [1, 80, 800]
+    outputs = session.run(None, {"input_features": input_features})
+    probability = float(np.squeeze(outputs[0]))  # already a sigmoid probability in [0, 1]
+    mel = input_features[0]  # [80, 800]
+    return probability, mel
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    try:
+        import onnxruntime as ort
+    except ImportError:
+        print("ERROR: onnxruntime not installed.  Run: pip install onnxruntime", file=sys.stderr)
+        sys.exit(1)
+
+    ensure_silence()
+    model_path = ensure_model()
+
+    extractor = WhisperFeatureExtractor(chunk_length=8)
+    session = ort.InferenceSession(str(model_path))
+
+    results = []
+    for name in CLIPS:
+        path = FIXTURES / name
+        if not path.exists():
+            print(f"ERROR: missing fixture {path}", file=sys.stderr)
+            sys.exit(1)
+        audio = load_audio(path)
+        prob, mel = infer(audio, session, extractor)
+        np.save(str(FIXTURES / f"{name}.mel.npy"), mel)
+        print(f"  {name}: probability = {prob:.4f}")
+        results.append({"file": name, "probability": round(prob, 6)})
+
+    out_path = FIXTURES / "reference.json"
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=2)
+        f.write("\n")
+    print(f"\nWrote {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/fixtures/reference.json b/tests/fixtures/reference.json
new file mode 100644
index 0000000..dfdf721
--- /dev/null
+++ b/tests/fixtures/reference.json
@@ -0,0 +1,14 @@
+[
+  {
+    "file": "silence_2s.wav",
+    "probability": 0.987037
+  },
+  {
+    "file": "speech_finished.wav",
+    "probability": 0.984858
+  },
+  {
+    "file": "speech_mid.wav",
+    "probability": 0.047724
+  }
+]
diff --git a/tests/fixtures/silence_2s.wav b/tests/fixtures/silence_2s.wav
new file mode 100644
index 0000000..7f87b07
Binary files /dev/null and b/tests/fixtures/silence_2s.wav differ
diff --git a/tests/fixtures/speech_finished.wav b/tests/fixtures/speech_finished.wav
new file mode 100644
index 0000000..411d038
Binary files /dev/null and b/tests/fixtures/speech_finished.wav differ
diff --git a/tests/fixtures/speech_mid.wav b/tests/fixtures/speech_mid.wav
new file mode 100644
index 0000000..d65825e
Binary files /dev/null and b/tests/fixtures/speech_mid.wav differ