From f47213c15137577c8929d9675558d87e987cd8d9 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:16:23 +1200
Subject: [PATCH 01/43] feat: CUDA/TensorRT/CoreML execution provider support

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile                                      |   3 +
 crates/wavekat-tts/Cargo.toml                 |   5 +
 .../wavekat-tts/src/backends/qwen3_tts/mod.rs |  11 +-
 .../src/backends/qwen3_tts/model.rs           |   7 +-
 crates/wavekat-tts/src/lib.rs                 |  12 ++
 docs/06-colab-cuda-gpu.md                     | 121 ++++++++++++++++++
 6 files changed, 156 insertions(+), 3 deletions(-)
 create mode 100644 docs/06-colab-cuda-gpu.md

diff --git a/Makefile b/Makefile
index d5fc95f..452c17b 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,9 @@ test: ## Run tests (no features)
 test-qwen3: ## Run tests with qwen3-tts feature
 	cargo test --features qwen3-tts
 
+test-qwen3-cuda: ## Run tests with qwen3-tts + CUDA
+	cargo test --features "qwen3-tts,cuda"
+
 test-all: ## Run tests with all features
 	cargo test --all-features
 
diff --git a/crates/wavekat-tts/Cargo.toml b/crates/wavekat-tts/Cargo.toml
index 73b7c20..22c7429 100644
--- a/crates/wavekat-tts/Cargo.toml
+++ b/crates/wavekat-tts/Cargo.toml
@@ -15,6 +15,11 @@ default = []
 qwen3-tts = ["dep:ort", "dep:ndarray", "dep:tokenizers", "dep:npyz", "dep:rand", "dep:hf-hub"]
 cosyvoice = ["dep:ort", "dep:ndarray"]
 
+# Execution providers — composable with any ONNX backend feature
+coreml   = ["ort?/coreml"]
+cuda     = ["ort?/cuda"]
+tensorrt = ["ort?/tensorrt"]
+
 [dependencies]
 wavekat-core = { version = "0.0.5", features = ["wav"] }
 thiserror = "2"
diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/mod.rs b/crates/wavekat-tts/src/backends/qwen3_tts/mod.rs
index eed072d..0f60c65 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/mod.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/mod.rs
@@ -69,14 +69,21 @@ impl ModelPrecision {
 /// Selecting a provider that is unavailable at runtime causes an error at load
 /// time rather than silently falling back. Use [`ExecutionProvider::Cpu`] (the
 /// default) if you need guaranteed availability.
+///
+/// Enable the corresponding Cargo feature to bundle the native libraries:
+/// - `cuda` for [`Cuda`](ExecutionProvider::Cuda)
+/// - `tensorrt` for [`TensorRt`](ExecutionProvider::TensorRt)
+/// - `coreml` for [`CoreMl`](ExecutionProvider::CoreMl)
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 pub enum ExecutionProvider {
     /// CPU inference via ONNX Runtime. Always available. Default.
     #[default]
     Cpu,
-    /// NVIDIA CUDA GPU inference. Requires an ORT build with CUDA support.
+    /// NVIDIA CUDA GPU inference. Requires `cuda` feature.
     Cuda,
-    /// Apple CoreML (macOS / iOS). Requires an ORT build with CoreML support.
+    /// NVIDIA TensorRT. Requires `tensorrt` feature.
+    TensorRt,
+    /// Apple CoreML (macOS / iOS). Requires `coreml` feature.
     CoreMl,
 }
 
diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index ca0d99a..15cfefc 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -639,12 +639,17 @@ fn apply_execution_provider(
     builder: ort::session::builder::SessionBuilder,
     ep: super::ExecutionProvider,
 ) -> Result<ort::session::builder::SessionBuilder, TtsError> {
-    use ort::execution_providers::{CUDAExecutionProvider, CoreMLExecutionProvider};
+    use ort::execution_providers::{
+        CUDAExecutionProvider, CoreMLExecutionProvider, TensorRTExecutionProvider,
+    };
     match ep {
         super::ExecutionProvider::Cpu => Ok(builder),
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default().build()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
+        super::ExecutionProvider::TensorRt => builder
+            .with_execution_providers([TensorRTExecutionProvider::default().build()])
+            .map_err(|e| TtsError::Model(format!("TensorRT execution provider error: {e}"))),
         super::ExecutionProvider::CoreMl => builder
             .with_execution_providers([CoreMLExecutionProvider::default().build()])
             .map_err(|e| TtsError::Model(format!("CoreML execution provider error: {e}"))),
diff --git a/crates/wavekat-tts/src/lib.rs b/crates/wavekat-tts/src/lib.rs
index eed98de..9aa1e97 100644
--- a/crates/wavekat-tts/src/lib.rs
+++ b/crates/wavekat-tts/src/lib.rs
@@ -22,11 +22,23 @@
 //!
 //! # Feature flags
 //!
+//! ## Backends
+//!
 //! | Feature | Backend | Multilingual | Requires |
 //! |---------|---------|-------------|----------|
 //! | `qwen3-tts` | Qwen3-TTS (ONNX) | 10 languages | ONNX model download |
 //! | `cosyvoice` | CosyVoice (ONNX) | Yes | ONNX model download |
 //!
+//! ## Execution providers
+//!
+//! Composable with any backend feature. Selects the inference hardware at build time.
+//!
+//! | Feature | Provider | Platform |
+//! |---------|----------|----------|
+//! | `cuda` | NVIDIA CUDA | Linux / Windows |
+//! | `tensorrt` | NVIDIA TensorRT | Linux / Windows |
+//! | `coreml` | Apple CoreML | macOS / iOS |
+//!
 //! # Quick start
 //!
 //! ```toml
diff --git a/docs/06-colab-cuda-gpu.md b/docs/06-colab-cuda-gpu.md
new file mode 100644
index 0000000..6907d9a
--- /dev/null
+++ b/docs/06-colab-cuda-gpu.md
@@ -0,0 +1,121 @@
+# CUDA Execution Provider
+
+## Status
+
+**In progress** — the `cuda` Cargo feature is being wired up.
+CPU inference already works; this adds NVIDIA GPU acceleration via ORT's CUDA EP.
+
+## Goal
+
+Enable NVIDIA GPU inference for Qwen3-TTS (and future ONNX-based backends)
+by activating ORT's built-in CUDA execution provider. The `TtsBackend` trait
+surface is unchanged — callers opt in via `BackendConfig`.
+
+## Why CUDA over CPU
+
+ORT's CUDA EP offloads transformer operations (matmul, attention, KV cache
+reads/writes) to the GPU. For a 1.7B-parameter model the decode loop is the
+bottleneck; GPU parallelism reduces per-step latency by ~15× on a T4.
+
+CoreML is not viable here — see `05-mlx-backend.md` for why.
+
+## Cargo feature
+
+```toml
+# crates/wavekat-tts/Cargo.toml
+[features]
+cuda     = ["ort?/cuda"]
+tensorrt = ["ort?/tensorrt"]   # optional; higher throughput, longer compile
+```
+
+These compose with any ONNX-based backend feature:
+
+```toml
+# CPU only (default)
+wavekat-tts = { version = "0.0.1", features = ["qwen3-tts"] }
+
+# CUDA
+wavekat-tts = { version = "0.0.1", features = ["qwen3-tts", "cuda"] }
+
+# TensorRT (higher throughput, requires trtexec engine build)
+wavekat-tts = { version = "0.0.1", features = ["qwen3-tts", "tensorrt"] }
+```
+
+The `ort` crate bundles its own CUDA libraries — no manual `LD_LIBRARY_PATH`
+configuration is needed as long as the host has a compatible CUDA driver.
+
+## Runtime API
+
+```rust
+use wavekat_tts::{BackendConfig, ExecutionProvider};
+
+let config = BackendConfig::default()
+    .with_provider(ExecutionProvider::Cuda);
+
+let tts = Qwen3Tts::with_config("models/qwen3-tts-1.7b", config)?;
+```
+
+ORT falls back to CPU automatically if no compatible GPU is found at runtime.
+Set `ORT_LOG_LEVEL=1` to confirm which EP is active:
+
+```
+[I:ort:session] [CUDAExecutionProvider] Created CUDA EP on device 0
+```
+
+### ExecutionProvider variants
+
+| Variant    | Cargo feature | Requirement                        |
+|------------|---------------|------------------------------------|
+| `Cpu`      | (always)      | —                                  |
+| `Cuda`     | `cuda`        | NVIDIA GPU, CUDA driver ≥ 11.8     |
+| `TensorRt` | `tensorrt`    | CUDA + TensorRT 8+ installed       |
+
+## Build
+
+```bash
+cargo build --release --features "qwen3-tts,cuda"
+cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
+  --text "Hello from GPU" --out output.wav
+```
+
+## Implementation
+
+The only files that change:
+
+| File | Change |
+|------|--------|
+| `crates/wavekat-tts/Cargo.toml` | Add `cuda` and `tensorrt` features |
+| `src/backends/onnx.rs` | Match `ExecutionProvider::Cuda` → add CUDA EP to session builder |
+| `src/types.rs` | `ExecutionProvider` enum already has `Cuda` and `TensorRt` variants |
+
+`src/backends/onnx.rs` session builder (pseudocode):
+
+```rust
+let mut builder = Session::builder()?;
+match config.execution_provider {
+    ExecutionProvider::Cpu      => {}
+    ExecutionProvider::CoreMl   => { builder = builder.with_execution_providers([CoreMLExecutionProvider::default()])?; }
+    ExecutionProvider::Cuda     => { builder = builder.with_execution_providers([CUDAExecutionProvider::default()])?; }
+    ExecutionProvider::TensorRt => { builder = builder.with_execution_providers([TensorRTExecutionProvider::default()])?; }
+}
+```
+
+## Expected performance (NVIDIA T4, 1.7B model)
+
+| Segment length | CPU     | CUDA (T4) | Speedup |
+|----------------|---------|-----------|---------|
+| 5 s audio      | ~120 s  | ~8 s      | ~15×    |
+| 30 s audio     | ~700 s  | ~45 s     | ~15×    |
+
+*Estimates based on ORT CUDA EP throughput for similarly-sized transformer
+decode loops. Actual numbers depend on VRAM bandwidth and batch size.*
+
+## Open questions
+
+- **ORT CUDA version pinning** — ORT 2.0.0-rc.12 bundles specific CUDA/cuDNN
+  versions. Verify compatibility with the target driver before shipping.
+- **TensorRT engine caching** — TRT requires a one-time engine build per
+  (model, GPU, precision) tuple. Decide whether to ship pre-built engines or
+  build on first run.
+- **Multi-GPU** — `CUDAExecutionProvider::default()` uses device 0.
+  Expose a `device_id` field in `BackendConfig` if needed.

From 0060e40d3f1e72e2a8caeb49e20cdd6b41e19f13 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:16:39 +1200
Subject: [PATCH 02/43] docs: rename 06-colab-cuda-gpu to 06-cuda-provider

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/{06-colab-cuda-gpu.md => 06-cuda-provider.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/{06-colab-cuda-gpu.md => 06-cuda-provider.md} (100%)

diff --git a/docs/06-colab-cuda-gpu.md b/docs/06-cuda-provider.md
similarity index 100%
rename from docs/06-colab-cuda-gpu.md
rename to docs/06-cuda-provider.md

From 28060717740dda2a3eaf0b25a8006ce91972f21b Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:17:24 +1200
Subject: [PATCH 03/43] docs: add Google Colab setup section to cuda-provider
 doc

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 63 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 6907d9a..2ac0d5a 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -110,6 +110,69 @@ match config.execution_provider {
 *Estimates based on ORT CUDA EP throughput for similarly-sized transformer
 decode loops. Actual numbers depend on VRAM bandwidth and batch size.*
 
+## Google Colab setup
+
+Colab's free tier provides a T4 GPU (CUDA 12.x) — enough to run the 1.7B model
+at real-time or faster with no local NVIDIA hardware required.
+
+### 1. Enable GPU runtime
+
+Runtime → Change runtime type → Hardware accelerator: **T4 GPU** → Save.
+
+Verify:
+
+```python
+!nvidia-smi
+```
+
+### 2. Install Rust
+
+```python
+!curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
+import os
+os.environ["PATH"] = os.path.expanduser("~/.cargo/bin") + ":" + os.environ["PATH"]
+!rustc --version
+```
+
+### 3. Clone and build
+
+```python
+!git clone https://github.com/wavekat/wavekat-tts.git
+%cd wavekat-tts
+!cargo build --release --features "qwen3-tts,cuda"
+```
+
+### 4. Model weights
+
+The HF Hub downloader will fetch weights automatically on first run.
+To persist across sessions, mount Google Drive and set `WAVEKAT_MODEL_DIR`:
+
+```python
+from google.colab import drive
+drive.mount('/content/drive')
+```
+
+```python
+import os
+os.environ["WAVEKAT_MODEL_DIR"] = "/content/drive/MyDrive/wavekat-models"
+!cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
+  --text "Hello from GPU" --out /content/output.wav
+```
+
+### 5. Download output
+
+```python
+from google.colab import files
+files.download('/content/output.wav')
+```
+
+### Notes
+
+- `/content` is wiped on disconnect — pin model weights to Drive to avoid
+  re-downloading each session.
+- ORT bundles its own CUDA libraries; no manual driver configuration is needed
+  beyond selecting the T4 runtime.
+
 ## Open questions
 
 - **ORT CUDA version pinning** — ORT 2.0.0-rc.12 bundles specific CUDA/cuDNN

From c91aeb8fe98baea4925b51f35951b519b34ea7e2 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:24:27 +1200
Subject: [PATCH 04/43] docs: add Colab badge and execution provider flags to
 README

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 8541d65..f649c67 100644
--- a/README.md
+++ b/README.md
@@ -90,13 +90,31 @@ cargo run --example synthesize --features qwen3-tts -- --precision fp32 "Hello"
 cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/model --output hello.wav "Hello"
 ```
 
+## Try it on Google Colab
+
+No local GPU needed — run Qwen3-TTS on a free T4 in the browser:
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qtc6lAk9RsAsvF1ojft0ACO2-PzFX4pi?usp=sharing)
+
 ## Feature flags
 
+### Backends
+
 | Flag | Default | Description |
 |------|---------|-------------|
 | `qwen3-tts` | off | Qwen3-TTS local ONNX inference |
 | `cosyvoice` | off | CosyVoice local ONNX inference (planned) |
 
+### Execution providers
+
+Composable with any backend flag. Selects the inference hardware at build time.
+
+| Flag | Description |
+|------|-------------|
+| `cuda` | NVIDIA CUDA GPU |
+| `tensorrt` | NVIDIA TensorRT |
+| `coreml` | Apple CoreML (macOS) |
+
 WAV I/O (`write_wav` / `from_wav`) is provided by `wavekat-core` via its `wav` feature flag.
 
 ## License

From 81901da6b13d702211b918fcc39c8eaeb8d3a7df Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:24:53 +1200
Subject: [PATCH 05/43] docs: remove WAV I/O note from README

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index f649c67..78d9722 100644
--- a/README.md
+++ b/README.md
@@ -115,8 +115,6 @@ Composable with any backend flag. Selects the inference hardware at build time.
 | `tensorrt` | NVIDIA TensorRT |
 | `coreml` | Apple CoreML (macOS) |
 
-WAV I/O (`write_wav` / `from_wav`) is provided by `wavekat-core` via its `wav` feature flag.
-
 ## License
 
 Licensed under [Apache 2.0](LICENSE).

From d8ddf7c273ef0d8f87672f965b87ba8ba5ab9452 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:28:38 +1200
Subject: [PATCH 06/43] docs: fix Colab glibc incompatibility with
 ORT_STRATEGY=system

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 2ac0d5a..4f91f35 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -134,7 +134,24 @@ os.environ["PATH"] = os.path.expanduser("~/.cargo/bin") + ":" + os.environ["PATH
 !rustc --version
 ```
 
-### 3. Clone and build
+### 3. Install ORT system libraries
+
+The prebuilt ORT binaries bundled by `ort-sys` (CUDA variant) require glibc 2.38+,
+but Colab runs Ubuntu 22.04 (glibc 2.35). Use `ORT_STRATEGY=system` with the
+pip-installed `onnxruntime-gpu`, which is compiled for Ubuntu 22.04:
+
+```python
+!pip install -q onnxruntime-gpu==1.20.1
+
+import onnxruntime, os
+os.environ["ORT_STRATEGY"] = "system"
+os.environ["ORT_LIB_LOCATION"] = os.path.dirname(onnxruntime.__file__)
+```
+
+These env vars must be set before `cargo build` so that `ort-sys` finds the
+system ORT instead of downloading its own prebuilt binaries.
+
+### 5. Clone and build
 
 ```python
 !git clone https://github.com/wavekat/wavekat-tts.git
@@ -142,7 +159,7 @@ os.environ["PATH"] = os.path.expanduser("~/.cargo/bin") + ":" + os.environ["PATH
 !cargo build --release --features "qwen3-tts,cuda"
 ```
 
-### 4. Model weights
+### 6. Model weights
 
 The HF Hub downloader will fetch weights automatically on first run.
 To persist across sessions, mount Google Drive and set `WAVEKAT_MODEL_DIR`:
@@ -159,7 +176,7 @@ os.environ["WAVEKAT_MODEL_DIR"] = "/content/drive/MyDrive/wavekat-models"
   --text "Hello from GPU" --out /content/output.wav
 ```
 
-### 5. Download output
+### 7. Download output
 
 ```python
 from google.colab import files

From 0c42f4b5ad3658d0af05499c15b89f3d58be653e Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:30:14 +1200
Subject: [PATCH 07/43] docs: fix ORT_LIB_LOCATION to point to capi/
 subdirectory

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 4f91f35..af1c813 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -145,7 +145,8 @@ pip-installed `onnxruntime-gpu`, which is compiled for Ubuntu 22.04:
 
 import onnxruntime, os
 os.environ["ORT_STRATEGY"] = "system"
-os.environ["ORT_LIB_LOCATION"] = os.path.dirname(onnxruntime.__file__)
+# libonnxruntime.so lives in the capi/ subdirectory, not the package root
+os.environ["ORT_LIB_LOCATION"] = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
 ```
 
 These env vars must be set before `cargo build` so that `ort-sys` finds the

From d9f2912a0468fdf61624b8fd5a7137361d98a46a Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:31:39 +1200
Subject: [PATCH 08/43] docs: create libonnxruntime.so symlink for ort-sys
 system strategy

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index af1c813..a0ae19f 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -144,9 +144,19 @@ pip-installed `onnxruntime-gpu`, which is compiled for Ubuntu 22.04:
 !pip install -q onnxruntime-gpu==1.20.1
 
 import onnxruntime, os
-os.environ["ORT_STRATEGY"] = "system"
-# libonnxruntime.so lives in the capi/ subdirectory, not the package root
-os.environ["ORT_LIB_LOCATION"] = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
+
+capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
+
+# The pip package ships libonnxruntime.so.1.20.1 but no unversioned symlink.
+# ort-sys requires libonnxruntime.so — create it if missing.
+so_plain     = os.path.join(capi_dir, "libonnxruntime.so")
+so_versioned = os.path.join(capi_dir, f"libonnxruntime.so.{onnxruntime.__version__}")
+if os.path.exists(so_versioned) and not os.path.exists(so_plain):
+    os.symlink(so_versioned, so_plain)
+
+os.environ["ORT_STRATEGY"]     = "system"
+os.environ["ORT_LIB_LOCATION"] = capi_dir
+os.environ["LD_LIBRARY_PATH"]  = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
 ```
 
 These env vars must be set before `cargo build` so that `ort-sys` finds the

From a9e3035f45c232b81209aafdde8aaa886120e90a Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:32:28 +1200
Subject: [PATCH 09/43] docs: add shell variant for ORT setup in Colab terminal

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index a0ae19f..2325cb6 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -140,6 +140,8 @@ The prebuilt ORT binaries bundled by `ort-sys` (CUDA variant) require glibc 2.38
 but Colab runs Ubuntu 22.04 (glibc 2.35). Use `ORT_STRATEGY=system` with the
 pip-installed `onnxruntime-gpu`, which is compiled for Ubuntu 22.04:
 
+**Notebook cell:**
+
 ```python
 !pip install -q onnxruntime-gpu==1.20.1
 
@@ -159,6 +161,19 @@ os.environ["ORT_LIB_LOCATION"] = capi_dir
 os.environ["LD_LIBRARY_PATH"]  = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
 ```
 
+**Terminal (Colab shell):**
+
+```bash
+pip install -q onnxruntime-gpu==1.20.1
+
+CAPI=/usr/local/lib/python3.12/dist-packages/onnxruntime/capi
+ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so
+
+export ORT_STRATEGY=system
+export ORT_LIB_LOCATION=$CAPI
+export LD_LIBRARY_PATH=$CAPI:$LD_LIBRARY_PATH
+```
+
 These env vars must be set before `cargo build` so that `ort-sys` finds the
 system ORT instead of downloading its own prebuilt binaries.
 

From e2c392b48ce291336f1a3790532f431b38ca8e1c Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:36:36 +1200
Subject: [PATCH 10/43] docs: add ORT_PREFER_DYNAMIC_LINK=1 to fix static link
 fallback

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 2325cb6..1c1e901 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -156,9 +156,10 @@ so_versioned = os.path.join(capi_dir, f"libonnxruntime.so.{onnxruntime.__version
 if os.path.exists(so_versioned) and not os.path.exists(so_plain):
     os.symlink(so_versioned, so_plain)
 
-os.environ["ORT_STRATEGY"]     = "system"
-os.environ["ORT_LIB_LOCATION"] = capi_dir
-os.environ["LD_LIBRARY_PATH"]  = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
+os.environ["ORT_STRATEGY"]          = "system"
+os.environ["ORT_LIB_LOCATION"]      = capi_dir
+os.environ["ORT_PREFER_DYNAMIC_LINK"] = "1"
+os.environ["LD_LIBRARY_PATH"]       = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
 ```
 
 **Terminal (Colab shell):**
@@ -171,6 +172,7 @@ ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so
 
 export ORT_STRATEGY=system
 export ORT_LIB_LOCATION=$CAPI
+export ORT_PREFER_DYNAMIC_LINK=1
 export LD_LIBRARY_PATH=$CAPI:$LD_LIBRARY_PATH
 ```
 

From f7081d2370dd26b85bec5756211f1ce3fbb1d159 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:38:37 +1200
Subject: [PATCH 11/43] docs: add libonnxruntime.so.1 SONAME symlink for
 runtime linker

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 1c1e901..48b093e 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -151,10 +151,15 @@ capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
 
 # The pip package ships libonnxruntime.so.1.20.1 but no unversioned symlink.
 # ort-sys requires libonnxruntime.so — create it if missing.
-so_plain     = os.path.join(capi_dir, "libonnxruntime.so")
 so_versioned = os.path.join(capi_dir, f"libonnxruntime.so.{onnxruntime.__version__}")
+# ort-sys needs libonnxruntime.so (unversioned) for linking
+so_plain = os.path.join(capi_dir, "libonnxruntime.so")
 if os.path.exists(so_versioned) and not os.path.exists(so_plain):
     os.symlink(so_versioned, so_plain)
+# runtime linker resolves the SONAME libonnxruntime.so.1 (major version only)
+so_major = os.path.join(capi_dir, "libonnxruntime.so.1")
+if os.path.exists(so_versioned) and not os.path.exists(so_major):
+    os.symlink(so_versioned, so_major)
 
 os.environ["ORT_STRATEGY"]          = "system"
 os.environ["ORT_LIB_LOCATION"]      = capi_dir
@@ -168,6 +173,7 @@ os.environ["LD_LIBRARY_PATH"]       = capi_dir + ":" + os.environ.get("LD_LIBRAR
 pip install -q onnxruntime-gpu==1.20.1
 
 CAPI=/usr/local/lib/python3.12/dist-packages/onnxruntime/capi
+ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so.1
 ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so
 
 export ORT_STRATEGY=system

From 016b455f3694cdad5fd5f8a1518057bcff843e7a Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:42:00 +1200
Subject: [PATCH 12/43] docs: fix ORT API version mismatch, remove version pin

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 48 ++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 48b093e..65ac611 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -136,45 +136,59 @@ os.environ["PATH"] = os.path.expanduser("~/.cargo/bin") + ":" + os.environ["PATH
 
 ### 3. Install ORT system libraries
 
-The prebuilt ORT binaries bundled by `ort-sys` (CUDA variant) require glibc 2.38+,
-but Colab runs Ubuntu 22.04 (glibc 2.35). Use `ORT_STRATEGY=system` with the
-pip-installed `onnxruntime-gpu`, which is compiled for Ubuntu 22.04:
+The prebuilt ORT CUDA binaries from `ort-sys` require glibc 2.38+, but Colab
+runs Ubuntu 22.04 (glibc 2.35). Use `ORT_STRATEGY=system` with the pip-installed
+`onnxruntime-gpu` instead, which is compiled for Ubuntu 22.04.
+
+**Version requirement:** `ort-sys` 2.0.0-rc.12 requests ORT C API version 24.
+The pip package version must match — `onnxruntime-gpu` N.x ships API version N
+(e.g. `1.24.x` → API 24). Install the matching version:
+
+```bash
+pip install onnxruntime-gpu==1.24.0   # adjust patch if needed
+```
+
+Or install the latest and verify:
+
+```bash
+pip install -U onnxruntime-gpu
+python -c "import onnxruntime; print(onnxruntime.__version__)"
+```
+
+After installing, create the symlinks the linker needs (the pip package ships
+only the versioned `.so`, not the plain or major-version names):
 
 **Notebook cell:**
 
 ```python
-!pip install -q onnxruntime-gpu==1.20.1
-
 import onnxruntime, os
 
 capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
-
-# The pip package ships libonnxruntime.so.1.20.1 but no unversioned symlink.
-# ort-sys requires libonnxruntime.so — create it if missing.
 so_versioned = os.path.join(capi_dir, f"libonnxruntime.so.{onnxruntime.__version__}")
-# ort-sys needs libonnxruntime.so (unversioned) for linking
+
+# ort-sys build script needs libonnxruntime.so (unversioned)
 so_plain = os.path.join(capi_dir, "libonnxruntime.so")
 if os.path.exists(so_versioned) and not os.path.exists(so_plain):
     os.symlink(so_versioned, so_plain)
-# runtime linker resolves the SONAME libonnxruntime.so.1 (major version only)
+
+# runtime linker resolves the ELF SONAME libonnxruntime.so.1 (major version)
 so_major = os.path.join(capi_dir, "libonnxruntime.so.1")
 if os.path.exists(so_versioned) and not os.path.exists(so_major):
     os.symlink(so_versioned, so_major)
 
-os.environ["ORT_STRATEGY"]          = "system"
-os.environ["ORT_LIB_LOCATION"]      = capi_dir
+os.environ["ORT_STRATEGY"]            = "system"
+os.environ["ORT_LIB_LOCATION"]        = capi_dir
 os.environ["ORT_PREFER_DYNAMIC_LINK"] = "1"
-os.environ["LD_LIBRARY_PATH"]       = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
+os.environ["LD_LIBRARY_PATH"]         = capi_dir + ":" + os.environ.get("LD_LIBRARY_PATH", "")
 ```
 
 **Terminal (Colab shell):**
 
 ```bash
-pip install -q onnxruntime-gpu==1.20.1
-
 CAPI=/usr/local/lib/python3.12/dist-packages/onnxruntime/capi
-ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so.1
-ln -sf $CAPI/libonnxruntime.so.1.20.1 $CAPI/libonnxruntime.so
+ORT_VER=$(python -c "import onnxruntime; print(onnxruntime.__version__)")
+ln -sf $CAPI/libonnxruntime.so.$ORT_VER $CAPI/libonnxruntime.so.1
+ln -sf $CAPI/libonnxruntime.so.$ORT_VER $CAPI/libonnxruntime.so
 
 export ORT_STRATEGY=system
 export ORT_LIB_LOCATION=$CAPI

From c7bc938892d0cfdf4009663157716aebb902433a Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:44:33 +1200
Subject: [PATCH 13/43] docs: use lexists to handle stale symlinks on ORT
 version upgrade

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 65ac611..9b8be1b 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -166,15 +166,15 @@ import onnxruntime, os
 capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
 so_versioned = os.path.join(capi_dir, f"libonnxruntime.so.{onnxruntime.__version__}")
 
-# ort-sys build script needs libonnxruntime.so (unversioned)
-so_plain = os.path.join(capi_dir, "libonnxruntime.so")
-if os.path.exists(so_versioned) and not os.path.exists(so_plain):
-    os.symlink(so_versioned, so_plain)
-
-# runtime linker resolves the ELF SONAME libonnxruntime.so.1 (major version)
-so_major = os.path.join(capi_dir, "libonnxruntime.so.1")
-if os.path.exists(so_versioned) and not os.path.exists(so_major):
-    os.symlink(so_versioned, so_major)
+# Force-create symlinks — os.path.lexists catches stale/broken symlinks
+# that os.path.exists would miss (e.g. left over from a previous ORT version).
+for link in [
+    os.path.join(capi_dir, "libonnxruntime.so"),    # ort-sys build script
+    os.path.join(capi_dir, "libonnxruntime.so.1"),  # runtime ELF SONAME
+]:
+    if os.path.lexists(link):
+        os.remove(link)
+    os.symlink(so_versioned, link)
 
 os.environ["ORT_STRATEGY"]            = "system"
 os.environ["ORT_LIB_LOCATION"]        = capi_dir

From 7a9d9b7345b0476f551085960226b2c9d2495807 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 16:53:19 +1200
Subject: [PATCH 14/43] feat: download into WAVEKAT_MODEL_DIR when files are
 missing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/backends/qwen3_tts/download.rs        | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/download.rs b/crates/wavekat-tts/src/backends/qwen3_tts/download.rs
index 480ea01..c426678 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/download.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/download.rs
@@ -85,15 +85,29 @@ pub fn resolve_model_dir(config: &super::ModelConfig) -> Result<PathBuf, TtsErro
         return Ok(dir.clone());
     }
 
-    if let Ok(dir) = std::env::var("WAVEKAT_MODEL_DIR") {
-        return Ok(PathBuf::from(dir));
-    }
+    // When WAVEKAT_MODEL_DIR is set and config.json already exists there,
+    // treat it as a pre-populated local directory and skip all downloads.
+    // If config.json is absent, fall through and use WAVEKAT_MODEL_DIR as
+    // the HF Hub cache root so files are downloaded there.
+    let cache_dir_override = match std::env::var("WAVEKAT_MODEL_DIR") {
+        Ok(dir) => {
+            let path = PathBuf::from(&dir);
+            if path.join("config.json").exists() {
+                return Ok(path);
+            }
+            Some(path)
+        }
+        Err(_) => None,
+    };
 
     let precision = config.precision;
 
     // from_env() reads HF_HOME / HF_ENDPOINT.
     // Bridge HF_TOKEN which hf-hub doesn't read from the environment natively.
     let mut builder = ApiBuilder::from_env();
+    if let Some(ref dir) = cache_dir_override {
+        builder = builder.with_cache_dir(dir.clone());
+    }
     if let Ok(token) = std::env::var("HF_TOKEN") {
         if !token.is_empty() {
             builder = builder.with_token(Some(token));

From 8857d45acfe5b399b9eb26b236c83c2715ba7964 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 17:03:00 +1200
Subject: [PATCH 15/43] docs: fix ORT symlink escapes model dir via cp -rL from
 Drive

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 9b8be1b..6bec801 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -209,18 +209,41 @@ system ORT instead of downloading its own prebuilt binaries.
 
 ### 6. Model weights
 
-The HF Hub downloader will fetch weights automatically on first run.
-To persist across sessions, mount Google Drive and set `WAVEKAT_MODEL_DIR`:
+Mount Drive for persistent storage, then copy the model to local `/content/`
+before loading. ORT 1.24 added a security check that rejects `.onnx.data`
+external data paths resolving outside the model directory — HF Hub stores files
+as symlinks (`int4/talker_prefill.onnx.data → ../../blobs/...`) which trigger
+this check when accessed via the Drive FUSE mount. Copying with `cp -rL`
+dereferences symlinks into real files. `/content/` is local NVMe so loading is
+also faster than from Drive.
 
 ```python
 from google.colab import drive
 drive.mount('/content/drive')
 ```
 
-```python
-import os
-os.environ["WAVEKAT_MODEL_DIR"] = "/content/drive/MyDrive/wavekat-models"
-!cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
+```bash
+LOCAL=/content/wavekat-model
+
+# First run: download to Drive. Subsequent runs: find the cached snapshot.
+if [ ! -f "$LOCAL/config.json" ]; then
+  SNAPSHOT=$(ls -d /content/drive/MyDrive/wavekat-models/models--*/snapshots/*/ 2>/dev/null | head -1)
+  if [ -n "$SNAPSHOT" ]; then
+    echo "Copying model Drive → local (resolving symlinks)..."
+    cp -rL "$SNAPSHOT/." "$LOCAL/"
+  else
+    # No Drive cache yet — let WAVEKAT_MODEL_DIR trigger a fresh download
+    echo "Drive cache not found, will download..."
+    export WAVEKAT_MODEL_DIR=/content/drive/MyDrive/wavekat-models
+  fi
+fi
+
+# Once local copy exists, point directly at it
+[ -f "$LOCAL/config.json" ] && export WAVEKAT_MODEL_DIR=$LOCAL
+```
+
+```bash
+cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
   --text "Hello from GPU" --out /content/output.wav
 ```
 

From b623c6076873797d6c8e12c12a13e4f5d9e130a3 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 17:05:05 +1200
Subject: [PATCH 16/43] docs: convert model copy script to Python notebook cell

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 6bec801..1a4cc43 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -222,24 +222,26 @@ from google.colab import drive
 drive.mount('/content/drive')
 ```
 
-```bash
-LOCAL=/content/wavekat-model
-
-# First run: download to Drive. Subsequent runs: find the cached snapshot.
-if [ ! -f "$LOCAL/config.json" ]; then
-  SNAPSHOT=$(ls -d /content/drive/MyDrive/wavekat-models/models--*/snapshots/*/ 2>/dev/null | head -1)
-  if [ -n "$SNAPSHOT" ]; then
-    echo "Copying model Drive → local (resolving symlinks)..."
-    cp -rL "$SNAPSHOT/." "$LOCAL/"
-  else
-    # No Drive cache yet — let WAVEKAT_MODEL_DIR trigger a fresh download
-    echo "Drive cache not found, will download..."
-    export WAVEKAT_MODEL_DIR=/content/drive/MyDrive/wavekat-models
-  fi
-fi
-
-# Once local copy exists, point directly at it
-[ -f "$LOCAL/config.json" ] && export WAVEKAT_MODEL_DIR=$LOCAL
+```python
+import glob, os, shutil
+
+LOCAL    = "/content/wavekat-model"
+DRIVE    = "/content/drive/MyDrive/wavekat-models"
+
+if not os.path.isfile(f"{LOCAL}/config.json"):
+    snapshots = glob.glob(f"{DRIVE}/models--*/snapshots/*/")
+    if snapshots:
+        snapshot = snapshots[0]
+        print(f"Copying {snapshot} → {LOCAL} (resolving symlinks)...")
+        shutil.copytree(snapshot, LOCAL, symlinks=False, dirs_exist_ok=True)
+        print("Done.")
+    else:
+        print("Drive cache not found — will download to Drive on first run.")
+        os.environ["WAVEKAT_MODEL_DIR"] = DRIVE
+
+if os.path.isfile(f"{LOCAL}/config.json"):
+    os.environ["WAVEKAT_MODEL_DIR"] = LOCAL
+    print(f"WAVEKAT_MODEL_DIR={LOCAL}")
 ```
 
 ```bash

From 1d57379f87e054e711bae67be20e2a29825597c3 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 17:07:57 +1200
Subject: [PATCH 17/43] feat: add --provider flag to synthesize example

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/examples/synthesize.rs | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/crates/wavekat-tts/examples/synthesize.rs b/crates/wavekat-tts/examples/synthesize.rs
index f0b3ca4..93e9e9e 100644
--- a/crates/wavekat-tts/examples/synthesize.rs
+++ b/crates/wavekat-tts/examples/synthesize.rs
@@ -6,6 +6,7 @@
 //! Options:
 //!   --model-dir <PATH>      Model directory (default: auto-download to cache)
 //!   --precision <PREC>      Model precision: int4 (default) or fp32
+//!   --provider <EP>         Execution provider: cpu (default), cuda, tensorrt, coreml
 //!   --language <LANG>       Language code (default: en)
 //!   --instruction <TEXT>    Voice style instruction (VoiceDesign prompt)
 //!                           Default: "Speak naturally and clearly."
@@ -29,7 +30,7 @@
 use std::io::{self, BufRead, Write};
 use std::path::PathBuf;
 
-use wavekat_tts::backends::qwen3_tts::{ModelConfig, ModelPrecision, Qwen3Tts};
+use wavekat_tts::backends::qwen3_tts::{ExecutionProvider, ModelConfig, ModelPrecision, Qwen3Tts};
 use wavekat_tts::{SynthesizeRequest, TtsBackend};
 
 const DEFAULT_INSTRUCTION: &str = "Speak naturally and clearly.";
@@ -39,6 +40,7 @@ fn main() {
 
     let mut model_dir: Option<PathBuf> = None;
     let mut precision = ModelPrecision::Int4;
+    let mut provider = ExecutionProvider::Cpu;
     let mut language = "en".to_string();
     let mut instruction: Option<String> = None;
     let mut output = PathBuf::from("output.wav");
@@ -63,6 +65,19 @@ fn main() {
                     }
                 };
             }
+            "--provider" => {
+                i += 1;
+                provider = match args[i].as_str() {
+                    "cpu"      => ExecutionProvider::Cpu,
+                    "cuda"     => ExecutionProvider::Cuda,
+                    "tensorrt" => ExecutionProvider::TensorRt,
+                    "coreml"   => ExecutionProvider::CoreMl,
+                    other => {
+                        eprintln!("error: unknown provider \"{other}\", expected cpu, cuda, tensorrt, or coreml");
+                        std::process::exit(1);
+                    }
+                };
+            }
             "--language" => {
                 i += 1;
                 language = args[i].clone();
@@ -86,6 +101,7 @@ fn main() {
         eprintln!("Usage: synthesize [OPTIONS] [TEXT]");
         eprintln!("  --model-dir <PATH>       Model directory (default: auto-download)");
         eprintln!("  --precision <PREC>       Model precision: int4 (default) or fp32");
+        eprintln!("  --provider <EP>          Execution provider: cpu (default), cuda, tensorrt, coreml");
         eprintln!("  --language <LANG>        Language code (default: en)");
         eprintln!("  --instruction <TEXT>     Voice style instruction (VoiceDesign prompt)");
         eprintln!("                           Default: \"{DEFAULT_INSTRUCTION}\"");
@@ -100,7 +116,9 @@ fn main() {
     }
 
     eprintln!("Loading model ...");
-    let mut config = ModelConfig::default().with_precision(precision);
+    let mut config = ModelConfig::default()
+        .with_precision(precision)
+        .with_execution_provider(provider);
     if let Some(dir) = model_dir {
         config = config.with_dir(dir);
     }

From adeba06e7c7bdbf26e3418f84fc1d329b46b9b35 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 17:59:43 +1200
Subject: [PATCH 18/43] fix: use SameAsRequested arena strategy for CUDA EP

ORT's default kNextPowerOfTwo doubles the GPU memory arena on each
extension, causing monotonic growth across synthesis calls. Switching
to SameAsRequested limits allocation to actual peak usage.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/src/backends/qwen3_tts/model.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index 15cfefc..b8dc96a 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -640,12 +640,15 @@ fn apply_execution_provider(
     ep: super::ExecutionProvider,
 ) -> Result<ort::session::builder::SessionBuilder, TtsError> {
     use ort::execution_providers::{
-        CUDAExecutionProvider, CoreMLExecutionProvider, TensorRTExecutionProvider,
+        ArenaExtendStrategy, CUDAExecutionProvider, CoreMLExecutionProvider,
+        TensorRTExecutionProvider,
     };
     match ep {
         super::ExecutionProvider::Cpu => Ok(builder),
         super::ExecutionProvider::Cuda => builder
-            .with_execution_providers([CUDAExecutionProvider::default().build()])
+            .with_execution_providers([CUDAExecutionProvider::default()
+                .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
+                .build()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
             .with_execution_providers([TensorRTExecutionProvider::default().build()])

From 0ddf74605bd0627de1fd0ecfd5bfda0c7cb33f9f Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:11:40 +1200
Subject: [PATCH 19/43] fix: shim missing glibc 2.38 C23 strto* symbols for
 Colab

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/src/glibc_compat.rs | 39 ++++++++++++++++++++++++++
 crates/wavekat-tts/src/lib.rs          |  5 ++++
 2 files changed, 44 insertions(+)
 create mode 100644 crates/wavekat-tts/src/glibc_compat.rs

diff --git a/crates/wavekat-tts/src/glibc_compat.rs b/crates/wavekat-tts/src/glibc_compat.rs
new file mode 100644
index 0000000..b3c79db
--- /dev/null
+++ b/crates/wavekat-tts/src/glibc_compat.rs
@@ -0,0 +1,39 @@
+// glibc 2.38 introduced C23 variants of the strto* family (ISO C23 §7.22.1).
+// ORT prebuilt binaries compiled on newer toolchains emit references to these
+// symbols, but Ubuntu 22.04 (glibc 2.35) — used by Google Colab and many CI
+// hosts — does not provide them.  Define thin wrappers so the linker is happy.
+use std::ffi::c_char;
+use std::os::raw::{c_int, c_long, c_longlong, c_ulonglong};
+
+extern "C" {
+    fn strtol(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_long;
+    fn strtoll(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_longlong;
+    fn strtoull(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_ulonglong;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn __isoc23_strtol(
+    nptr: *const c_char,
+    endptr: *mut *mut c_char,
+    base: c_int,
+) -> c_long {
+    strtol(nptr, endptr, base)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn __isoc23_strtoll(
+    nptr: *const c_char,
+    endptr: *mut *mut c_char,
+    base: c_int,
+) -> c_longlong {
+    strtoll(nptr, endptr, base)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn __isoc23_strtoull(
+    nptr: *const c_char,
+    endptr: *mut *mut c_char,
+    base: c_int,
+) -> c_ulonglong {
+    strtoull(nptr, endptr, base)
+}
diff --git a/crates/wavekat-tts/src/lib.rs b/crates/wavekat-tts/src/lib.rs
index 9aa1e97..e2557ae 100644
--- a/crates/wavekat-tts/src/lib.rs
+++ b/crates/wavekat-tts/src/lib.rs
@@ -60,6 +60,11 @@ mod error;
 mod traits;
 mod types;
 
+// Provide missing glibc 2.38+ C23 strto* symbols for older Linux hosts (e.g.
+// Ubuntu 22.04 / Colab) when linking ORT prebuilt CUDA/TensorRT binaries.
+#[cfg(all(target_os = "linux", any(feature = "qwen3-tts", feature = "cosyvoice")))]
+mod glibc_compat;
+
 pub mod backends;
 
 pub use error::TtsError;

From 547d2148e877c886d53eb3daa16cc79a32fc51be Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:16:56 +1200
Subject: [PATCH 20/43] fix: error on EP failure instead of silent CPU fallback

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/src/backends/qwen3_tts/model.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index b8dc96a..e9d7653 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -648,13 +648,18 @@ fn apply_execution_provider(
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default()
                 .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
-                .build()])
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
-            .with_execution_providers([TensorRTExecutionProvider::default().build()])
+            .with_execution_providers([TensorRTExecutionProvider::default()
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("TensorRT execution provider error: {e}"))),
         super::ExecutionProvider::CoreMl => builder
-            .with_execution_providers([CoreMLExecutionProvider::default().build()])
+            .with_execution_providers([CoreMLExecutionProvider::default()
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CoreML execution provider error: {e}"))),
     }
 }

From 92c8182bb9f22e0b2c8ae5d50fc0a429688fc2d3 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:20:42 +1200
Subject: [PATCH 21/43] feat: add load-dynamic feature for glibc 2.35 compat
 (Colab)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/Cargo.toml |  6 ++++++
 crates/wavekat-tts/src/lib.rs | 11 ++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/crates/wavekat-tts/Cargo.toml b/crates/wavekat-tts/Cargo.toml
index 22c7429..12b4901 100644
--- a/crates/wavekat-tts/Cargo.toml
+++ b/crates/wavekat-tts/Cargo.toml
@@ -20,6 +20,12 @@ coreml   = ["ort?/coreml"]
 cuda     = ["ort?/cuda"]
 tensorrt = ["ort?/tensorrt"]
 
+# Use dlopen to load a system/pip-installed libonnxruntime.so at runtime instead
+# of the bundled prebuilt binary.  Required on Linux hosts with glibc < 2.38
+# (e.g. Ubuntu 22.04 / Google Colab) because the bundled CUDA EP plugin is
+# built against glibc 2.38.  Set ORT_DYLIB_PATH to the library path at runtime.
+load-dynamic = ["ort?/load-dynamic"]
+
 [dependencies]
 wavekat-core = { version = "0.0.5", features = ["wav"] }
 thiserror = "2"
diff --git a/crates/wavekat-tts/src/lib.rs b/crates/wavekat-tts/src/lib.rs
index e2557ae..424b857 100644
--- a/crates/wavekat-tts/src/lib.rs
+++ b/crates/wavekat-tts/src/lib.rs
@@ -60,9 +60,14 @@ mod error;
 mod traits;
 mod types;
 
-// Provide missing glibc 2.38+ C23 strto* symbols for older Linux hosts (e.g.
-// Ubuntu 22.04 / Colab) when linking ORT prebuilt CUDA/TensorRT binaries.
-#[cfg(all(target_os = "linux", any(feature = "qwen3-tts", feature = "cosyvoice")))]
+// Provide missing glibc 2.38+ C23 strto* symbols when statically linking the
+// bundled ORT on older Linux hosts (e.g. Ubuntu 22.04 / Colab).
+// Not needed with `load-dynamic`, which skips the bundled static ORT entirely.
+#[cfg(all(
+    target_os = "linux",
+    any(feature = "qwen3-tts", feature = "cosyvoice"),
+    not(feature = "load-dynamic"),
+))]
 mod glibc_compat;
 
 pub mod backends;

From f34e153e8203739e305885e0d31e6f86fb28da5c Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:26:59 +1200
Subject: [PATCH 22/43] revert: remove glibc compat workarounds

Minimum supported Linux is Ubuntu 24.04 (glibc 2.38+).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/Cargo.toml                 |  6 ---
 .../src/backends/qwen3_tts/model.rs           | 11 ++----
 crates/wavekat-tts/src/glibc_compat.rs        | 39 -------------------
 crates/wavekat-tts/src/lib.rs                 | 10 -----
 4 files changed, 3 insertions(+), 63 deletions(-)
 delete mode 100644 crates/wavekat-tts/src/glibc_compat.rs

diff --git a/crates/wavekat-tts/Cargo.toml b/crates/wavekat-tts/Cargo.toml
index 12b4901..22c7429 100644
--- a/crates/wavekat-tts/Cargo.toml
+++ b/crates/wavekat-tts/Cargo.toml
@@ -20,12 +20,6 @@ coreml   = ["ort?/coreml"]
 cuda     = ["ort?/cuda"]
 tensorrt = ["ort?/tensorrt"]
 
-# Use dlopen to load a system/pip-installed libonnxruntime.so at runtime instead
-# of the bundled prebuilt binary.  Required on Linux hosts with glibc < 2.38
-# (e.g. Ubuntu 22.04 / Google Colab) because the bundled CUDA EP plugin is
-# built against glibc 2.38.  Set ORT_DYLIB_PATH to the library path at runtime.
-load-dynamic = ["ort?/load-dynamic"]
-
 [dependencies]
 wavekat-core = { version = "0.0.5", features = ["wav"] }
 thiserror = "2"
diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index e9d7653..b8dc96a 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -648,18 +648,13 @@ fn apply_execution_provider(
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default()
                 .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
-                .build()
-                .error_on_failure()])
+                .build()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
-            .with_execution_providers([TensorRTExecutionProvider::default()
-                .build()
-                .error_on_failure()])
+            .with_execution_providers([TensorRTExecutionProvider::default().build()])
             .map_err(|e| TtsError::Model(format!("TensorRT execution provider error: {e}"))),
         super::ExecutionProvider::CoreMl => builder
-            .with_execution_providers([CoreMLExecutionProvider::default()
-                .build()
-                .error_on_failure()])
+            .with_execution_providers([CoreMLExecutionProvider::default().build()])
             .map_err(|e| TtsError::Model(format!("CoreML execution provider error: {e}"))),
     }
 }
diff --git a/crates/wavekat-tts/src/glibc_compat.rs b/crates/wavekat-tts/src/glibc_compat.rs
deleted file mode 100644
index b3c79db..0000000
--- a/crates/wavekat-tts/src/glibc_compat.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-// glibc 2.38 introduced C23 variants of the strto* family (ISO C23 §7.22.1).
-// ORT prebuilt binaries compiled on newer toolchains emit references to these
-// symbols, but Ubuntu 22.04 (glibc 2.35) — used by Google Colab and many CI
-// hosts — does not provide them.  Define thin wrappers so the linker is happy.
-use std::ffi::c_char;
-use std::os::raw::{c_int, c_long, c_longlong, c_ulonglong};
-
-extern "C" {
-    fn strtol(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_long;
-    fn strtoll(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_longlong;
-    fn strtoull(nptr: *const c_char, endptr: *mut *mut c_char, base: c_int) -> c_ulonglong;
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn __isoc23_strtol(
-    nptr: *const c_char,
-    endptr: *mut *mut c_char,
-    base: c_int,
-) -> c_long {
-    strtol(nptr, endptr, base)
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn __isoc23_strtoll(
-    nptr: *const c_char,
-    endptr: *mut *mut c_char,
-    base: c_int,
-) -> c_longlong {
-    strtoll(nptr, endptr, base)
-}
-
-#[no_mangle]
-pub unsafe extern "C" fn __isoc23_strtoull(
-    nptr: *const c_char,
-    endptr: *mut *mut c_char,
-    base: c_int,
-) -> c_ulonglong {
-    strtoull(nptr, endptr, base)
-}
diff --git a/crates/wavekat-tts/src/lib.rs b/crates/wavekat-tts/src/lib.rs
index 424b857..9aa1e97 100644
--- a/crates/wavekat-tts/src/lib.rs
+++ b/crates/wavekat-tts/src/lib.rs
@@ -60,16 +60,6 @@ mod error;
 mod traits;
 mod types;
 
-// Provide missing glibc 2.38+ C23 strto* symbols when statically linking the
-// bundled ORT on older Linux hosts (e.g. Ubuntu 22.04 / Colab).
-// Not needed with `load-dynamic`, which skips the bundled static ORT entirely.
-#[cfg(all(
-    target_os = "linux",
-    any(feature = "qwen3-tts", feature = "cosyvoice"),
-    not(feature = "load-dynamic"),
-))]
-mod glibc_compat;
-
 pub mod backends;
 
 pub use error::TtsError;

From 8c0b3941e5b927fa8fe88a05dd035c3404aea3d0 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:38:12 +1200
Subject: [PATCH 23/43] fix: hard-link HF Hub symlinks so ORT external data
 validation passes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/backends/qwen3_tts/model.rs           | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index b8dc96a..7e83682 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -77,8 +77,9 @@ impl Model {
     /// - `model_dir/{int4,fp32}/talker_prefill.onnx` (+ .data), etc.
     /// - `model_dir/embeddings/text_embedding.npy`, etc.
     pub fn load(model_dir: &Path, config: &super::ModelConfig) -> Result<Self, TtsError> {
+        let onnx_dir = prepare_onnx_dir(&model_dir.join(config.precision.subdir()))?;
         let load_session = |name: &str| -> Result<Session, TtsError> {
-            let path = model_dir.join(config.precision.subdir()).join(name);
+            let path = onnx_dir.join(name);
             let builder = Session::builder()
                 .map_err(|e| TtsError::Model(format!("session builder error: {e}")))?;
             apply_execution_provider(builder, config.execution_provider)?
@@ -631,6 +632,50 @@ impl Model {
 // Helpers
 // ---------------------------------------------------------------------------
 
+/// Ensure ORT can load ONNX models with external data from `onnx_dir`.
+///
+/// HuggingFace Hub snapshot directories store files as symlinks into a
+/// `blobs/` directory.  ORT's external-data path validation resolves symlinks
+/// and rejects any `.onnx.data` file whose real path escapes the model
+/// directory, even though the symlink itself sits right next to the `.onnx`.
+///
+/// When symlinks are detected, this function creates a sibling directory
+/// (`{onnx_dir}.ort`) populated with hard links to the same inodes.  Hard
+/// links are free (no data is copied) and have no symlink targets, so ORT's
+/// validation passes.  Falls back to a full copy only when hard links are not
+/// supported (cross-device mount).
+///
+/// Returns `onnx_dir` unchanged if no symlinks are present.
+fn prepare_onnx_dir(onnx_dir: &Path) -> Result<std::path::PathBuf, TtsError> {
+    let entries: Vec<_> = std::fs::read_dir(onnx_dir)
+        .map_err(|e| TtsError::Model(format!("cannot read {}: {e}", onnx_dir.display())))?
+        .filter_map(|e| e.ok())
+        .collect();
+
+    let has_symlinks = entries.iter().any(|e| e.path().is_symlink());
+    if !has_symlinks {
+        return Ok(onnx_dir.to_path_buf());
+    }
+
+    let resolved = onnx_dir.with_extension("ort");
+    std::fs::create_dir_all(&resolved)
+        .map_err(|e| TtsError::Model(format!("cannot create {}: {e}", resolved.display())))?;
+
+    for entry in &entries {
+        let src = entry.path();
+        let dst = resolved.join(entry.file_name());
+        if dst.exists() {
+            continue;
+        }
+        if std::fs::hard_link(&src, &dst).is_err() {
+            std::fs::copy(&src, &dst)
+                .map_err(|e| TtsError::Model(format!("cannot copy {}: {e}", src.display())))?;
+        }
+    }
+
+    Ok(resolved)
+}
+
 /// Register the requested execution provider on a session builder.
 ///
 /// CPU is the ORT default — no registration needed. CUDA and CoreML require

From 5a7631134a39fb4b82b3db37a9dd2fb817126615 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 21:43:02 +1200
Subject: [PATCH 24/43] fix: hard-link symlink target, not symlink inode, for
 ORT compat

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/backends/qwen3_tts/model.rs           | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index 7e83682..f8638cd 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -640,10 +640,12 @@ impl Model {
 /// directory, even though the symlink itself sits right next to the `.onnx`.
 ///
 /// When symlinks are detected, this function creates a sibling directory
-/// (`{onnx_dir}.ort`) populated with hard links to the same inodes.  Hard
-/// links are free (no data is copied) and have no symlink targets, so ORT's
-/// validation passes.  Falls back to a full copy only when hard links are not
-/// supported (cross-device mount).
+/// (`{onnx_dir}.ort`) where each symlink is replaced by a hard link to the
+/// symlink's *target* (resolved via `canonicalize`).  Hard-linking the target
+/// (not the symlink inode) is critical: on some filesystems hard-linking a
+/// symlink succeeds but produces another symlink, which would fail ORT's
+/// validation again.  Hard links are free (no data is copied).  Falls back to
+/// a full copy only on cross-device mounts.
 ///
 /// Returns `onnx_dir` unchanged if no symlinks are present.
 fn prepare_onnx_dir(onnx_dir: &Path) -> Result<std::path::PathBuf, TtsError> {
@@ -667,7 +669,16 @@ fn prepare_onnx_dir(onnx_dir: &Path) -> Result<std::path::PathBuf, TtsError> {
         if dst.exists() {
             continue;
         }
-        if std::fs::hard_link(&src, &dst).is_err() {
+        // Hard-link the symlink's *target*, not the symlink inode itself.
+        // On some filesystems hard-linking a symlink succeeds but produces
+        // another symlink, which would fail ORT's path validation again.
+        let link_src = if src.is_symlink() {
+            src.canonicalize()
+                .map_err(|e| TtsError::Model(format!("cannot resolve {}: {e}", src.display())))?
+        } else {
+            src.clone()
+        };
+        if std::fs::hard_link(&link_src, &dst).is_err() {
             std::fs::copy(&src, &dst)
                 .map_err(|e| TtsError::Model(format!("cannot copy {}: {e}", src.display())))?;
         }

From c29a6ca75a7604631803faca3d803782925f1419 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:14:29 +1200
Subject: [PATCH 25/43] fix: add error_on_failure to CUDA/TRT/CoreML EPs; add
 Azure Ubuntu 24.04 setup docs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/backends/qwen3_tts/model.rs           |   9 +-
 docs/06-cuda-provider.md                      | 104 +++++++++++++++++-
 2 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index f8638cd..51443f9 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -704,13 +704,18 @@ fn apply_execution_provider(
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default()
                 .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
+                .error_on_failure()
                 .build()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
-            .with_execution_providers([TensorRTExecutionProvider::default().build()])
+            .with_execution_providers([TensorRTExecutionProvider::default()
+                .error_on_failure()
+                .build()])
             .map_err(|e| TtsError::Model(format!("TensorRT execution provider error: {e}"))),
         super::ExecutionProvider::CoreMl => builder
-            .with_execution_providers([CoreMLExecutionProvider::default().build()])
+            .with_execution_providers([CoreMLExecutionProvider::default()
+                .error_on_failure()
+                .build()])
             .map_err(|e| TtsError::Model(format!("CoreML execution provider error: {e}"))),
     }
 }
diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 1a4cc43..044ca74 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -75,7 +75,7 @@ Set `ORT_LOG_LEVEL=1` to confirm which EP is active:
 ```bash
 cargo build --release --features "qwen3-tts,cuda"
 cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
-  --text "Hello from GPU" --out output.wav
+  --provider cuda --output output.wav "Hello from GPU"
 ```
 
 ## Implementation
@@ -246,7 +246,7 @@ if os.path.isfile(f"{LOCAL}/config.json"):
 
 ```bash
 cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
-  --text "Hello from GPU" --out /content/output.wav
+  --provider cuda --output /content/output.wav "Hello from GPU"
 ```
 
 ### 7. Download output
@@ -263,6 +263,106 @@ files.download('/content/output.wav')
 - ORT bundles its own CUDA libraries; no manual driver configuration is needed
   beyond selecting the T4 runtime.
 
+## Azure Ubuntu 24.04 GPU (T4) setup
+
+Azure's **Standard_NC4as_T4_v3** SKU provides a single NVIDIA T4 (16 GB VRAM)
+on Ubuntu 24.04 LTS. Ubuntu 24.04 ships glibc 2.39, so ORT's prebuilt CUDA
+binaries work without the `ORT_STRATEGY=system` workaround required on Colab.
+
+### 1. Provision the VM
+
+```bash
+az vm create \
+  --resource-group <rg> \
+  --name wavekat-gpu \
+  --image Ubuntu2404 \
+  --size Standard_NC4as_T4_v3 \
+  --admin-username azureuser \
+  --generate-ssh-keys
+```
+
+Open SSH if needed:
+
+```bash
+az vm open-port --resource-group <rg> --name wavekat-gpu --port 22
+```
+
+### 2. Install NVIDIA drivers
+
+```bash
+ssh azureuser@<public-ip>
+
+sudo apt update
+sudo apt install -y ubuntu-drivers-common
+sudo ubuntu-drivers install
+sudo reboot
+```
+
+After reconnecting, verify:
+
+```bash
+nvidia-smi
+```
+
+Expected output includes `Tesla T4` and the CUDA driver version (≥ 11.8).
+
+### 3. Install Rust
+
+```bash
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
+source "$HOME/.cargo/env"
+rustc --version
+```
+
+### 4. Install build dependencies
+
+```bash
+sudo apt install -y pkg-config libssl-dev
+```
+
+### 5. Clone and build
+
+```bash
+git clone https://github.com/wavekat/wavekat-tts.git
+cd wavekat-tts
+cargo build --release --features "qwen3-tts,cuda"
+```
+
+ORT will download its prebuilt CUDA libraries automatically (no extra env vars
+needed on Ubuntu 24.04).
+
+### 6. Model weights
+
+Download weights from Hugging Face Hub:
+
+```bash
+pip install huggingface-hub
+huggingface-cli download Qwen/Qwen3-TTS-1.7B --local-dir ~/models/qwen3-tts-1.7b
+export WAVEKAT_MODEL_DIR=~/models/qwen3-tts-1.7b
+```
+
+### 7. Run
+
+```bash
+cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
+  --provider cuda --output ~/output.wav "Hello from Azure GPU"
+```
+
+Confirm the CUDA EP is active:
+
+```bash
+ORT_LOG_LEVEL=1 cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
+  --provider cuda --output ~/output.wav "Hello from Azure GPU" 2>&1 | grep -i cuda
+# [I:ort:session] [CUDAExecutionProvider] Created CUDA EP on device 0
+```
+
+### Notes
+
+- Ubuntu 24.04 has glibc 2.39 — no `ORT_STRATEGY=system` or symlink patching needed.
+- The `Standard_NC4as_T4_v3` SKU is available in East US, West US 2, and several
+  European regions. Check availability with `az vm list-skus`.
+- Stop the VM when idle to avoid billing: `az vm deallocate --resource-group <rg> --name wavekat-gpu`.
+
 ## Open questions
 
 - **ORT CUDA version pinning** — ORT 2.0.0-rc.12 bundles specific CUDA/cuDNN

From 52f912cf9446a28c255427bce6e82578b0aec2b7 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:16:04 +1200
Subject: [PATCH 26/43] fix: call error_on_failure() on
 ExecutionProviderDispatch, not EP builder

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/src/backends/qwen3_tts/model.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index 51443f9..0691441 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -704,18 +704,18 @@ fn apply_execution_provider(
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default()
                 .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
-                .error_on_failure()
-                .build()])
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
             .with_execution_providers([TensorRTExecutionProvider::default()
-                .error_on_failure()
-                .build()])
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("TensorRT execution provider error: {e}"))),
         super::ExecutionProvider::CoreMl => builder
             .with_execution_providers([CoreMLExecutionProvider::default()
-                .error_on_failure()
-                .build()])
+                .build()
+                .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CoreML execution provider error: {e}"))),
     }
 }

From 95dc3e1b3d440aacb5aa4839c2843289a0bd05ee Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:18:51 +1200
Subject: [PATCH 27/43] docs: fix ORT CUDA library bundling claim; add CUDA
 runtime install to Azure steps

ORT bundles libonnxruntime_providers_cuda.so but not cuBLAS/cuDNN.
Azure step 2 now installs cuda-libraries-12-6 and libcudnn9-cuda-12
from the NVIDIA CUDA apt repository.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 044ca74..4cec5fa 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -41,8 +41,9 @@ wavekat-tts = { version = "0.0.1", features = ["qwen3-tts", "cuda"] }
 wavekat-tts = { version = "0.0.1", features = ["qwen3-tts", "tensorrt"] }
 ```
 
-The `ort` crate bundles its own CUDA libraries — no manual `LD_LIBRARY_PATH`
-configuration is needed as long as the host has a compatible CUDA driver.
+The `ort` crate bundles `libonnxruntime_providers_cuda.so` but **not** the
+underlying CUDA runtime libraries (cuBLAS, cuDNN, cuFFT, etc.). Those must be
+installed on the host separately — see the platform-specific setup sections below.
 
 ## Runtime API
 
@@ -260,8 +261,8 @@ files.download('/content/output.wav')
 
 - `/content` is wiped on disconnect — pin model weights to Drive to avoid
   re-downloading each session.
-- ORT bundles its own CUDA libraries; no manual driver configuration is needed
-  beyond selecting the T4 runtime.
+- Colab's T4 runtime includes cuBLAS, cuDNN, and the CUDA driver pre-installed —
+  no extra CUDA library setup needed beyond selecting the GPU runtime.
 
 ## Azure Ubuntu 24.04 GPU (T4) setup
 
@@ -287,7 +288,7 @@ Open SSH if needed:
 az vm open-port --resource-group <rg> --name wavekat-gpu --port 22
 ```
 
-### 2. Install NVIDIA drivers
+### 2. Install NVIDIA drivers and CUDA runtime libraries
 
 ```bash
 ssh azureuser@<public-ip>
@@ -298,13 +299,22 @@ sudo ubuntu-drivers install
 sudo reboot
 ```
 
-After reconnecting, verify:
+After reconnecting, verify the driver and add the NVIDIA CUDA repository to install
+the runtime libraries that ORT's CUDA provider requires (cuBLAS, cuDNN, etc.):
 
 ```bash
-nvidia-smi
-```
+# Verify driver
+nvidia-smi   # expect "Tesla T4", CUDA driver ≥ 12.x
+
+# Add NVIDIA CUDA 12 repository
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb
+sudo apt update
 
-Expected output includes `Tesla T4` and the CUDA driver version (≥ 11.8).
+# Install CUDA 12 runtime libraries (cuBLAS, cuDNN, cuFFT, cuSolver, cuSPARSE)
+# ort-sys 2.0.0-rc.12 targets cu12 by default; any 12.x minor version works.
+sudo apt install -y cuda-libraries-12-6 libcudnn9-cuda-12
+```
 
 ### 3. Install Rust
 
@@ -359,6 +369,8 @@ ORT_LOG_LEVEL=1 cargo run --release --example synthesize --features "qwen3-tts,c
 ### Notes
 
 - Ubuntu 24.04 has glibc 2.39 — no `ORT_STRATEGY=system` or symlink patching needed.
+- ORT bundles `libonnxruntime_providers_cuda.so` but **not** cuBLAS/cuDNN. Step 2
+  installs those from the NVIDIA CUDA repository.
 - The `Standard_NC4as_T4_v3` SKU is available in East US, West US 2, and several
   European regions. Check availability with `az vm list-skus`.
 - Stop the VM when idle to avoid billing: `az vm deallocate --resource-group <rg> --name wavekat-gpu`.

From 756a6ad53193509e2cd41ff3f999e4ff1b626dc3 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:25:15 +1200
Subject: [PATCH 28/43] feat: replace empty-line quit with /quit command in
 interactive mode

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/examples/synthesize.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/crates/wavekat-tts/examples/synthesize.rs b/crates/wavekat-tts/examples/synthesize.rs
index 93e9e9e..ded58cb 100644
--- a/crates/wavekat-tts/examples/synthesize.rs
+++ b/crates/wavekat-tts/examples/synthesize.rs
@@ -20,7 +20,8 @@
 //!   /instruct               Reset instruction to default
 //!   /status                 Show current settings
 //!   /help                   Show this command list
-//!   Empty line or Ctrl-D    Quit
+//!   /quit                   Quit
+//!   Ctrl-C                  Quit
 //!
 //! Example:
 //!   cargo run --example synthesize --features qwen3-tts -- "Hello, world!"
@@ -144,7 +145,7 @@ fn run_interactive(
         .flat_map(|v| v.languages)
         .collect();
 
-    eprintln!("Interactive mode. Type text to synthesize, /help for commands, empty line to quit.");
+    eprintln!("Interactive mode. Type text to synthesize, /help for commands, /quit or Ctrl-C to quit.");
     eprintln!("  language={language}  instruction=\"{instruction}\"");
 
     let stdin = io::stdin();
@@ -160,7 +161,7 @@ fn run_interactive(
         }
         let input = line.trim();
         if input.is_empty() {
-            break;
+            continue;
         }
 
         if let Some(rest) = input.strip_prefix('/') {
@@ -196,6 +197,7 @@ fn run_interactive(
                     eprintln!("  instruction=\"{instruction}\"");
                     eprintln!("  supported languages: {}", supported_langs.join(", "));
                 }
+                "quit" | "exit" | "q" => break,
                 "help" => {
                     eprintln!("  /lang <code>        Switch language");
                     eprintln!("  /langs              List supported language codes");
@@ -203,7 +205,7 @@ fn run_interactive(
                     eprintln!("  /instruct           Reset instruction to default");
                     eprintln!("  /status             Show current settings");
                     eprintln!("  /help               Show this help");
-                    eprintln!("  Empty line          Quit");
+                    eprintln!("  /quit               Quit (or Ctrl-C)");
                 }
                 other => eprintln!("unknown command: /{other}  (type /help for commands)"),
             }

From ab5657426a0cd5f60b4984f4dc75a8f59c108a6a Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:34:27 +1200
Subject: [PATCH 29/43] feat: add bench_rtf example for qwen3-tts RTF
 benchmarking

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile                                 |  11 +-
 crates/wavekat-tts/Cargo.toml            |   4 +
 crates/wavekat-tts/examples/bench_rtf.rs | 336 +++++++++++++++++++++++
 docs/06-cuda-provider.md                 |   9 +-
 4 files changed, 355 insertions(+), 5 deletions(-)
 create mode 100644 crates/wavekat-tts/examples/bench_rtf.rs

diff --git a/Makefile b/Makefile
index 452c17b..14fa84a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help check test test-qwen3 test-all fmt clippy doc
+.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt
 
 help: ## Show this help
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-14s\033[0m %s\n", $$1, $$2}'
@@ -26,3 +26,12 @@ test-all: ## Run tests with all features
 
 doc: ## Build and open docs
 	cargo doc --all-features --no-deps --open
+
+bench-rtf: ## RTF benchmark on CPU (int4)
+	cargo run --release --example bench_rtf --features qwen3-tts
+
+bench-rtf-cuda: ## RTF benchmark on CUDA (int4) — for Azure T4
+	cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda
+
+bench-rtf-trt: ## RTF benchmark on TensorRT (int4) — for Azure T4
+	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- --provider tensorrt
diff --git a/crates/wavekat-tts/Cargo.toml b/crates/wavekat-tts/Cargo.toml
index 22c7429..f9723b9 100644
--- a/crates/wavekat-tts/Cargo.toml
+++ b/crates/wavekat-tts/Cargo.toml
@@ -37,3 +37,7 @@ hf-hub = { version = "0.5", optional = true, default-features = false, features
 [[example]]
 name = "synthesize"
 required-features = ["qwen3-tts"]
+
+[[example]]
+name = "bench_rtf"
+required-features = ["qwen3-tts"]
diff --git a/crates/wavekat-tts/examples/bench_rtf.rs b/crates/wavekat-tts/examples/bench_rtf.rs
new file mode 100644
index 0000000..8073f97
--- /dev/null
+++ b/crates/wavekat-tts/examples/bench_rtf.rs
@@ -0,0 +1,336 @@
+//! RTF benchmark for Qwen3-TTS.
+//!
+//! Measures Real-Time Factor (RTF = synthesis_time / audio_duration) across
+//! different text lengths. RTF < 1.0 means faster-than-real-time synthesis.
+//!
+//! Usage:
+//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda
+//!
+//! Options:
+//!   --model-dir <PATH>   Model directory (default: auto-download)
+//!   --precision <PREC>   int4 (default) | fp32
+//!   --provider <EP>      cpu (default) | cuda | tensorrt | coreml
+//!   --iterations <N>     Measured runs per sample (default: 5)
+//!   --warmup <N>         Warmup runs before measurement (default: 1)
+//!   --language <LANG>    Language code (default: en)
+//!   --instruction <TEXT> Voice instruction
+//!   --csv                Emit CSV rows instead of summary table
+//!
+//! Examples:
+//!   cargo run --release --example bench_rtf --features qwen3-tts
+//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda
+//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --csv > results.csv
+
+use std::path::PathBuf;
+use std::time::Instant;
+
+use wavekat_tts::backends::qwen3_tts::{ExecutionProvider, ModelConfig, ModelPrecision, Qwen3Tts};
+use wavekat_tts::{SynthesizeRequest, TtsBackend};
+
+struct Sample {
+    label: &'static str,
+    text: &'static str,
+}
+
+const SAMPLES: &[Sample] = &[
+    Sample {
+        label: "short",
+        text: "Hello, world! This is a quick test of the speech synthesis system.",
+    },
+    Sample {
+        label: "medium",
+        text: "The quick brown fox jumps over the lazy dog. \
+               Speech synthesis has improved dramatically over the past few years. \
+               Modern neural TTS systems can produce highly natural-sounding speech \
+               that is nearly indistinguishable from human voice recordings.",
+    },
+    Sample {
+        label: "long",
+        text: "Artificial intelligence is transforming the way we interact with computers. \
+               Voice interfaces powered by text-to-speech technology are now commonplace \
+               in smartphones, smart speakers, and automotive systems. \
+               The latest generation of neural TTS models uses transformer architectures \
+               trained on thousands of hours of human speech to capture the subtle nuances \
+               of natural spoken language, including prosody, rhythm, and intonation. \
+               These models can generate high-quality audio at sample rates of twenty-four \
+               kilohertz or higher, enabling crisp and clear voice output across a wide range \
+               of applications from accessibility tools to interactive voice assistants.",
+    },
+];
+
+fn main() {
+    let args: Vec<String> = std::env::args().skip(1).collect();
+
+    let mut model_dir: Option<PathBuf> = None;
+    let mut precision = ModelPrecision::Int4;
+    let mut provider = ExecutionProvider::Cpu;
+    let mut iterations: usize = 5;
+    let mut warmup: usize = 1;
+    let mut language = "en".to_string();
+    let mut instruction = "Speak naturally and clearly.".to_string();
+    let mut csv_mode = false;
+
+    let mut i = 0;
+    while i < args.len() {
+        match args[i].as_str() {
+            "--model-dir" => {
+                i += 1;
+                model_dir = Some(PathBuf::from(&args[i]));
+            }
+            "--precision" => {
+                i += 1;
+                precision = match args[i].as_str() {
+                    "int4" => ModelPrecision::Int4,
+                    "fp32" => ModelPrecision::Fp32,
+                    other => {
+                        eprintln!("error: unknown precision \"{other}\", expected int4 or fp32");
+                        std::process::exit(1);
+                    }
+                };
+            }
+            "--provider" => {
+                i += 1;
+                provider = match args[i].as_str() {
+                    "cpu" => ExecutionProvider::Cpu,
+                    "cuda" => ExecutionProvider::Cuda,
+                    "tensorrt" => ExecutionProvider::TensorRt,
+                    "coreml" => ExecutionProvider::CoreMl,
+                    other => {
+                        eprintln!(
+                            "error: unknown provider \"{other}\", \
+                             expected cpu, cuda, tensorrt, or coreml"
+                        );
+                        std::process::exit(1);
+                    }
+                };
+            }
+            "--iterations" => {
+                i += 1;
+                iterations = args[i]
+                    .parse()
+                    .expect("--iterations must be a positive integer");
+            }
+            "--warmup" => {
+                i += 1;
+                warmup = args[i]
+                    .parse()
+                    .expect("--warmup must be a non-negative integer");
+            }
+            "--language" => {
+                i += 1;
+                language = args[i].clone();
+            }
+            "--instruction" => {
+                i += 1;
+                instruction = args[i].clone();
+            }
+            "--csv" => csv_mode = true,
+            "--help" | "-h" => {
+                print_usage();
+                return;
+            }
+            other => {
+                eprintln!("error: unknown argument \"{other}\"  (use --help for usage)");
+                std::process::exit(1);
+            }
+        }
+        i += 1;
+    }
+
+    eprintln!(
+        "Loading model (precision={:?}, provider={:?}) ...",
+        precision, provider
+    );
+    let mut config = ModelConfig::default()
+        .with_precision(precision)
+        .with_execution_provider(provider);
+    if let Some(dir) = model_dir {
+        config = config.with_dir(dir);
+    }
+    let tts = Qwen3Tts::from_config(config).expect("failed to load model");
+    eprintln!("Model loaded.\n");
+
+    if csv_mode {
+        println!("sample,chars,iteration,synth_secs,audio_secs,rtf");
+    } else {
+        eprintln!(
+            "Benchmark: {} warmup + {} measured iterations per sample\n",
+            warmup, iterations
+        );
+    }
+
+    let mut summary: Vec<(&'static str, usize, Vec<RunResult>)> = Vec::new();
+
+    for sample in SAMPLES {
+        let request = SynthesizeRequest::new(sample.text)
+            .with_language(&language)
+            .with_instruction(&instruction);
+
+        // Warmup runs (not counted).
+        for w in 0..warmup {
+            eprint!(
+                "  [{:6}] warmup {}/{} ...\r",
+                sample.label,
+                w + 1,
+                warmup
+            );
+            tts.synthesize(&request).expect("warmup synthesis failed");
+        }
+        if warmup > 0 {
+            eprintln!();
+        }
+
+        // Measured runs.
+        let mut runs = Vec::with_capacity(iterations);
+        for it in 0..iterations {
+            let t0 = Instant::now();
+            let audio = tts.synthesize(&request).expect("synthesis failed");
+            let synth_secs = t0.elapsed().as_secs_f64();
+            let audio_secs = audio.duration_secs();
+            let rtf = synth_secs / audio_secs;
+
+            eprintln!(
+                "  [{:6}] iter {}/{}: synth={:.3}s  audio={:.2}s  RTF={:.3}",
+                sample.label,
+                it + 1,
+                iterations,
+                synth_secs,
+                audio_secs,
+                rtf,
+            );
+
+            if csv_mode {
+                println!(
+                    "{},{},{},{:.6},{:.6},{:.6}",
+                    sample.label,
+                    sample.text.len(),
+                    it + 1,
+                    synth_secs,
+                    audio_secs,
+                    rtf,
+                );
+            }
+
+            runs.push(RunResult {
+                synth_secs,
+                audio_secs,
+                rtf,
+            });
+        }
+
+        summary.push((sample.label, sample.text.len(), runs));
+        eprintln!();
+    }
+
+    if !csv_mode {
+        print_table(&summary);
+    }
+}
+
+struct RunResult {
+    synth_secs: f64,
+    audio_secs: f64,
+    rtf: f64,
+}
+
+struct Stats {
+    mean: f64,
+    std: f64,
+    min: f64,
+    p50: f64,
+    p95: f64,
+    max: f64,
+}
+
+fn stats(values: &[f64]) -> Stats {
+    let n = values.len() as f64;
+    let mean = values.iter().sum::<f64>() / n;
+    let std = (values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / n).sqrt();
+
+    let mut sorted = values.to_vec();
+    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    let pct = |p: f64| -> f64 {
+        let idx = ((p / 100.0) * (sorted.len() - 1) as f64).round() as usize;
+        sorted[idx.min(sorted.len() - 1)]
+    };
+
+    Stats {
+        mean,
+        std,
+        min: sorted[0],
+        p50: pct(50.0),
+        p95: pct(95.0),
+        max: *sorted.last().unwrap(),
+    }
+}
+
+fn print_table(summary: &[(&'static str, usize, Vec<RunResult>)]) {
+    let w = 82;
+    println!("\n{}", "=".repeat(w));
+    println!("  Qwen3-TTS RTF Benchmark");
+    println!("{}", "=".repeat(w));
+    println!(
+        "{:<8}  {:>5}  {:>7}  {:>7}  {:>7}  {:>7}  {:>7}  {:>7}  {:>8}  {:>8}",
+        "sample", "chars", "rtf_mean", "rtf_std", "rtf_min", "rtf_p50", "rtf_p95", "rtf_max",
+        "audio_s", "synth_s"
+    );
+    println!("{}", "-".repeat(w));
+
+    for (label, chars, runs) in summary {
+        let rtf_vals: Vec<f64> = runs.iter().map(|r| r.rtf).collect();
+        let audio_vals: Vec<f64> = runs.iter().map(|r| r.audio_secs).collect();
+        let synth_vals: Vec<f64> = runs.iter().map(|r| r.synth_secs).collect();
+
+        let rtf = stats(&rtf_vals);
+        let audio_mean = audio_vals.iter().sum::<f64>() / audio_vals.len() as f64;
+        let synth_mean = synth_vals.iter().sum::<f64>() / synth_vals.len() as f64;
+
+        println!(
+            "{:<8}  {:>5}  {:>7.3}  {:>7.3}  {:>7.3}  {:>7.3}  {:>7.3}  {:>7.3}  {:>8.2}  {:>8.2}",
+            label,
+            chars,
+            rtf.mean,
+            rtf.std,
+            rtf.min,
+            rtf.p50,
+            rtf.p95,
+            rtf.max,
+            audio_mean,
+            synth_mean,
+        );
+    }
+
+    println!("{}", "=".repeat(w));
+    println!("RTF < 1.0 = faster-than-real-time.  synth_s / audio_s = RTF.");
+}
+
+fn print_usage() {
+    eprintln!(
+        "bench_rtf — RTF benchmark for Qwen3-TTS
+
+Usage:
+  cargo run --release --example bench_rtf --features qwen3-tts [-- OPTIONS]
+
+Options:
+  --model-dir <PATH>   Model directory (default: auto-download to HF cache)
+  --precision <PREC>   int4 (default) | fp32
+  --provider <EP>      cpu (default) | cuda | tensorrt | coreml
+  --iterations <N>     Measured runs per sample (default: 5)
+  --warmup <N>         Warmup runs before measurement (default: 1)
+  --language <LANG>    Language code (default: en)
+  --instruction <TEXT> Voice instruction (default: \"Speak naturally and clearly.\")
+  --csv                Emit CSV rows to stdout instead of summary table
+
+Examples:
+  # CPU benchmark
+  cargo run --release --example bench_rtf --features qwen3-tts
+
+  # CUDA benchmark (T4)
+  cargo run --release --example bench_rtf --features \"qwen3-tts,cuda\" -- --provider cuda
+
+  # Save CSV for further analysis
+  cargo run --release --example bench_rtf --features \"qwen3-tts,cuda\" \\
+    -- --provider cuda --csv > results.csv"
+    );
+}
diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 4cec5fa..8789c26 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -343,12 +343,13 @@ needed on Ubuntu 24.04).
 
 ### 6. Model weights
 
-Download weights from Hugging Face Hub:
+The app auto-downloads weights from Hugging Face Hub on first run. Point
+`HF_HOME` at a data disk so the cache doesn't fill the small OS disk on `/`:
 
 ```bash
-pip install huggingface-hub
-huggingface-cli download Qwen/Qwen3-TTS-1.7B --local-dir ~/models/qwen3-tts-1.7b
-export WAVEKAT_MODEL_DIR=~/models/qwen3-tts-1.7b
+sudo mkdir -p /checkpoints/huggingface
+sudo chown -R $USER:$USER /checkpoints
+export HF_HOME=/checkpoints/huggingface
 ```
 
 ### 7. Run

From 9fcfe67bff7c1845a6f8e2fbcc0bc8a48836dfeb Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:51:34 +1200
Subject: [PATCH 30/43] feat: auto-update README benchmark table from CSV
 results

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/update-bench.yml |  36 ++++++++
 Makefile                           |  14 ++-
 README.md                          |   6 ++
 bench/results/.gitkeep             |   0
 scripts/update_bench_table.py      | 143 +++++++++++++++++++++++++++++
 5 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/update-bench.yml
 create mode 100644 bench/results/.gitkeep
 create mode 100644 scripts/update_bench_table.py

diff --git a/.github/workflows/update-bench.yml b/.github/workflows/update-bench.yml
new file mode 100644
index 0000000..5897fdc
--- /dev/null
+++ b/.github/workflows/update-bench.yml
@@ -0,0 +1,36 @@
+name: Update benchmark table
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'bench/results/**.csv'
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  update-bench-table:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Regenerate benchmark table in README.md
+        run: python scripts/update_bench_table.py
+
+      - name: Commit if changed
+        run: |
+          git config user.name  "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          if git diff --quiet README.md; then
+            echo "README.md unchanged — nothing to commit."
+          else
+            git add README.md
+            git commit -m "docs: update benchmark table from bench/results"
+            git push
+          fi
diff --git a/Makefile b/Makefile
index 14fa84a..568ae3c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt
+.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt bench-csv bench-csv-cuda bench-csv-trt
 
 help: ## Show this help
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-14s\033[0m %s\n", $$1, $$2}'
@@ -35,3 +35,15 @@ bench-rtf-cuda: ## RTF benchmark on CUDA (int4) — for Azure T4
 
 bench-rtf-trt: ## RTF benchmark on TensorRT (int4) — for Azure T4
 	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- --provider tensorrt
+
+bench-csv: ## RTF benchmark on CPU, save CSV to bench/results/
+	@mkdir -p bench/results
+	cargo run --release --example bench_rtf --features qwen3-tts -- --csv > bench/results/cpu-int4.csv
+
+bench-csv-cuda: ## RTF benchmark on CUDA (T4), save CSV to bench/results/
+	@mkdir -p bench/results
+	cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --csv > bench/results/cuda-t4-int4.csv
+
+bench-csv-trt: ## RTF benchmark on TensorRT (T4), save CSV to bench/results/
+	@mkdir -p bench/results
+	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- --provider tensorrt --csv > bench/results/trt-t4-int4.csv
diff --git a/README.md b/README.md
index 78d9722..cbedcc3 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,12 @@ cargo run --example synthesize --features qwen3-tts -- --precision fp32 "Hello"
 cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/model --output hello.wav "Hello"
 ```
 
+## Performance
+
+<!-- bench:start -->
+_No results yet. Run `make bench-csv-cuda` on a T4, commit `bench/results/`, and the table will appear here automatically._
+<!-- bench:end -->
+
 ## Try it on Google Colab
 
 No local GPU needed — run Qwen3-TTS on a free T4 in the browser:
diff --git a/bench/results/.gitkeep b/bench/results/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/update_bench_table.py b/scripts/update_bench_table.py
new file mode 100644
index 0000000..13dffa7
--- /dev/null
+++ b/scripts/update_bench_table.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""
+Update the benchmark table in README.md from bench/results/*.csv.
+
+CSV format (produced by `bench_rtf --csv`):
+  sample,chars,iteration,synth_secs,audio_secs,rtf
+
+Usage:
+  python scripts/update_bench_table.py
+  python scripts/update_bench_table.py --check   # exit 1 if README would change
+"""
+
+import csv
+import re
+import sys
+from pathlib import Path
+from statistics import mean
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+RESULTS_DIR = REPO_ROOT / "bench" / "results"
+README = REPO_ROOT / "README.md"
+
+START_MARKER = "<!-- bench:start -->"
+END_MARKER = "<!-- bench:end -->"
+
+# Canonical sample order.
+SAMPLES = ["short", "medium", "long"]
+
+# Pretty labels for known config names.  Unknown names fall back to the stem.
+LABEL_MAP = {
+    "cpu-int4":      "CPU · int4",
+    "cpu-fp32":      "CPU · fp32",
+    "cuda-t4-int4":  "CUDA T4 · int4",
+    "cuda-t4-fp32":  "CUDA T4 · fp32",
+    "trt-t4-int4":   "TensorRT T4 · int4",
+    "trt-t4-fp32":   "TensorRT T4 · fp32",
+}
+
+SORT_ORDER = list(LABEL_MAP.keys())
+
+
+def label_for(stem: str) -> str:
+    return LABEL_MAP.get(stem, stem.replace("-", " ").title())
+
+
+def read_csv(path: Path) -> dict:
+    """Return {sample: {rtf: [floats], synth_secs: [floats]}}."""
+    data: dict = {}
+    with open(path, newline="") as f:
+        for row in csv.DictReader(f):
+            s = row["sample"]
+            data.setdefault(s, {"rtf": [], "synth_secs": []})
+            data[s]["rtf"].append(float(row["rtf"]))
+            data[s]["synth_secs"].append(float(row["synth_secs"]))
+    return data
+
+
+def build_table(configs: list) -> str:
+    present = [s for s in SAMPLES if any(s in d for _, d in configs)]
+    if not present:
+        return "_No results yet._"
+
+    header = "| Config |" + "".join(f" RTF {s} |" for s in present)
+    sep    = "|--------|" + "".join(":-----------:|" for _ in present)
+
+    rows = []
+    for stem, data in configs:
+        cells = []
+        for s in present:
+            if s in data and data[s]["rtf"]:
+                rtf = mean(data[s]["rtf"])
+                # Bold values below real-time.
+                cells.append(f"**{rtf:.2f}**" if rtf < 1.0 else f"{rtf:.2f}")
+            else:
+                cells.append("—")
+        rows.append(f"| {label_for(stem)} |" + "".join(f" {c} |" for c in cells))
+
+    lines = [header, sep] + rows + [
+        "",
+        "_RTF < 1.0 = faster-than-real-time. Lower is better._  ",
+        "_To update: run `make bench-csv-cuda` on a T4, then commit `bench/results/`._",
+    ]
+    return "\n".join(lines)
+
+
+def update_readme(table: str, check: bool = False) -> bool:
+    """Replace the section between markers.  Returns True if the file changed."""
+    text = README.read_text()
+    pattern = re.compile(
+        re.escape(START_MARKER) + r".*?" + re.escape(END_MARKER),
+        re.DOTALL,
+    )
+    replacement = f"{START_MARKER}\n{table}\n{END_MARKER}"
+    new_text, n = pattern.subn(replacement, text)
+    if n == 0:
+        print(f"error: markers not found in {README.name}", file=sys.stderr)
+        print(f"  Add  {START_MARKER!r}  and  {END_MARKER!r}  to README.md", file=sys.stderr)
+        sys.exit(1)
+    if new_text == text:
+        return False
+    if not check:
+        README.write_text(new_text)
+    return True
+
+
+def main():
+    check_mode = "--check" in sys.argv
+
+    csvs = sorted(RESULTS_DIR.glob("*.csv"))
+    if not csvs:
+        print("No CSV files found in bench/results/ — nothing to do.", file=sys.stderr)
+        sys.exit(0)
+
+    configs = []
+    for path in csvs:
+        try:
+            configs.append((path.stem, read_csv(path)))
+        except Exception as exc:
+            print(f"warning: skipping {path.name}: {exc}", file=sys.stderr)
+
+    def sort_key(item):
+        try:
+            return (SORT_ORDER.index(item[0]), item[0])
+        except ValueError:
+            return (len(SORT_ORDER), item[0])
+
+    configs.sort(key=sort_key)
+
+    table = build_table(configs)
+    changed = update_readme(table, check=check_mode)
+
+    if check_mode:
+        if changed:
+            print("README.md is out of date — run `python scripts/update_bench_table.py`")
+            sys.exit(1)
+        else:
+            print("README.md is up to date.")
+    else:
+        print("README.md updated." if changed else "README.md unchanged.")
+
+
+if __name__ == "__main__":
+    main()

From b409f875118c226c866cc5adf4ef14d1a2a09809 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:53:13 +1200
Subject: [PATCH 31/43] bench: add CPU int4 baseline results (RTF ~2.0x)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md                  |  7 ++++++-
 bench/results/cpu-int4.csv | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 bench/results/cpu-int4.csv

diff --git a/README.md b/README.md
index cbedcc3..264c710 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,12 @@ cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/mode
 ## Performance
 
 <!-- bench:start -->
-_No results yet. Run `make bench-csv-cuda` on a T4, commit `bench/results/`, and the table will appear here automatically._
+| Config | RTF short | RTF medium | RTF long |
+|--------|:-----------:|:-----------:|:-----------:|
+| CPU · int4 | 1.98 | 2.04 | 2.34 |
+
+_RTF < 1.0 = faster-than-real-time. Lower is better._  
+_To update: run `make bench-csv-cuda` on a T4, then commit `bench/results/`._
 <!-- bench:end -->
 
 ## Try it on Google Colab
diff --git a/bench/results/cpu-int4.csv b/bench/results/cpu-int4.csv
new file mode 100644
index 0000000..b8b52f4
--- /dev/null
+++ b/bench/results/cpu-int4.csv
@@ -0,0 +1,16 @@
+sample,chars,iteration,synth_secs,audio_secs,rtf
+short,66,1,7.882000,4.040000,1.950000
+short,66,2,7.036000,3.580000,1.967000
+short,66,3,7.033000,3.560000,1.974000
+short,66,4,9.177000,4.440000,2.065000
+short,66,5,8.204000,4.210000,1.949000
+medium,243,1,46.086000,22.290000,2.068000
+medium,243,2,45.984000,22.530000,2.041000
+medium,243,3,37.957000,18.840000,2.015000
+medium,243,4,38.513000,18.890000,2.039000
+medium,243,5,41.841000,20.650000,2.026000
+long,655,1,85.839000,37.480000,2.290000
+long,655,2,122.740000,51.160000,2.399000
+long,655,3,154.986000,61.930000,2.503000
+long,655,4,27.911000,12.940000,2.157000
+long,655,5,119.948000,50.500000,2.375000

From 04971ec8ada9fd6cf67b95a740d856d34f2c8191 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:54:44 +1200
Subject: [PATCH 32/43] docs: add benchmarking guide (07-benchmarking.md)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/07-benchmarking.md | 167 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 docs/07-benchmarking.md

diff --git a/docs/07-benchmarking.md b/docs/07-benchmarking.md
new file mode 100644
index 0000000..b4fada6
--- /dev/null
+++ b/docs/07-benchmarking.md
@@ -0,0 +1,167 @@
+# Benchmarking
+
+## What is RTF
+
+**Real-Time Factor (RTF)** = synthesis time / audio duration.
+
+- RTF `0.35` → the model produces 1 s of audio in 0.35 s (2.9× faster than real-time)
+- RTF `2.0`  → the model takes 2 s to produce 1 s of audio (2× slower than real-time)
+- RTF `1.0`  → exactly real-time
+
+RTF is the primary performance metric because it is independent of text length and
+directly answers "can this hardware keep up with a live conversation?"
+
+---
+
+## Running the benchmark
+
+### Quick human-readable run
+
+```bash
+make bench-rtf           # CPU (int4)
+make bench-rtf-cuda      # CUDA — requires --features "qwen3-tts,cuda"
+make bench-rtf-trt       # TensorRT — requires --features "qwen3-tts,tensorrt"
+```
+
+Output is a summary table printed to stdout:
+
+```
+sample    chars  rtf_mean  rtf_std  rtf_min  rtf_p50  rtf_p95  rtf_max   audio_s   synth_s
+short        66    1.981    0.043    1.949    1.967    2.065    2.065      3.97      7.87
+medium      243    2.038    0.018    2.015    2.039    2.068    2.068     20.64     42.08
+long        655    2.345    0.116    2.157    2.375    2.503    2.503     42.80    102.28
+```
+
+### Saving results as CSV
+
+```bash
+make bench-csv           # saves to bench/results/cpu-int4.csv
+make bench-csv-cuda      # saves to bench/results/cuda-t4-int4.csv
+make bench-csv-trt       # saves to bench/results/trt-t4-int4.csv
+```
+
+These targets pipe `--csv` output to the appropriate file in `bench/results/`.
+Cargo/bench progress goes to stderr; only the CSV rows go to the file.
+
+### Advanced options
+
+All options are passed after `--`:
+
+```bash
+cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- \
+  --provider cuda \
+  --precision fp32 \
+  --warmup 2 \
+  --iterations 10 \
+  --language zh \
+  --csv > bench/results/cuda-t4-fp32.csv
+```
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--provider` | `cpu` | `cpu` \| `cuda` \| `tensorrt` \| `coreml` |
+| `--precision` | `int4` | `int4` \| `fp32` |
+| `--iterations` | `5` | Measured runs per sample |
+| `--warmup` | `1` | Warmup runs (not counted) |
+| `--language` | `en` | Language code |
+| `--instruction` | `"Speak naturally and clearly."` | VoiceDesign prompt |
+| `--model-dir` | _(auto-download)_ | Local model directory |
+| `--csv` | _(off)_ | Emit CSV to stdout |
+
+---
+
+## Text samples
+
+The benchmark uses three fixed English samples:
+
+| Label | Chars | ~Words | ~Audio duration |
+|-------|------:|------:|----------------|
+| `short` | 66 | 11 | 3–5 s |
+| `medium` | 243 | 42 | 18–25 s |
+| `long` | 655 | 109 | 35–65 s |
+
+These are intentionally varied to expose whether RTF scales with sequence length
+(it does slightly, due to KV cache growth in the decode loop).
+
+---
+
+## Saving and committing results
+
+1. Run on the target machine (e.g. Azure T4):
+   ```bash
+   make bench-csv-cuda
+   ```
+2. Commit the CSV:
+   ```bash
+   git add bench/results/cuda-t4-int4.csv
+   git commit -m "bench: add T4 CUDA int4 results"
+   git push
+   ```
+3. On push to `main`, the `update-bench` GitHub Actions workflow
+   (`workflows/update-bench.yml`) detects the changed CSV, runs
+   `scripts/update_bench_table.py`, and commits an updated `## Performance`
+   table to `README.md` automatically.
+
+CSV files live in `bench/results/` and are named `<provider>-<hardware>-<precision>.csv`.
+Known names and their README labels:
+
+| Filename | Label |
+|----------|-------|
+| `cpu-int4.csv` | CPU · int4 |
+| `cpu-fp32.csv` | CPU · fp32 |
+| `cuda-t4-int4.csv` | CUDA T4 · int4 |
+| `cuda-t4-fp32.csv` | CUDA T4 · fp32 |
+| `trt-t4-int4.csv` | TensorRT T4 · int4 |
+| `trt-t4-fp32.csv` | TensorRT T4 · fp32 |
+
+Unknown filenames fall back to a title-cased label derived from the stem.
+
+---
+
+## Reading the results
+
+### Capacity planning
+
+A single GPU running at RTF `0.35` spends 0.35 s of compute per 1 s of audio.
+The theoretical maximum concurrent streams on one GPU is:
+
+```
+max_streams ≈ floor(1 / RTF)
+```
+
+At RTF `0.35` → **2–3 concurrent requests** before queuing builds up.
+At RTF `2.0` (CPU baseline) the model cannot keep up with a single real-time stream.
+
+### Latency
+
+`synth_s` (mean) for a given sample length is the wall-clock latency a caller
+experiences. Use the `p95` column for SLA planning — it bounds worst-case latency
+under normal conditions.
+
+### Provider comparison
+
+| Provider | Expected RTF (T4, int4) | Notes |
+|----------|:----------------------:|-------|
+| CPU | ~2.0 | Baseline, no GPU needed |
+| CUDA | ~0.3–0.5 | ORT CUDA EP, requires CUDA 12 + cuDNN 9 |
+| TensorRT | ~0.15–0.3 | Higher setup cost, best throughput |
+
+CUDA results to be added after T4 run. See `06-cuda-provider.md` for setup.
+
+---
+
+## Updating the README table manually
+
+If you need to regenerate the table without pushing:
+
+```bash
+python3 scripts/update_bench_table.py
+```
+
+To check whether README.md is in sync with the current CSVs (useful in CI):
+
+```bash
+python3 scripts/update_bench_table.py --check
+```
+
+Exit code `1` means the table is stale; `0` means it is up to date.

From 9da95af225188d0c79826ae9afcb495bba38c262 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 22:59:55 +1200
Subject: [PATCH 33/43] docs: add TensorRT install step for Azure T4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 8789c26..9d311b8 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -316,7 +316,27 @@ sudo apt update
 sudo apt install -y cuda-libraries-12-6 libcudnn9-cuda-12
 ```
 
-### 3. Install Rust
+### 3. (Optional) Install TensorRT
+
+Required only when building with the `tensorrt` feature. ORT bundles
+`libonnxruntime_providers_tensorrt.so` but dynamically loads
+`libnvinfer.so.10` at runtime — install TensorRT 10 from the NVIDIA CUDA
+repository added in step 2:
+
+```bash
+sudo apt install -y libnvinfer10 libnvinfer-plugin10 libnvonnxparsers10
+```
+
+Verify:
+
+```bash
+ldconfig -p | grep nvinfer
+# expect libnvinfer.so.10 → /usr/lib/x86_64-linux-gnu/libnvinfer.so.10
+```
+
+Skip this step if you only need the `cuda` feature.
+
+### 4. Install Rust
 
 ```bash
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
@@ -324,13 +344,13 @@ source "$HOME/.cargo/env"
 rustc --version
 ```
 
-### 4. Install build dependencies
+### 5. Install build dependencies
 
 ```bash
 sudo apt install -y pkg-config libssl-dev
 ```
 
-### 5. Clone and build
+### 6. Clone and build
 
 ```bash
 git clone https://github.com/wavekat/wavekat-tts.git
@@ -339,9 +359,10 @@ cargo build --release --features "qwen3-tts,cuda"
 ```
 
 ORT will download its prebuilt CUDA libraries automatically (no extra env vars
-needed on Ubuntu 24.04).
+needed on Ubuntu 24.04). For TensorRT, swap the feature flag for
+`"qwen3-tts,tensorrt"` after completing step 3.
 
-### 6. Model weights
+### 7. Model weights
 
 The app auto-downloads weights from Hugging Face Hub on first run. Point
 `HF_HOME` at a data disk so the cache doesn't fill the small OS disk on `/`:
@@ -352,7 +373,7 @@ sudo chown -R $USER:$USER /checkpoints
 export HF_HOME=/checkpoints/huggingface
 ```
 
-### 7. Run
+### 8. Run
 
 ```bash
 cargo run --release --example synthesize --features "qwen3-tts,cuda" -- \
@@ -370,8 +391,9 @@ ORT_LOG_LEVEL=1 cargo run --release --example synthesize --features "qwen3-tts,c
 ### Notes
 
 - Ubuntu 24.04 has glibc 2.39 — no `ORT_STRATEGY=system` or symlink patching needed.
-- ORT bundles `libonnxruntime_providers_cuda.so` but **not** cuBLAS/cuDNN. Step 2
-  installs those from the NVIDIA CUDA repository.
+- ORT bundles `libonnxruntime_providers_cuda.so` and
+  `libonnxruntime_providers_tensorrt.so` but **not** cuBLAS/cuDNN or
+  `libnvinfer`. Steps 2 and 3 install those from the NVIDIA CUDA repository.
 - The `Standard_NC4as_T4_v3` SKU is available in East US, West US 2, and several
   European regions. Check availability with `az vm list-skus`.
 - Stop the VM when idle to avoid billing: `az vm deallocate --resource-group <rg> --name wavekat-gpu`.

From ffba867fcf49208cad0b5c1d0ce075c3c9c7ee45 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:13:28 +1200
Subject: [PATCH 34/43] fix: use NextPowerOfTwo arena strategy for CUDA EP

SameAsRequested caused one cudaMalloc per unique KV-cache size. After a
few synthesis iterations the growing KV cache produced 100+ different-sized
CUDA allocations that fragmented the virtual address space, making later
contiguous allocations (e.g. a 36 MB concat buffer) fail with OOM even
though total free VRAM was sufficient.

NextPowerOfTwo (ORT default) doubles the arena on extension, so the same
decode loop needs only ~7 cudaMalloc calls total. All KV-cache allocations
come from one contiguous block, eliminating fragmentation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/src/backends/qwen3_tts/model.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index 0691441..94993ea 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -696,14 +696,12 @@ fn apply_execution_provider(
     ep: super::ExecutionProvider,
 ) -> Result<ort::session::builder::SessionBuilder, TtsError> {
     use ort::execution_providers::{
-        ArenaExtendStrategy, CUDAExecutionProvider, CoreMLExecutionProvider,
-        TensorRTExecutionProvider,
+        CUDAExecutionProvider, CoreMLExecutionProvider, TensorRTExecutionProvider,
     };
     match ep {
         super::ExecutionProvider::Cpu => Ok(builder),
         super::ExecutionProvider::Cuda => builder
             .with_execution_providers([CUDAExecutionProvider::default()
-                .with_arena_extend_strategy(ArenaExtendStrategy::SameAsRequested)
                 .build()
                 .error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),

From 270505140e64c0b785f3d00ccc225c056b3b7959 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:21:53 +1200
Subject: [PATCH 35/43] feat: self-describing CSV with
 backend/precision/provider/hardware/date

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile                                 |  12 +-
 README.md                                |   8 +-
 bench/results/cpu-int4.csv               |  32 +++---
 crates/wavekat-tts/examples/bench_rtf.rs |  85 ++++++++++++--
 scripts/update_bench_table.py            | 139 ++++++++++++++++-------
 5 files changed, 199 insertions(+), 77 deletions(-)

diff --git a/Makefile b/Makefile
index 568ae3c..e876c22 100644
--- a/Makefile
+++ b/Makefile
@@ -36,14 +36,16 @@ bench-rtf-cuda: ## RTF benchmark on CUDA (int4) — for Azure T4
 bench-rtf-trt: ## RTF benchmark on TensorRT (int4) — for Azure T4
 	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- --provider tensorrt
 
-bench-csv: ## RTF benchmark on CPU, save CSV to bench/results/
+bench-csv: ## RTF benchmark on CPU (int4), save CSV to bench/results/
 	@mkdir -p bench/results
 	cargo run --release --example bench_rtf --features qwen3-tts -- --csv > bench/results/cpu-int4.csv
 
-bench-csv-cuda: ## RTF benchmark on CUDA (T4), save CSV to bench/results/
+bench-csv-cuda: ## RTF benchmark on CUDA T4 (int4), save CSV to bench/results/
 	@mkdir -p bench/results
-	cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --csv > bench/results/cuda-t4-int4.csv
+	cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- \
+		--provider cuda --hardware t4 --csv > bench/results/cuda-t4-int4.csv
 
-bench-csv-trt: ## RTF benchmark on TensorRT (T4), save CSV to bench/results/
+bench-csv-trt: ## RTF benchmark on TensorRT T4 (int4), save CSV to bench/results/
 	@mkdir -p bench/results
-	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- --provider tensorrt --csv > bench/results/trt-t4-int4.csv
+	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- \
+		--provider tensorrt --hardware t4 --csv > bench/results/trt-t4-int4.csv
diff --git a/README.md b/README.md
index 264c710..73abcf3 100644
--- a/README.md
+++ b/README.md
@@ -93,12 +93,12 @@ cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/mode
 ## Performance
 
 <!-- bench:start -->
-| Config | RTF short | RTF medium | RTF long |
-|--------|:-----------:|:-----------:|:-----------:|
-| CPU · int4 | 1.98 | 2.04 | 2.34 |
+| Backend | Precision | Provider | Hardware | RTF short | RTF medium | RTF long | Date |
+|---------|-----------|----------|----------|:-----------:|:-----------:|:-----------:|------|
+| qwen3-tts | int4 | CPU | — | 1.98 | 2.04 | 2.34 | 2026-04-07 |
 
 _RTF < 1.0 = faster-than-real-time. Lower is better._  
-_To update: run `make bench-csv-cuda` on a T4, then commit `bench/results/`._
+_To update: run `make bench-csv-cuda` on target hardware, then commit `bench/results/`._
 <!-- bench:end -->
 
 ## Try it on Google Colab
diff --git a/bench/results/cpu-int4.csv b/bench/results/cpu-int4.csv
index b8b52f4..b0f9e33 100644
--- a/bench/results/cpu-int4.csv
+++ b/bench/results/cpu-int4.csv
@@ -1,16 +1,16 @@
-sample,chars,iteration,synth_secs,audio_secs,rtf
-short,66,1,7.882000,4.040000,1.950000
-short,66,2,7.036000,3.580000,1.967000
-short,66,3,7.033000,3.560000,1.974000
-short,66,4,9.177000,4.440000,2.065000
-short,66,5,8.204000,4.210000,1.949000
-medium,243,1,46.086000,22.290000,2.068000
-medium,243,2,45.984000,22.530000,2.041000
-medium,243,3,37.957000,18.840000,2.015000
-medium,243,4,38.513000,18.890000,2.039000
-medium,243,5,41.841000,20.650000,2.026000
-long,655,1,85.839000,37.480000,2.290000
-long,655,2,122.740000,51.160000,2.399000
-long,655,3,154.986000,61.930000,2.503000
-long,655,4,27.911000,12.940000,2.157000
-long,655,5,119.948000,50.500000,2.375000
+backend,precision,provider,hardware,date,sample,chars,iteration,synth_secs,audio_secs,rtf
+qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,1,7.882000,4.040000,1.950000
+qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,2,7.036000,3.580000,1.967000
+qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,3,7.033000,3.560000,1.974000
+qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,4,9.177000,4.440000,2.065000
+qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,5,8.204000,4.210000,1.949000
+qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,1,46.086000,22.290000,2.068000
+qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,2,45.984000,22.530000,2.041000
+qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,3,37.957000,18.840000,2.015000
+qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,4,38.513000,18.890000,2.039000
+qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,5,41.841000,20.650000,2.026000
+qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,1,85.839000,37.480000,2.290000
+qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,2,122.740000,51.160000,2.399000
+qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,3,154.986000,61.930000,2.503000
+qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,4,27.911000,12.940000,2.157000
+qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,5,119.948000,50.500000,2.375000
diff --git a/crates/wavekat-tts/examples/bench_rtf.rs b/crates/wavekat-tts/examples/bench_rtf.rs
index 8073f97..6941afc 100644
--- a/crates/wavekat-tts/examples/bench_rtf.rs
+++ b/crates/wavekat-tts/examples/bench_rtf.rs
@@ -8,8 +8,10 @@
 //!
 //! Options:
 //!   --model-dir <PATH>   Model directory (default: auto-download)
+//!   --backend <NAME>     Backend identifier written to CSV (default: qwen3-tts)
 //!   --precision <PREC>   int4 (default) | fp32
 //!   --provider <EP>      cpu (default) | cuda | tensorrt | coreml
+//!   --hardware <NAME>    Hardware label written to CSV, e.g. t4, a10g (default: unknown)
 //!   --iterations <N>     Measured runs per sample (default: 5)
 //!   --warmup <N>         Warmup runs before measurement (default: 1)
 //!   --language <LANG>    Language code (default: en)
@@ -18,8 +20,8 @@
 //!
 //! Examples:
 //!   cargo run --release --example bench_rtf --features qwen3-tts
-//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda
-//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --csv > results.csv
+//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --hardware t4
+//!   cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- --provider cuda --hardware t4 --csv > results.csv
 
 use std::path::PathBuf;
 use std::time::Instant;
@@ -62,8 +64,10 @@ fn main() {
     let args: Vec<String> = std::env::args().skip(1).collect();
 
     let mut model_dir: Option<PathBuf> = None;
+    let mut backend = "qwen3-tts".to_string();
     let mut precision = ModelPrecision::Int4;
     let mut provider = ExecutionProvider::Cpu;
+    let mut hardware = "unknown".to_string();
     let mut iterations: usize = 5;
     let mut warmup: usize = 1;
     let mut language = "en".to_string();
@@ -77,6 +81,14 @@ fn main() {
                 i += 1;
                 model_dir = Some(PathBuf::from(&args[i]));
             }
+            "--backend" => {
+                i += 1;
+                backend = args[i].clone();
+            }
+            "--hardware" => {
+                i += 1;
+                hardware = args[i].clone();
+            }
             "--precision" => {
                 i += 1;
                 precision = match args[i].as_str() {
@@ -150,8 +162,20 @@ fn main() {
     let tts = Qwen3Tts::from_config(config).expect("failed to load model");
     eprintln!("Model loaded.\n");
 
+    let precision_str = match precision {
+        ModelPrecision::Int4 => "int4",
+        ModelPrecision::Fp32 => "fp32",
+    };
+    let provider_str = match provider {
+        ExecutionProvider::Cpu => "cpu",
+        ExecutionProvider::Cuda => "cuda",
+        ExecutionProvider::TensorRt => "tensorrt",
+        ExecutionProvider::CoreMl => "coreml",
+    };
+    let date = today_iso();
+
     if csv_mode {
-        println!("sample,chars,iteration,synth_secs,audio_secs,rtf");
+        println!("backend,precision,provider,hardware,date,sample,chars,iteration,synth_secs,audio_secs,rtf");
     } else {
         eprintln!(
             "Benchmark: {} warmup + {} measured iterations per sample\n",
@@ -201,7 +225,12 @@ fn main() {
 
             if csv_mode {
                 println!(
-                    "{},{},{},{:.6},{:.6},{:.6}",
+                    "{},{},{},{},{},{},{},{},{:.6},{:.6},{:.6}",
+                    backend,
+                    precision_str,
+                    provider_str,
+                    hardware,
+                    date,
                     sample.label,
                     sample.text.len(),
                     it + 1,
@@ -314,8 +343,10 @@ Usage:
 
 Options:
   --model-dir <PATH>   Model directory (default: auto-download to HF cache)
+  --backend <NAME>     Backend label in CSV (default: qwen3-tts)
   --precision <PREC>   int4 (default) | fp32
   --provider <EP>      cpu (default) | cuda | tensorrt | coreml
+  --hardware <NAME>    Hardware label in CSV, e.g. t4, a10g (default: unknown)
   --iterations <N>     Measured runs per sample (default: 5)
   --warmup <N>         Warmup runs before measurement (default: 1)
   --language <LANG>    Language code (default: en)
@@ -326,11 +357,49 @@ Examples:
   # CPU benchmark
   cargo run --release --example bench_rtf --features qwen3-tts
 
-  # CUDA benchmark (T4)
-  cargo run --release --example bench_rtf --features \"qwen3-tts,cuda\" -- --provider cuda
+  # CUDA benchmark on a T4
+  cargo run --release --example bench_rtf --features \"qwen3-tts,cuda\" \\
+    -- --provider cuda --hardware t4
 
-  # Save CSV for further analysis
+  # Save CSV for tracking and README auto-update
   cargo run --release --example bench_rtf --features \"qwen3-tts,cuda\" \\
-    -- --provider cuda --csv > results.csv"
+    -- --provider cuda --hardware t4 --csv > bench/results/cuda-t4-int4.csv"
     );
 }
+
+/// Return today's date as YYYY-MM-DD (UTC) without any external dependency.
+fn today_iso() -> String {
+    let secs = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_secs();
+
+    let mut days = secs / 86400;
+    let mut year = 1970u32;
+    loop {
+        let in_year = if is_leap(year) { 366 } else { 365 };
+        if days < in_year {
+            break;
+        }
+        days -= in_year;
+        year += 1;
+    }
+    let month_lengths = if is_leap(year) {
+        [31u64, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    } else {
+        [31u64, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+    };
+    let mut month = 1u32;
+    for &ml in &month_lengths {
+        if days < ml {
+            break;
+        }
+        days -= ml;
+        month += 1;
+    }
+    format!("{:04}-{:02}-{:02}", year, month, days + 1)
+}
+
+fn is_leap(year: u32) -> bool {
+    (year % 4 == 0 && year % 100 != 0) || year % 400 == 0
+}
diff --git a/scripts/update_bench_table.py b/scripts/update_bench_table.py
index 13dffa7..2963535 100644
--- a/scripts/update_bench_table.py
+++ b/scripts/update_bench_table.py
@@ -3,6 +3,9 @@
 Update the benchmark table in README.md from bench/results/*.csv.
 
 CSV format (produced by `bench_rtf --csv`):
+  backend,precision,provider,hardware,date,sample,chars,iteration,synth_secs,audio_secs,rtf
+
+Legacy format (no metadata columns) is also accepted:
   sample,chars,iteration,synth_secs,audio_secs,rtf
 
 Usage:
@@ -13,6 +16,7 @@
 import csv
 import re
 import sys
+from collections import defaultdict
 from pathlib import Path
 from statistics import mean
 
@@ -23,68 +27,122 @@
 START_MARKER = "<!-- bench:start -->"
 END_MARKER = "<!-- bench:end -->"
 
-# Canonical sample order.
 SAMPLES = ["short", "medium", "long"]
 
-# Pretty labels for known config names.  Unknown names fall back to the stem.
-LABEL_MAP = {
-    "cpu-int4":      "CPU · int4",
-    "cpu-fp32":      "CPU · fp32",
-    "cuda-t4-int4":  "CUDA T4 · int4",
-    "cuda-t4-fp32":  "CUDA T4 · fp32",
-    "trt-t4-int4":   "TensorRT T4 · int4",
-    "trt-t4-fp32":   "TensorRT T4 · fp32",
+# Display order for known (provider, hardware) combinations.
+SORT_KEY = {
+    ("cpu",      "unknown"): (0, ""),
+    ("cuda",     "t4"):       (1, ""),
+    ("cuda",     "a10g"):     (2, ""),
+    ("tensorrt", "t4"):       (3, ""),
+    ("tensorrt", "a10g"):     (4, ""),
+    ("coreml",   "unknown"):  (5, ""),
 }
 
-SORT_ORDER = list(LABEL_MAP.keys())
-
 
-def label_for(stem: str) -> str:
-    return LABEL_MAP.get(stem, stem.replace("-", " ").title())
+def sort_key(config: dict) -> tuple:
+    k = (config["provider"], config["hardware"])
+    order, _ = SORT_KEY.get(k, (99, ""))
+    return (config["backend"], order, config["precision"], config["hardware"])
 
 
-def read_csv(path: Path) -> dict:
-    """Return {sample: {rtf: [floats], synth_secs: [floats]}}."""
-    data: dict = {}
+def read_csv(path: Path) -> list[dict]:
+    """
+    Return a list of row dicts.  Adds default metadata for legacy files
+    that don't have the new columns.
+    """
+    rows = []
     with open(path, newline="") as f:
-        for row in csv.DictReader(f):
-            s = row["sample"]
-            data.setdefault(s, {"rtf": [], "synth_secs": []})
-            data[s]["rtf"].append(float(row["rtf"]))
-            data[s]["synth_secs"].append(float(row["synth_secs"]))
-    return data
-
-
-def build_table(configs: list) -> str:
-    present = [s for s in SAMPLES if any(s in d for _, d in configs)]
+        reader = csv.DictReader(f)
+        for row in reader:
+            if "backend" not in row:
+                # Legacy format — derive metadata from filename
+                stem = path.stem  # e.g. "cuda-t4-int4"
+                parts = stem.split("-")
+                row["backend"] = "qwen3-tts"
+                row["precision"] = parts[-1] if parts[-1] in ("int4", "fp32") else "int4"
+                row["provider"] = parts[0] if parts else "cpu"
+                row["hardware"] = parts[1] if len(parts) > 2 else "unknown"
+                row["date"] = ""
+            rows.append(row)
+    return rows
+
+
+def group_rows(all_rows: list[dict]) -> list[tuple[dict, dict]]:
+    """
+    Group rows by (backend, precision, provider, hardware).
+    Returns list of (config_dict, {sample: {rtf: [], synth_secs: []}}).
+    """
+    groups: dict[tuple, dict] = defaultdict(lambda: defaultdict(lambda: {"rtf": [], "synth_secs": []}))
+    configs: dict[tuple, dict] = {}
+
+    for row in all_rows:
+        key = (row["backend"], row["precision"], row["provider"], row["hardware"])
+        groups[key][row["sample"]]["rtf"].append(float(row["rtf"]))
+        groups[key][row["sample"]]["synth_secs"].append(float(row["synth_secs"]))
+        if key not in configs:
+            configs[key] = {
+                "backend":   row["backend"],
+                "precision": row["precision"],
+                "provider":  row["provider"],
+                "hardware":  row["hardware"],
+                "date":      row.get("date", ""),
+            }
+        elif row.get("date"):
+            # Keep the most recent date seen for this config.
+            if row["date"] > configs[key]["date"]:
+                configs[key]["date"] = row["date"]
+
+    result = [(configs[k], dict(groups[k])) for k in groups]
+    result.sort(key=lambda x: sort_key(x[0]))
+    return result
+
+
+def hardware_label(hw: str) -> str:
+    return {"t4": "T4", "a10g": "A10G", "unknown": "—"}.get(hw, hw.upper())
+
+
+def provider_label(pv: str) -> str:
+    return {"cpu": "CPU", "cuda": "CUDA", "tensorrt": "TensorRT", "coreml": "CoreML"}.get(pv, pv)
+
+
+def build_table(groups: list[tuple[dict, dict]]) -> str:
+    present = [s for s in SAMPLES if any(s in data for _, data in groups)]
     if not present:
         return "_No results yet._"
 
-    header = "| Config |" + "".join(f" RTF {s} |" for s in present)
-    sep    = "|--------|" + "".join(":-----------:|" for _ in present)
+    rtf_headers = "".join(f" RTF {s} |" for s in present)
+    header = f"| Backend | Precision | Provider | Hardware |{rtf_headers} Date |"
+    sep    = f"|---------|-----------|----------|----------|" + ":-----------:|" * len(present) + "------|"
 
     rows = []
-    for stem, data in configs:
+    for config, data in groups:
         cells = []
         for s in present:
             if s in data and data[s]["rtf"]:
                 rtf = mean(data[s]["rtf"])
-                # Bold values below real-time.
                 cells.append(f"**{rtf:.2f}**" if rtf < 1.0 else f"{rtf:.2f}")
             else:
                 cells.append("—")
-        rows.append(f"| {label_for(stem)} |" + "".join(f" {c} |" for c in cells))
+
+        hw   = hardware_label(config["hardware"])
+        prov = provider_label(config["provider"])
+        date = config["date"] or "—"
+        rows.append(
+            f"| {config['backend']} | {config['precision']} | {prov} | {hw} |"
+            + "".join(f" {c} |" for c in cells)
+            + f" {date} |"
+        )
 
     lines = [header, sep] + rows + [
         "",
         "_RTF < 1.0 = faster-than-real-time. Lower is better._  ",
-        "_To update: run `make bench-csv-cuda` on a T4, then commit `bench/results/`._",
+        "_To update: run `make bench-csv-cuda` on target hardware, then commit `bench/results/`._",
     ]
     return "\n".join(lines)
 
 
 def update_readme(table: str, check: bool = False) -> bool:
-    """Replace the section between markers.  Returns True if the file changed."""
     text = README.read_text()
     pattern = re.compile(
         re.escape(START_MARKER) + r".*?" + re.escape(END_MARKER),
@@ -111,22 +169,15 @@ def main():
         print("No CSV files found in bench/results/ — nothing to do.", file=sys.stderr)
         sys.exit(0)
 
-    configs = []
+    all_rows = []
     for path in csvs:
         try:
-            configs.append((path.stem, read_csv(path)))
+            all_rows.extend(read_csv(path))
         except Exception as exc:
             print(f"warning: skipping {path.name}: {exc}", file=sys.stderr)
 
-    def sort_key(item):
-        try:
-            return (SORT_ORDER.index(item[0]), item[0])
-        except ValueError:
-            return (len(SORT_ORDER), item[0])
-
-    configs.sort(key=sort_key)
-
-    table = build_table(configs)
+    groups = group_rows(all_rows)
+    table = build_table(groups)
     changed = update_readme(table, check=check_mode)
 
     if check_mode:

From 4842c17c1ec7a8359f7c8c95ea6a5ae175526871 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:24:53 +1200
Subject: [PATCH 36/43] fix: tag cpu-int4 benchmark with Standard_NC4as_T4_v3
 hardware

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile                      |  3 ++-
 README.md                     |  2 +-
 bench/results/cpu-int4.csv    | 30 +++++++++++++++---------------
 scripts/update_bench_table.py |  2 +-
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index e876c22..b10e82c 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,8 @@ bench-rtf-trt: ## RTF benchmark on TensorRT (int4) — for Azure T4
 
 bench-csv: ## RTF benchmark on CPU (int4), save CSV to bench/results/
 	@mkdir -p bench/results
-	cargo run --release --example bench_rtf --features qwen3-tts -- --csv > bench/results/cpu-int4.csv
+	cargo run --release --example bench_rtf --features qwen3-tts -- \
+		--hardware Standard_NC4as_T4_v3 --csv > bench/results/cpu-int4.csv
 
 bench-csv-cuda: ## RTF benchmark on CUDA T4 (int4), save CSV to bench/results/
 	@mkdir -p bench/results
diff --git a/README.md b/README.md
index 73abcf3..300e4d7 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/mode
 <!-- bench:start -->
 | Backend | Precision | Provider | Hardware | RTF short | RTF medium | RTF long | Date |
 |---------|-----------|----------|----------|:-----------:|:-----------:|:-----------:|------|
-| qwen3-tts | int4 | CPU | — | 1.98 | 2.04 | 2.34 | 2026-04-07 |
+| qwen3-tts | int4 | CPU | Standard_NC4as_T4_v3 | 1.98 | 2.04 | 2.34 | 2026-04-07 |
 
 _RTF < 1.0 = faster-than-real-time. Lower is better._  
 _To update: run `make bench-csv-cuda` on target hardware, then commit `bench/results/`._
diff --git a/bench/results/cpu-int4.csv b/bench/results/cpu-int4.csv
index b0f9e33..f0f77ab 100644
--- a/bench/results/cpu-int4.csv
+++ b/bench/results/cpu-int4.csv
@@ -1,16 +1,16 @@
 backend,precision,provider,hardware,date,sample,chars,iteration,synth_secs,audio_secs,rtf
-qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,1,7.882000,4.040000,1.950000
-qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,2,7.036000,3.580000,1.967000
-qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,3,7.033000,3.560000,1.974000
-qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,4,9.177000,4.440000,2.065000
-qwen3-tts,int4,cpu,unknown,2026-04-07,short,66,5,8.204000,4.210000,1.949000
-qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,1,46.086000,22.290000,2.068000
-qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,2,45.984000,22.530000,2.041000
-qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,3,37.957000,18.840000,2.015000
-qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,4,38.513000,18.890000,2.039000
-qwen3-tts,int4,cpu,unknown,2026-04-07,medium,243,5,41.841000,20.650000,2.026000
-qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,1,85.839000,37.480000,2.290000
-qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,2,122.740000,51.160000,2.399000
-qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,3,154.986000,61.930000,2.503000
-qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,4,27.911000,12.940000,2.157000
-qwen3-tts,int4,cpu,unknown,2026-04-07,long,655,5,119.948000,50.500000,2.375000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,short,66,1,7.882000,4.040000,1.950000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,short,66,2,7.036000,3.580000,1.967000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,short,66,3,7.033000,3.560000,1.974000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,short,66,4,9.177000,4.440000,2.065000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,short,66,5,8.204000,4.210000,1.949000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,medium,243,1,46.086000,22.290000,2.068000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,medium,243,2,45.984000,22.530000,2.041000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,medium,243,3,37.957000,18.840000,2.015000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,medium,243,4,38.513000,18.890000,2.039000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,medium,243,5,41.841000,20.650000,2.026000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,long,655,1,85.839000,37.480000,2.290000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,long,655,2,122.740000,51.160000,2.399000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,long,655,3,154.986000,61.930000,2.503000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,long,655,4,27.911000,12.940000,2.157000
+qwen3-tts,int4,cpu,Standard_NC4as_T4_v3,2026-04-07,long,655,5,119.948000,50.500000,2.375000
diff --git a/scripts/update_bench_table.py b/scripts/update_bench_table.py
index 2963535..0b69f3e 100644
--- a/scripts/update_bench_table.py
+++ b/scripts/update_bench_table.py
@@ -99,7 +99,7 @@ def group_rows(all_rows: list[dict]) -> list[tuple[dict, dict]]:
 
 
 def hardware_label(hw: str) -> str:
-    return {"t4": "T4", "a10g": "A10G", "unknown": "—"}.get(hw, hw.upper())
+    return {"t4": "T4", "a10g": "A10G", "unknown": "—"}.get(hw, hw)
 
 
 def provider_label(pv: str) -> str:

From d7bc1a67b8c10bdeffb2cb51a66ac045f135c0fd Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:25:41 +1200
Subject: [PATCH 37/43] feat: add CUDA int4 benchmark results for
 Standard_NC4as_T4_v3

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md                      |  1 +
 bench/results/cuda-t4-int4.csv | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 bench/results/cuda-t4-int4.csv

diff --git a/README.md b/README.md
index 300e4d7..46a7b15 100644
--- a/README.md
+++ b/README.md
@@ -96,6 +96,7 @@ cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/mode
 | Backend | Precision | Provider | Hardware | RTF short | RTF medium | RTF long | Date |
 |---------|-----------|----------|----------|:-----------:|:-----------:|:-----------:|------|
 | qwen3-tts | int4 | CPU | Standard_NC4as_T4_v3 | 1.98 | 2.04 | 2.34 | 2026-04-07 |
+| qwen3-tts | int4 | CUDA | Standard_NC4as_T4_v3 | **0.78** | **0.85** | 1.07 | 2026-04-07 |
 
 _RTF < 1.0 = faster-than-real-time. Lower is better._  
 _To update: run `make bench-csv-cuda` on target hardware, then commit `bench/results/`._
diff --git a/bench/results/cuda-t4-int4.csv b/bench/results/cuda-t4-int4.csv
new file mode 100644
index 0000000..e0a111f
--- /dev/null
+++ b/bench/results/cuda-t4-int4.csv
@@ -0,0 +1,16 @@
+backend,precision,provider,hardware,date,sample,chars,iteration,synth_secs,audio_secs,rtf
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,short,66,1,3.181000,4.057792,0.783924
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,short,66,2,3.790286,4.806042,0.788650
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,short,66,3,3.114891,3.978583,0.782915
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,short,66,4,3.171773,4.053667,0.782446
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,short,66,5,3.472673,4.462417,0.778205
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,medium,243,1,19.550738,22.549375,0.867019
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,medium,243,2,16.825036,19.462083,0.864503
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,medium,243,3,15.813848,18.711875,0.845124
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,medium,243,4,16.724455,19.930542,0.839137
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,medium,243,5,17.720516,20.934250,0.846484
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,long,655,1,46.378647,43.818125,1.058435
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,long,655,2,53.422495,49.133000,1.087304
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,long,655,3,46.295702,44.013667,1.051848
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,long,655,4,48.479280,45.570792,1.063824
+qwen3-tts,int4,cuda,Standard_NC4as_T4_v3,2026-04-07,long,655,5,47.073837,44.205833,1.064878

From c4b2fbd9d0d442236748a62f34da44590440d5fe Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:28:15 +1200
Subject: [PATCH 38/43] chore: add update-readme Make target

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b10e82c..50cb5bf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt bench-csv bench-csv-cuda bench-csv-trt
+.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt bench-csv bench-csv-cuda bench-csv-trt update-readme
 
 help: ## Show this help
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-14s\033[0m %s\n", $$1, $$2}'
@@ -46,6 +46,9 @@ bench-csv-cuda: ## RTF benchmark on CUDA T4 (int4), save CSV to bench/results/
 	cargo run --release --example bench_rtf --features "qwen3-tts,cuda" -- \
 		--provider cuda --hardware t4 --csv > bench/results/cuda-t4-int4.csv
 
+update-readme: ## Update README benchmark table from bench/results/*.csv
+	python3 scripts/update_bench_table.py
+
 bench-csv-trt: ## RTF benchmark on TensorRT T4 (int4), save CSV to bench/results/
 	@mkdir -p bench/results
 	cargo run --release --example bench_rtf --features "qwen3-tts,tensorrt" -- \

From 655216eaf0ca7c692d39c4cfd156d2f39768bdc9 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:34:32 +1200
Subject: [PATCH 39/43] feat: rotate through text pool per iteration in
 bench_rtf

Each iteration now uses a different sentence from the pool
(short: 6 variants, medium: 5, long: 5), cycling if iterations
exceed pool size. CSV chars column reports actual per-iteration
length; summary table shows the average.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/examples/bench_rtf.rs | 115 ++++++++++++++++++-----
 1 file changed, 92 insertions(+), 23 deletions(-)

diff --git a/crates/wavekat-tts/examples/bench_rtf.rs b/crates/wavekat-tts/examples/bench_rtf.rs
index 6941afc..5e3c949 100644
--- a/crates/wavekat-tts/examples/bench_rtf.rs
+++ b/crates/wavekat-tts/examples/bench_rtf.rs
@@ -31,32 +31,93 @@ use wavekat_tts::{SynthesizeRequest, TtsBackend};
 
 struct Sample {
     label: &'static str,
-    text: &'static str,
+    /// Pool of texts rotated across iterations so each run uses different content.
+    texts: &'static [&'static str],
 }
 
 const SAMPLES: &[Sample] = &[
     Sample {
         label: "short",
-        text: "Hello, world! This is a quick test of the speech synthesis system.",
+        texts: &[
+            "Hello, world! This is a quick test of the speech synthesis system.",
+            "The weather today is sunny with a high of twenty-three degrees Celsius.",
+            "Your package has been delivered to the front door of your building.",
+            "Please turn left in five hundred meters, then continue for two miles.",
+            "Your appointment is confirmed for Tuesday at three fifteen in the afternoon.",
+            "An update is available for your device. Would you like to install it now?",
+        ],
     },
     Sample {
         label: "medium",
-        text: "The quick brown fox jumps over the lazy dog. \
-               Speech synthesis has improved dramatically over the past few years. \
-               Modern neural TTS systems can produce highly natural-sounding speech \
-               that is nearly indistinguishable from human voice recordings.",
+        texts: &[
+            "The quick brown fox jumps over the lazy dog. \
+             Speech synthesis has improved dramatically over the past few years. \
+             Modern neural TTS systems can produce highly natural-sounding speech \
+             that is nearly indistinguishable from human voice recordings.",
+            "Scientists have confirmed that regular physical activity reduces the risk \
+             of chronic conditions including heart disease and diabetes. \
+             A thirty-minute walk each day can improve cardiovascular health \
+             and support mental well-being across all age groups.",
+            "The global demand for renewable energy is accelerating as countries commit \
+             to reducing carbon emissions. Solar and wind installations have grown rapidly \
+             in recent years, making clean electricity more affordable than ever \
+             for homes and businesses worldwide.",
+            "Advances in robotics are transforming manufacturing, logistics, and healthcare. \
+             Modern robots can perform delicate surgical procedures, navigate warehouse \
+             environments, and assist elderly patients with daily tasks, \
+             often working alongside human colleagues.",
+            "The history of the internet stretches back to the nineteen sixties, \
+             when researchers first connected computers across university campuses. \
+             What began as a small academic network has grown into a global infrastructure \
+             connecting billions of people and devices.",
+        ],
     },
     Sample {
         label: "long",
-        text: "Artificial intelligence is transforming the way we interact with computers. \
-               Voice interfaces powered by text-to-speech technology are now commonplace \
-               in smartphones, smart speakers, and automotive systems. \
-               The latest generation of neural TTS models uses transformer architectures \
-               trained on thousands of hours of human speech to capture the subtle nuances \
-               of natural spoken language, including prosody, rhythm, and intonation. \
-               These models can generate high-quality audio at sample rates of twenty-four \
-               kilohertz or higher, enabling crisp and clear voice output across a wide range \
-               of applications from accessibility tools to interactive voice assistants.",
+        texts: &[
+            "Artificial intelligence is transforming the way we interact with computers. \
+             Voice interfaces powered by text-to-speech technology are now commonplace \
+             in smartphones, smart speakers, and automotive systems. \
+             The latest generation of neural TTS models uses transformer architectures \
+             trained on thousands of hours of human speech to capture the subtle nuances \
+             of natural spoken language, including prosody, rhythm, and intonation. \
+             These models can generate high-quality audio at sample rates of twenty-four \
+             kilohertz or higher, enabling crisp and clear voice output across a wide range \
+             of applications from accessibility tools to interactive voice assistants.",
+            "Ocean exploration remains one of the most challenging frontiers in modern science. \
+             More than eighty percent of the world's oceans have never been mapped, explored, \
+             or studied in detail, leaving vast regions of our planet largely unknown. \
+             Deep sea research vessels and remotely operated underwater vehicles are slowly \
+             changing this picture, discovering new ecosystems, geological formations, \
+             and species previously unknown to science. These discoveries have important \
+             implications for medicine, materials science, and our understanding of how \
+             life evolved on Earth and potentially on other worlds in the solar system.",
+            "Urban transportation networks are undergoing a fundamental transformation \
+             driven by electrification, automation, and new mobility services. \
+             Electric buses, trams, and bicycles are replacing fossil-fuel vehicles \
+             in many cities, reducing air pollution and greenhouse gas emissions. \
+             Ride-sharing platforms and micro-mobility services are changing how people \
+             think about car ownership, particularly among younger generations who prefer \
+             flexible access over the fixed costs of owning a vehicle. \
+             City planners are reimagining streets to prioritize pedestrians and cyclists, \
+             creating more livable environments while reducing traffic congestion.",
+            "The development of quantum computing promises to solve problems that are \
+             intractable for classical computers, including simulating molecular interactions \
+             for drug discovery and breaking certain cryptographic algorithms. \
+             Current quantum processors must operate near absolute zero to maintain \
+             the fragile quantum states that give them their computational power. \
+             Researchers around the world are racing to build systems with enough \
+             stable qubits to demonstrate a clear advantage over classical hardware \
+             in real-world applications, a milestone often referred to as quantum advantage.",
+            "Throughout history, libraries have served as the guardians of human knowledge, \
+             preserving texts and manuscripts that might otherwise have been lost to time. \
+             The transition from physical collections to digital archives has dramatically \
+             expanded access to information, allowing anyone with an internet connection \
+             to read documents that were once available only to scholars. \
+             Digitization projects at major institutions have made millions of books, \
+             maps, and historical records freely available online, democratizing access \
+             to cultural heritage and enabling new forms of research across disciplines.",
+        ],
     },
 ];
 
@@ -186,12 +247,12 @@ fn main() {
     let mut summary: Vec<(&'static str, usize, Vec<RunResult>)> = Vec::new();
 
     for sample in SAMPLES {
-        let request = SynthesizeRequest::new(sample.text)
-            .with_language(&language)
-            .with_instruction(&instruction);
-
-        // Warmup runs (not counted).
+        // Warmup runs (not counted) — rotate through the text pool.
         for w in 0..warmup {
+            let text = sample.texts[w % sample.texts.len()];
+            let request = SynthesizeRequest::new(text)
+                .with_language(&language)
+                .with_instruction(&instruction);
             eprint!(
                 "  [{:6}] warmup {}/{} ...\r",
                 sample.label,
@@ -204,9 +265,15 @@ fn main() {
             eprintln!();
         }
 
-        // Measured runs.
+        // Measured runs — each iteration uses a different text from the pool.
         let mut runs = Vec::with_capacity(iterations);
+        let mut total_chars: usize = 0;
         for it in 0..iterations {
+            let text = sample.texts[it % sample.texts.len()];
+            let request = SynthesizeRequest::new(text)
+                .with_language(&language)
+                .with_instruction(&instruction);
+
             let t0 = Instant::now();
             let audio = tts.synthesize(&request).expect("synthesis failed");
             let synth_secs = t0.elapsed().as_secs_f64();
@@ -232,7 +299,7 @@ fn main() {
                     hardware,
                     date,
                     sample.label,
-                    sample.text.len(),
+                    text.len(),
                     it + 1,
                     synth_secs,
                     audio_secs,
@@ -240,6 +307,7 @@ fn main() {
                 );
             }
 
+            total_chars += text.len();
             runs.push(RunResult {
                 synth_secs,
                 audio_secs,
@@ -247,7 +315,8 @@ fn main() {
             });
         }
 
-        summary.push((sample.label, sample.text.len(), runs));
+        let avg_chars = if iterations > 0 { total_chars / iterations } else { 0 };
+        summary.push((sample.label, avg_chars, runs));
         eprintln!();
     }
 

From 44052f525e178714c0f0cff74fff391df5cf790b Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:52:23 +1200
Subject: [PATCH 40/43] chore: add make ci target mirroring GitHub Actions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Makefile | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 50cb5bf..9364b7f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,20 @@
-.PHONY: help check test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt bench-csv bench-csv-cuda bench-csv-trt update-readme
+.PHONY: help check ci test test-qwen3 test-all fmt clippy doc bench-rtf bench-rtf-cuda bench-rtf-trt bench-csv bench-csv-cuda bench-csv-trt update-readme
 
 help: ## Show this help
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-14s\033[0m %s\n", $$1, $$2}'
 
 check: fmt clippy test ## Run fmt, clippy, and test (no features)
 
+ci: ## Run the full GitHub Actions CI check locally
+	cargo fmt --all -- --check
+	cargo clippy --workspace -- -D warnings
+	cargo test --workspace
+	cargo doc --no-deps -p wavekat-tts --all-features
+	cargo test -p wavekat-tts --no-default-features --features ""
+	cargo test -p wavekat-tts --no-default-features --features "qwen3-tts"
+	cargo test -p wavekat-tts --no-default-features --features "cosyvoice"
+	cargo test -p wavekat-tts --no-default-features --features "qwen3-tts,cosyvoice"
+
 fmt: ## Check formatting
 	cargo fmt --all -- --check
 

From b4cc122a43b9310808c0d08b60faf145d9bf87de Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Tue, 7 Apr 2026 23:52:27 +1200
Subject: [PATCH 41/43] style: apply cargo fmt

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/examples/bench_rtf.rs      | 25 ++++++++++++-------
 crates/wavekat-tts/examples/synthesize.rs     | 14 +++++++----
 .../src/backends/qwen3_tts/model.rs           |  4 +--
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/crates/wavekat-tts/examples/bench_rtf.rs b/crates/wavekat-tts/examples/bench_rtf.rs
index 5e3c949..c5052f4 100644
--- a/crates/wavekat-tts/examples/bench_rtf.rs
+++ b/crates/wavekat-tts/examples/bench_rtf.rs
@@ -253,12 +253,7 @@ fn main() {
             let request = SynthesizeRequest::new(text)
                 .with_language(&language)
                 .with_instruction(&instruction);
-            eprint!(
-                "  [{:6}] warmup {}/{} ...\r",
-                sample.label,
-                w + 1,
-                warmup
-            );
+            eprint!("  [{:6}] warmup {}/{} ...\r", sample.label, w + 1, warmup);
             tts.synthesize(&request).expect("warmup synthesis failed");
         }
         if warmup > 0 {
@@ -315,7 +310,11 @@ fn main() {
             });
         }
 
-        let avg_chars = if iterations > 0 { total_chars / iterations } else { 0 };
+        let avg_chars = if iterations > 0 {
+            total_chars / iterations
+        } else {
+            0
+        };
         summary.push((sample.label, avg_chars, runs));
         eprintln!();
     }
@@ -370,8 +369,16 @@ fn print_table(summary: &[(&'static str, usize, Vec<RunResult>)]) {
     println!("{}", "=".repeat(w));
     println!(
         "{:<8}  {:>5}  {:>7}  {:>7}  {:>7}  {:>7}  {:>7}  {:>7}  {:>8}  {:>8}",
-        "sample", "chars", "rtf_mean", "rtf_std", "rtf_min", "rtf_p50", "rtf_p95", "rtf_max",
-        "audio_s", "synth_s"
+        "sample",
+        "chars",
+        "rtf_mean",
+        "rtf_std",
+        "rtf_min",
+        "rtf_p50",
+        "rtf_p95",
+        "rtf_max",
+        "audio_s",
+        "synth_s"
     );
     println!("{}", "-".repeat(w));
 
diff --git a/crates/wavekat-tts/examples/synthesize.rs b/crates/wavekat-tts/examples/synthesize.rs
index ded58cb..768efbe 100644
--- a/crates/wavekat-tts/examples/synthesize.rs
+++ b/crates/wavekat-tts/examples/synthesize.rs
@@ -69,10 +69,10 @@ fn main() {
             "--provider" => {
                 i += 1;
                 provider = match args[i].as_str() {
-                    "cpu"      => ExecutionProvider::Cpu,
-                    "cuda"     => ExecutionProvider::Cuda,
+                    "cpu" => ExecutionProvider::Cpu,
+                    "cuda" => ExecutionProvider::Cuda,
                     "tensorrt" => ExecutionProvider::TensorRt,
-                    "coreml"   => ExecutionProvider::CoreMl,
+                    "coreml" => ExecutionProvider::CoreMl,
                     other => {
                         eprintln!("error: unknown provider \"{other}\", expected cpu, cuda, tensorrt, or coreml");
                         std::process::exit(1);
@@ -102,7 +102,9 @@ fn main() {
         eprintln!("Usage: synthesize [OPTIONS] [TEXT]");
         eprintln!("  --model-dir <PATH>       Model directory (default: auto-download)");
         eprintln!("  --precision <PREC>       Model precision: int4 (default) or fp32");
-        eprintln!("  --provider <EP>          Execution provider: cpu (default), cuda, tensorrt, coreml");
+        eprintln!(
+            "  --provider <EP>          Execution provider: cpu (default), cuda, tensorrt, coreml"
+        );
         eprintln!("  --language <LANG>        Language code (default: en)");
         eprintln!("  --instruction <TEXT>     Voice style instruction (VoiceDesign prompt)");
         eprintln!("                           Default: \"{DEFAULT_INSTRUCTION}\"");
@@ -145,7 +147,9 @@ fn run_interactive(
         .flat_map(|v| v.languages)
         .collect();
 
-    eprintln!("Interactive mode. Type text to synthesize, /help for commands, /quit or Ctrl-C to quit.");
+    eprintln!(
+        "Interactive mode. Type text to synthesize, /help for commands, /quit or Ctrl-C to quit."
+    );
     eprintln!("  language={language}  instruction=\"{instruction}\"");
 
     let stdin = io::stdin();
diff --git a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
index 94993ea..41d064c 100644
--- a/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
+++ b/crates/wavekat-tts/src/backends/qwen3_tts/model.rs
@@ -701,9 +701,7 @@ fn apply_execution_provider(
     match ep {
         super::ExecutionProvider::Cpu => Ok(builder),
         super::ExecutionProvider::Cuda => builder
-            .with_execution_providers([CUDAExecutionProvider::default()
-                .build()
-                .error_on_failure()])
+            .with_execution_providers([CUDAExecutionProvider::default().build().error_on_failure()])
             .map_err(|e| TtsError::Model(format!("CUDA execution provider error: {e}"))),
         super::ExecutionProvider::TensorRt => builder
             .with_execution_providers([TensorRTExecutionProvider::default()

From c951985b54b70ff5c901fba64176a16818f46627 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Wed, 8 Apr 2026 07:55:37 +1200
Subject: [PATCH 42/43] fix: use is_multiple_of() in is_leap to satisfy clippy

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/wavekat-tts/examples/bench_rtf.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/wavekat-tts/examples/bench_rtf.rs b/crates/wavekat-tts/examples/bench_rtf.rs
index c5052f4..010a0c0 100644
--- a/crates/wavekat-tts/examples/bench_rtf.rs
+++ b/crates/wavekat-tts/examples/bench_rtf.rs
@@ -477,5 +477,5 @@ fn today_iso() -> String {
 }
 
 fn is_leap(year: u32) -> bool {
-    (year % 4 == 0 && year % 100 != 0) || year % 400 == 0
+    (year.is_multiple_of(4) && !year.is_multiple_of(100)) || year.is_multiple_of(400)
 }

From 40d8c5781659300df29d877b3d15f73ce43e6630 Mon Sep 17 00:00:00 2001
From: Eason WaveKat <eason@wavekat.com>
Date: Wed, 8 Apr 2026 07:57:28 +1200
Subject: [PATCH 43/43] docs: mark CUDA provider doc as implemented

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/06-cuda-provider.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/06-cuda-provider.md b/docs/06-cuda-provider.md
index 9d311b8..6798f7e 100644
--- a/docs/06-cuda-provider.md
+++ b/docs/06-cuda-provider.md
@@ -2,8 +2,8 @@
 
 ## Status
 
-**In progress** — the `cuda` Cargo feature is being wired up.
-CPU inference already works; this adds NVIDIA GPU acceleration via ORT's CUDA EP.
+**Implemented** — `cuda`, `tensorrt`, and `coreml` Cargo features are available.
+Select the provider at load time via `ModelConfig::with_execution_provider`.
 
 ## Goal