Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,3 @@ make test-all # test all backends
use internal tokio runtimes.
4. **Feature flags per backend** — same pattern as wavekat-vad/turn.

## Pending wavekat-core change

`AudioFrame::from_owned(Vec<f32>, u32) -> AudioFrame<'static>` — avoids the
borrow-then-clone path when creating frames from producer-owned data (TTS output).
Currently uses `AudioFrame::new(slice, rate).into_owned()` as a workaround.
43 changes: 29 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ Same pattern as

## Backends

| Backend | Feature flag | License |
|---------|-------------|---------|
| [Qwen3-TTS](https://huggingface.co/Qwen/Qwen3-TTS) | `qwen3-tts` | Apache 2.0 |
| [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) | `cosyvoice` | Apache 2.0 |
| Backend | Feature flag | Status | License |
|---------|-------------|--------|---------|
| [Qwen3-TTS](https://huggingface.co/Qwen/Qwen3-TTS) | `qwen3-tts` | ✅ Available | Apache 2.0 |
| [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) | `cosyvoice` | 🚧 Planned | Apache 2.0 |

## Quick start

Expand All @@ -31,21 +31,33 @@ cargo add wavekat-tts --features qwen3-tts

```rust
use wavekat_tts::{TtsBackend, SynthesizeRequest};
use wavekat_tts::backends::qwen3_tts::Qwen3Tts;
use wavekat_tts::backends::qwen3_tts::{Qwen3Tts, ModelConfig, ModelPrecision, ExecutionProvider};

// Auto-downloads model files (~3.8 GB) on first run:
// Auto-downloads INT4 model files on first run, runs on CPU (default):
let tts = Qwen3Tts::new()?;

// Or load from an explicit directory:
// let tts = Qwen3Tts::from_dir("models/qwen3-tts-0.6b")?;
// Or FP32 on CPU:
// let tts = Qwen3Tts::from_config(ModelConfig::default().with_precision(ModelPrecision::Fp32))?;

let request = SynthesizeRequest::new("Hello, world");
// Or INT4 from a local directory on CUDA:
// let tts = Qwen3Tts::from_config(
// ModelConfig::default()
// .with_dir("models/qwen3-tts-1.7b")
// .with_execution_provider(ExecutionProvider::Cuda),
// )?;

let request = SynthesizeRequest::new("Hello, world")
.with_instruction("Speak naturally and clearly.");
let audio = tts.synthesize(&request)?;

// Save to WAV (wavekat-core includes WAV I/O via the `wav` feature):
audio.write_wav("output.wav")?;

println!("{}s at {} Hz", audio.duration_secs(), audio.sample_rate());
```

Model files are cached at `$WAVEKAT_MODEL_DIR` or `~/.cache/wavekat/qwen3-tts-0.6b/`.
Model files are cached by the HF Hub client at `$HF_HOME/hub/` (default `~/.cache/huggingface/hub/`).
Set `WAVEKAT_MODEL_DIR` to load from a local directory and skip all downloads.

All backends produce `AudioFrame<'static>` from [`wavekat-core`](https://github.com/wavekat/wavekat-core) — the same
type consumed by `wavekat-vad` and `wavekat-turn`.
Expand All @@ -72,17 +84,20 @@ Two trait families:
Generate a WAV file from text (model files are auto-downloaded on first run):

```sh
cargo run --example synthesize --features qwen3-tts,hound -- "Hello, world\!"
cargo run --example synthesize --features qwen3-tts,hound -- --language zh "你好世界"
cargo run --example synthesize --features qwen3-tts,hound -- --model-dir /path/to/model --output hello.wav "Hello"
cargo run --example synthesize --features qwen3-tts -- "Hello, world\!"
cargo run --example synthesize --features qwen3-tts -- --instruction "Speak in a warm, friendly tone." "Give every small business the voice of a big one."
cargo run --example synthesize --features qwen3-tts -- --precision fp32 "Hello"
cargo run --example synthesize --features qwen3-tts -- --model-dir /path/to/model --output hello.wav "Hello"
```

## Feature flags

| Flag | Default | Description |
|------|---------|-------------|
| `qwen3-tts` | off | Qwen3-TTS local ONNX inference |
| `cosyvoice` | off | CosyVoice local ONNX inference |
| `cosyvoice` | off | CosyVoice local ONNX inference (planned) |

WAV I/O (`write_wav` / `from_wav`) is provided by `wavekat-core` via its `wav` feature flag.

## License

Expand Down
12 changes: 4 additions & 8 deletions crates/wavekat-tts/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ categories = ["multimedia::audio"]
default = []

# Local inference backends (all ONNX-based)
qwen3-tts = ["dep:ort", "dep:ndarray", "dep:tokenizers", "dep:npyz", "dep:rand", "dep:ureq"]
qwen3-tts = ["dep:ort", "dep:ndarray", "dep:tokenizers", "dep:npyz", "dep:rand", "dep:hf-hub"]
cosyvoice = ["dep:ort", "dep:ndarray"]

[dependencies]
wavekat-core = "0.0.3"
wavekat-core = { version = "0.0.5", features = ["wav"] }
thiserror = "2"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
Expand All @@ -27,12 +27,8 @@ ndarray = { version = "0.17", optional = true }
tokenizers = { version = "0.21", optional = true, default-features = false, features = ["onig"] }
npyz = { version = "0.8", optional = true }
rand = { version = "0.9", optional = true }
ureq = { version = "2", optional = true }
hound = { version = "3.5", optional = true }

[dev-dependencies]
hound = "3.5"
hf-hub = { version = "0.5", optional = true, default-features = false, features = ["ureq"] }

[[example]]
name = "synthesize"
required-features = ["qwen3-tts", "hound"]
required-features = ["qwen3-tts"]
178 changes: 139 additions & 39 deletions crates/wavekat-tts/examples/synthesize.rs
Original file line number Diff line number Diff line change
@@ -1,29 +1,46 @@
//! Synthesize text to a WAV file using Qwen3-TTS.
//!
//! Usage:
//! cargo run --example synthesize --features qwen3-tts,hound -- [OPTIONS] [TEXT]
//! cargo run --example synthesize --features qwen3-tts -- [OPTIONS] [TEXT]
//!
//! Options:
//! --model-dir <PATH> Model directory (default: auto-download to cache)
//! --language <LANG> Language code (default: en)
//! --output <PATH> Output WAV path (default: output.wav)
//! -i, --interactive Interactive mode: keep model loaded, read text from stdin
//! --model-dir <PATH> Model directory (default: auto-download to cache)
//! --precision <PREC> Model precision: int4 (default) or fp32
//! --language <LANG> Language code (default: en)
//! --instruction <TEXT> Voice style instruction (VoiceDesign prompt)
//! Default: "Speak naturally and clearly."
//! --output <PATH> Output WAV path (default: output.wav)
//! -i, --interactive Interactive mode: keep model loaded, read text from stdin
//!
//! Interactive commands (prefix with /):
//! /lang <code> Switch language (e.g. /lang ja)
//! /langs List supported language codes
//! /instruct <text> Change voice instruction (e.g. /instruct Speak slowly.)
//! /instruct Reset instruction to default
//! /status Show current settings
//! /help Show this command list
//! Empty line or Ctrl-D Quit
//!
//! Example:
//! cargo run --example synthesize --features qwen3-tts,hound -- "Hello, world!"
//! cargo run --example synthesize --features qwen3-tts,hound -- -i
//! cargo run --example synthesize --features qwen3-tts -- "Hello, world!"
//! cargo run --example synthesize --features qwen3-tts -- -i
//! cargo run --example synthesize --features qwen3-tts -- --precision fp32 -i

use std::io::{self, BufRead, Write};
use std::path::PathBuf;

use wavekat_tts::backends::qwen3_tts::Qwen3Tts;
use wavekat_tts::backends::qwen3_tts::{ModelConfig, ModelPrecision, Qwen3Tts};
use wavekat_tts::{SynthesizeRequest, TtsBackend};

const DEFAULT_INSTRUCTION: &str = "Speak naturally and clearly.";

fn main() {
let args: Vec<String> = std::env::args().skip(1).collect();

let mut model_dir: Option<PathBuf> = None;
let mut precision = ModelPrecision::Int4;
let mut language = "en".to_string();
let mut instruction: Option<String> = None;
let mut output = PathBuf::from("output.wav");
let mut interactive = false;
let mut text_parts: Vec<String> = Vec::new();
Expand All @@ -35,10 +52,25 @@ fn main() {
i += 1;
model_dir = Some(PathBuf::from(&args[i]));
}
"--precision" => {
i += 1;
precision = match args[i].as_str() {
"int4" => ModelPrecision::Int4,
"fp32" => ModelPrecision::Fp32,
other => {
eprintln!("error: unknown precision \"{other}\", expected int4 or fp32");
std::process::exit(1);
}
};
}
"--language" => {
i += 1;
language = args[i].clone();
}
"--instruction" => {
i += 1;
instruction = Some(args[i].clone());
}
"--output" => {
i += 1;
output = PathBuf::from(&args[i]);
Expand All @@ -52,28 +84,51 @@ fn main() {
let text = text_parts.join(" ");
if text.is_empty() && !interactive {
eprintln!("Usage: synthesize [OPTIONS] [TEXT]");
eprintln!(" --model-dir <PATH> Model directory (default: auto-download)");
eprintln!(" --language <LANG> Language code (default: en)");
eprintln!(" --output <PATH> Output WAV path (default: output.wav)");
eprintln!(" -i, --interactive Interactive mode (read from stdin)");
eprintln!(" --model-dir <PATH> Model directory (default: auto-download)");
eprintln!(" --precision <PREC> Model precision: int4 (default) or fp32");
eprintln!(" --language <LANG> Language code (default: en)");
eprintln!(" --instruction <TEXT> Voice style instruction (VoiceDesign prompt)");
eprintln!(" Default: \"{DEFAULT_INSTRUCTION}\"");
eprintln!(" --output <PATH> Output WAV path (default: output.wav)");
eprintln!(" -i, --interactive Interactive mode (read from stdin)");
std::process::exit(1);
}

if instruction.is_none() {
eprintln!("note: no --instruction given, using default: \"{DEFAULT_INSTRUCTION}\"");
instruction = Some(DEFAULT_INSTRUCTION.to_string());
}

eprintln!("Loading model ...");
let tts = match model_dir {
Some(dir) => Qwen3Tts::from_dir(dir).expect("failed to load model"),
None => Qwen3Tts::new().expect("failed to load model"),
};
let mut config = ModelConfig::default().with_precision(precision);
if let Some(dir) = model_dir {
config = config.with_dir(dir);
}
let tts = Qwen3Tts::from_config(config).expect("failed to load model");

if interactive {
run_interactive(&tts, &language, &output);
run_interactive(&tts, language, instruction.unwrap(), &output);
} else {
synthesize_one(&tts, &text, &language, &output);
synthesize_one(&tts, &text, &language, instruction.as_deref(), &output);
}
}

fn run_interactive(tts: &Qwen3Tts, language: &str, default_output: &PathBuf) {
eprintln!("Interactive mode. Type text to synthesize, empty line to quit.");
fn run_interactive(
tts: &Qwen3Tts,
mut language: String,
mut instruction: String,
default_output: &PathBuf,
) {
let supported_langs: Vec<String> = tts
.voices()
.unwrap_or_default()
.into_iter()
.flat_map(|v| v.languages)
.collect();

eprintln!("Interactive mode. Type text to synthesize, /help for commands, empty line to quit.");
eprintln!(" language={language} instruction=\"{instruction}\"");

let stdin = io::stdin();
let mut count = 0u32;

Expand All @@ -85,32 +140,88 @@ fn run_interactive(tts: &Qwen3Tts, language: &str, default_output: &PathBuf) {
if stdin.lock().read_line(&mut line).unwrap_or(0) == 0 {
break;
}
let text = line.trim();
if text.is_empty() {
let input = line.trim();
if input.is_empty() {
break;
}

if let Some(rest) = input.strip_prefix('/') {
let (cmd, arg) = rest
.split_once(' ')
.map_or((rest, ""), |(c, a)| (c, a.trim()));
match cmd {
"lang" | "language" => {
if arg.is_empty() {
eprintln!("usage: /lang <code> — type /langs to list supported codes");
} else if !supported_langs.iter().any(|l| l == arg) {
eprintln!("unsupported language: \"{arg}\"");
eprintln!("supported: {}", supported_langs.join(", "));
} else {
language = arg.to_string();
eprintln!("language set to: {language}");
}
}
"langs" | "languages" => {
eprintln!("supported languages: {}", supported_langs.join(", "));
}
"instruct" | "instruction" => {
if arg.is_empty() {
instruction = DEFAULT_INSTRUCTION.to_string();
eprintln!("instruction reset to default: \"{instruction}\"");
} else {
instruction = arg.to_string();
eprintln!("instruction set to: \"{instruction}\"");
}
}
"status" => {
eprintln!(" language={language}");
eprintln!(" instruction=\"{instruction}\"");
eprintln!(" supported languages: {}", supported_langs.join(", "));
}
"help" => {
eprintln!(" /lang <code> Switch language");
eprintln!(" /langs List supported language codes");
eprintln!(" /instruct <text> Change voice instruction");
eprintln!(" /instruct Reset instruction to default");
eprintln!(" /status Show current settings");
eprintln!(" /help Show this help");
eprintln!(" Empty line Quit");
}
other => eprintln!("unknown command: /{other} (type /help for commands)"),
}
continue;
}

count += 1;
let output = if *default_output == PathBuf::from("output.wav") {
let output = if default_output == std::path::Path::new("output.wav") {
PathBuf::from(format!("output_{count:03}.wav"))
} else {
default_output.clone()
};

synthesize_one(tts, text, language, &output);
synthesize_one(tts, input, &language, Some(&instruction), &output);
}
}

fn synthesize_one(tts: &Qwen3Tts, text: &str, language: &str, output: &PathBuf) {
let request = SynthesizeRequest::new(text).with_language(language);
fn synthesize_one(
tts: &Qwen3Tts,
text: &str,
language: &str,
instruction: Option<&str>,
output: &PathBuf,
) {
let mut request = SynthesizeRequest::new(text).with_language(language);
if let Some(instr) = instruction {
request = request.with_instruction(instr);
}

eprintln!("Synthesizing: \"{text}\" (language={language})");
let start = std::time::Instant::now();
let audio = tts.synthesize(&request).expect("synthesis failed");
let elapsed = start.elapsed();

let duration = audio.duration_secs();
let rtf = elapsed.as_secs_f64() / duration as f64;
let rtf = elapsed.as_secs_f64() / duration;

eprintln!(
"Generated {} samples at {} Hz ({:.2}s) in {:.2}s (RTF: {:.2})",
Expand All @@ -121,18 +232,7 @@ fn synthesize_one(tts: &Qwen3Tts, text: &str, language: &str, output: &PathBuf)
rtf,
);

// Write WAV
let spec = hound::WavSpec {
channels: 1,
sample_rate: audio.sample_rate(),
bits_per_sample: 32,
sample_format: hound::SampleFormat::Float,
};
let mut writer = hound::WavWriter::create(output, spec).expect("failed to create WAV file");
for &sample in audio.samples() {
writer.write_sample(sample).expect("failed to write sample");
}
writer.finalize().expect("failed to finalize WAV");
audio.write_wav(output).expect("failed to write WAV");

eprintln!("Wrote {}", output.display());
}
Loading