Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -241,4 +241,7 @@ marimo/_lsp/
__marimo__/

# Streamlit
.streamlit/secrets.toml
.streamlit/secrets.toml


kitten-tts-mini-0.8/
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "kittentts-rs/crates/espeak-ng-sys/espeak-ng"]
path = kittentts-rs/crates/espeak-ng-sys/espeak-ng
url = https://github.com/espeak-ng/espeak-ng
7 changes: 7 additions & 0 deletions kittentts-rs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/target/
Cargo.lock
espeak-ng.dll
*.wav
*.onnx
*.npz
.DS_Store
20 changes: 20 additions & 0 deletions kittentts-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[package]
name = "kittentts-rs"
version = "0.1.0"
edition = "2021"
description = "Ultra-lightweight text-to-speech inference in Rust — port of KittenTTS"
license = "MIT"

[dependencies]
ort = { version = "2.0.0-rc.11", features = ["ndarray"] }
ndarray = "0.16"
ndarray-npy = { version = "0.9", features = ["npz"] }
hound = "3.5"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
regex = "1.10"
clap = { version = "4.4", features = ["derive"] }
anyhow = "1.0"
once_cell = "1.19"
fancy-regex = "0.13"
espeak-ng-sys = { path = "crates/espeak-ng-sys" }
43 changes: 43 additions & 0 deletions kittentts-rs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# KittenTTS-RS

Ultra-lightweight text-to-speech inference in Rust using ONNX Runtime and eSpeak-NG.

This is a Rust port of the KittenTTS project.

## Features

- **Fast & Efficient**: Low-latency synthesis via ONNX Runtime (`ort`).
- **High Quality**: Accurate pronunciation using IPA phonemes via eSpeak-NG.
- **Self-Contained**: eSpeak-NG is built-in as a submodule—no system installation required.
- **Configurable**: Multiple voices, speed control, and text preprocessing.

## Prerequisites

- **Rust**: [Install Rust](https://www.rust-lang.org/tools/install)
- **ONNX Model**: A directory containing `config.json`, `.onnx` model, and `voices.npz`.

## Running Locally

```bash
# download model weights (onnx)
git clone https://huggingface.co/KittenML/kitten-tts-mini-0.8

cargo run -- \
--text "Hello, I am KittenTTS-RS!" \
--model-dir "../kitten-tts-mini-0.8" \
--voice "Luna" \
--output "output.wav"
```

### Options

- `--text`: Text to synthesize.
- `--model-dir`: Directory containing ONNX model files.
- `--voice`: Voice name (e.g., "Luna", "Leo").
- `--speed`: Speech speed (default: 1.0).
- `--output`: Output file path (default: `output.wav`).
- `--no-clean`: Skip text preprocessing.

---

*Coded with ❤️ by Antigravity.*
21 changes: 21 additions & 0 deletions kittentts-rs/crates/espeak-ng-sys/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "espeak-ng-sys"
version = "0.1.0"
edition = "2021"
links = "espeak-ng"
include = [
"src/**/*",
"espeak-ng/espeak-ng-data/**/*",
"espeak-ng/dictsource/**/*",
"build.rs",
"wrapper.h",
]

[dependencies]
# Piper uses these for path resolution
once_cell = "1.19"

[build-dependencies]
cmake = "0.1"
bindgen = "0.70"
glob = "0.3"
68 changes: 68 additions & 0 deletions kittentts-rs/crates/espeak-ng-sys/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use std::env;
use std::path::PathBuf;

fn main() {
let dst = cmake::Config::new("espeak-ng")
.define("BUILD_SHARED_LIBS", "OFF")
.define("USE_LIBPCAUDIO", "OFF")
.define("USE_KLATT", "OFF")
.define("USE_MBROLA", "OFF")
.define("USE_ASYNC", "OFF")
.define("ENABLE_TESTS", "OFF")
.build();

let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
let cmake_config = if profile == "release" {
"Release"
} else {
"Debug"
};

// espeak-ng builds sub-libraries in specific subdirectories
let mut search_paths = vec![
dst.join("lib"),
dst.join("build/src/ucd-tools"),
dst.join("build/src/speechPlayer"),
];

if cfg!(target_os = "windows") {
// MSVC adds a config-specific subdirectory
search_paths.push(dst.join("build/src/ucd-tools").join(cmake_config));
search_paths.push(dst.join("build/src/speechPlayer").join(cmake_config));
// Some versions might put espeak-ng.lib in lib/Debug too
search_paths.push(dst.join("lib").join(cmake_config));
}

for path in &search_paths {
if path.exists() {
println!("cargo:rustc-link-search=native={}", path.display());
}
}

println!("cargo:rustc-link-lib=static=espeak-ng");
println!("cargo:rustc-link-lib=static=ucd");
// Piper links speechPlayer too
println!("cargo:rustc-link-lib=static=speechPlayer");

if cfg!(target_os = "windows") {
println!("cargo:rustc-link-lib=dylib=user32");
println!("cargo:rustc-link-lib=dylib=shell32");
} else if cfg!(target_os = "linux") {
println!("cargo:rustc-link-lib=dylib=stdc++");
} else if cfg!(target_os = "macos") {
println!("cargo:rustc-link-lib=framework=Foundation");
println!("cargo:rustc-link-lib=dylib=c++");
}

let bindings = bindgen::Builder::default()
.header("wrapper.h")
.clang_arg("-Iespeak-ng/src/include")
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
.generate()
.expect("Unable to generate bindings");

let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
bindings
.write_to_file(out_path.join("bindings.rs"))
.expect("Couldn't write bindings!");
}
1 change: 1 addition & 0 deletions kittentts-rs/crates/espeak-ng-sys/espeak-ng
Submodule espeak-ng added at c204e6
5 changes: 5 additions & 0 deletions kittentts-rs/crates/espeak-ng-sys/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]

include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
2 changes: 2 additions & 0 deletions kittentts-rs/crates/espeak-ng-sys/wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#include "espeak-ng/speak_lib.h"
#include "espeak-ng/espeak_ng.h"
31 changes: 31 additions & 0 deletions kittentts-rs/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use serde::Deserialize;
use std::collections::HashMap;
use std::path::Path;

/// Model configuration matching the JSON config.json format.
#[derive(Debug, Deserialize)]
pub struct ModelConfig {
pub name: String,
pub version: String,
#[serde(rename = "type")]
pub model_type: String,
pub model: String,
pub voices: String,
pub model_file: String,
#[serde(default)]
pub speed_priors: HashMap<String, f32>,
#[serde(default)]
pub voice_aliases: HashMap<String, String>,
}

impl ModelConfig {
/// Load configuration from a config.json file.
pub fn load(path: &Path) -> anyhow::Result<Self> {
let data = std::fs::read_to_string(path)?;
let config: ModelConfig = serde_json::from_str(&data)?;
if config.model_type != "ONNX1" && config.model_type != "ONNX2" {
anyhow::bail!("Unsupported model type: {}", config.model_type);
}
Ok(config)
}
}
87 changes: 87 additions & 0 deletions kittentts-rs/src/espeak.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
use espeak_ng_sys as espeak_ng;
use std::ffi::{CStr, CString};
use std::ptr;

pub struct Espeak {
_private: (),
}

impl Espeak {
pub fn new(data_path: Option<&str>) -> anyhow::Result<Self> {
let c_path = data_path.map(|s| CString::new(s).unwrap());

let path_ptr = c_path.as_ref().map(|s| s.as_ptr()).unwrap_or(ptr::null());

// output = Retrieval, buflength = 0, options = 0
let sample_rate = unsafe {
espeak_ng::espeak_Initialize(
espeak_ng::espeak_AUDIO_OUTPUT_AUDIO_OUTPUT_RETRIEVAL,
0,
path_ptr,
0,
)
};

if sample_rate <= 0 {
anyhow::bail!("Failed to initialize eSpeak-NG (returned {})", sample_rate);
}

Ok(Espeak { _private: () })
}

pub fn set_voice(&self, name: &str) -> anyhow::Result<()> {
let c_name = CString::new(name)?;
let result = unsafe { espeak_ng::espeak_SetVoiceByName(c_name.as_ptr()) };
if result == espeak_ng::espeak_ERROR_EE_OK {
Ok(())
} else {
anyhow::bail!("Failed to set eSpeak voice to {}", name)
}
}

pub fn text_to_phonemes(&self, text: &str) -> anyhow::Result<String> {
let c_text = CString::new(text)?;
let mut text_ptr = c_text.as_ptr() as *const std::os::raw::c_char;
let mut all_phonemes = String::new();

while !text_ptr.is_null() && unsafe { *text_ptr != 0 } {
let initial_ptr = text_ptr;
let mut current_ptr = text_ptr as *const std::os::raw::c_void;

// textmode 1 = UTF8, phonememode 66 = IPA + Punctuation
let result_ptr = unsafe { espeak_ng::espeak_TextToPhonemes(&mut current_ptr, 1, 66) };

if !result_ptr.is_null() {
let c_str = unsafe { CStr::from_ptr(result_ptr) };
let phonemes = c_str.to_string_lossy();
if !all_phonemes.is_empty()
&& !all_phonemes.ends_with(' ')
&& !phonemes.starts_with(' ')
{
all_phonemes.push(' ');
}
all_phonemes.push_str(&phonemes);
}

text_ptr = current_ptr as *const std::os::raw::c_char;

if text_ptr.is_null() || text_ptr == initial_ptr {
break;
}
}

if all_phonemes.is_empty() {
anyhow::bail!("eSpeak failed to generate phonemes");
}

Ok(all_phonemes)
}
}

impl Drop for Espeak {
fn drop(&mut self) {
unsafe {
espeak_ng::espeak_Terminate();
}
}
}
15 changes: 15 additions & 0 deletions kittentts-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//! # kittentts-rs
//!
//! Ultra-lightweight text-to-speech inference in Rust.
//!
//! A Rust port of the [KittenTTS](https://github.com/kittenml/kittentts) Python
//! package, using `ort` for ONNX Runtime inference and `espeakng` for phonemisation.

pub mod config;
pub mod espeak;
pub mod model;
pub mod preprocess;
pub mod text_cleaner;

// Re-export the main model type for convenience.
pub use model::KittenTTSModel;
Loading