diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..85eacda --- /dev/null +++ b/.dockerignore @@ -0,0 +1,41 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so + +.git/ +.gitignore +.github/ + +venv/ +.venv/ +env/ +.env + +*.egg-info/ +dist/ +build/ +.eggs/ + +*.wav +*.mp3 +*.ogg + +models/ +cache/ +.huggingface/ + +.idea/ +.vscode/ +*.swp +*.swo + +.pytest_cache/ +.mypy_cache/ +.coverage +htmlcov/ + +.DS_Store +Thumbs.db + +*.log diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..4872864 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,290 @@ +# KittenTTS Project Guide + +This document provides essential information for AI coding agents working on the KittenTTS project. + +## Project Overview + +KittenTTS is an open-source, ultra-lightweight text-to-speech (TTS) model designed for CPU-optimized, high-quality voice synthesis without requiring a GPU. The project provides both a Python library and a web interface. + +**Key Characteristics:** +- Model sizes range from 15M to 80M parameters +- ONNX-based inference for cross-platform compatibility +- Models downloaded from Hugging Face at runtime (not bundled) +- 8 distinct voices with speed control support +- Target: Real-time speech synthesis on consumer hardware + +**Available Models:** +| Model | Params | Size | HuggingFace Repo | +|-------|--------|------|------------------| +| kitten-tts-mini | 80M | 80MB | KittenML/kitten-tts-mini-0.8 | +| kitten-tts-micro | 40M | 41MB | KittenML/kitten-tts-micro-0.8 | +| kitten-tts-nano | 15M | 56MB | KittenML/kitten-tts-nano-0.8-fp32 | +| kitten-tts-nano-int8 | 15M | 19MB | KittenML/kitten-tts-nano-0.8-int8 | + +**Available Voices:** Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo (4 male, 4 female) + +## Technology Stack + +**Core Dependencies:** +- Python 3.8+ (recommended 3.12) +- `onnxruntime` - Model inference engine +- `phonemizer` + `espeak-ng` - Text-to-phoneme conversion +- `misaki[en]` - English text processing +- `spacy` - NLP processing +- `soundfile` - Audio I/O +- `huggingface_hub` - Model downloading + +**WebUI Dependencies:** +- `fastapi` - Web framework +- `uvicorn` - ASGI server +- `python-multipart` - Form parsing + +**Build System:** +- `setuptools` with `pyproject.toml` (primary) and legacy `setup.py` +- `ruff` for linting (cache directory `.ruff_cache/` present) + +## Project Structure + +``` +. +├── kittentts/ # Core library package +│ ├── __init__.py # Package exports (KittenTTS, get_model) +│ ├── __index__.py # Legacy exports +│ ├── get_model.py # Model download & main KittenTTS class +│ ├── onnx_model.py # ONNX inference engine (KittenTTS_1_Onnx) +│ └── preprocess.py # Text preprocessing pipeline +│ +├── webui/ # Web interface +│ ├── __init__.py +│ ├── server.py # FastAPI application & endpoints +│ ├── templates/ +│ │ └── index.html # Main web interface +│ └── static/ +│ ├── style.css # UI styling +│ ├── app.js # Frontend JavaScript +│ └── favicon.svg # Branding icon +│ +├── pyproject.toml # Modern Python packaging config +├── setup.py # Legacy packaging (keep in sync) +├── requirements.txt # Base dependencies +├── MANIFEST.in # Package distribution includes +├── Dockerfile # Container build +├── run_webui.py # WebUI entry point +└── example.py # Usage example +``` + +## Key Module Details + +### 1. `kittentts/get_model.py` +- **KittenTTS** class: Main user-facing API + - `__init__(model_name, cache_dir)` - Downloads model from HF if needed + - `generate(text, voice, speed)` - Returns numpy array of audio + - `generate_to_file(text, output_path, ...)` - Saves to WAV file + - `available_voices` property - Lists supported voices +- **download_from_huggingface()** - Downloads config, model ONNX, and voice embeddings + +### 2. `kittentts/onnx_model.py` +- **KittenTTS_1_Onnx** class: Low-level ONNX inference + - Loads ONNX model and voice embeddings (NPZ format) + - Uses EspeakBackend for phonemization (language: "en-us") + - **TextCleaner** class: Maps phonemes to token IDs + - **chunk_text()**: Splits long text at sentence/word boundaries (400 char limit) + - Handles speed adjustments via voice-specific priors +- **StreamingTTS** class: Sentence-level streaming for real-time TTS + - Buffers incoming text and yields audio when complete sentences are detected + - `add_text(text)`: Add text chunk, yields audio for complete sentences + - `flush()`: Synthesize any remaining buffered text + - `reset()`: Clear buffer without generating audio + - `buffered_text` property: View current buffered text + +### 3. `kittentts/preprocess.py` +- **TextPreprocessor** class: Comprehensive text normalization + - Number-to-words conversion (integers, floats, ordinals, fractions) + - Currency expansion ($, €, £, ¥, ₹, ₩, ₿) + - Time format expansion (3:30pm → "three thirty pm") + - Unit expansion (km, kg, GB, °C, etc.) + - Scientific notation, Roman numerals, phone numbers, IP addresses + - Model name normalization (GPT-3 → "GPT 3") + - HTML/URL/email removal, contraction expansion + - Configurable pipeline via constructor flags + +### 4. `webui/server.py` +- FastAPI application with CORS enabled +- Endpoints: + - `GET /` - Serves HTML template + - `GET /api/models` - Returns model metadata + - `GET /api/voices` - Returns voice metadata + - `POST /api/generate` - Generates speech (returns base64 WAV) + - `GET /api/health` - Health check with loaded models + - `POST /api/stream/start` - Start a streaming TTS session + - `POST /api/stream/chunk` - Add text to streaming session, get audio for complete sentences + - `DELETE /api/stream/end/{session_id}` - End streaming session +- **Model lazy-loading**: Models loaded on first request and cached +- **Streaming sessions**: In-memory session cache for streaming TTS + +## Build and Installation + +**Development Installation:** +```bash +pip install -e . +# Or with WebUI support: +pip install -e . fastapi uvicorn python-multipart +``` + +**Building Wheel:** +```bash +python -m build +``` + +**Docker Build:** +```bash +docker build -t kittentts-webui . +docker run -d -p 7860:7860 -v ~/.cache/huggingface:/root/.cache/huggingface kittentts-webui +``` + +## Running the Application + +**Python API:** +```python +from kittentts import KittenTTS +import soundfile as sf + +model = KittenTTS("KittenML/kitten-tts-mini-0.8") +audio = model.generate("Hello world", voice="Jasper", speed=1.0) +sf.write("output.wav", audio, 24000) +``` + +**WebUI:** +```bash +python run_webui.py --host 0.0.0.0 --port 7860 +``` + +**Streaming TTS (for LLM integration):** +```python +from kittentts import KittenTTS, StreamingTTS +import soundfile as sf + +# Initialize model +model = KittenTTS("KittenML/kitten-tts-mini-0.8") + +# Create a streaming instance +streamer = model.create_streamer(voice="Jasper", speed=1.0) + +# Simulate streaming from an LLM +llm_tokens = ["Hello", " there", "! How", " are", " you", " today", "?"] + +for token in llm_tokens: + # add_text() yields audio chunks when complete sentences are detected + for audio_chunk in streamer.add_text(token): + sf.write("chunk.wav", audio_chunk, 24000) + # Or play immediately for real-time output + +# Don't forget to flush remaining buffered text +for audio_chunk in streamer.flush(): + sf.write("final_chunk.wav", audio_chunk, 24000) +``` + +**Streaming via Web API:** +```python +import requests +import json + +BASE_URL = "http://localhost:7860" + +# Start a streaming session +response = requests.post(f"{BASE_URL}/api/stream/start?model=kitten-tts-nano&voice=Jasper&speed=1.0") +session_id = response.json()["session_id"] + +# Stream text chunks +for token in ["Hello", " there", "! How", " are", " you", "?"]: + response = requests.post( + f"{BASE_URL}/api/stream/chunk?session_id={session_id}", + json={"text": token, "flush": False} + ) + result = response.json() + for audio_base64 in result["audio_chunks"]: + # Decode and play audio + pass + +# Flush remaining text and end session +response = requests.post( + f"{BASE_URL}/api/stream/chunk?session_id={session_id}", + json={"text": "", "flush": True} +) +requests.delete(f"{BASE_URL}/api/stream/end/{session_id}") +``` + +## Development Conventions + +**Code Style:** +- Project uses `ruff` for linting (evidenced by `.ruff_cache/`) +- Follow PEP 8 conventions +- Use type hints where appropriate (FastAPI models use Pydantic) + +**Text Processing Order:** +When modifying `preprocess.py`, maintain the processing order in `TextPreprocessor.process()`: +1. Unicode normalization +2. Content removal (HTML, URLs, emails) +3. Contraction expansion +4. IP addresses (before decimal normalization) +5. Currency/percentages/scientific notation +6. Time, ordinals, units, fractions, decades +7. Phone numbers (before ranges) +8. Ranges, model names, Roman numerals +9. Generic number replacement +10. Final cleanup (accents, punctuation, lowercase) + +**Voice Aliases:** +The WebUI uses friendly names (Bella, Jasper, etc.) that map to internal voice IDs (expr-voice-2-f, expr-voice-2-m, etc.). Maintain this mapping in both `webui/server.py` and model configs. + +## Testing + +**Current State:** No test suite is currently present in the repository. + +**Recommended Testing Approach:** +- Add unit tests for `TextPreprocessor` with various input cases +- Test ONNX model inference with dummy inputs +- Integration tests for HuggingFace model downloading +- WebUI API endpoint testing with `TestClient` from FastAPI + +## Deployment Considerations + +**System Requirements:** +- Python 3.12 recommended (3.8 minimum) +- `espeak-ng` system package required (installed in Dockerfile) +- HuggingFace cache directory should be persisted for faster restarts +- Models are downloaded on-demand (~80MB per model variant) + +**Security:** +- WebUI runs with CORS allow-all (`["*"]`) - configure appropriately for production +- No authentication implemented in default WebUI +- Input validation present for speed range (0.25-3.0) and empty text + +**Environment Variables:** +- `PYTHONUNBUFFERED=1` set in Docker +- HF cache location follows HuggingFace hub defaults (`~/.cache/huggingface`) + +## Common Tasks + +**Adding a New Voice:** +1. Add voice embeddings to model's voices.npz on HuggingFace +2. Update `available_voices` in `onnx_model.py` +3. Add voice alias mapping in `webui/server.py` +4. Update voice metadata in `VOICES` list in `server.py` + +**Adding a New Model:** +1. Upload ONNX model and config to HuggingFace +2. Add entry to `MODELS` dict in `webui/server.py` +3. Add metadata to `MODEL_INFO` list +4. Ensure config.json has correct `type`, `model_file`, `voices` keys + +**Modifying Text Preprocessing:** +1. Add new regex pattern near other `_RE_*` definitions +2. Create expansion function with docstring and examples +3. Add config flag to `TextPreprocessor.__init__` +4. Insert call in `process()` method at appropriate position +5. Add test case in `if __name__ == "__main__"` block + +## License + +Apache License 2.0 - See LICENSE file for details. diff --git a/AUDIO_QUALITY.md b/AUDIO_QUALITY.md new file mode 100644 index 0000000..d8e1bd1 --- /dev/null +++ b/AUDIO_QUALITY.md @@ -0,0 +1,202 @@ +# KittenTTS Audio Quality Optimization Guide + +This guide explains how to get the best audio quality from KittenTTS. + +## 🎯 Model Selection (Most Important!) + +**The #1 factor for audio quality is model precision, not model size.** + +### Model Precision Comparison + +| Model | Params | Size | Precision | Quality | Use Case | +|-------|--------|------|-----------|---------|----------| +| **Nano (FP32)** | 15M | 56MB | 32-bit float | ⭐⭐⭐⭐⭐ **Best** | Recommended for best quality | +| Mini (INT8) | 80M | 80MB | 8-bit int | ⭐⭐⭐⭐ Good | Long-form content | +| Micro (INT8) | 40M | 41MB | 8-bit int | ⭐⭐⭐ Good | Balanced | +| Nano (INT8) | 15M | 19MB | 8-bit int | ⭐⭐ Basic | Resource-constrained | + +### Why FP32 Sounds Better Than Larger INT8 Models + +The neural network generates continuous audio waveforms. Precision matters: + +- **FP32 (32-bit float)**: Smooth, continuous curves → natural speech +- **INT8 (8-bit integer)**: Stepped approximations → subtle artifacts + +A smaller FP32 model (15M params, 56MB) produces smoother audio than a larger INT8 model (80M params, 80MB) because: +1. **No quantization artifacts** - Full precision preserves subtle prosody +2. **Smoother waveforms** - No stepped approximations in output +3. **Better pitch/rhythm** - Floating point preserves continuous variations + +**Recommendation: Always use `kitten-tts-nano` (FP32) for best quality.** + +## 🔧 Environment Setup + +### Required Dependencies + +For optimal audio quality, ensure all dependencies are properly installed: + +```bash +# Install all dependencies +pip install -r requirements.txt +python -m spacy download en_core_web_sm +``` + +### Critical Components + +| Component | Purpose | Quality Impact | +|-----------|---------|----------------| +| **ONNX Runtime** | Model inference engine | High - affects synthesis speed & stability | +| **Phonemizer** | Text → phoneme conversion | Critical - wrong phonemes = gibberish speech | +| **Espeak-ng** | Backend for phonemizer | Critical - version must match training environment | +| **NumPy < 2.0** | Array operations | Medium - version 2.x may have precision issues | + +## 🎙️ Best Practices + +### 1. Voice Selection + +Different voices work better for different content: + +| Voice | Gender | Best For | Notes | +|-------|--------|----------|-------| +| **Jasper** | Male | General use, clarity | Most consistent across environments | +| **Bella** | Female | Warm, friendly content | Good for conversational text | +| **Luna** | Female | Soft, melodic speech | Best for poetry/artistic content | +| **Bruno** | Male | Deep, authoritative | Good for announcements | + +### 2. Speed Settings + +- **1.0x (default)**: Most natural speech +- **0.8-0.9x**: More deliberate, clearer pronunciation +- **1.1-1.2x**: Faster but still natural +- **>1.5x**: May become distorted + +### 3. Text Preparation + +```python +# Good: Punctuation helps with prosody +text = "Hello! How are you today? I hope you're doing well." + +# Avoid: Missing punctuation +text = "hello how are you today i hope youre doing well" + +# Good: Numbers written out or let preprocessor handle them +text = "I have 3 cats and 2 dogs." # Auto-converted to "three" and "two" + +# Good: End sentences with punctuation +# This helps the model know when to pause +``` + +### 4. Audio Output Settings + +When saving files, use appropriate bit depth: + +```python +# Standard quality (recommended) +model.generate_to_file(text, "output.wav", subtype='PCM_16') + +# Higher quality (larger file) +model.generate_to_file(text, "output.wav", subtype='PCM_24') +``` + +## 🔍 Troubleshooting + +### Audio Sounds Robotic/Muffled + +**Cause**: Wrong phonemization (espeak version mismatch) + +**Fix**: +```bash +# Check espeak version +espeak-ng --version # Should be 1.51 or later + +# Reinstall phonemizer dependencies +pip install --force-reinstall phonemizer espeakng-loader +``` + +### Audio Has Static/Noise + +**Cause**: Clipping or float precision issues + +**Fix**: +- Lower the speed slightly (try 0.95x) +- Check NumPy version: `pip install "numpy<2.0"` + +### Generation Is Slow + +**Cause**: ONNX Runtime not using optimal settings + +**Fix**: The model now auto-configures ONNX Runtime. If still slow: +```python +# Check available providers +import onnxruntime as ort +print(ort.get_available_providers()) + +# Should show ['CPUExecutionProvider'] at minimum +``` + +### Voice Sounds Wrong (Wrong Pitch/Gender) + +**Cause**: Voice embeddings not loading correctly + +**Fix**: +1. Delete cached model: `~/.cache/huggingface/hub/KittenML_*` +2. Re-download: The model will re-download on next use + +## 📊 Testing Your Setup + +Run the diagnostic script: + +```bash +# Check environment +python check_environment.py + +# Test audio generation +python test_tts.py + +# With speed benchmark +python test_tts.py --benchmark +``` + +## 🏗️ How It Works + +``` +Text Input + ↓ +Text Preprocessor (numbers → words, contractions, etc.) + ↓ +Phonemizer (espeak-ng) → IPA phonemes + ↓ +Tokenization → Integer IDs + ↓ +ONNX Model Inference + ↓ +Audio Trimming & Normalization + ↓ +24kHz Mono WAV Output +``` + +Each step can affect quality. The most critical is **phonemization** - if espeak produces different phonemes than expected, the neural network receives unfamiliar input. + +## 🐛 Known Issues + +1. **NumPy 2.0+**: May cause subtle audio differences. Stick to 1.24-1.26 for best results. + +2. **Windows Espeak**: Sometimes requires manual PATH configuration. The `espeakng-loader` package helps but may need: + ```python + import espeakng_loader + espeakng_loader.load_espeakng_library() + ``` + +3. **Long Text**: Automatically chunked at 400 characters. Very long sentences may have slight discontinuities at chunk boundaries. + +## 📈 Performance Metrics + +On a modern CPU, you should expect: + +| Model | Size | RTF (Real-Time Factor) | Quality | +|-------|------|----------------------|---------| +| Nano | 15M | 5-10x | Good | +| Micro | 40M | 3-5x | Better | +| Mini | 80M | 1-2x | Best | + +RTF > 1 means faster than real-time (good for streaming). diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..81e90b0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + espeak-ng \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +RUN pip install --no-cache-dir \ + fastapi>=0.104.0 \ + uvicorn>=0.24.0 \ + python-multipart>=0.0.6 + +COPY . . + +EXPOSE 7860 + +ENV PYTHONUNBUFFERED=1 + +CMD ["python", "run_webui.py"] diff --git a/KITTENTTS_API_REFERENCE.md b/KITTENTTS_API_REFERENCE.md new file mode 100644 index 0000000..eada78a --- /dev/null +++ b/KITTENTTS_API_REFERENCE.md @@ -0,0 +1,649 @@ +# KittenTTS — API Reference + +> **Service**: KittenTTS Ultra-Lightweight Text-to-Speech +> **Base URL**: `http://localhost:7860` +> **Protocol**: REST (JSON) +> **CORS**: Enabled for all origins +> **Model**: KittenTTS — 15M to 80M param neural TTS, 24kHz sample rate + +--- + +## Quick Start + +```javascript +// Simplest usage — generate audio from text +const response = await fetch('http://localhost:7860/api/generate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: 'Hello world!', + voice: 'Jasper', + speed: 1.0, + model: 'kitten-tts-nano' + }) +}); +const result = await response.json(); +const audioBlob = await fetch(`data:audio/wav;base64,${result.audio_base64}`).then(r => r.blob()); +const audioUrl = URL.createObjectURL(audioBlob); +``` + +```python +# Python equivalent +import requests +import base64 +import io +import soundfile as sf + +response = requests.post('http://localhost:7860/api/generate', json={ + 'text': 'Hello world!', + 'voice': 'Jasper', + 'speed': 1.0, + 'model': 'kitten-tts-nano' +}) +result = response.json() + +# Decode and save audio +audio_bytes = base64.b64decode(result['audio_base64']) +audio, sr = sf.read(io.BytesIO(audio_bytes)) +sf.write('output.wav', audio, sr) +``` + +--- + +## Available Models + +| Model ID | Name | Params | Size | Precision | Quality | Description | +|----------|------|--------|------|-----------|---------|-------------| +| `kitten-tts-nano` ⭐ | Nano (FP32) | 15M | 56MB | FP32 | **Best** | Full 32-bit precision, highest quality | +| `kitten-tts-mini` | Mini (INT8) | 80M | 80MB | INT8 | Good | Largest model, quantized | +| `kitten-tts-micro` | Micro (INT8) | 40M | 41MB | INT8 | Good | Balanced size/performance | +| `kitten-tts-nano-int8` | Nano (INT8) | 15M | 19MB | INT8 | Basic | Smallest footprint | + +> **💡 Quality Tip:** The FP32 nano model (56MB) produces the best audio quality. Use `kitten-tts-nano` for optimal results. + +--- + +## Available Voices + +KittenTTS includes 8 expressive voices — 4 male and 4 female: + +| Voice ID | Name | Gender | Description | +|----------|------|--------|-------------| +| `Bella` | Bella | Female | Warm & gentle | +| `Jasper` | Jasper | Male | Clear & professional | +| `Luna` | Luna | Female | Soft & melodic | +| `Bruno` | Bruno | Male | Deep & resonant | +| `Rosie` | Rosie | Female | Bright & cheerful | +| `Hugo` | Hugo | Male | Confident & steady | +| `Kiki` | Kiki | Female | Playful & energetic | +| `Leo` | Leo | Male | Friendly & warm | + +### Recommended Voices + +| Use Case | Recommended Voice | Notes | +|----------|-------------------|-------| +| **Professional/Narration** | `Jasper` | Clear, professional tone | +| **Warm/Conversational** | `Bella` | Gentle, welcoming | +| **Energetic/Cheerful** | `Kiki` | Playful, upbeat | +| **Deep/Authoritative** | `Bruno` | Resonant, commanding | + +--- + +## Endpoints + +### `GET /api/health` + +Health check and system info. + +**Response:** +```json +{ + "status": "healthy", + "loaded_models": ["kitten-tts-nano"], + "cache_dir": "/home/user/.cache/kittentts", + "cache_size_mb": 56.2 +} +``` + +--- + +### `GET /api/models` + +List available models. + +**Response:** +```json +{ + "models": [ + { + "id": "kitten-tts-nano", + "name": "Nano (FP32)", + "params": "15M", + "size": "56MB", + "description": "⭐ Best quality - Full 32-bit precision", + "quality": "best", + "precision": "FP32" + } + ] +} +``` + +--- + +### `GET /api/voices` + +List available voices. + +**Response:** +```json +{ + "voices": [ + { + "id": "Bella", + "name": "Bella", + "gender": "female", + "description": "Warm & gentle" + }, + { + "id": "Jasper", + "name": "Jasper", + "gender": "male", + "description": "Clear & professional" + } + ] +} +``` + +--- + +### `GET /api/stats` + +Get detailed generation statistics. + +**Response:** +```json +{ + "generation_stats": { + "total_requests": 42, + "avg_generation_time": 0.156, + "avg_rtf": 12.5, + "total_audio_generated": 125.4, + "recent_requests": [] + }, + "system": { + "cache_directory": "/home/user/.cache/kittentts", + "cache_size_mb": 56.2, + "loaded_models": ["kitten-tts-nano"], + "model_load_times": {"kitten-tts-nano": 2.34}, + "python_version": "3.12.0", + "memory_usage_mb": 245.6 + }, + "available_models": ["kitten-tts-nano", "kitten-tts-mini", "kitten-tts-micro", "kitten-tts-nano-int8"], + "available_voices": ["Bella", "Jasper", "Luna", "Bruno", "Rosie", "Hugo", "Kiki", "Leo"] +} +``` + +--- + +### `POST /api/generate` + +Synthesize speech from text. Returns JSON with base64-encoded WAV audio. + +**Request Body:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `text` | string | *required* | Text to speak | +| `voice` | string | `Bella` | Voice ID | +| `speed` | float | `1.0` | Speech rate (0.25–3.0) | +| `model` | string | `kitten-tts-nano` | Model ID | + +**Response:** +```json +{ + "audio_base64": "UklGRi4AAABXQVZFZm10...", + "sample_rate": 24000, + "duration": 2.45, + "debug_info": { + "model_load_time": 2.34, + "generation_time": 0.156, + "total_time": 2.496, + "real_time_factor": 15.7, + "audio_samples": 58800, + "sample_rate": 24000 + } +} +``` + +**Example:** +```javascript +const res = await fetch('http://localhost:7860/api/generate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + text: 'Hello world! This is a test.', + voice: 'Jasper', + speed: 1.0, + model: 'kitten-tts-nano' + }) +}); +const result = await res.json(); +console.log(`Generated ${result.duration}s of audio`); +``` + +--- + +## Streaming API (for LLM Integration) + +The streaming API enables real-time text-to-speech for conversational AI applications. Audio generation starts as soon as complete sentences are detected. + +### `POST /api/stream/start` + +Start a new streaming TTS session. + +**Query Parameters:** + +| Param | Type | Default | Description | +|-------|------|---------|-------------| +| `model` | string | `kitten-tts-nano` | Model ID | +| `voice` | string | `Bella` | Voice ID | +| `speed` | float | `1.0` | Speech rate (0.25–3.0) | + +**Response:** +```json +{ + "session_id": "550e8400-e29b-41d4-a716-446655440000", + "status": "created" +} +``` + +**Example:** +```javascript +const res = await fetch('http://localhost:7860/api/stream/start?voice=Jasper&speed=1.0', { + method: 'POST' +}); +const { session_id } = await res.json(); +``` + +--- + +### `POST /api/stream/chunk` + +Add text to a streaming session and receive audio for complete sentences. + +**Query Parameters:** + +| Param | Type | Required | Description | +|-------|------|----------|-------------| +| `session_id` | string | *required* | Session ID from `/api/stream/start` | + +**Request Body:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `text` | string | `""` | Text chunk to add | +| `flush` | boolean | `false` | Set `true` on final chunk to flush remaining text | + +**Response:** +```json +{ + "audio_chunks": [ + "UklGRi4AAABXQVZFZm10...", + "UklGRi4AAABXQVZFZm10..." + ], + "sample_rate": 24000, + "buffered_text": " remaining text", + "status": "streaming" +} +``` + +**Status Values:** +- `streaming` — Session is active, more chunks expected +- `flushed` — Session was flushed, no more buffered text + +**Example:** +```javascript +// Stream text from an LLM +const tokens = ["Hello", " there", "! How", " are", " you", " today", "?"]; + +for (const token of tokens) { + const res = await fetch(`http://localhost:7860/api/stream/chunk?session_id=${sessionId}`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: token, flush: false }) + }); + const result = await res.json(); + + // Play each audio chunk immediately + for (const audioBase64 of result.audio_chunks) { + const audioBlob = await fetch(`data:audio/wav;base64,${audioBase64}`).then(r => r.blob()); + playAudio(audioBlob); + } +} + +// Flush remaining text +const finalRes = await fetch(`http://localhost:7860/api/stream/chunk?session_id=${sessionId}`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: "", flush: true }) +}); +``` + +--- + +### `DELETE /api/stream/end/{session_id}` + +End a streaming session and release resources. + +**Path Parameters:** + +| Param | Type | Description | +|-------|------|-------------| +| `session_id` | string | Session ID to terminate | + +**Response:** +```json +{ + "status": "ended", + "session_id": "550e8400-e29b-41d4-a716-446655440000" +} +``` + +**Example:** +```javascript +await fetch(`http://localhost:7860/api/stream/end/${sessionId}`, { + method: 'DELETE' +}); +``` + +--- + +## Complete Streaming Example + +### JavaScript (Browser) + +```javascript +class KittenTTSStreamer { + constructor(baseUrl = 'http://localhost:7860') { + this.baseUrl = baseUrl; + this.sessionId = null; + } + + async start(voice = 'Jasper', speed = 1.0, model = 'kitten-tts-nano') { + const res = await fetch( + `${this.baseUrl}/api/stream/start?voice=${voice}&speed=${speed}&model=${model}`, + { method: 'POST' } + ); + const data = await res.json(); + this.sessionId = data.session_id; + return this.sessionId; + } + + async addText(text, flush = false) { + if (!this.sessionId) throw new Error('Session not started'); + + const res = await fetch( + `${this.baseUrl}/api/stream/chunk?session_id=${this.sessionId}`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text, flush }) + } + ); + return await res.json(); + } + + async end() { + if (!this.sessionId) return; + await fetch(`${this.baseUrl}/api/stream/end/${this.sessionId}`, { + method: 'DELETE' + }); + this.sessionId = null; + } +} + +// Usage with Web Audio API for immediate playback +const audioCtx = new AudioContext(); +let nextStartTime = 0; + +async function playChunk(audioBase64) { + const response = await fetch(`data:audio/wav;base64,${audioBase64}`); + const arrayBuffer = await response.arrayBuffer(); + const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); + + const source = audioCtx.createBufferSource(); + source.buffer = audioBuffer; + source.connect(audioCtx.destination); + + const startTime = Math.max(audioCtx.currentTime, nextStartTime); + source.start(startTime); + nextStartTime = startTime + audioBuffer.duration; +} + +// Main streaming loop +async function streamFromLLM(llmStream) { + const streamer = new KittenTTSStreamer(); + await streamer.start('Jasper'); + + for await (const token of llmStream) { + const result = await streamer.addText(token); + for (const chunk of result.audio_chunks) { + await playChunk(chunk); + } + } + + // Flush remaining text + const final = await streamer.addText('', true); + for (const chunk of final.audio_chunks) { + await playChunk(chunk); + } + + await streamer.end(); +} +``` + +### Python + +```python +import requests +import base64 +import io +import soundfile as sf +import sounddevice as sd + +class KittenTTSStreamer: + def __init__(self, base_url='http://localhost:7860'): + self.base_url = base_url + self.session_id = None + + def start(self, voice='Jasper', speed=1.0, model='kitten-tts-nano'): + res = requests.post( + f'{self.base_url}/api/stream/start', + params={'voice': voice, 'speed': speed, 'model': model} + ) + self.session_id = res.json()['session_id'] + return self.session_id + + def add_text(self, text, flush=False): + if not self.session_id: + raise RuntimeError('Session not started') + + res = requests.post( + f'{self.base_url}/api/stream/chunk', + params={'session_id': self.session_id}, + json={'text': text, 'flush': flush} + ) + return res.json() + + def end(self): + if self.session_id: + requests.delete(f'{self.base_url}/api/stream/end/{self.session_id}') + self.session_id = None + +def play_audio_chunk(audio_base64, sample_rate=24000): + """Play audio chunk immediately using sounddevice.""" + audio_bytes = base64.b64decode(audio_base64) + audio, sr = sf.read(io.BytesIO(audio_bytes)) + sd.play(audio, sr) + sd.wait() + +# Usage example +def stream_from_llm(llm_generator): + streamer = KittenTTSStreamer() + streamer.start(voice='Jasper', speed=1.0) + + try: + for token in llm_generator: + result = streamer.add_text(token) + for chunk in result['audio_chunks']: + play_audio_chunk(chunk) + + # Flush remaining text + final = streamer.add_text('', flush=True) + for chunk in final['audio_chunks']: + play_audio_chunk(chunk) + finally: + streamer.end() + +# Simulate LLM stream +def mock_llm_stream(): + tokens = ["Hello", " there", "! How", " are", " you", " today", "?"] + for token in tokens: + yield token + +if __name__ == '__main__': + stream_from_llm(mock_llm_stream()) +``` + +--- + +## Python Library API + +For direct Python usage without the web server: + +### Basic Usage + +```python +from kittentts import KittenTTS +import soundfile as sf + +# Initialize model (downloads from HuggingFace if needed) +model = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") + +# Generate audio +audio = model.generate( + text="Hello world! This is a test.", + voice="Jasper", + speed=1.0 +) + +# Save to file +sf.write('output.wav', audio, 24000) + +# Or use the convenience method +model.generate_to_file( + text="Hello world!", + output_path='output.wav', + voice="Jasper", + speed=1.0 +) + +# List available voices +print(model.available_voices) +# ['expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', ...] +``` + +### Streaming API + +```python +from kittentts import KittenTTS +import soundfile as sf + +model = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") + +# Create a streaming instance +streamer = model.create_streamer(voice="Jasper", speed=1.0) + +# Simulate LLM stream +llm_tokens = ["Hello", " there", "! How", " are", " you", " today", "?"] + +for token in llm_tokens: + # add_text() yields audio chunks when complete sentences are detected + for audio_chunk in streamer.add_text(token): + sf.write("chunk.wav", audio_chunk, 24000) + # Or play immediately for real-time output + +# Flush remaining buffered text +for audio_chunk in streamer.flush(): + sf.write("final_chunk.wav", audio_chunk, 24000) + +# Check buffered text at any time +print(streamer.buffered_text) # Shows text waiting for sentence completion + +# Reset buffer without generating (optional) +streamer.reset() +``` + +--- + +## Error Handling + +All endpoints return appropriate HTTP status codes: + +| Status | Description | +|--------|-------------| +| `200` | Success | +| `400` | Bad request (invalid parameters, empty text, speed out of range) | +| `404` | Not found (invalid session ID) | +| `500` | Server error (model loading failed, inference error) | + +**Error Response Format:** +```json +{ + "detail": "Error description here" +} +``` + +**Example Error Handling:** +```javascript +const res = await fetch('http://localhost:7860/api/generate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: '', voice: 'Jasper' }) +}); + +if (!res.ok) { + const error = await res.json(); + console.error(`Error ${res.status}: ${error.detail}`); +} +``` + +--- + +## Performance Notes + +- **Real-Time Factor (RTF)**: Typically 10-20x real-time on modern CPUs +- **First Request Latency**: ~2-3 seconds (model loading) +- **Subsequent Requests**: ~100-300ms for typical sentences +- **Streaming Latency**: Audio available within ~100-300ms of sentence completion +- **Memory Usage**: ~200-300MB depending on model + +--- + +## Docker Deployment + +```bash +# Build image +docker build -t kittentts-webui . + +# Run container +docker run -d -p 7860:7860 -v ~/.cache/huggingface:/root/.cache/huggingface kittentts-webui + +# Access at http://localhost:7860 +``` + +--- + +## License + +Apache License 2.0 diff --git a/README.md b/README.md index a3e3955..2449c30 100644 --- a/README.md +++ b/README.md @@ -28,14 +28,16 @@ Email the creators with any questions : info@stellonlabs.com ## Models -| Model | Params | Size | Link | -|-------|--------|------|------| -| kitten-tts-mini | 80M | 80MB | 🤗 [KittenML/kitten-tts-mini-0.8](https://huggingface.co/KittenML/kitten-tts-mini-0.8) | -| kitten-tts-micro | 40M | 41MB | 🤗 [KittenML/kitten-tts-micro-0.8](https://huggingface.co/KittenML/kitten-tts-micro-0.8) | -| kitten-tts-nano | 15M | 56MB | 🤗 [KittenML/kitten-tts-nano-0.8](https://huggingface.co/KittenML/kitten-tts-nano-0.8-fp32) | -| kitten-tts-nano-int8 quantized | 15M | 19MB | 🤗 [KittenML/kitten-tts-nano-0.8-int8](https://huggingface.co/KittenML/kitten-tts-nano-0.8-int8) | +| Model | Params | Size | Precision | Quality | Link | +|-------|--------|------|-----------|---------|------| +| **kitten-tts-nano** ⭐ | 15M | 56MB | FP32 | **Best** | 🤗 [KittenML/kitten-tts-nano-0.8-fp32](https://huggingface.co/KittenML/kitten-tts-nano-0.8-fp32) | +| kitten-tts-mini | 80M | 80MB | INT8 | Good | 🤗 [KittenML/kitten-tts-mini-0.8](https://huggingface.co/KittenML/kitten-tts-mini-0.8) | +| kitten-tts-micro | 40M | 41MB | INT8 | Good | 🤗 [KittenML/kitten-tts-micro-0.8](https://huggingface.co/KittenML/kitten-tts-micro-0.8) | +| kitten-tts-nano-int8 | 15M | 19MB | INT8 | Basic | 🤗 [KittenML/kitten-tts-nano-0.8-int8](https://huggingface.co/KittenML/kitten-tts-nano-0.8-int8) | -> Some users are facing minor issues with the kitten-tts-nano-int8 model. We are looking into it. Please report to us if you face any issues. +> **💡 Quality Tip:** The FP32 nano model (56MB) produces the best audio quality because it uses full 32-bit floating point precision. Larger models (mini, micro) use INT8 quantization which can introduce subtle artifacts. **For best results, use `kitten-tts-nano` (FP32).** + +> Some users are facing minor issues with the kitten-tts-nano-int8 model. We are looking into it. Please report to us if you face any issues. ## Demo Video @@ -58,7 +60,9 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.8/kittentt ``` from kittentts import KittenTTS -m = KittenTTS("KittenML/kitten-tts-mini-0.8") + +# Use FP32 model for best quality (recommended) +m = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") audio = m.generate("This high quality TTS model works without a GPU", voice='Jasper' ) @@ -70,20 +74,197 @@ sf.write('output.wav', audio, 24000) ``` +## Streaming TTS (for LLM Integration) + +KittenTTS supports sentence-level streaming, ideal for real-time conversational AI applications. Audio generation starts as soon as complete sentences are detected from streaming text. + +### Python API + +```python +from kittentts import KittenTTS +import soundfile as sf + +# Initialize model +model = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") + +# Create a streaming instance +streamer = model.create_streamer(voice="Jasper", speed=1.0) + +# Simulate streaming from an LLM +llm_tokens = ["Hello", " there", "! How", " are", " you", " today", "?"] + +for token in llm_tokens: + # add_text() yields audio chunks when complete sentences are detected + for audio_chunk in streamer.add_text(token): + sf.write("chunk.wav", audio_chunk, 24000) + # Or play immediately for real-time output + +# Don't forget to flush remaining buffered text +for audio_chunk in streamer.flush(): + sf.write("final_chunk.wav", audio_chunk, 24000) +``` + +### Web API + +For remote applications, use the streaming endpoints: + +```python +import requests +import base64 +import soundfile as sf +import io + +BASE_URL = "http://localhost:7860" + +# Start a streaming session +response = requests.post( + f"{BASE_URL}/api/stream/start", + params={"model": "kitten-tts-nano", "voice": "Jasper", "speed": 1.0} +) +session_id = response.json()["session_id"] + +# Stream text chunks (e.g., from an LLM) +for token in ["Hello", " there", "! How", " are", " you", "?"]: + response = requests.post( + f"{BASE_URL}/api/stream/chunk?session_id={session_id}", + json={"text": token, "flush": False} + ) + result = response.json() + + # Process audio chunks for complete sentences + for audio_base64 in result["audio_chunks"]: + audio_bytes = base64.b64decode(audio_base64) + audio, sr = sf.read(io.BytesIO(audio_bytes)) + # Play or save audio + +# Flush remaining text and end session +response = requests.post( + f"{BASE_URL}/api/stream/chunk?session_id={session_id}", + json={"text": "", "flush": True} +) +# Process final audio chunks... + +# Clean up +requests.delete(f"{BASE_URL}/api/stream/end/{session_id}") +``` + +### Streaming API Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/api/stream/start` | POST | Start a new streaming session | +| `/api/stream/chunk?session_id={id}` | POST | Add text chunk, get audio for complete sentences | +| `/api/stream/end/{session_id}` | DELETE | End session and release resources | + ## System Requirements -Works literally everywhere. Needs python3.12. We recommend using conda. +Works literally everywhere. Needs python3.8+. We recommend using python3.12 with conda. + +### Audio Quality Note + +The model performance may vary based on your environment (OS, espeak-ng version, ONNX Runtime provider). For best results: + +```bash +# Check your environment +python check_environment.py + +# Test audio generation +python test_tts.py +``` + +See [AUDIO_QUALITY.md](AUDIO_QUALITY.md) for detailed optimization guide. + + + +## WebUI + +KittenTTS includes a cute kitten-themed web interface for easy text-to-speech generation. +### Quick Start with Conda (Recommended) + +```bash +# Create and activate a conda environment +conda create -n kittentts python=3.12 -y +conda activate kittentts + +# Install KittenTTS +pip install https://github.com/KittenML/KittenTTS/releases/download/0.8/kittentts-0.8.0-py3-none-any.whl + +# Install additional WebUI dependencies +pip install fastapi uvicorn python-multipart + +# Run the WebUI +python run_webui.py +``` + +### Quick Start with pip + +```bash +# Install additional dependencies +pip install fastapi uvicorn python-multipart + +# Run the WebUI +python run_webui.py +``` + +Open your browser and navigate to `http://localhost:7860` + +### Features + +- **4 Models**: Choose from Mini, Micro, Nano, and Nano INT8 variants +- **8 Voices**: Select from Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, and Leo +- **Speed Control**: Adjust speech speed from 0.5x to 2.0x +- **Dark/Light Mode**: Toggle between themes with automatic system detection +- **Audio Download**: Save generated audio as WAV files + +### Command Line Options + +```bash +python run_webui.py --host 0.0.0.0 --port 7860 +``` + +## Docker Usage + +Run KittenTTS WebUI in a containerized environment. + +### Build the Image + +```bash +docker build -t kittentts-webui . +``` + +### Run the Container + +```bash +docker run -d -p 7860:7860 -v ~/.cache/huggingface:/root/.cache/huggingface kittentts-webui +``` + +The `-v` flag mounts the Hugging Face cache directory to persist downloaded models between container restarts. + +### Access the WebUI + +Open `http://localhost:7860` in your browser. + +### Stop the Container + +```bash +# List running containers to find the container ID +docker ps + +# Stop the container +docker stop +``` +Or if you ran without `-d` (detached mode), press `Ctrl+C` in the terminal to stop. ## Checklist - [x] Release a preview model - [ ] Release the fully trained model weights - [ ] Release mobile SDK -- [ ] Release web version +- [ ] Release web version diff --git a/check_environment.py b/check_environment.py new file mode 100644 index 0000000..49a41b1 --- /dev/null +++ b/check_environment.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +""" +KittenTTS Environment Check Script +Run this to verify your environment is properly configured for best audio quality. +""" + +import sys +import subprocess + +def check_import(module_name, package_name=None): + """Check if a module can be imported.""" + package_name = package_name or module_name + try: + __import__(module_name) + print(f" [OK] {package_name}") + return True + except ImportError: + print(f" [MISSING] {package_name} (pip install {package_name})") + return False + +def check_version(module_name, attr='__version__'): + """Get version of a module.""" + try: + mod = __import__(module_name) + version = getattr(mod, attr, 'unknown') + if callable(version): + version = version() + return version + except: + return 'unknown' + +def main(): + print("=" * 60) + print("KittenTTS Environment Check") + print("=" * 60) + + # Check if in conda environment + if sys.prefix == sys.base_prefix: + print("\n[NOTE] Not running in a virtual environment.") + print(" For best results, use a conda environment:") + print(" conda activate kittentts") + else: + print(f"\n[OK] Running in virtual environment: {sys.prefix}") + + all_ok = True + + # Check Python version + print(f"\nPython Version: {sys.version}") + if sys.version_info < (3, 8): + print(" [WARN] Python 3.8+ recommended") + all_ok = False + else: + print(" [OK] Python version") + + # Check core dependencies + print("\nCore Dependencies:") + deps = [ + ('numpy', 'numpy'), + ('onnxruntime', 'onnxruntime'), + ('soundfile', 'soundfile'), + ('phonemizer', 'phonemizer'), + ('spacy', 'spacy'), + ('num2words', 'num2words'), + ('huggingface_hub', 'huggingface-hub'), + ] + + for module, package in deps: + if not check_import(module, package): + all_ok = False + + # Check specific versions + print("\nVersions:") + try: + import numpy as np + print(f" NumPy: {np.__version__}") + if int(np.__version__.split('.')[0]) >= 2: + print(" [WARN] NumPy 2.x detected - use numpy<2.0 for best compatibility") + except: + pass + + try: + import onnxruntime as ort + print(f" ONNX Runtime: {ort.__version__}") + providers = ort.get_available_providers() + print(f" Providers: {providers}") + if 'CPUExecutionProvider' not in providers: + print(" [WARN] CPUExecutionProvider not available") + except: + pass + + # Check espeak + print("\n[Espeak Phonemizer Backend]") + try: + # Try to load espeak-ng library if available (needed on Windows) + try: + import espeakng_loader + espeakng_loader.load_library() + import os + if 'ESPEAK_DATA_PATH' not in os.environ: + os.environ['ESPEAK_DATA_PATH'] = str(espeakng_loader.get_data_path()) + # Tell phonemizer where to find the espeak library + from phonemizer.backend.espeak.base import BaseEspeakBackend + BaseEspeakBackend.set_library(str(espeakng_loader.get_library_path())) + except: + pass + + from phonemizer.backend import BACKENDS + EspeakBackend = BACKENDS.get('espeak') or BACKENDS.get('espeak-ng') + if EspeakBackend is None: + raise RuntimeError("No espeak backend available") + + # Try to create a backend + backend = EspeakBackend('en-us') + print(f" [OK] Espeak backend working (language: {backend.language})") + + # Test phonemization + test = backend.phonemize(["hello world"]) + # Encoding-safe print (Windows console may not support IPA chars) + try: + print(f" [OK] Phonemization test: {test}") + except UnicodeEncodeError: + print(f" [OK] Phonemization working (output contains IPA characters)") + except Exception as e: + print(f" [ERROR] Espeak backend: {e}") + print(" Fix: pip install espeakng-loader phonemizer") + all_ok = False + + # Check spacy model + print("\nSpacy English Model:") + try: + import spacy + try: + nlp = spacy.load('en_core_web_sm') + print(" [OK] en_core_web_sm loaded") + except: + print(" [MISSING] en_core_web_sm not found") + print(" Run: python -m spacy download en_core_web_sm") + all_ok = False + except: + pass + + # ONNX Runtime optimization check + print("\nONNX Runtime Configuration:") + try: + import onnxruntime as ort + sess_options = ort.SessionOptions() + print(f" Default threads: {sess_options.intra_op_num_threads}") + print(f" Graph optimization: {sess_options.graph_optimization_level}") + + # Check if we can load a test session + print(" [OK] ONNX Runtime session options accessible") + except Exception as e: + print(f" [ERROR] ONNX: {e}") + + # Summary + print("\n" + "=" * 60) + if all_ok: + print("[SUCCESS] Environment looks good! KittenTTS should work well.") + print("\nTips for best audio quality:") + print(" 1. Use speed=1.0 for most natural speech") + print(" 2. Keep sentences under 400 characters") + print(" 3. End sentences with punctuation for better prosody") + print(" 4. Use 'Jasper' or 'Bella' for clearest speech") + else: + print("[ERROR] Some issues found. Please install missing dependencies:") + print(" pip install -r requirements.txt") + print(" python -m spacy download en_core_web_sm") + print("=" * 60) + + return 0 if all_ok else 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/kittentts/__index__.py b/kittentts/__index__.py deleted file mode 100644 index e178a13..0000000 --- a/kittentts/__index__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kittentts.get_model import get_model - - \ No newline at end of file diff --git a/kittentts/__init__.py b/kittentts/__init__.py index 9cf1a2d..e63ca99 100644 --- a/kittentts/__init__.py +++ b/kittentts/__init__.py @@ -1,7 +1,8 @@ from kittentts.get_model import get_model, KittenTTS +from kittentts.onnx_model import StreamingTTS __version__ = "0.1.0" __author__ = "KittenML" __description__ = "Ultra-lightweight text-to-speech model with just 15 million parameters" -__all__ = ["get_model", "KittenTTS"] +__all__ = ["get_model", "KittenTTS", "StreamingTTS"] diff --git a/kittentts/get_model.py b/kittentts/get_model.py index b0d47b2..8b53f5f 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -1,7 +1,7 @@ import json import os from huggingface_hub import hf_hub_download -from .onnx_model import KittenTTS_1_Onnx +from .onnx_model import KittenTTS_1_Onnx, StreamingTTS class KittenTTS: @@ -52,6 +52,18 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, def available_voices(self): """Get list of available voices.""" return self.model.available_voices + + def create_streamer(self, voice="expr-voice-5-m", speed=1.0): + """Create a streaming TTS instance for real-time generation. + + Args: + voice: Voice to use for synthesis + speed: Speech speed (1.0 = normal) + + Returns: + StreamingTTS: A streaming TTS instance + """ + return StreamingTTS(self.model, voice=voice, speed=speed) def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index 7ea20b3..012c57d 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -1,6 +1,23 @@ -from misaki import en, espeak +# Try to load espeak-ng library if available (needed on Windows) +# This sets up the espeak-ng library and data paths properly +try: + import espeakng_loader + espeakng_loader.load_library() + # Set the data path environment variable required by espeak + import os + if 'ESPEAK_DATA_PATH' not in os.environ: + os.environ['ESPEAK_DATA_PATH'] = str(espeakng_loader.get_data_path()) + # Tell phonemizer where to find the espeak library + from phonemizer.backend.espeak.base import BaseEspeakBackend + BaseEspeakBackend.set_library(str(espeakng_loader.get_library_path())) +except Exception: + # If loader fails, phonemizer might still find system espeak + pass + import numpy as np + import phonemizer +from phonemizer.backend import BACKENDS import soundfile as sf import onnxruntime as ort from .preprocess import TextPreprocessor @@ -86,10 +103,44 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice voices_path: Path to the voices NPZ file """ self.model_path = model_path - self.voices = np.load(voices_path) - self.session = ort.InferenceSession(model_path) + self.voices = np.load(voices_path) + + # Configure ONNX Runtime for best audio quality and performance + sess_options = ort.SessionOptions() + + # Use all available cores for parallel processing + sess_options.intra_op_num_threads = 0 # 0 = use all cores + sess_options.inter_op_num_threads = 0 + + # Graph optimizations for better inference + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + + # Enable memory pattern optimization + sess_options.enable_mem_pattern = True + + # Get available providers (prefer CPUExecutionProvider for consistency) + available_providers = ort.get_available_providers() + providers = [] - self.phonemizer = phonemizer.backend.EspeakBackend( + # For TTS quality/consistency, CPU is often more deterministic than GPU + if 'CPUExecutionProvider' in available_providers: + providers.append('CPUExecutionProvider') + elif 'AzureExecutionProvider' in available_providers: + providers.append('AzureExecutionProvider') + + # Create session with optimized settings + self.session = ort.InferenceSession( + model_path, + sess_options=sess_options, + providers=providers + ) + + # Use the BACKENDS dict to get EspeakBackend (handles API differences across versions) + EspeakBackend = BACKENDS.get('espeak') or BACKENDS.get('espeak-ng') + if EspeakBackend is None: + raise RuntimeError("No espeak backend available. Install espeak-ng and phonemizer.") + + self.phonemizer = EspeakBackend( language="en-us", preserve_punctuation=True, with_stress=True ) self.text_cleaner = TextCleaner() @@ -159,14 +210,56 @@ def generate_single_chunk(self, text: str, voice: str = "expr-voice-5-m", speed: onnx_inputs = self._prepare_inputs(text, voice, speed) outputs = self.session.run(None, onnx_inputs) + audio = outputs[0] - # Trim audio - audio = outputs[0][..., :-5000] + # Smart trimming: remove trailing silence while preserving actual audio content + audio = self._smart_trim_trailing_silence(audio) + + # Normalize audio to prevent clipping and ensure consistent volume + max_val = np.max(np.abs(audio)) + if max_val > 0: + # Soft normalization: don't over-compress, just prevent clipping + if max_val > 0.95: + audio = audio * (0.95 / max_val) return audio + def _smart_trim_trailing_silence(self, audio: np.ndarray, threshold: float = 0.01, + padding_ms: float = 50.0, sample_rate: int = 24000) -> np.ndarray: + """Trim trailing silence while preserving audio content. + + Args: + audio: Audio data as numpy array + threshold: Amplitude threshold for silence detection + padding_ms: Milliseconds of padding to keep after audio ends + sample_rate: Audio sample rate + + Returns: + Trimmed audio data + """ + if audio.shape[-1] < 8000: # Don't trim very short audio + return audio + + # Find the last sample above the threshold + energy = np.abs(audio) + above_threshold = np.where(energy > threshold)[0] + + if len(above_threshold) == 0: + # All silence, return original + return audio + + # Find the end of the last audio segment + last_audio_sample = above_threshold[-1] + + # Add padding (default 50ms) to avoid cutting off decay + padding_samples = int(padding_ms / 1000.0 * sample_rate) + end_sample = min(last_audio_sample + padding_samples, audio.shape[-1]) + + return audio[..., :end_sample] + def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice-5-m", - speed: float = 1.0, sample_rate: int = 24000, clean_text: bool=True) -> None: + speed: float = 1.0, sample_rate: int = 24000, clean_text: bool=True, + subtype: str = 'PCM_16') -> None: """Synthesize speech and save to file. Args: @@ -176,8 +269,111 @@ def generate_to_file(self, text: str, output_path: str, voice: str = "expr-voice speed: Speech speed (1.0 = normal) sample_rate: Audio sample rate clean_text: If true, it will cleanup the text. Eg. replace numbers with words. + subtype: SoundFile subtype for quality (PCM_16, PCM_24, FLOAT) """ audio = self.generate(text, voice, speed, clean_text=clean_text) - sf.write(output_path, audio, sample_rate) - print(f"Audio saved to {output_path}") + + # Ensure audio is float32 for best compatibility + if audio.dtype != np.float32: + audio = audio.astype(np.float32) + + # Write with specified subtype for quality + sf.write(output_path, audio, sample_rate, subtype=subtype) + print(f"Audio saved to {output_path} ({len(audio)/sample_rate:.2f}s at {sample_rate}Hz)") + + +class StreamingTTS: + """Sentence-level streaming TTS for real-time text-to-speech generation. + + Buffers incoming text and yields audio chunks as complete sentences are detected. + Ideal for use with streaming LLMs for conversational AI applications. + + Example: + >>> streamer = StreamingTTS(model) + >>> for token in llm_stream: + ... for audio_chunk in streamer.add_text(token): + ... play_audio(audio_chunk) + >>> # Don't forget to flush remaining text + >>> for audio_chunk in streamer.flush(): + ... play_audio(audio_chunk) + """ + + # Sentence-ending punctuation that triggers audio generation + SENTENCE_ENDINGS = '.!?' + + def __init__(self, tts_model: KittenTTS_1_Onnx, voice: str = "expr-voice-5-m", + speed: float = 1.0, clean_text: bool = True): + """Initialize the streaming TTS. + + Args: + tts_model: An initialized KittenTTS_1_Onnx model instance + voice: Voice to use for synthesis + speed: Speech speed (1.0 = normal) + clean_text: Whether to preprocess text before synthesis + """ + self.tts = tts_model + self.voice = voice + self.speed = speed + self.clean_text = clean_text + self._buffer = "" + + def add_text(self, text: str): + """Add text to the buffer and yield audio for any complete sentences. + + Args: + text: Text chunk to add (e.g., a token from an LLM stream) + + Yields: + numpy.ndarray: Audio chunks for complete sentences + """ + self._buffer += text + + # Find complete sentences + while True: + # Find the earliest sentence ending + earliest_end = -1 + for ending in self.SENTENCE_ENDINGS: + pos = self._buffer.find(ending) + if pos != -1 and (earliest_end == -1 or pos < earliest_end): + earliest_end = pos + + if earliest_end == -1: + break + + # Extract the complete sentence (include the punctuation) + sentence = self._buffer[:earliest_end + 1].strip() + self._buffer = self._buffer[earliest_end + 1:].lstrip() + + if sentence: + audio = self.tts.generate_single_chunk(sentence, self.voice, self.speed) + yield audio + + def flush(self): + """Flush any remaining text in the buffer. + + Call this when the text stream is complete to synthesize + any remaining text that hasn't formed a complete sentence. + + Yields: + numpy.ndarray: Audio chunk for remaining text (if any) + """ + if self._buffer.strip(): + # Ensure the text ends with punctuation for natural prosody + text = self._buffer.strip() + if text[-1] not in self.SENTENCE_ENDINGS: + text += '.' + + audio = self.tts.generate_single_chunk(text, self.voice, self.speed) + yield audio + + self._buffer = "" + + def reset(self): + """Clear the buffer without generating audio.""" + self._buffer = "" + + @property + def buffered_text(self) -> str: + """Return the current buffered text that hasn't been synthesized yet.""" + return self._buffer diff --git a/pyproject.toml b/pyproject.toml index c2d1e5c..b6f7eb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,14 +18,16 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] dependencies = [ - "num2words", - "spacy", - "espeakng_loader", + "num2words>=0.5.13", + "spacy>=3.7.0", + "phonemizer>=3.3.0", + "espeakng-loader>=0.1.0", "misaki[en]>=0.9.4", - "onnxruntime", - "soundfile", - "numpy", - "huggingface_hub", + "onnxruntime>=1.16.0", + "soundfile>=0.12.0", + "numpy>=1.24.0,<2.0.0", + "huggingface-hub>=0.20.0", + "psutil>=5.9.0", ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 37bfbb3..c8aa768 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,20 @@ -num2words -spacy -espeakng_loader +# Core TTS dependencies +num2words>=0.5.13 +spacy>=3.7.0 +phonemizer>=3.3.0 +espeakng-loader>=0.1.0 misaki[en]>=0.9.4 -onnxruntime -soundfile -numpy -huggingface_hub + +# ML/Audio +onnxruntime>=1.16.0 +soundfile>=0.12.0 +numpy>=1.24.0,<2.0.0 + +# Model download +huggingface-hub>=0.20.0 + +# System monitoring +psutil>=5.9.0 + +# Optional: Better phonemization support +# espeak-ng (system package - see README for installation) diff --git a/run_webui.py b/run_webui.py new file mode 100644 index 0000000..4be69d5 --- /dev/null +++ b/run_webui.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +"""Entry point for KittenTTS WebUI.""" + +import argparse +from webui.server import run_server + + +def main(): + parser = argparse.ArgumentParser( + description="KittenTTS WebUI - A cute kitten-themed text-to-speech interface" + ) + parser.add_argument( + "--host", type=str, default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)" + ) + parser.add_argument( + "--port", type=int, default=7880, help="Port to bind to (default: 7880)" + ) + args = parser.parse_args() + + run_server(host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/test_tts.py b/test_tts.py new file mode 100644 index 0000000..a71237c --- /dev/null +++ b/test_tts.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +KittenTTS Audio Quality Test Script +Tests TTS generation and reports on audio quality metrics. +""" + +import sys +import time + +# Try to load espeak-ng library if available (needed on Windows) +try: + import espeakng_loader + espeakng_loader.load_library() + import os + if 'ESPEAK_DATA_PATH' not in os.environ: + os.environ['ESPEAK_DATA_PATH'] = str(espeakng_loader.get_data_path()) + from phonemizer.backend.espeak.base import BaseEspeakBackend + BaseEspeakBackend.set_library(str(espeakng_loader.get_library_path())) +except: + pass + +import numpy as np + +def test_basic_generation(): + """Test basic TTS generation.""" + print("=" * 60) + print("🐱 KittenTTS Audio Quality Test") + print("=" * 60) + + # Test imports + print("\n1. Testing imports...") + try: + from kittentts import KittenTTS + print(" ✓ KittenTTS imported successfully") + except Exception as e: + print(f" ✗ Import failed: {e}") + return False + + # Test model loading + print("\n2. Loading model (this may take a moment)...") + try: + # Use nano model for quick testing + model = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") + print(f" ✓ Model loaded") + print(f" Available voices: {model.available_voices}") + except Exception as e: + print(f" ✗ Model loading failed: {e}") + print(f" Error details: {type(e).__name__}") + import traceback + traceback.print_exc() + return False + + # Test audio generation + print("\n3. Generating test audio...") + test_texts = [ + ("Hello, this is a test of KittenTTS.", "Jasper"), + ("The quick brown fox jumps over the lazy dog.", "Bella"), + ] + + for text, voice in test_texts: + print(f"\n Testing with voice '{voice}':") + print(f" Text: \"{text}\"") + + try: + start = time.time() + audio = model.generate(text, voice=voice, speed=1.0) + duration = time.time() - start + + # Analyze audio + audio_duration = len(audio) / 24000 # 24kHz sample rate + max_amplitude = np.max(np.abs(audio)) + rms = np.sqrt(np.mean(audio**2)) + + print(f" ✓ Generated {audio_duration:.2f}s audio in {duration:.2f}s") + print(f" Max amplitude: {max_amplitude:.4f}") + print(f" RMS level: {rms:.4f}") + print(f" Real-time factor: {audio_duration/duration:.2f}x") + + # Quality checks + if max_amplitude > 1.0: + print(f" ⚠️ Warning: Audio clipping detected!") + elif max_amplitude < 0.1: + print(f" ⚠️ Warning: Audio level very low!") + else: + print(f" ✓ Audio levels OK") + + except Exception as e: + print(f" ✗ Generation failed: {e}") + import traceback + traceback.print_exc() + return False + + # Test file saving + print("\n4. Testing file output...") + try: + model.generate_to_file( + "This is a test file.", + "test_output.wav", + voice="Jasper", + subtype='PCM_16' + ) + print(" ✓ File saved to test_output.wav") + except Exception as e: + print(f" ✗ File saving failed: {e}") + return False + + print("\n" + "=" * 60) + print("✅ All tests passed! Audio quality looks good.") + print("=" * 60) + return True + +def benchmark_speed(): + """Benchmark generation speed.""" + print("\n" + "=" * 60) + print("⚡ Speed Benchmark") + print("=" * 60) + + try: + from kittentts import KittenTTS + model = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32") + + text = "This is a benchmark test to measure generation speed." + + # Warmup + print("Warming up...") + model.generate(text, voice="Jasper") + + # Benchmark + print("Running benchmark...") + times = [] + for _ in range(3): + start = time.time() + audio = model.generate(text, voice="Jasper") + times.append(time.time() - start) + + avg_time = np.mean(times) + audio_duration = len(audio) / 24000 + rtf = audio_duration / avg_time + + print(f"Average generation time: {avg_time:.3f}s") + print(f"Audio duration: {audio_duration:.2f}s") + print(f"Real-time factor: {rtf:.2f}x") + + if rtf > 1.0: + print("✅ Faster than real-time!") + else: + print("⚠️ Slower than real-time - expect delays") + + except Exception as e: + print(f"Benchmark failed: {e}") + +if __name__ == "__main__": + success = test_basic_generation() + + if success and '--benchmark' in sys.argv: + benchmark_speed() + + sys.exit(0 if success else 1) diff --git a/webui/__init__.py b/webui/__init__.py new file mode 100644 index 0000000..bf9fc8b --- /dev/null +++ b/webui/__init__.py @@ -0,0 +1,3 @@ +from .server import create_app, run_server + +__all__ = ["create_app", "run_server"] diff --git a/webui/server.py b/webui/server.py new file mode 100644 index 0000000..cd3128a --- /dev/null +++ b/webui/server.py @@ -0,0 +1,500 @@ +import io +import base64 +import tempfile +import time +import os +import uuid +import json +from typing import Optional, Dict, Any, Generator +from pathlib import Path +from datetime import datetime + +import numpy as np +import soundfile as sf +from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi.responses import HTMLResponse, JSONResponse, FileResponse, StreamingResponse +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel + +# Configuration +CACHE_DIR = Path.home() / ".cache" / "kittentts" +CACHE_DIR.mkdir(parents=True, exist_ok=True) + +MODELS = { + "kitten-tts-mini": "KittenML/kitten-tts-mini-0.8", + "kitten-tts-micro": "KittenML/kitten-tts-micro-0.8", + "kitten-tts-nano": "KittenML/kitten-tts-nano-0.8-fp32", + "kitten-tts-nano-int8": "KittenML/kitten-tts-nano-0.8-int8", +} + +VOICE_ALIASES = { + "Bella": "expr-voice-2-f", + "Jasper": "expr-voice-2-m", + "Luna": "expr-voice-3-f", + "Bruno": "expr-voice-3-m", + "Rosie": "expr-voice-4-f", + "Hugo": "expr-voice-4-m", + "Kiki": "expr-voice-5-f", + "Leo": "expr-voice-5-m", +} + +VOICES = [ + { + "id": "Bella", + "name": "Bella", + "gender": "female", + "description": "Warm & gentle", + }, + { + "id": "Jasper", + "name": "Jasper", + "gender": "male", + "description": "Clear & professional", + }, + {"id": "Luna", "name": "Luna", "gender": "female", "description": "Soft & melodic"}, + { + "id": "Bruno", + "name": "Bruno", + "gender": "male", + "description": "Deep & resonant", + }, + { + "id": "Rosie", + "name": "Rosie", + "gender": "female", + "description": "Bright & cheerful", + }, + { + "id": "Hugo", + "name": "Hugo", + "gender": "male", + "description": "Confident & steady", + }, + { + "id": "Kiki", + "name": "Kiki", + "gender": "female", + "description": "Playful & energetic", + }, + {"id": "Leo", "name": "Leo", "gender": "male", "description": "Friendly & warm"}, +] + +MODEL_INFO = [ + { + "id": "kitten-tts-nano", + "name": "Nano (FP32)", + "params": "15M", + "size": "56MB", + "description": "⭐ Best quality - Full 32-bit precision", + "quality": "best", + "precision": "FP32", + }, + { + "id": "kitten-tts-mini", + "name": "Mini (INT8)", + "params": "80M", + "size": "80MB", + "description": "Largest model, INT8 quantized", + "quality": "good", + "precision": "INT8", + }, + { + "id": "kitten-tts-micro", + "name": "Micro (INT8)", + "params": "40M", + "size": "41MB", + "description": "Balanced size, INT8 quantized", + "quality": "good", + "precision": "INT8", + }, + { + "id": "kitten-tts-nano-int8", + "name": "Nano (INT8)", + "params": "15M", + "size": "19MB", + "description": "Smallest, INT8 quantized", + "quality": "basic", + "precision": "INT8", + }, +] + +# In-memory model cache +loaded_models: Dict[str, Any] = {} +model_load_times: Dict[str, float] = {} + +# Streaming session cache (session_id -> StreamingTTS instance) +streaming_sessions: Dict[str, Any] = {} + +# Stats tracking +class StatsTracker: + def __init__(self): + self.total_requests = 0 + self.total_generation_time = 0.0 + self.total_audio_duration = 0.0 + self.request_history: list = [] + self.max_history = 50 + + def record_request(self, model_id: str, voice: str, text_length: int, + generation_time: float, audio_duration: float, + load_time: float = 0.0, preprocessing_time: float = 0.0): + self.total_requests += 1 + self.total_generation_time += generation_time + self.total_audio_duration += audio_duration + + entry = { + "timestamp": datetime.now().isoformat(), + "model": model_id, + "voice": voice, + "text_length": text_length, + "generation_time": round(generation_time, 3), + "audio_duration": round(audio_duration, 3), + "load_time": round(load_time, 3), + "preprocessing_time": round(preprocessing_time, 3), + "rtf": round(audio_duration / generation_time, 3) if generation_time > 0 else 0, + } + + self.request_history.insert(0, entry) + if len(self.request_history) > self.max_history: + self.request_history = self.request_history[:self.max_history] + + def get_stats(self): + avg_gen_time = (self.total_generation_time / self.total_requests) if self.total_requests > 0 else 0 + avg_rtf = (self.total_audio_duration / self.total_generation_time) if self.total_generation_time > 0 else 0 + + return { + "total_requests": self.total_requests, + "avg_generation_time": round(avg_gen_time, 3), + "avg_rtf": round(avg_rtf, 3), + "total_audio_generated": round(self.total_audio_duration, 2), + "recent_requests": self.request_history[:10], + } + +stats_tracker = StatsTracker() + + +class GenerateRequest(BaseModel): + text: str + model: str = "kitten-tts-nano" # Default to FP32 for best quality + voice: str = "Bella" + speed: float = 1.0 + + +class StreamChunkRequest(BaseModel): + """Request model for streaming TTS endpoint.""" + text: str + model: str = "kitten-tts-nano" + voice: str = "Bella" + speed: float = 1.0 + flush: bool = False # Set True on final chunk to flush remaining text + + +class GenerateResponse(BaseModel): + audio_base64: str + sample_rate: int + duration: float + debug_info: Optional[Dict[str, Any]] = None + + +def get_cache_size(): + """Calculate total cache size in MB.""" + total_size = 0 + if CACHE_DIR.exists(): + for dirpath, dirnames, filenames in os.walk(CACHE_DIR): + for f in filenames: + fp = os.path.join(dirpath, f) + total_size += os.path.getsize(fp) + return round(total_size / (1024 * 1024), 2) + + +def get_model(model_id: str): + """Get or load a model with caching and timing.""" + if model_id not in MODELS: + raise ValueError(f"Unknown model: {model_id}") + + if model_id not in loaded_models: + from kittentts import KittenTTS + + start_time = time.time() + repo_id = MODELS[model_id] + loaded_models[model_id] = KittenTTS(repo_id, cache_dir=str(CACHE_DIR)) + load_time = time.time() - start_time + model_load_times[model_id] = load_time + print(f"[Model Load] {model_id} loaded in {load_time:.2f}s") + + return loaded_models[model_id], model_load_times.get(model_id, 0.0) + + +def create_app() -> FastAPI: + app = FastAPI( + title="KittenTTS WebUI", + description="A cute kitten-themed text-to-speech web interface", + version="1.0.0", + ) + + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + static_dir = Path(__file__).parent / "static" + if static_dir.exists(): + app.mount("/static", StaticFiles(directory=str(static_dir)), name="static") + + @app.get("/", response_class=HTMLResponse) + async def index(): + template_path = Path(__file__).parent / "templates" / "index.html" + if template_path.exists(): + return HTMLResponse(content=template_path.read_text(encoding="utf-8")) + raise HTTPException(status_code=404, detail="Template not found") + + @app.get("/api/models") + async def get_models(): + return {"models": MODEL_INFO} + + @app.get("/api/voices") + async def get_voices(): + return {"voices": VOICES} + + @app.get("/api/health") + async def health_check(): + return { + "status": "healthy", + "loaded_models": list(loaded_models.keys()), + "cache_dir": str(CACHE_DIR), + "cache_size_mb": get_cache_size(), + } + + @app.get("/api/stats") + async def get_stats(): + """Get detailed stats for debugging.""" + import sys + + # Try to get memory usage, fallback if psutil not available + try: + import psutil + process = psutil.Process() + memory_info = process.memory_info() + memory_mb = round(memory_info.rss / (1024 * 1024), 2) + except: + memory_mb = "N/A" + + return { + "generation_stats": stats_tracker.get_stats(), + "system": { + "cache_directory": str(CACHE_DIR), + "cache_size_mb": get_cache_size(), + "loaded_models": list(loaded_models.keys()), + "model_load_times": {k: round(v, 3) for k, v in model_load_times.items()}, + "python_version": sys.version.split()[0], + "memory_usage_mb": memory_mb, + }, + "available_models": list(MODELS.keys()), + "available_voices": [v["id"] for v in VOICES], + } + + @app.post("/api/generate", response_model=GenerateResponse) + async def generate_audio(request: GenerateRequest): + if not request.text.strip(): + raise HTTPException(status_code=400, detail="Text cannot be empty") + + if request.speed < 0.25 or request.speed > 3.0: + raise HTTPException( + status_code=400, detail="Speed must be between 0.25 and 3.0" + ) + + try: + # Load model with timing + model_start = time.time() + model, load_time = get_model(request.model) + model_load_elapsed = time.time() - model_start + + voice_id = VOICE_ALIASES.get(request.voice, request.voice) + + # Generate audio with timing + gen_start = time.time() + audio = model.generate( + text=request.text, voice=voice_id, speed=request.speed + ) + generation_time = time.time() - gen_start + + if isinstance(audio, np.ndarray): + audio_array = audio + else: + audio_array = np.array(audio) + + if audio_array.ndim > 1: + audio_array = audio_array.squeeze() + + sample_rate = 24000 + duration = len(audio_array) / sample_rate + + # Ensure proper audio format for web playback + if audio_array.dtype != np.float32: + audio_array = audio_array.astype(np.float32) + + # Normalize if needed to prevent clipping + max_val = np.max(np.abs(audio_array)) + if max_val > 0.99: + audio_array = audio_array * (0.99 / max_val) + + buffer = io.BytesIO() + sf.write(buffer, audio_array, sample_rate, format="WAV", subtype='PCM_16') + buffer.seek(0) + audio_base64 = base64.b64encode(buffer.read()).decode("utf-8") + + # Record stats + stats_tracker.record_request( + model_id=request.model, + voice=request.voice, + text_length=len(request.text), + generation_time=generation_time, + audio_duration=duration, + load_time=load_time if load_time > 0 else model_load_elapsed, + ) + + # Debug info for API response + debug_info = { + "model_load_time": round(load_time if load_time > 0 else model_load_elapsed, 3), + "generation_time": round(generation_time, 3), + "total_time": round(load_time + generation_time, 3), + "real_time_factor": round(duration / generation_time, 3) if generation_time > 0 else 0, + "audio_samples": len(audio_array), + "sample_rate": sample_rate, + } + + return GenerateResponse( + audio_base64=audio_base64, + sample_rate=sample_rate, + duration=round(duration, 2), + debug_info=debug_info, + ) + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/stream/start") + async def start_streaming_session(model: str = "kitten-tts-nano", voice: str = "Bella", speed: float = 1.0): + """Start a new streaming TTS session. + + Returns a session_id to use for subsequent streaming requests. + """ + if speed < 0.25 or speed > 3.0: + raise HTTPException(status_code=400, detail="Speed must be between 0.25 and 3.0") + + try: + tts_model, _ = get_model(model) + from kittentts import StreamingTTS + + voice_id = VOICE_ALIASES.get(voice, voice) + streamer = tts_model.create_streamer(voice=voice_id, speed=speed) + + session_id = str(uuid.uuid4()) + streaming_sessions[session_id] = { + "streamer": streamer, + "model": model, + "voice": voice, + "created_at": datetime.now().isoformat(), + } + + return {"session_id": session_id, "status": "created"} + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/stream/chunk") + async def stream_chunk(request: StreamChunkRequest, session_id: str): + """Add text to a streaming session and get audio for complete sentences. + + Args: + session_id: The streaming session ID from /api/stream/start + request: Contains text chunk and flush flag + + Returns: + JSON with audio_base64 chunks for any complete sentences + """ + if session_id not in streaming_sessions: + raise HTTPException(status_code=404, detail="Session not found. Start a new session with /api/stream/start") + + session = streaming_sessions[session_id] + streamer = session["streamer"] + + try: + audio_chunks = [] + + # Process incoming text and get audio for complete sentences + for audio in streamer.add_text(request.text): + audio_chunks.append(audio) + + # If flush is True, also get any remaining buffered text + if request.flush: + for audio in streamer.flush(): + audio_chunks.append(audio) + + # Convert audio chunks to base64 + sample_rate = 24000 + audio_base64_chunks = [] + + for audio in audio_chunks: + if isinstance(audio, np.ndarray): + audio_array = audio + else: + audio_array = np.array(audio) + + if audio_array.ndim > 1: + audio_array = audio_array.squeeze() + + if audio_array.dtype != np.float32: + audio_array = audio_array.astype(np.float32) + + # Normalize if needed + max_val = np.max(np.abs(audio_array)) + if max_val > 0.99: + audio_array = audio_array * (0.99 / max_val) + + buffer = io.BytesIO() + sf.write(buffer, audio_array, sample_rate, format="WAV", subtype='PCM_16') + buffer.seek(0) + audio_base64_chunks.append(base64.b64encode(buffer.read()).decode("utf-8")) + + return { + "audio_chunks": audio_base64_chunks, + "sample_rate": sample_rate, + "buffered_text": streamer.buffered_text, + "status": "flushed" if request.flush else "streaming", + } + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + @app.delete("/api/stream/end/{session_id}") + async def end_streaming_session(session_id: str): + """End a streaming session and release resources.""" + if session_id in streaming_sessions: + del streaming_sessions[session_id] + return {"status": "ended", "session_id": session_id} + raise HTTPException(status_code=404, detail="Session not found") + + @app.get("/favicon.ico") + async def favicon(): + return FileResponse( + Path(__file__).parent / "static" / "favicon.svg", media_type="image/svg+xml" + ) + + return app + + +def run_server(host: str = "0.0.0.0", port: int = 7860): + import uvicorn + + app = create_app() + print(f"\n🐱 KittenTTS WebUI starting at http://{host}:{port}") + print(f"📁 Cache directory: {CACHE_DIR}") + print("Press Ctrl+C to stop\n") + uvicorn.run(app, host=host, port=port) + + +if __name__ == "__main__": + run_server() diff --git a/webui/static/app.js b/webui/static/app.js new file mode 100644 index 0000000..03e152e --- /dev/null +++ b/webui/static/app.js @@ -0,0 +1,429 @@ +// KittenTTS WebUI - Cute kitten-themed interactions 🐱 + +const models = [ + { id: 'kitten-tts-nano', name: 'Nano (FP32)', params: '15M', size: '56MB', precision: 'FP32', quality: 'best', description: '⭐ Best quality - Full 32-bit precision', emoji: '⭐' }, + { id: 'kitten-tts-mini', name: 'Mini (INT8)', params: '80M', size: '80MB', precision: 'INT8', quality: 'good', description: 'Largest model, INT8 quantized', emoji: '🐱' }, + { id: 'kitten-tts-micro', name: 'Micro (INT8)', params: '40M', size: '41MB', precision: 'INT8', quality: 'good', description: 'Balanced size, INT8 quantized', emoji: '🐈' }, + { id: 'kitten-tts-nano-int8', name: 'Nano (INT8)', params: '15M', size: '19MB', precision: 'INT8', quality: 'basic', description: 'Smallest, INT8 quantized', emoji: '💫' } +]; + +const voices = [ + { id: 'Bella', name: 'Bella', gender: 'female', description: 'Warm & gentle', emoji: '👩' }, + { id: 'Jasper', name: 'Jasper', gender: 'male', description: 'Clear & professional', emoji: '👨' }, + { id: 'Luna', name: 'Luna', gender: 'female', description: 'Soft & melodic', emoji: '🌙' }, + { id: 'Bruno', name: 'Bruno', gender: 'male', description: 'Deep & resonant', emoji: '🐻' }, + { id: 'Rosie', name: 'Rosie', gender: 'female', description: 'Bright & cheerful', emoji: '🌸' }, + { id: 'Hugo', name: 'Hugo', gender: 'male', description: 'Confident & steady', emoji: '💼' }, + { id: 'Kiki', name: 'Kiki', gender: 'female', description: 'Playful & energetic', emoji: '🎀' }, + { id: 'Leo', name: 'Leo', gender: 'male', description: 'Friendly & warm', emoji: '🦁' } +]; + +const sampleTexts = [ + "Hello! I'm KittenTTS, your cute and lightweight text-to-speech companion! 🐱", + "The quick brown fox jumps over the lazy dog. Meow! 🐾", + "Welcome to KittenTTS! For kittens, by kittens. 💕", + "Did you know? KittenTTS can run entirely on your CPU without a GPU! ✨", + "Purrr-fect speech synthesis at your fingertips! 🎙️" +]; + +let audioContext = null; +let meowSounds = []; + +function init() { + initializeTheme(); + populateModels(); + populateVoices(); + setupEventListeners(); + addFloatingPaws(); + + // Randomly select a sample text + const textarea = document.getElementById('textInput'); + if (textarea && !textarea.value) { + textarea.placeholder = sampleTexts[Math.floor(Math.random() * sampleTexts.length)]; + } + + // Load initial stats + loadDebugStats(); +} + +// Add floating paw decorations +function addFloatingPaws() { + const container = document.querySelector('.container'); + for (let i = 0; i < 5; i++) { + const paw = document.createElement('div'); + paw.className = 'paw-bg'; + paw.innerHTML = '🐾'; + paw.style.left = `${Math.random() * 90}%`; + paw.style.top = `${Math.random() * 90}%`; + paw.style.animationDelay = `${Math.random() * -20}s`; + paw.style.fontSize = `${6 + Math.random() * 6}rem`; + document.body.insertBefore(paw, container); + } +} + +function initializeTheme() { + const savedTheme = localStorage.getItem('kitten-tts-theme'); + const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches; + const theme = savedTheme || (prefersDark ? 'dark' : 'light'); + setTheme(theme); +} + +function setTheme(theme) { + document.documentElement.setAttribute('data-theme', theme); + localStorage.setItem('kitten-tts-theme', theme); + updateThemeIcon(theme); +} + +function updateThemeIcon(theme) { + const btn = document.getElementById('themeToggle'); + btn.innerHTML = theme === 'dark' ? '☀️' : '🌙'; + btn.setAttribute('aria-label', theme === 'dark' ? 'Switch to light mode' : 'Switch to dark mode'); +} + +function toggleTheme() { + const current = document.documentElement.getAttribute('data-theme'); + setTheme(current === 'dark' ? 'light' : 'dark'); +} + +function populateModels() { + const select = document.getElementById('modelSelect'); + select.innerHTML = models.map(m => { + const qualityIcon = m.quality === 'best' ? '⭐' : m.quality === 'good' ? '✓' : '○'; + return ``; + }).join(''); + + // Add change handler to update info display + select.addEventListener('change', updateModelInfo); + updateModelInfo(); +} + +function updateModelInfo() { + const select = document.getElementById('modelSelect'); + const model = models.find(m => m.id === select.value); + const infoEl = document.getElementById('modelInfo'); + if (infoEl && model) { + const qualityBadge = model.quality === 'best' ? '⭐ Best Quality' : + model.quality === 'good' ? '✓ Good' : '○ Basic'; + infoEl.textContent = `${model.params} params • ${model.size} • ${model.precision} precision • ${qualityBadge}`; + } +} + +function populateVoices() { + const select = document.getElementById('voiceSelect'); + select.innerHTML = voices.map(v => + `` + ).join(''); + + select.addEventListener('change', updateVoiceInfo); + updateVoiceInfo(); +} + +function updateVoiceInfo() { + const select = document.getElementById('voiceSelect'); + const voice = voices.find(v => v.id === select.value); + const infoEl = document.getElementById('voiceInfo'); + if (infoEl && voice) { + const genderEmoji = voice.gender === 'female' ? '♀️' : '♂️'; + infoEl.textContent = `${genderEmoji} ${voice.gender} voice • ${voice.description}`; + } +} + +function setupEventListeners() { + document.getElementById('themeToggle').addEventListener('click', toggleTheme); + document.getElementById('speedSlider').addEventListener('input', updateSpeedDisplay); + document.getElementById('generateBtn').addEventListener('click', handleGenerate); + + // Debug panel toggle + const debugToggle = document.getElementById('debugToggle'); + if (debugToggle) { + debugToggle.addEventListener('click', toggleDebugPanel); + } + + // Debug refresh button + const refreshBtn = document.getElementById('refreshStatsBtn'); + if (refreshBtn) { + refreshBtn.addEventListener('click', loadDebugStats); + } +} + +function updateSpeedDisplay() { + const slider = document.getElementById('speedSlider'); + const value = slider.value; + document.getElementById('speedValue').childNodes[0].textContent = `${value}x `; + + // Change emoji based on speed + let emoji = '🐱'; + if (value < 0.8) emoji = '🐢'; // Slow + else if (value > 1.5) emoji = '⚡'; // Fast + else if (value > 1.2) emoji = '🐇'; // Quick + + const emojiEl = document.getElementById('speedEmoji'); + if (emojiEl) emojiEl.textContent = emoji; +} + +function showLoading(show) { + const btn = document.getElementById('generateBtn'); + const btnText = document.getElementById('btnText'); + const spinner = document.getElementById('btnSpinner'); + + btn.disabled = show; + btnText.style.display = show ? 'none' : 'inline'; + spinner.style.display = show ? 'inline-block' : 'none'; + + if (show) { + btn.classList.add('generating'); + } else { + btn.classList.remove('generating'); + } +} + +function showError(message) { + const el = document.getElementById('errorMessage'); + el.textContent = '😿 ' + message; + el.classList.add('visible'); + setTimeout(() => el.classList.remove('visible'), 5000); +} + +function showOutput(audioBase64, duration) { + const section = document.getElementById('outputSection'); + const audio = document.getElementById('audioPlayer'); + const durationEl = document.getElementById('audioDuration'); + const downloadBtn = document.getElementById('downloadBtn'); + + audio.src = `data:audio/wav;base64,${audioBase64}`; + durationEl.textContent = `⏱️ Duration: ${duration}s`; + + const blob = base64ToBlob(audioBase64, 'audio/wav'); + const url = URL.createObjectURL(blob); + downloadBtn.href = url; + downloadBtn.download = `kitten-tts-${Date.now()}.wav`; + + section.classList.add('visible'); + section.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); + + // Auto-play the audio + audio.play().catch(() => { + // Auto-play blocked, user will need to click + }); +} + +function base64ToBlob(base64, mimeType) { + const byteChars = atob(base64); + const byteNumbers = new Array(byteChars.length); + for (let i = 0; i < byteChars.length; i++) { + byteNumbers[i] = byteChars.charCodeAt(i); + } + const byteArray = new Uint8Array(byteNumbers); + return new Blob([byteArray], { type: mimeType }); +} + +async function handleGenerate() { + const text = document.getElementById('textInput').value.trim(); + const model = document.getElementById('modelSelect').value; + const voice = document.getElementById('voiceSelect').value; + const speed = parseFloat(document.getElementById('speedSlider').value); + + if (!text) { + showError('Please enter some text to generate speech! 🐾'); + return; + } + + showLoading(true); + document.getElementById('errorMessage').classList.remove('visible'); + + try { + const response = await fetch('/api/generate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text, model, voice, speed }) + }); + + const data = await response.json(); + console.log('Generation response:', data); + + if (!response.ok) { + throw new Error(data.detail || 'Generation failed 😿'); + } + + showOutput(data.audio_base64, data.duration); + + // Update debug stats if available + if (data.debug_info) { + console.log('Debug info received:', data.debug_info); + updateCurrentStats(data.debug_info); + } else { + console.log('No debug_info in response'); + } + + // Refresh session stats + await loadDebugStats(); + + // Success animation on button + const btn = document.getElementById('generateBtn'); + btn.style.transform = 'scale(0.98)'; + setTimeout(() => btn.style.transform = '', 200); + + } catch (error) { + showError(error.message || 'An error occurred during generation 😿'); + console.error('Generation error:', error); + } finally { + showLoading(false); + } +} + +// Debug Panel Functions +function updateCurrentStats(debugInfo) { + console.log('Updating current stats:', debugInfo); + const modelLoadEl = document.getElementById('statModelLoad'); + const genTimeEl = document.getElementById('statGenTime'); + const totalTimeEl = document.getElementById('statTotalTime'); + const rtfEl = document.getElementById('statRTF'); + + if (modelLoadEl) modelLoadEl.textContent = debugInfo.model_load_time ? `${debugInfo.model_load_time}s` : '-'; + if (genTimeEl) genTimeEl.textContent = debugInfo.generation_time ? `${debugInfo.generation_time}s` : '-'; + if (totalTimeEl) totalTimeEl.textContent = debugInfo.total_time ? `${debugInfo.total_time}s` : '-'; + if (rtfEl) rtfEl.textContent = debugInfo.real_time_factor || '-'; +} + +async function loadDebugStats() { + try { + console.log('Loading debug stats...'); + const response = await fetch('/api/stats'); + if (!response.ok) { + console.error('Stats fetch failed:', response.status); + return; + } + + const data = await response.json(); + console.log('Stats received:', data); + + // Update session stats + if (data.generation_stats) { + const sessionRequests = document.getElementById('sessionRequests'); + const sessionAvgGen = document.getElementById('sessionAvgGen'); + const sessionAvgRTF = document.getElementById('sessionAvgRTF'); + const sessionAudio = document.getElementById('sessionAudio'); + + if (sessionRequests) sessionRequests.textContent = data.generation_stats.total_requests || 0; + if (sessionAvgGen) sessionAvgGen.textContent = `${data.generation_stats.avg_generation_time || 0}s`; + if (sessionAvgRTF) sessionAvgRTF.textContent = data.generation_stats.avg_rtf || 0; + if (sessionAudio) sessionAudio.textContent = `${data.generation_stats.total_audio_generated || 0}s`; + } + + // Update system info + if (data.system) { + const systemInfoEl = document.getElementById('systemInfo'); + if (systemInfoEl) { + systemInfoEl.innerHTML = ` +
+ Cache Directory + ${data.system.cache_directory || 'N/A'} +
+
+ Cache Size + ${data.system.cache_size_mb || 0} MB +
+
+ Memory Usage + ${data.system.memory_usage_mb || 'N/A'} MB +
+
+ Python Version + ${data.system.python_version || 'N/A'} +
+
+ Loaded Models + ${(data.system.loaded_models || []).join(', ') || 'None'} +
+ `; + } + } + + // Update recent requests + const requestsEl = document.getElementById('recentRequests'); + if (requestsEl && data.generation_stats && data.generation_stats.recent_requests) { + if (data.generation_stats.recent_requests.length === 0) { + requestsEl.innerHTML = '
No recent generations
'; + } else { + requestsEl.innerHTML = data.generation_stats.recent_requests.map(req => ` +
+ ${req.model || '?'} + ${req.voice || '?'} + ${req.generation_time || 0}s + RTF: ${req.rtf || 0} +
+ `).join(''); + } + } + console.log('Debug stats updated successfully'); + } catch (error) { + console.error('Failed to load debug stats:', error); + } +} + +function toggleDebugPanel() { + const panel = document.getElementById('debugPanel'); + const content = document.getElementById('debugContent'); + + console.log('Toggling debug panel'); + if (content.style.display === 'none') { + content.style.display = 'block'; + panel.classList.add('expanded'); + loadDebugStats(); + } else { + content.style.display = 'none'; + panel.classList.remove('expanded'); + } +} + +// Easter egg: Meow on logo click +document.addEventListener('DOMContentLoaded', () => { + init(); + + const logoIcon = document.querySelector('.logo-icon'); + if (logoIcon) { + logoIcon.addEventListener('click', () => { + // Visual feedback + logoIcon.style.transform = 'scale(1.1) rotate(-10deg)'; + setTimeout(() => { + logoIcon.style.transform = ''; + }, 300); + + // Create a cute popup + const popup = document.createElement('div'); + popup.textContent = 'Meow! 🐱'; + popup.style.cssText = ` + position: fixed; + top: 100px; + left: 50%; + transform: translateX(-50%); + background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary)); + color: white; + padding: 0.75rem 1.5rem; + border-radius: 9999px; + font-weight: 600; + font-size: 1rem; + box-shadow: 0 4px 12px rgba(232, 146, 160, 0.4); + z-index: 1000; + animation: popIn 0.3s ease, fadeOut 0.3s ease 1s forwards; + pointer-events: none; + `; + document.body.appendChild(popup); + setTimeout(() => popup.remove(), 1500); + }); + } +}); + +// Add popIn animation +const style = document.createElement('style'); +style.textContent = ` + @keyframes popIn { + from { opacity: 0; transform: translateX(-50%) scale(0.8) translateY(10px); } + to { opacity: 1; transform: translateX(-50%) scale(1) translateY(0); } + } + @keyframes fadeOut { + from { opacity: 1; transform: translateX(-50%) scale(1); } + to { opacity: 0; transform: translateX(-50%) scale(0.8) translateY(-10px); } + } +`; +document.head.appendChild(style); diff --git a/webui/static/favicon.svg b/webui/static/favicon.svg new file mode 100644 index 0000000..63b36b1 --- /dev/null +++ b/webui/static/favicon.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/webui/static/icon.png b/webui/static/icon.png new file mode 100644 index 0000000..fc7499a Binary files /dev/null and b/webui/static/icon.png differ diff --git a/webui/static/style.css b/webui/static/style.css new file mode 100644 index 0000000..7e8a091 --- /dev/null +++ b/webui/static/style.css @@ -0,0 +1,1090 @@ +/* KittenTTS - Cute Kitten Theme 🐱 */ +/* Inspired by the GitHub repo design */ + +:root { + /* Soft kitten color palette */ + --bg-primary: #faf7f5; + --bg-secondary: #ffffff; + --bg-tertiary: #f5f0ed; + --bg-card: #fff9f7; + + --text-primary: #2d2a35; + --text-secondary: #6b6570; + --text-muted: #9a959f; + --text-accent: #8b5a6b; + + /* Soft pink/lavender accents */ + --accent-primary: #e892a0; + --accent-secondary: #d67b8a; + --accent-light: #fce8eb; + --accent-lavender: #c9b8d4; + --accent-lavender-light: #f0e8f5; + + --border-color: #ebe5e0; + --border-accent: #f0d5da; + + --shadow-sm: 0 2px 8px rgba(139, 90, 107, 0.06); + --shadow-md: 0 4px 16px rgba(139, 90, 107, 0.1); + --shadow-lg: 0 8px 32px rgba(139, 90, 107, 0.14); + --shadow-glow: 0 0 20px rgba(232, 146, 160, 0.3); + + --radius-sm: 12px; + --radius-md: 16px; + --radius-lg: 24px; + --radius-xl: 32px; + --radius-full: 9999px; + + --transition: 0.25s cubic-bezier(0.4, 0, 0.2, 1); + --transition-bounce: 0.4s cubic-bezier(0.68, -0.55, 0.265, 1.55); +} + +[data-theme="dark"] { + --bg-primary: #1a1820; + --bg-secondary: #252230; + --bg-tertiary: #2d2a38; + --bg-card: #2a2733; + + --text-primary: #f5f0f7; + --text-secondary: #b5adb8; + --text-muted: #7a7580; + --text-accent: #e8b8c8; + + --accent-primary: #e892a0; + --accent-secondary: #f0a0b0; + --accent-light: #3d3035; + --accent-lavender: #8b7a9a; + --accent-lavender-light: #3d3545; + + --border-color: #3d3845; + --border-accent: #4d3842; + + --shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.25); + --shadow-md: 0 4px 16px rgba(0, 0, 0, 0.35); + --shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.45); + --shadow-glow: 0 0 30px rgba(232, 146, 160, 0.2); +} + +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +html { + font-size: 22.4px; + /* 16px * 1.4 = 22.4px for 140% scale */ + scroll-behavior: smooth; +} + +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + background: var(--bg-primary); + color: var(--text-primary); + min-height: 100vh; + line-height: 1.6; + transition: background var(--transition), color var(--transition); + overflow-x: hidden; +} + +/* Floating paw background decorations */ +.paw-bg { + position: fixed; + pointer-events: none; + opacity: 0.04; + font-size: 8rem; + z-index: 0; + animation: float 20s ease-in-out infinite; + color: #8b5a6b; + /* Pink-ish color for light mode */ +} + +/* Dark mode paw colors - more visible */ +[data-theme="dark"] .paw-bg { + opacity: 0.15; + color: #e892a0; + /* Accent pink color for better visibility */ +} + +.paw-bg:nth-child(1) { + top: 5%; + left: 5%; + animation-delay: 0s; +} + +.paw-bg:nth-child(2) { + top: 20%; + right: 8%; + animation-delay: -5s; +} + +.paw-bg:nth-child(3) { + top: 50%; + left: 3%; + animation-delay: -10s; +} + +.paw-bg:nth-child(4) { + bottom: 20%; + right: 5%; + animation-delay: -15s; +} + +.paw-bg:nth-child(5) { + bottom: 10%; + left: 15%; + animation-delay: -7s; +} + +@keyframes float { + + 0%, + 100% { + transform: translateY(0) rotate(0deg); + } + + 50% { + transform: translateY(-20px) rotate(5deg); + } +} + +.container { + position: relative; + z-index: 1; + max-width: 1600px; + margin: 0 auto; + padding: 2rem 3rem; + width: 95%; +} + +/* Header with cute logo */ +header { + display: flex; + align-items: center; + justify-content: space-between; + margin-bottom: 2rem; + padding-bottom: 1.5rem; + border-bottom: 2px dashed var(--border-color); +} + +.logo { + display: flex; + align-items: center; + gap: 1rem; +} + +.logo-icon { + width: 64px; + height: 64px; + background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary)); + border-radius: var(--radius-lg); + display: flex; + align-items: center; + justify-content: center; + box-shadow: var(--shadow-md), var(--shadow-glow); + transition: transform var(--transition-bounce); + position: relative; + overflow: hidden; +} + +.logo-icon::before { + content: ''; + position: absolute; + inset: 0; + background: linear-gradient(135deg, transparent 40%, rgba(255, 255, 255, 0.3) 50%, transparent 60%); + animation: shimmer 3s infinite; +} + +@keyframes shimmer { + 0% { + transform: translateX(-100%); + } + + 100% { + transform: translateX(100%); + } +} + +.logo-icon:hover { + transform: scale(1.05) rotate(-3deg); +} + +.logo-icon svg { + width: 44px; + height: 44px; + filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1)); +} + +.logo-text h1 { + font-size: 1.75rem; + font-weight: 800; + letter-spacing: -0.03em; + background: linear-gradient(135deg, var(--accent-primary), var(--accent-lavender), var(--accent-secondary)); + background-size: 200% 200%; + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + animation: gradient 5s ease infinite; +} + +@keyframes gradient { + + 0%, + 100% { + background-position: 0% 50%; + } + + 50% { + background-position: 100% 50%; + } +} + +.logo-text .tagline { + font-size: 0.8rem; + color: var(--text-muted); + font-weight: 500; + letter-spacing: 0.08em; + margin-top: 0.25rem; +} + +.theme-toggle { + width: 48px; + height: 48px; + border: 2px solid var(--border-color); + border-radius: var(--radius-full); + background: var(--bg-secondary); + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + font-size: 1.25rem; + transition: all var(--transition-bounce); + color: var(--text-secondary); + position: relative; +} + +.theme-toggle:hover { + background: var(--accent-light); + border-color: var(--accent-primary); + color: var(--accent-secondary); + transform: scale(1.1) rotate(15deg); +} + +.theme-toggle:active { + transform: scale(0.95); +} + +/* Main content */ +main { + display: flex; + flex-direction: column; + gap: 1.25rem; +} + +/* Cards with cute styling */ +.card { + background: var(--bg-secondary); + border: 2px solid var(--border-color); + border-radius: var(--radius-lg); + padding: 1.5rem; + box-shadow: var(--shadow-sm); + transition: all var(--transition); + position: relative; +} + +.card::before { + content: ''; + position: absolute; + top: 0; + left: 1.5rem; + right: 1.5rem; + height: 3px; + background: linear-gradient(90deg, var(--accent-primary), var(--accent-lavender), var(--accent-primary)); + border-radius: 0 0 4px 4px; + opacity: 0; + transition: opacity var(--transition); +} + +.card:hover { + box-shadow: var(--shadow-md); + border-color: var(--border-accent); +} + +.card:hover::before { + opacity: 1; +} + +.card-header { + display: flex; + align-items: center; + gap: 0.75rem; + margin-bottom: 1.25rem; + padding-bottom: 0.75rem; + border-bottom: 1px dashed var(--border-color); +} + +.card-header-icon { + width: 32px; + height: 32px; + background: var(--accent-light); + border-radius: var(--radius-md); + display: flex; + align-items: center; + justify-content: center; + font-size: 1rem; +} + +.card-header h2 { + font-size: 0.9375rem; + font-weight: 700; + letter-spacing: 0.02em; + color: var(--text-accent); +} + +/* Form elements */ +.form-group { + margin-bottom: 1.25rem; +} + +.form-group:last-child { + margin-bottom: 0; +} + +label { + display: flex; + align-items: center; + gap: 0.375rem; + font-size: 0.8125rem; + font-weight: 600; + color: var(--text-secondary); + margin-bottom: 0.625rem; +} + +label .label-icon { + font-size: 0.875rem; +} + +select, +input[type="range"] { + width: 100%; +} + +select { + appearance: none; + background: var(--bg-tertiary); + border: 2px solid var(--border-color); + border-radius: var(--radius-md); + padding: 0.875rem 2.75rem 0.875rem 1rem; + font-size: 0.9375rem; + font-weight: 500; + color: var(--text-primary); + cursor: pointer; + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='18' height='18' viewBox='0 0 24 24' fill='none' stroke='%23e892a0' stroke-width='2.5' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath d='m6 9 6 6 6-6'/%3E%3C/svg%3E"); + background-repeat: no-repeat; + background-position: right 1rem center; + transition: all var(--transition); +} + +select:hover { + border-color: var(--accent-primary); + background-color: var(--bg-card); +} + +select:focus { + outline: none; + border-color: var(--accent-primary); + box-shadow: 0 0 0 4px var(--accent-light); +} + +.settings-row { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); + gap: 1.5rem; +} + +/* Voice selector with avatars */ +.voice-select-wrapper { + position: relative; +} + +.voice-avatar { + display: inline-flex; + align-items: center; + justify-content: center; + width: 20px; + height: 20px; + border-radius: 50%; + font-size: 0.75rem; + margin-right: 0.5rem; +} + +.voice-avatar.female { + background: var(--accent-light); + color: var(--accent-secondary); +} + +.voice-avatar.male { + background: var(--accent-lavender-light); + color: var(--accent-lavender); +} + +/* Speed slider with cute styling */ +.speed-container { + display: flex; + flex-direction: column; + gap: 0.625rem; +} + +.speed-header { + display: flex; + justify-content: space-between; + align-items: center; +} + +.speed-value { + font-size: 0.875rem; + font-weight: 700; + color: var(--accent-secondary); + background: linear-gradient(135deg, var(--accent-light), var(--bg-card)); + padding: 0.375rem 1rem; + border-radius: var(--radius-full); + border: 1px solid var(--border-accent); + transition: all var(--transition); +} + +.speed-value:hover { + transform: scale(1.05); +} + +input[type="range"] { + -webkit-appearance: none; + appearance: none; + height: 8px; + background: linear-gradient(90deg, var(--bg-tertiary), var(--accent-light)); + border-radius: var(--radius-full); + cursor: pointer; + border: 1px solid var(--border-color); +} + +input[type="range"]::-webkit-slider-thumb { + -webkit-appearance: none; + appearance: none; + width: 24px; + height: 24px; + background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary)); + border-radius: 50%; + cursor: pointer; + box-shadow: var(--shadow-sm), 0 0 10px rgba(232, 146, 160, 0.4); + transition: all var(--transition-bounce); + border: 3px solid white; +} + +input[type="range"]::-webkit-slider-thumb:hover { + transform: scale(1.2); + box-shadow: var(--shadow-md), 0 0 15px rgba(232, 146, 160, 0.6); +} + +input[type="range"]::-moz-range-thumb { + width: 24px; + height: 24px; + background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary)); + border-radius: 50%; + cursor: pointer; + border: 3px solid white; + box-shadow: var(--shadow-sm); +} + +/* Textarea with cute styling */ +textarea { + width: 100%; + min-height: 140px; + background: var(--bg-tertiary); + border: 2px solid var(--border-color); + border-radius: var(--radius-md); + padding: 1.125rem; + font-size: 0.9375rem; + font-family: inherit; + color: var(--text-primary); + resize: vertical; + transition: all var(--transition); + line-height: 1.7; +} + +textarea::placeholder { + color: var(--text-muted); + font-style: italic; +} + +textarea:hover { + border-color: var(--accent-primary); + background: var(--bg-card); +} + +textarea:focus { + outline: none; + border-color: var(--accent-primary); + box-shadow: 0 0 0 4px var(--accent-light); +} + +/* Generate button - the star of the show! */ +.generate-btn { + width: 100%; + padding: 1.125rem 2rem; + background: linear-gradient(135deg, var(--accent-primary), var(--accent-secondary)); + border: none; + border-radius: var(--radius-md); + font-size: 1.0625rem; + font-weight: 700; + color: white; + cursor: pointer; + transition: all var(--transition-bounce); + display: flex; + align-items: center; + justify-content: center; + gap: 0.625rem; + box-shadow: var(--shadow-md), 0 4px 0 #c96b7a; + position: relative; + overflow: hidden; + text-shadow: 0 1px 2px rgba(0, 0, 0, 0.1); +} + +.generate-btn::before { + content: ''; + position: absolute; + inset: 0; + background: linear-gradient(135deg, rgba(255, 255, 255, 0.2), transparent); + opacity: 0; + transition: opacity var(--transition); +} + +.generate-btn:hover:not(:disabled) { + transform: translateY(-3px); + box-shadow: var(--shadow-lg), 0 7px 0 #c96b7a, var(--shadow-glow); +} + +.generate-btn:hover::before { + opacity: 1; +} + +.generate-btn:active:not(:disabled) { + transform: translateY(2px); + box-shadow: var(--shadow-sm), 0 2px 0 #c96b7a; +} + +.generate-btn:disabled { + opacity: 0.7; + cursor: not-allowed; + box-shadow: var(--shadow-sm); + transform: none; +} + +.generate-btn .btn-icon { + font-size: 1.25rem; + animation: wiggle 2s ease-in-out infinite; +} + +@keyframes wiggle { + + 0%, + 100% { + transform: rotate(-3deg); + } + + 50% { + transform: rotate(3deg); + } +} + +.generate-btn:disabled .btn-icon { + animation: none; +} + +.generate-btn .spinner { + width: 22px; + height: 22px; + border: 3px solid rgba(255, 255, 255, 0.3); + border-top-color: white; + border-radius: 50%; + animation: spin 0.8s linear infinite; +} + +@keyframes spin { + to { + transform: rotate(360deg); + } +} + +/* Output section */ +.output-section { + display: none; + border-color: var(--accent-lavender); + background: linear-gradient(135deg, var(--bg-secondary), var(--accent-lavender-light)); +} + +.output-section.visible { + display: block; + animation: slideUp 0.4s var(--transition-bounce); +} + +@keyframes slideUp { + from { + opacity: 0; + transform: translateY(20px) scale(0.98); + } + + to { + opacity: 1; + transform: translateY(0) scale(1); + } +} + +.audio-player { + background: var(--bg-secondary); + border-radius: var(--radius-md); + padding: 1.25rem; + margin-bottom: 1rem; + border: 1px solid var(--border-color); +} + +.audio-player audio { + width: 100%; + height: 48px; + border-radius: var(--radius-sm); +} + +.audio-info { + display: flex; + justify-content: space-between; + align-items: center; + margin-top: 1rem; + padding-top: 1rem; + border-top: 1px dashed var(--border-color); + font-size: 0.8125rem; + color: var(--text-muted); +} + +.download-btn { + display: inline-flex; + align-items: center; + gap: 0.5rem; + padding: 0.625rem 1.25rem; + background: var(--bg-tertiary); + border: 2px solid var(--border-color); + border-radius: var(--radius-full); + font-size: 0.8125rem; + font-weight: 600; + color: var(--text-secondary); + cursor: pointer; + text-decoration: none; + transition: all var(--transition-bounce); +} + +.download-btn:hover { + background: var(--accent-light); + border-color: var(--accent-primary); + color: var(--accent-secondary); + transform: translateY(-2px); +} + +/* Error message */ +.error-message { + background: linear-gradient(135deg, #fee2e2, #fef2f2); + border: 2px solid #fecaca; + border-radius: var(--radius-md); + padding: 1.125rem; + color: #dc2626; + font-size: 0.875rem; + font-weight: 500; + display: none; + animation: shake 0.5s ease; +} + +[data-theme="dark"] .error-message { + background: linear-gradient(135deg, #3f1f25, #2d1a1f); + border-color: #5c2626; + color: #fca5a5; +} + +.error-message.visible { + display: block; +} + +@keyframes shake { + + 0%, + 100% { + transform: translateX(0); + } + + 25% { + transform: translateX(-5px); + } + + 75% { + transform: translateX(5px); + } +} + +/* Footer */ +footer { + text-align: center; + margin-top: 2.5rem; + padding-top: 1.5rem; + border-top: 2px dashed var(--border-color); + color: var(--text-muted); + font-size: 0.8125rem; +} + +footer a { + color: var(--accent-primary); + text-decoration: none; + font-weight: 600; + transition: all var(--transition); +} + +footer a:hover { + color: var(--accent-secondary); + text-decoration: underline; + text-underline-offset: 3px; +} + +footer .heart { + color: var(--accent-primary); + display: inline-block; + animation: heartbeat 1.5s ease-in-out infinite; +} + +@keyframes heartbeat { + + 0%, + 100% { + transform: scale(1); + } + + 50% { + transform: scale(1.15); + } +} + +/* Bottom paw decoration */ +.paw-decoration { + display: flex; + justify-content: center; + gap: 1.5rem; + margin: 1.5rem 0; + opacity: 0.12; +} + +.paw-decoration span { + font-size: 1.75rem; + animation: pawWalk 2s ease-in-out infinite; +} + +.paw-decoration span:nth-child(2) { + animation-delay: 0.2s; +} + +.paw-decoration span:nth-child(3) { + animation-delay: 0.4s; +} + +@keyframes pawWalk { + + 0%, + 100% { + transform: translateY(0); + } + + 50% { + transform: translateY(-8px); + } +} + +/* Model & Voice info */ +.model-info, +.voice-description { + font-size: 0.75rem; + color: var(--text-muted); + margin-top: 0.375rem; + display: flex; + align-items: center; + gap: 0.375rem; +} + +.model-info::before { + content: '📦'; +} + +.voice-description::before { + content: '🔊'; +} + +/* Quick actions */ +.quick-actions { + display: flex; + flex-wrap: wrap; + gap: 0.5rem; + margin-top: 0.75rem; +} + +.quick-action-btn { + padding: 0.375rem 0.875rem; + background: var(--bg-tertiary); + border: 1px solid var(--border-color); + border-radius: var(--radius-full); + font-size: 0.75rem; + font-weight: 500; + color: var(--text-secondary); + cursor: pointer; + transition: all var(--transition); +} + +.quick-action-btn:hover { + background: var(--accent-light); + border-color: var(--accent-primary); + color: var(--accent-secondary); + transform: translateY(-1px); +} + +/* Debug Panel */ +.debug-panel { + border-color: var(--accent-lavender); + background: linear-gradient(135deg, var(--bg-secondary), var(--accent-lavender-light)); +} + +.debug-header { + cursor: pointer; + user-select: none; + transition: all var(--transition); +} + +.debug-header:hover { + color: var(--accent-lavender); +} + +.debug-arrow { + margin-left: auto; + transition: transform var(--transition); + font-size: 0.875rem; +} + +.debug-panel.expanded .debug-arrow { + transform: rotate(180deg); +} + +.debug-content { + animation: slideDown 0.3s ease; +} + +@keyframes slideDown { + from { + opacity: 0; + transform: translateY(-10px); + } + + to { + opacity: 1; + transform: translateY(0); + } +} + +.debug-section { + margin-bottom: 1.25rem; + padding-bottom: 1.25rem; + border-bottom: 1px dashed var(--border-color); +} + +.debug-section:last-of-type { + border-bottom: none; + margin-bottom: 0; +} + +.debug-section h3 { + font-size: 0.75rem; + font-weight: 700; + color: var(--text-secondary); + text-transform: uppercase; + letter-spacing: 0.05em; + margin-bottom: 0.75rem; +} + +.debug-stats-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(120px, 1fr)); + gap: 0.75rem; +} + +.debug-stat { + background: var(--bg-tertiary); + border-radius: var(--radius-sm); + padding: 0.75rem; + text-align: center; + border: 1px solid var(--border-color); +} + +.debug-label { + display: block; + font-size: 0.6875rem; + color: var(--text-muted); + text-transform: uppercase; + letter-spacing: 0.03em; + margin-bottom: 0.25rem; +} + +.debug-value { + display: block; + font-size: 1rem; + font-weight: 700; + color: var(--accent-primary); + font-family: 'SF Mono', Monaco, monospace; +} + +.debug-info-list { + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.debug-info-item { + display: flex; + justify-content: space-between; + font-size: 0.8125rem; + padding: 0.375rem 0; + border-bottom: 1px solid var(--border-color); +} + +.debug-info-item:last-child { + border-bottom: none; +} + +.debug-info-key { + color: var(--text-secondary); +} + +.debug-info-value { + color: var(--text-primary); + font-family: 'SF Mono', Monaco, monospace; + font-size: 0.75rem; +} + +.debug-requests { + max-height: 200px; + overflow-y: auto; + background: var(--bg-tertiary); + border-radius: var(--radius-sm); + border: 1px solid var(--border-color); +} + +.debug-request-item { + display: grid; + grid-template-columns: 1fr auto auto auto; + gap: 0.75rem; + padding: 0.625rem 0.75rem; + font-size: 0.75rem; + border-bottom: 1px solid var(--border-color); + align-items: center; +} + +.debug-request-item:last-child { + border-bottom: none; +} + +.debug-request-model { + font-weight: 600; + color: var(--text-primary); +} + +.debug-request-voice { + color: var(--text-secondary); +} + +.debug-request-time { + font-family: 'SF Mono', Monaco, monospace; + color: var(--accent-primary); +} + +.debug-request-rtf { + font-family: 'SF Mono', Monaco, monospace; + color: var(--text-muted); + font-size: 0.6875rem; +} + +.debug-refresh-btn { + width: 100%; + padding: 0.75rem; + background: var(--bg-tertiary); + border: 1px solid var(--border-color); + border-radius: var(--radius-sm); + font-size: 0.8125rem; + font-weight: 600; + color: var(--text-secondary); + cursor: pointer; + transition: all var(--transition); + margin-top: 1rem; +} + +.debug-refresh-btn:hover { + background: var(--accent-light); + border-color: var(--accent-primary); + color: var(--accent-secondary); +} + +/* Responsive */ +@media (max-width: 640px) { + .container { + padding: 1.25rem; + } + + header { + flex-direction: column; + align-items: flex-start; + gap: 1rem; + position: relative; + } + + .logo-icon { + width: 52px; + height: 52px; + } + + .logo-icon svg { + width: 36px; + height: 36px; + } + + .logo-text h1 { + font-size: 1.5rem; + } + + .theme-toggle { + position: absolute; + top: 0; + right: 0; + width: 44px; + height: 44px; + } + + .settings-row { + grid-template-columns: 1fr; + } + + .card { + padding: 1.25rem; + } + + textarea { + min-height: 120px; + padding: 1rem; + } + + .generate-btn { + padding: 1rem 1.5rem; + font-size: 1rem; + } +} + +@media (prefers-reduced-motion: reduce) { + * { + animation: none !important; + transition-duration: 0.01ms !important; + } +} \ No newline at end of file diff --git a/webui/templates/index.html b/webui/templates/index.html new file mode 100644 index 0000000..61035f3 --- /dev/null +++ b/webui/templates/index.html @@ -0,0 +1,206 @@ + + + + + + + KittenTTS 🐱 Text to Speech + + + + + + + +
+
+ + +
+ +
+ +
+
+
⚙️
+

Settings

+
+ +
+
+ + +
+
+ +
+ + +
+
+
+ +
+
+
+ + 1.0x 🐱 +
+ +
+
+
+ + +
+
+
✏️
+

Text Input

+
+ +
+ +
+ + +
+ + +
+ + +
+
+
🎵
+

Your Audio

+
+
+ +
+ ⏱️ Duration: 0s + + 💾 Download WAV + +
+
+
+ + +
+
+
🔧
+

Debug Stats

+ +
+ +
+ + +
+ 🐾 + 🐾 + 🐾 + 🐾 +
+
+ + +
+ + + +