diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..419c62c --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Python build artifacts +__pycache__/ +*.py[cod] +*$py.class +build/ +dist/ +*.egg-info/ + +# Virtual environment +venv/ +env/ +.env/ + +# IDE settings +.vscode/ +.idea/ + +# Output files +output/*.wav + +# Local development settings +.env +*.log diff --git a/README.md b/README.md index 81536da..c8fb1b0 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,16 @@ -# Kitten TTS 😻 +# KittenTTS 😻 - Enhanced for Windows -Kitten TTS is an open-source realistic text-to-speech model with just 15 million parameters, designed for lightweight deployment and high-quality voice synthesis. +> **Note:** This is an enhanced fork of [KittenTTS](https://github.com/KittenML/KittenTTS) specifically optimized for Windows compatibility and improved user experience. -*Currently in developer preview* +KittenTTS is an open-source realistic text-to-speech model with just 15 million parameters, designed for lightweight deployment and high-quality voice synthesis. -[Join our discord](https://discord.gg/upcyF5s6) +## šŸš€ **What's Enhanced:** +- āœ… **Windows Compatibility** - Fixed dependency issues on Windows +- āœ… **Better Error Handling** - Comprehensive error messages and solutions +- āœ… **Organized Output** - Audio files saved in dedicated `output/` folder +- āœ… **Enhanced Examples** - Working examples with proper documentation +- āœ… **Improved Dependencies** - Fixed version conflicts and compatibility ## ✨ Features @@ -13,49 +18,137 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million - **CPU-optimized**: Runs without GPU on any device - **High-quality voices**: Several premium voice options available - **Fast inference**: Optimized for real-time speech synthesis +- **Windows-friendly**: Specifically tested and optimized for Windows +## šŸŽµ **Audio Examples** + +### šŸŽ§ **How to Generate Audio:** + +1. **Run the Example:** + ```python + python simple_example.py + ``` + This will create a sample audio in `output/output.wav` + +2. **Try Different Voices:** + ```python + python final_example.py + ``` + This will generate samples with both male and female voices + +3. **Check the Output:** + Generated audio files will be saved in the `output/` folder + +### šŸ“Š **Audio Quality:** +- **Sample Rate:** 24000 Hz +- **Format:** WAV (16-bit PCM) +- **Quality:** High-quality professional synthesis +- **Duration:** ~10-15 seconds per sample + +--- ## šŸš€ Quick Start ### Installation -``` -pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl +```bash +pip install -r requirements.txt ``` +### Basic Usage +```python +from kittentts import KittenTTS +import numpy as np - ### Basic Usage +# Load model +model = KittenTTS("KittenML/kitten-tts-nano-0.1") -``` -from kittentts import KittenTTS -m = KittenTTS("KittenML/kitten-tts-nano-0.1") +# Generate audio +text = "Welcome to the future of text-to-speech! KittenTTS is absolutely incredible - it's fast, lightweight, and produces crystal clear audio quality. This revolutionary AI model is changing the game with just 15 million parameters. Amazing technology!" +audio = model.generate(text, voice='expr-voice-5-m') -audio = m.generate("This high quality TTS model works without a GPU", voice='expr-voice-2-f' ) +# Save audio +audio_normalized = np.int16(audio * 32767) -# available_voices : [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] +# Create output folder +import os +os.makedirs('output', exist_ok=True) -# Save the audio -import soundfile as sf -sf.write('output.wav', audio, 24000) +with open('output/output.wav', 'wb') as f: + # WAV header and data writing code... + pass +print("āœ… Audio file created: output/output.wav") ``` +### Available Voices + +- `expr-voice-2-m` - Male voice 2 +- `expr-voice-2-f` - Female voice 2 +- `expr-voice-3-m` - Male voice 3 +- `expr-voice-3-f` - Female voice 3 +- `expr-voice-4-m` - Male voice 4 +- `expr-voice-4-f` - Female voice 4 +- `expr-voice-5-m` - Male voice 5 +- `expr-voice-5-f` - Female voice 5 + +## šŸ“ Files +- `final_example.py` - Complete working example with multiple voices +- `simple_example.py` - Basic usage example +- `output/` - Generated audio files folder (not tracked in Git) +> **Note:** The `output/` directory is gitignored by default. Generated audio files are stored here but not committed to the repository to keep it lightweight. Make sure to back up any important audio files separately. +## šŸŽÆ Examples + +### Different voices +```python +for voice in ['expr-voice-5-m', 'expr-voice-5-f']: + audio = model.generate(text, voice=voice) + # Save audio... +``` + +### Speed control +```python +# Normal speed +audio_normal = model.generate(text, voice='expr-voice-5-m', speed=1.0) + +# Faster +audio_fast = model.generate(text, voice='expr-voice-5-m', speed=1.5) + +# Slower +audio_slow = model.generate(text, voice='expr-voice-5-m', speed=0.7) +``` ## šŸ’» System Requirements -Works literally everywhere +Works literally everywhere - just needs Python 3.8+ and the required dependencies. + +## šŸ“Š Audio Specifications + +- **Sample rate**: 24000 Hz +- **Format**: WAV (16-bit PCM) +- **Quality**: High-quality professional voice synthesis + +## šŸ”§ Troubleshooting + +If you encounter issues: + +1. **Install dependencies:** `pip install -r requirements.txt` +2. **Check Python version:** Python 3.8+ required +3. **Run examples:** `python simple_example.py` + +## šŸ“ž Credits +This enhanced version is based on the original [KittenTTS](https://github.com/KittenML/KittenTTS) by KittenML. +**Original Repository:** https://github.com/KittenML/KittenTTS +**License:** Apache 2.0 -## Checklist +--- -- [x] Release a preview model -- [ ] Release the fully trained model weights -- [ ] Release mobile SDK -- [ ] Release web version +**Note**: This project is currently in developer preview. Some features may change. diff --git a/final_example.py b/final_example.py new file mode 100644 index 0000000..d41195a --- /dev/null +++ b/final_example.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +KittenTTS final working example +""" + +import sys +import os +import numpy as np + +# Add current directory to Python path +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +def save_audio_wav(audio, filename, sample_rate=24000): + """Save audio file in WAV format""" + try: + # Normalize audio data + audio_normalized = np.int16(audio * 32767) + + # Create output folder if it doesn't exist + os.makedirs('output', exist_ok=True) + + # Create WAV file + with open(f'output/{filename}', 'wb') as f: + # RIFF header + f.write(b'RIFF') + f.write((36 + len(audio_normalized) * 2).to_bytes(4, 'little')) + f.write(b'WAVE') + + # fmt chunk + f.write(b'fmt ') + f.write((16).to_bytes(4, 'little')) + f.write((1).to_bytes(2, 'little')) # PCM format + f.write((1).to_bytes(2, 'little')) # 1 channel + f.write(sample_rate.to_bytes(4, 'little')) + f.write((sample_rate * 2).to_bytes(4, 'little')) # byte rate + f.write((2).to_bytes(2, 'little')) # block align + f.write((16).to_bytes(2, 'little')) # bits per sample + + # data chunk + f.write(b'data') + f.write((len(audio_normalized) * 2).to_bytes(4, 'little')) + f.write(audio_normalized.tobytes()) + + print(f"šŸ’¾ Audio file saved: output/{filename}") + return True + except Exception as e: + print(f"āŒ Audio save error: {e}") + return False + +def main(): + print("šŸŽ¤ KittenTTS final example") + print("=" * 40) + + try: + # 1. Import KittenTTS + print("šŸ“„ Loading KittenTTS...") + from kittentts import KittenTTS + print("āœ… KittenTTS loaded!") + + # 2. Load model + print("šŸ¤– Loading model...") + model = KittenTTS("KittenML/kitten-tts-nano-0.1") + print("āœ… Model loaded!") + + # 3. Show available voices + print(f"\nšŸŽµ Available voices:") + for i, voice in enumerate(model.available_voices, 1): + print(f" {i}. {voice}") + + # 4. Test text + text = "Welcome to the future of text-to-speech! KittenTTS is absolutely incredible - it's fast, lightweight, and produces crystal clear audio quality. This revolutionary AI model is changing the game with just 15 million parameters. Amazing technology!" + print(f"\nšŸ“ Test text: {text}") + + # 5. Generate audio - male voice + print("\nšŸ”Š Generating audio with male voice...") + audio_male = model.generate(text, voice='expr-voice-5-m') + print("āœ… Male voice generated!") + + # 6. Generate audio - female voice + print("šŸ”Š Generating audio with female voice...") + audio_female = model.generate(text, voice='expr-voice-5-f') + print("āœ… Female voice generated!") + + # 7. Save audio files + print("\nšŸ’¾ Saving audio files...") + + if save_audio_wav(audio_male, 'male_voice.wav'): + print("āœ… Male voice saved") + + if save_audio_wav(audio_female, 'female_voice.wav'): + print("āœ… Female voice saved") + + # 8. Audio information + print(f"\nšŸ“Š Audio information:") + print(f" - Male voice length: {len(audio_male)} samples") + print(f" - Male voice duration: {len(audio_male)/24000:.2f} seconds") + print(f" - Female voice length: {len(audio_female)} samples") + print(f" - Female voice duration: {len(audio_female)/24000:.2f} seconds") + print(f" - Sample rate: 24000 Hz") + print(f" - Format: WAV (16-bit PCM)") + + # 9. Success message + print("\nšŸŽ‰ KittenTTS is working successfully!") + print("šŸ“ Created files in output folder:") + print(" - output/male_voice.wav") + print(" - output/female_voice.wav") + print("\nšŸ’” You can open these files in Windows Media Player or any other audio player!") + + except ImportError as e: + print(f"āŒ Import error: {e}") + print("\nšŸ”§ Solution:") + print("1. Install required libraries:") + print(" pip install misaki[en]==0.7.4 espeakng_loader") + print("2. Try again!") + + except Exception as e: + print(f"āŒ Error: {e}") + print("Please provide information about the error.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 37bfbb3..a8e2a7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ -num2words -spacy -espeakng_loader -misaki[en]>=0.9.4 +numpy onnxruntime soundfile -numpy huggingface_hub +misaki[en]==0.7.4 +espeakng_loader +scipy \ No newline at end of file diff --git a/setup.py b/setup.py index d0ac187..0533ed2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "num2words", "spacy", "espeakng_loader", - "misaki[en]>=0.9.4", + "misaki[en]>=0.7.4", "onnxruntime", "soundfile", "numpy", diff --git a/simple_example.py b/simple_example.py new file mode 100644 index 0000000..4c932c4 --- /dev/null +++ b/simple_example.py @@ -0,0 +1,41 @@ +from kittentts import KittenTTS +import numpy as np + +# Load model +model = KittenTTS("KittenML/kitten-tts-nano-0.1") + +# Convert text to speech +# text = "Hello! This is KittenTTS working perfectly." +text = "Welcome to the future of text-to-speech! KittenTTS is absolutely incredible - it's fast, lightweight, and produces crystal clear audio quality. This revolutionary AI model is changing the game with just 15 million parameters. Amazing technology!" +audio = model.generate(text, voice='expr-voice-5-m') + +# Save audio file +audio_normalized = np.int16(audio * 32767) + +# Create output folder if it doesn't exist +import os +os.makedirs('output', exist_ok=True) + +# Create WAV file +with open('output/output.wav', 'wb') as f: + # RIFF header + f.write(b'RIFF') + f.write((36 + len(audio_normalized) * 2).to_bytes(4, 'little')) + f.write(b'WAVE') + + # fmt chunk + f.write(b'fmt ') + f.write((16).to_bytes(4, 'little')) + f.write((1).to_bytes(2, 'little')) + f.write((1).to_bytes(2, 'little')) + f.write((24000).to_bytes(4, 'little')) + f.write((48000).to_bytes(4, 'little')) + f.write((2).to_bytes(2, 'little')) + f.write((16).to_bytes(2, 'little')) + + # data chunk + f.write(b'data') + f.write((len(audio_normalized) * 2).to_bytes(4, 'little')) + f.write(audio_normalized.tobytes()) + +print("āœ… Audio file created: output/output.wav") \ No newline at end of file