diff --git a/README.md b/README.md index 3bea197..2eefa9b 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ Kitten TTS is an open-source realistic text-to-speech model with just 15 million parameters, designed for lightweight deployment and high-quality voice synthesis. +> **UI and enhancement made by Muhammad Umer** + *Currently in developer preview* [Join our discord](https://discord.com/invite/VJ86W4SURW) @@ -59,7 +61,28 @@ pip install https://github.com/KittenML/KittenTTS/releases/download/0.8/kittentt - ### Basic Usage +``` + +### Graphical Interface (Windows) + +We have provided a native, modern UI designed specifically for Windows! +To use the UI: +1. Double-click **`Run_KittenTTS.bat`** from your File Explorer. +2. It will automatically build an environment if needed, and open the KittenTTS Desktop app. +3. Type your text, select your voice and model, and click **Generate & Play ā–¶**! + +### Terminal CLI Usage + +You can also use KittenTTS directly from your terminal/command line! +```bash +# Generate immediate audio +kittentts "Hello! I am speaking to you directly from the terminal" --voice Jasper --output out.wav + +# View all dynamically loaded voices +kittentts --list-voices +``` + + ### Basic Python Usage ``` from kittentts import KittenTTS diff --git a/Run_KittenTTS.bat b/Run_KittenTTS.bat new file mode 100644 index 0000000..8160dec --- /dev/null +++ b/Run_KittenTTS.bat @@ -0,0 +1,38 @@ +@echo off +echo ============================================== +echo KittenTTS Graphical Interface +echo ============================================== +echo. + +:: Check if virtual environment exists +if not exist "venv\Scripts\python.exe" ( + echo [INFO] Virtual environment not found. Setting it up now... + python -m venv venv + if errorlevel 1 ( + echo [ERROR] Failed to create virtual environment. Ensure Python is installed and in PATH. + pause + exit /b 1 + ) +) + +:: Activate the environment and install/update dependencies quietly +echo [INFO] Updating dependencies... +call "venv\Scripts\activate.bat" +if errorlevel 1 ( + echo [ERROR] Failed to activate virtual environment. + pause + exit /b 1 +) + +venv\Scripts\python.exe -m pip install -e . -q +if errorlevel 1 ( + echo [ERROR] Initialization failed while installing dependencies. + pause + exit /b 1 +) + +:: Run the UI application +echo [INFO] Launching UI... +venv\Scripts\python.exe -m kittentts.ui + +pause diff --git a/example.py b/example.py deleted file mode 100644 index 0b2db06..0000000 --- a/example.py +++ /dev/null @@ -1,10 +0,0 @@ -from kittentts import KittenTTS -m = KittenTTS("KittenML/kitten-tts-nano-0.2") - -audio = m.generate("This high quality TTS model works without a GPU it's 2026", voice='expr-voice-2-f' ) - -# available_voices : [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] - -# Save the audio -import soundfile as sf -sf.write('output.wav', audio, 24000) diff --git a/kittentts/cli.py b/kittentts/cli.py new file mode 100644 index 0000000..afb34a4 --- /dev/null +++ b/kittentts/cli.py @@ -0,0 +1,45 @@ +import argparse +import sys +from kittentts import KittenTTS + +def main(): + parser = argparse.ArgumentParser(description="KittenTTS: High-quality, ultra-lightweight Text-to-Speech") + parser.add_argument("text", nargs="?", help="The text to synthesize into speech") + parser.add_argument("--voice", default="Jasper", help="Voice to use (e.g., Bella, Jasper, Luna). Default: Jasper") + parser.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model ID to use. Default: KittenML/kitten-tts-nano-0.1") + parser.add_argument("--output", default="output.wav", help="Output audio file path. Default: output.wav") + parser.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0 = normal). Default: 1.0") + parser.add_argument("--list-voices", action="store_true", help="List available voices for the model and exit") + + args = parser.parse_args() + + if not args.list_voices and not args.text: + parser.error("The 'text' argument is required unless --list-voices is specified.") + + # Initialize model + print(f"Loading model: {args.model}...") + model = KittenTTS(model_name=args.model) + + if model.model is None: + sys.exit(1) + + if args.list_voices: + print("\n=== Available Voices ===") + print("Aliases:") + for alias in model.model.voice_aliases.keys(): + print(f" - {alias}") + print("\nNative Voices:") + for voice in model.available_voices: + if voice not in model.model.voice_aliases.values(): + print(f" - {voice}") + sys.exit(0) + + + print(f"Generating audio with voice '{args.voice}'...") + try: + model.generate_to_file(args.text, args.output, voice=args.voice, speed=args.speed) + except ValueError as e: + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/kittentts/get_model.py b/kittentts/get_model.py index b0d47b2..3fe030f 100644 --- a/kittentts/get_model.py +++ b/kittentts/get_model.py @@ -64,37 +64,46 @@ def download_from_huggingface(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir= Returns: KittenTTS_1_Onnx: Instantiated model ready for use """ - # Download config file first - config_path = hf_hub_download( - repo_id=repo_id, - filename="config.json", - cache_dir=cache_dir - ) - - # Load config - with open(config_path, 'r') as f: - config = json.load(f) + try: + # Download config file first + config_path = hf_hub_download( + repo_id=repo_id, + filename="config.json", + cache_dir=cache_dir + ) + + # Load config + with open(config_path, 'r') as f: + config = json.load(f) - if config.get("type") not in ["ONNX1", "ONNX2"]: - raise ValueError("Unsupported model type.") + if config.get("type") not in ["ONNX1", "ONNX2"]: + print(f"Error: Unsupported model type '{config.get('type')}'.") + return None - # Download model and voices files based on config - model_path = hf_hub_download( - repo_id=repo_id, - filename=config["model_file"], - cache_dir=cache_dir - ) - - voices_path = hf_hub_download( - repo_id=repo_id, - filename=config["voices"], - cache_dir=cache_dir - ) - - # Instantiate and return model - model = KittenTTS_1_Onnx(model_path=model_path, voices_path=voices_path, speed_priors=config.get("speed_priors", {}) , voice_aliases=config.get("voice_aliases", {})) - - return model + # Download model and voices files based on config + model_path = hf_hub_download( + repo_id=repo_id, + filename=config["model_file"], + cache_dir=cache_dir + ) + + voices_path = hf_hub_download( + repo_id=repo_id, + filename=config["voices"], + cache_dir=cache_dir + ) + + # Instantiate and return model + model = KittenTTS_1_Onnx(model_path=model_path, voices_path=voices_path, speed_priors=config.get("speed_priors", {}) , voice_aliases=config.get("voice_aliases", {})) + + return model + + except Exception as e: + print(f"\nāŒ Failed to load model '{repo_id}'.") + print(f"Error details: {str(e)}") + print("Please check your internet connection or verify the model name is correct.") + print("Example valid models: 'KittenML/kitten-tts-nano-0.1', 'KittenML/kitten-tts-mini-0.8'\n") + return None def get_model(repo_id="KittenML/kitten-tts-nano-0.1", cache_dir=None): diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py index 7ea20b3..1867aec 100644 --- a/kittentts/onnx_model.py +++ b/kittentts/onnx_model.py @@ -95,23 +95,42 @@ def __init__(self, model_path="kitten_tts_nano_preview.onnx", voices_path="voice self.text_cleaner = TextCleaner() self.speed_priors = speed_priors - # Available voices - self.available_voices = [ - 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', - 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' - ] - self.voice_aliases = voice_aliases + # Available voices dynamically loaded from the .npz file + self.available_voices = list(self.voices.keys()) + + # Default fallback aliases, can be extended by config + default_aliases = { + 'Bella': 'expr-voice-2-f', + 'Jasper': 'expr-voice-2-m', + 'Luna': 'expr-voice-3-f', + 'Bruno': 'expr-voice-3-m', + 'Rosie': 'expr-voice-4-f', + 'Hugo': 'expr-voice-4-m', + 'Kiki': 'expr-voice-5-f', + 'Leo': 'expr-voice-5-m' + } + self.voice_aliases = default_aliases + self.voice_aliases.update(voice_aliases) + self.preprocessor = TextPreprocessor() def _prepare_inputs(self, text: str, voice: str, speed: float = 1.0) -> dict: """Prepare ONNX model inputs from text and voice parameters.""" + # Try to resolve alias if necessary if voice in self.voice_aliases: voice = self.voice_aliases[voice] - if voice not in self.available_voices: - raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}") - + # Check if the requested voice exists in the dynamic voice dictionary + if voice not in self.voices: + fallback = list(self.aliases.keys())[0] if hasattr(self, 'aliases') and len(self.aliases) > 0 else (self.available_voices[0] if self.available_voices else None) + error_msg = f"\nāŒ Voice '{voice}' not found." + error_msg += f"\nšŸ‘‰ Available native voices: {self.available_voices}" + error_msg += f"\nšŸ‘‰ Available voice aliases: {list(self.voice_aliases.keys())}" + if fallback: + error_msg += f"\nPlease try using a valid voice like '{fallback}'." + print(error_msg) + raise ValueError(f"Voice '{voice}' not available.") if voice in self.speed_priors: speed = speed * self.speed_priors[voice] diff --git a/kittentts/ui.py b/kittentts/ui.py new file mode 100644 index 0000000..23902a4 --- /dev/null +++ b/kittentts/ui.py @@ -0,0 +1,266 @@ +import os +import sys +import threading +import warnings +import winsound +from tkinter import filedialog +import customtkinter as ctk + +from kittentts import get_model, KittenTTS + +# Ignore warnings from some dependencies +warnings.filterwarnings("ignore") + +ctk.set_appearance_mode("System") # Modes: "System" (standard), "Dark", "Light" +ctk.set_default_color_theme("blue") # Themes: "blue" (standard), "green", "dark-blue" + +class KittenTTSApp(ctk.CTk): + def __init__(self): + super().__init__() + + self.title("KittenTTS - Ultra-Lightweight Voice Generator") + self.geometry("700x550") + self.minsize(600, 500) + + self.current_model_id = "KittenML/kitten-tts-nano-0.1" + self.tts_model = None + self.output_file = "output.wav" + + # --- Grid Layout --- + self.grid_columnconfigure(0, weight=1) + self.grid_rowconfigure(0, weight=1) + + # --- Main Frame --- + self.main_frame = ctk.CTkFrame(self) + self.main_frame.grid(row=0, column=0, padx=20, pady=20, sticky="nsew") + self.main_frame.grid_columnconfigure(0, weight=1) + self.main_frame.grid_rowconfigure(3, weight=1) + + # --- Header --- + self.header_label = ctk.CTkLabel( + self.main_frame, + text="🐈 KittenTTS Generator", + font=ctk.CTkFont(size=24, weight="bold") + ) + self.header_label.grid(row=0, column=0, padx=20, pady=(20, 10), sticky="nw") + + # --- Controls Frame --- + self.controls_frame = ctk.CTkFrame(self.main_frame, fg_color="transparent") + self.controls_frame.grid(row=1, column=0, padx=20, pady=10, sticky="ew") + self.controls_frame.grid_columnconfigure((0, 1, 2), weight=1) + + # Model Selection + self.model_label = ctk.CTkLabel(self.controls_frame, text="Model:") + self.model_label.grid(row=0, column=0, padx=(0, 10), pady=0, sticky="w") + self.model_dropdown = ctk.CTkOptionMenu( + self.controls_frame, + values=["KittenML/kitten-tts-nano-0.1", "KittenML/kitten-tts-mini-0.8"], + command=self.change_model + ) + self.model_dropdown.grid(row=1, column=0, padx=(0, 10), pady=5, sticky="ew") + + # Voice Selection + self.voice_label = ctk.CTkLabel(self.controls_frame, text="Voice:") + self.voice_label.grid(row=0, column=1, padx=10, pady=0, sticky="w") + self.voice_dropdown = ctk.CTkOptionMenu( + self.controls_frame, + values=["Loading..."] + ) + self.voice_dropdown.grid(row=1, column=1, padx=10, pady=5, sticky="ew") + + # Speed Selection + self.speed_label = ctk.CTkLabel(self.controls_frame, text="Speed: 1.0x") + self.speed_label.grid(row=0, column=2, padx=(10, 0), pady=0, sticky="w") + self.speed_slider = ctk.CTkSlider( + self.controls_frame, + from_=0.5, to=2.0, + number_of_steps=15, + command=self.update_speed_label + ) + self.speed_slider.set(1.0) + self.speed_slider.grid(row=1, column=2, padx=(10, 0), pady=5, sticky="ew") + + # --- Text Input --- + self.text_label = ctk.CTkLabel(self.main_frame, text="Text to Synthesize:") + self.text_label.grid(row=2, column=0, padx=20, pady=(10, 0), sticky="w") + self.text_input = ctk.CTkTextbox(self.main_frame, height=150, font=ctk.CTkFont(size=14)) + self.text_input.grid(row=3, column=0, padx=20, pady=(5, 20), sticky="nsew") + self.text_input.insert("0.0", "Welcome to KittenTTS! Type something here and click Generate to hear it spoken out loud.") + + # --- Action Buttons Frame --- + self.action_frame = ctk.CTkFrame(self.main_frame, fg_color="transparent") + self.action_frame.grid(row=4, column=0, padx=20, pady=(0, 20), sticky="ew") + self.action_frame.grid_columnconfigure(0, weight=1) + + # Status Label + self.status_label = ctk.CTkLabel(self.action_frame, text="Status: Ready", text_color="gray") + self.status_label.grid(row=0, column=0, padx=10, pady=10, sticky="w") + + # Generate Button + self.generate_btn = ctk.CTkButton( + self.action_frame, + text="Generate & Play ā–¶", + command=self.generate_audio, + height=40, + font=ctk.CTkFont(size=15, weight="bold") + ) + self.generate_btn.grid(row=0, column=1, padx=(10, 0), pady=10, sticky="e") + + # Export Button + self.export_btn = ctk.CTkButton( + self.action_frame, + text="Export Audio šŸ’¾", + command=self.export_audio, + height=40, + fg_color="transparent", + border_width=2, + font=ctk.CTkFont(size=15, weight="bold") + ) + self.export_btn.grid(row=0, column=2, padx=(10, 0), pady=10, sticky="e") + + # Load initial model asynchronously + threading.Thread(target=self.load_model_bg, args=(self.current_model_id,), daemon=True).start() + + def update_speed_label(self, value): + self.speed_label.configure(text=f"Speed: {value:.1f}x") + + def change_model(self, selected_model): + if selected_model == self.current_model_id: + return + + self.current_model_id = selected_model + # Disable generation while loading + self.generate_btn.configure(state="disabled") + self.export_btn.configure(state="disabled") + self.status_label.configure(text=f"Status: Downloading/Loading {selected_model}...") + self.voice_dropdown.configure(values=["Loading..."]) + self.voice_dropdown.set("Loading...") + + threading.Thread(target=self.load_model_bg, args=(selected_model,), daemon=True).start() + + def load_model_bg(self, model_id): + self.update_status(f"Loading '{model_id}'...") + self.generate_btn.configure(state="disabled") + self.export_btn.configure(state="disabled") + + try: + model = KittenTTS(model_name=model_id) + if model.model is None: + raise Exception("Failed to load model architecture.") + + self.tts_model = model + + # Extract aliases for dropdown + if hasattr(self.tts_model.model, 'voice_aliases'): + aliases = list(self.tts_model.model.voice_aliases.keys()) + else: + aliases = ["Default"] + + # Update UI from main thread + self.after(0, self.update_voices_ui, aliases) + + except Exception as e: + self.update_status(f"Error loading model: {str(e)}", err=True) + self.after(0, lambda: self.generate_btn.configure(state="normal")) + self.after(0, lambda: self.export_btn.configure(state="normal")) + + def update_voices_ui(self, aliases): + self.voice_dropdown.configure(values=aliases) + if aliases: + self.voice_dropdown.set(aliases[1] if len(aliases) > 1 else aliases[0]) # Default to Jasper if possible + self.update_status("Model loaded. Ready to generate.") + self.generate_btn.configure(state="normal") + self.export_btn.configure(state="normal") + + def update_status(self, text, err=False): + color = "red" if err else "gray" + self.after(0, lambda: self.status_label.configure(text=f"Status: {text}", text_color=color)) + + def generate_audio(self): + text = self.text_input.get("0.0", "end").strip() + if not text: + self.update_status("Error: Input text is empty.", err=True) + return + + voice = self.voice_dropdown.get() + speed = self.speed_slider.get() + + if not self.tts_model: + self.update_status("Error: Model not loaded yet.", err=True) + return + + self.generate_btn.configure(state="disabled", text="Generating...") + self.export_btn.configure(state="disabled") + self.update_status("Generating speech audio...") + + # Run in background to prevent UI freeze + threading.Thread(target=self.generate_audio_bg, args=(text, voice, speed), daemon=True).start() + + def generate_audio_bg(self, text, voice, speed): + try: + self.tts_model.generate_to_file(text, self.output_file, voice=voice, speed=speed) + self.update_status("Audio generated successfully. Playing...") + + # Play sound using built-in Windows sound engine + # SND_FILENAME: The sound parameter is the name of a WAV file + # SND_ASYNC: The sound is played asynchronously, and the function returns immediately after beginning the sound + winsound.PlaySound(self.output_file, winsound.SND_FILENAME | winsound.SND_ASYNC) + + # Re-enable button + self.after(0, lambda: self.generate_btn.configure(state="normal", text="Generate & Play ā–¶")) + self.after(0, lambda: self.export_btn.configure(state="normal")) + self.update_status("Ready") + + except Exception as e: + self.update_status(f"Generation error: {str(e)}", err=True) + self.after(0, lambda: self.generate_btn.configure(state="normal", text="Generate & Play ā–¶")) + self.after(0, lambda: self.export_btn.configure(state="normal")) + + def export_audio(self): + text = self.text_input.get("0.0", "end").strip() + if not text: + self.update_status("Error: Input text is empty.", err=True) + return + + voice = self.voice_dropdown.get() + speed = self.speed_slider.get() + + if not self.tts_model: + self.update_status("Error: Model not loaded yet.", err=True) + return + + save_path = filedialog.asksaveasfilename( + defaultextension=".wav", + filetypes=[("WAV Audio", "*.wav"), ("All Files", "*.*")], + title="Export Audio As" + ) + + if not save_path: + return + + self.generate_btn.configure(state="disabled") + self.export_btn.configure(state="disabled", text="Exporting...") + self.update_status(f"Exporting speech audio to {os.path.basename(save_path)}...") + + threading.Thread(target=self.export_audio_bg, args=(text, voice, speed, save_path), daemon=True).start() + + def export_audio_bg(self, text, voice, speed, save_path): + try: + self.tts_model.generate_to_file(text, save_path, voice=voice, speed=speed) + self.update_status(f"Successfully exported to {os.path.basename(save_path)}") + + # Re-enable buttons + self.after(0, lambda: self.generate_btn.configure(state="normal")) + self.after(0, lambda: self.export_btn.configure(state="normal", text="Export Audio šŸ’¾")) + + except Exception as e: + self.update_status(f"Export error: {str(e)}", err=True) + self.after(0, lambda: self.generate_btn.configure(state="normal")) + self.after(0, lambda: self.export_btn.configure(state="normal", text="Export Audio šŸ’¾")) + +def main(): + app = KittenTTSApp() + app.mainloop() + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index d0ac187..6a65595 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,17 @@ "soundfile", "numpy", "huggingface_hub", + "customtkinter", ], keywords="text-to-speech, tts, speech-synthesis, neural-networks, onnx", project_urls={ "Bug Reports": "https://github.com/kittenml/kittentts/issues", "Source": "https://github.com/kittenml/kittentts", }, + entry_points={ + "console_scripts": [ + "kittentts=kittentts.cli:main", + "kittentts-ui=kittentts.ui:main", + ], + }, )