From 4fc8d05e2d81553003d858df4cc20132ec2cf9bf Mon Sep 17 00:00:00 2001 From: biswas445 Date: Fri, 9 Jan 2026 12:48:14 +0545 Subject: [PATCH 01/27] Enhanced CLI and API modules for Soprano TTS --- CLI/CLI_DOCUMENTATION.md | 159 +++++++ CLI/soprano_cli.py | 184 ++++++++ README.md | 190 -------- Soprano.bat | 2 + pyproject.toml | 6 + soprano/server.py | 47 -- soprano/server/api.py | 444 ++++++++++++++++++ soprano/server/api_test.py | 70 +++ soprano/server/docs/architecture.md | 158 +++++++ soprano/server/docs/configuration.md | 71 +++ soprano/server/docs/endpoints.md | 100 ++++ .../server/docs/errors_and_troubleshooting.md | 109 +++++ soprano/server/docs/index.md | 44 ++ soprano/server/docs/overview.md | 43 ++ soprano/server/docs/usage_examples.md | 229 +++++++++ soprano/soprano_cli.py | 38 -- 16 files changed, 1619 insertions(+), 275 deletions(-) create mode 100644 CLI/CLI_DOCUMENTATION.md create mode 100644 CLI/soprano_cli.py delete mode 100644 README.md create mode 100644 Soprano.bat delete mode 100644 soprano/server.py create mode 100644 soprano/server/api.py create mode 100644 soprano/server/api_test.py create mode 100644 soprano/server/docs/architecture.md create mode 100644 soprano/server/docs/configuration.md create mode 100644 soprano/server/docs/endpoints.md create mode 100644 soprano/server/docs/errors_and_troubleshooting.md create mode 100644 soprano/server/docs/index.md create mode 100644 soprano/server/docs/overview.md create mode 100644 soprano/server/docs/usage_examples.md delete mode 100644 soprano/soprano_cli.py diff --git a/CLI/CLI_DOCUMENTATION.md b/CLI/CLI_DOCUMENTATION.md new file mode 100644 index 0000000..b24e641 --- /dev/null +++ b/CLI/CLI_DOCUMENTATION.md @@ -0,0 +1,159 @@ +# Soprano TTS CLI Documentation + +## Overview + +Soprano TTS is an ultra-realistic text-to-speech system that generates high-quality audio from text input. The CLI provides an interactive interface to utilize the Soprano TTS engine with customizable voice parameters for naturalistic speech synthesis. + +## Features + +- Interactive menu-driven interface +- Real-time audio playback without file saving +- File-based audio generation with customizable output paths +- Adjustable voice parameters for naturalistic speech +- Automatic device selection (CUDA fallback to CPU) +- Progress indicators during audio playback + +## Installation + +```bash +pip install soprano-tts +``` + +## Usage + +Run the CLI with default settings: + +```bash +python soprano_cli.py +``` + +With optional parameters: + +```bash +python soprano_cli.py --model-path /path/to/model --backend auto --cache-size 10 +``` + +### Command Line Arguments + +- `--model-path` or `-m`: Path to local model directory (optional, defaults to Hugging Face model) +- `--backend`: Backend to use for inference (options: auto, transformers, lmdeploy; default: auto) +- `--cache-size` or `-c`: Cache size in MB for lmdeploy backend (default: 10) + +## Interactive Menu Options + +### Option 1: Input Text for Synthesis (with file saving) + +Generates audio from input text and saves it to a WAV file in the `audio_output` directory. The system automatically creates this directory if it doesn't exist and uses incremental naming: +- First file: `output_audio.wav` +- Second file: `output_audio1.wav` +- And so on... + +### Option 2: Real-time Audio Playback (no file saving) + +Generates audio from input text and plays it directly without saving to disk. This option: +- Generates audio in real-time +- Plays audio through system speakers +- Waits for complete playback before returning to menu + +### Option 3: View Saved Audio Files + +Displays a list of all audio files saved in the `audio_output` directory with their filenames. + +### Option 4: Exit + +Terminates the CLI application. + +## Visual Pipeline + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Soprano TTS CLI │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ User Input │ │ Model Load │ │ Device Check │ │ +│ │ & Validation │ │ & Init │ │ & Select │ │ +│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────────────────────────────────────────────────────┤ +│ │ Main Menu Loop │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ +│ │ │ Option 1: Save │ │ Option 2: Play │ │ Option 3: │ │ +│ │ │ to File │ │ to Speaker │ │View Audio │ │ +│ │ └─────────────────┘ └─────────────────┘ │ Files │ │ +│ └─────────────────────────────────────────────────────────────┤ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┤ +│ │ Text Pre- │ │ Text Pre- │ │ List │ +│ │ processing │ │ processing │ │ Audio │ +│ └─────────────────┘ └─────────────────┘ │ Files │ +│ │ │ └─────────────────┤ +│ ▼ ▼ │ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ Model Inference│ │ Model Inference│ │ │ +│ │ (Generate │ │ (Generate │ │ │ +│ │ Audio Data) │ │ Audio Data) │ │ │ +│ └─────────────────┘ └─────────────────┘ │ │ +│ │ │ │ │ +│ ▼ ▼ │ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ │ +│ │ Save to │ │ Audio │ │ │ +│ │ File (.wav) │ │ Playback │ │ │ +│ │in audio_output │ │(real-time) │ │ │ +│ │ directory │ │ │ │ │ +│ └─────────────────┘ └─────────────────┘ │ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────┤ +│ │ Return to │ +│ │ Main Menu │ +│ └─────────────────┤ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Voice Characteristics + +The system uses optimized default parameters for naturalistic speech: + +- **Temperature**: 0.7 (provides natural variation and creativity) +- **Top-p**: 0.9 (balances coherent speech with natural variation) +- **Repetition Penalty**: 1.05 (minimizes repetition while maintaining quality) + +These parameters are built-in and optimized for the most natural, human-like voice output. + +## Technical Details + +### Audio Specifications +- Sample Rate: 32,000 Hz +- Format: WAV (for saved files) +- Real-time playback through system audio + +### Model Architecture +- Uses Soprano-80M model by default +- Vocos-based decoder for high-quality audio synthesis +- Support for both LMDeploy and Transformers backends + +### Supported Platforms +- Windows, macOS, Linux +- CUDA-compatible GPUs (recommended) or CPU +- Python 3.10+ + +## Troubleshooting + +### Common Issues: + +1. **No audio output**: Ensure `sounddevice` is installed: + ```bash + pip install sounddevice + ``` + +2. **CUDA unavailable**: The system will automatically fall back to CPU + +3. **Long text processing**: Text is limited to 1000 characters per input + +4. **Model loading errors**: Check internet connection for downloading models from Hugging Face + +## License + +This project is licensed under the terms specified in the LICENSE file. \ No newline at end of file diff --git a/CLI/soprano_cli.py b/CLI/soprano_cli.py new file mode 100644 index 0000000..83007a0 --- /dev/null +++ b/CLI/soprano_cli.py @@ -0,0 +1,184 @@ + +""" +Soprano TTS Command Line Interface +""" +import argparse +import sys +import torch +from soprano import SopranoTTS + +try: + import sounddevice as sd + SOUNDDEVICE_AVAILABLE = True +except ImportError: + SOUNDDEVICE_AVAILABLE = False + +def get_device(): + """Determine the best available device (CUDA if available, otherwise CPU)""" + return 'cuda' if torch.cuda.is_available() else 'cpu' + +def play_audio(audio_tensor): + """Play audio tensor using sounddevice""" + if not SOUNDDEVICE_AVAILABLE: + print("Error: sounddevice library not available. Install it with 'pip install sounddevice'") + return + + import numpy as np + audio_np = audio_tensor.cpu().numpy() if isinstance(audio_tensor, torch.Tensor) else audio_tensor + + duration = len(audio_np) / 32000 + print(f"Playing audio ({duration:.2f}s)...") + + sample_rate = 32000 + sd.play(audio_np, samplerate=sample_rate) + + import time + time.sleep(duration + 0.5) + + try: + if sd.get_status().playing: + sd.wait() + except: + time.sleep(0.5) + +def validate_text(text): + """Validate input text""" + stripped_text = text.strip() if text else "" + if not stripped_text: + print("Error: Text cannot be empty.") + return False + if len(stripped_text) > 1000: + print("Error: Text is too long (max 1000 characters).") + return False + return True + +def get_validated_input(prompt, validator_func, error_msg=None): + """Get validated input from user""" + while True: + user_input = input(prompt).strip() + if validator_func(user_input): + return user_input + else: + if error_msg: + print(error_msg) + else: + print("Invalid input, please try again.") + +def get_next_filename(base_name="output_audio", ext=".wav"): + """Generate next available filename with incremental numbering""" + import os + + output_dir = "audio_output" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + counter = 0 + while True: + if counter == 0: + filename = f"{base_name}{ext}" + else: + filename = f"{base_name}{counter}{ext}" + + full_path = os.path.join(output_dir, filename) + if not os.path.exists(full_path): + return full_path + counter += 1 + +def main(): + parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') + parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') + parser.add_argument('--backend', '-b', default='auto', + choices=['auto', 'transformers', 'lmdeploy'], + help='Backend to use for inference') + parser.add_argument('--cache-size', '-c', type=int, default=10, + help='Cache size in MB (for lmdeploy backend)') + + args = parser.parse_args() + + device = get_device() + + try: + import io + import contextlib + + with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): + tts = SopranoTTS( + backend=args.backend, + device=device, + cache_size_mb=args.cache_size, + model_path=args.model_path + ) + except Exception as e: + print(f"Error initializing model: {e}") + sys.exit(1) + + print("Soprano TTS is ready. Starting interactive menu...") + + while True: + print("\n" + "="*50) + print(" SOPRANO TTS MENU") + print("="*50) + print("1. Input text for synthesis (with file saving)") + print("2. Real-time audio playback (no file saving)") + print("3. View saved audio files") + print("4. Exit") + print("="*50) + + choice = input("Enter your choice (1-4): ").strip() + + if choice == '1': + text = get_validated_input( + "Enter text to synthesize: ", + validate_text, + "Text must not be empty and must be under 1000 characters." + ) + + output_path = get_next_filename() + print(f"Using output path: {output_path}") + + print(f"Generating speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'") + try: + tts.infer(text, out_path=output_path) + print(f"✓ Audio saved to: {output_path}") + except Exception as e: + print(f"✗ Error generating audio: {e}") + + elif choice == '2': + text = get_validated_input( + "Enter text for real-time playback: ", + validate_text, + "Text must not be empty and must be under 1000 characters." + ) + + print(f"Generating real-time audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'") + try: + audio_tensor = tts.infer(text) + print("Playing audio...") + play_audio(audio_tensor) + print("✓ Playback finished.") + except Exception as e: + print(f"✗ Error during playback: {e}") + + elif choice == '3': + import os + output_dir = "audio_output" + if os.path.exists(output_dir): + files = [f for f in os.listdir(output_dir) if f.lower().endswith('.wav')] + if files: + print(f"Found {len(files)} audio file(s) in {output_dir}/:") + for i, file in enumerate(sorted(files), 1): + print(f" {i}. {file}") + else: + print(f"No audio files found in {output_dir}/") + else: + print(f"No {output_dir}/ directory exists yet.") + + elif choice == '4': + print("Thank you for using Soprano TTS. Goodbye!") + break + + else: + print("✗ Invalid choice. Please enter 1, 2, 3, or 4.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 9c2950b..0000000 --- a/README.md +++ /dev/null @@ -1,190 +0,0 @@ - -
- - # Soprano: Instant, Ultra‑Realistic Text‑to‑Speech - - [![Alt Text](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/ekwek/Soprano-80M) - [![Alt Text](https://img.shields.io/badge/HuggingFace-Demo-yellow?logo=huggingface)](https://huggingface.co/spaces/ekwek/Soprano-TTS) -
- -https://github.com/user-attachments/assets/525cf529-e79e-4368-809f-6be620852826 - ---- - -## Overview - -**Soprano** is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed, all while remaining compact and easy to deploy at **under 1 GB VRAM usage**. - -With only **80M parameters**, Soprano achieves a real‑time factor (RTF) of **~2000×**, capable of generating **10 hours of audio in under 20 seconds**. Soprano uses a **seamless streaming** technique that enables true real‑time synthesis in **<15 ms**, multiple orders of magnitude faster than existing TTS pipelines. - ---- - -## Installation - -**Requirements**: Linux or Windows, CUDA‑enabled GPU required (CPU support coming soon!). - -### Install with wheel - -```bash -pip install soprano-tts -pip uninstall -y torch -pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126 -``` - -### Install from source - -```bash -git clone https://github.com/ekwek1/soprano.git -cd soprano -pip install -e . -pip uninstall -y torch -pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126 -``` - -> **Note**: Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model. - ---- - -## Usage - -```python -from soprano import SopranoTTS - -model = SopranoTTS(backend='auto', device='cuda', cache_size_mb=10, decoder_batch_size=1) -``` - -> **Tip**: You can increase cache_size_mb and decoder_batch_size to increase inference speed at the cost of higher memory usage. - -### Basic inference - -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.") # can achieve 2000x real-time with sufficiently long input! -``` - -### Save output to a file - -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.", "out.wav") -``` - -### Custom sampling parameters - -```python -out = model.infer( - "Soprano is an extremely lightweight text to speech model.", - temperature=0.3, - top_p=0.95, - repetition_penalty=1.2, -) -``` - -### Batched inference - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10) # can achieve 2000x real-time with sufficiently large input size! -``` - -#### Save batch outputs to a directory - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10, "/dir") -``` - -### Streaming inference - -```python -import torch - -stream = model.infer_stream("Soprano is an extremely lightweight text to speech model.", chunk_size=1) - -# Audio chunks can be accessed via an iterator -chunks = [] -for chunk in stream: - chunks.append(chunk) # first chunk arrives in <15 ms! - -out = torch.cat(chunks) -``` - -### Serve endpoint - -``` -uvicorn soprano.server:app --host 0.0.0.0 --port 8000 -``` - -Compatible with OpenAI speech API. Use the endpoint like this: - -```bash -curl http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "input": "The quick brown fox jumped over the lazy dog." - }' \ - --output speech.wav -``` - -## Usage tips: - -* Soprano works best when each sentence is between 2 and 15 seconds long. -* Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc) -* If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results. -* Avoid improper grammar such as not using contractions, multiple spaces, etc. - ---- - -## Key Features - -### 1. High‑fidelity 32 kHz audio - -Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper and clearer than the 24 kHz output used by many existing TTS models. - -### 2. Vocoder‑based neural decoder - -Instead of slow diffusion decoders, Soprano uses a **vocoder‑based decoder** with a Vocos architecture, enabling **orders‑of‑magnitude faster** waveform generation while maintaining comparable perceptual quality. - -### 3. Seamless Streaming - -Soprano leverages the decoder’s finite receptive field to losslessly stream audio with ultra‑low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**. - -### 4. State‑of‑the‑art neural audio codec - -Speech is represented using a **neural codec** that compresses audio to **~15 tokens/sec** at just **0.2 kbps**, allowing extremely fast generation and efficient memory usage without sacrificing quality. - -### 5. Sentence‑level streaming for infinite context - -Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real‑time performance for long‑form generation. - ---- - -## Limitations - -I’m a second-year undergrad who’s just started working on TTS models, so I wanted to start small. Soprano was only pretrained on 1000 hours of audio (~100x less than other TTS models), so its stability and quality will improve tremendously as I train it on more data. Also, I optimized Soprano purely for speed, which is why it lacks bells and whistles like voice cloning, style control, and multilingual support. Now that I have experience creating TTS models, I have a lot of ideas for how to make Soprano even better in the future, so stay tuned for those! - ---- - -## Roadmap - -* [x] Add model and inference code -* [x] Seamless streaming -* [x] Batched inference -* [x] Command-line interface (CLI) -* [x] CPU support -* [x] Server / API inference -* [ ] Additional LLM backends -* [ ] Voice cloning -* [ ] Multilingual support - ---- - -## Acknowledgements - -Soprano uses and/or is inspired by the following projects: - -* [Vocos](https://github.com/gemelo-ai/vocos) -* [XTTS](https://github.com/coqui-ai/TTS) -* [LMDeploy](https://github.com/InternLM/lmdeploy) - ---- - -## License - -This project is licensed under the **Apache-2.0** license. See `LICENSE` for details. diff --git a/Soprano.bat b/Soprano.bat new file mode 100644 index 0000000..d3a3f90 --- /dev/null +++ b/Soprano.bat @@ -0,0 +1,2 @@ +cd CLI +python soprano_cli.py \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 5e917c2..9b97140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,8 +23,14 @@ dependencies = [ "scipy", "torch", "unidecode", +<<<<<<< HEAD + "inflect", + "scipy", + "sounddevice" +======= "uvicorn", "inflect" +>>>>>>> upstream/main ] license = {file = "LICENSE"} diff --git a/soprano/server.py b/soprano/server.py deleted file mode 100644 index 937c89a..0000000 --- a/soprano/server.py +++ /dev/null @@ -1,47 +0,0 @@ -import base64 -import io -import json -from typing import Generator - -import numpy as np -from fastapi import FastAPI, HTTPException -from fastapi.responses import Response -from scipy.io.wavfile import write -from torch import Tensor - -from soprano.tts import SopranoTTS - -# Load model at startup -tts = SopranoTTS(cache_size_mb = 100) - -app = FastAPI(title="Soprano TTS API") - -def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: - """ - Convert a 1D fp32 torch tensor to a WAV byte stream. - """ - # convert to int16 - audio_int16 = (np.clip(tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16) - - wav_io = io.BytesIO() - write(wav_io, 32000, audio_int16) # 32kHz sample rate - wav_io.seek(0) - return wav_io.read() - - -@app.post("/v1/audio/speech") -async def create_speech(payload: dict): - """ - Minimal implementation of OpenAI's Speech endpoint. - Fields: - - input: string - text to synthesize - - model, voice, etc. are accepted but ignored. - - response_format: str - ignored, only support wav. - """ - text = payload.get("input") - if not isinstance(text, str) or not text.strip(): - raise HTTPException(status_code=400, detail="`input` field must be a non-empty string.") - - audio_tensor = tts.infer(text) - wav_bytes = _tensor_to_wav_bytes(audio_tensor) - return Response(content=wav_bytes, media_type="audio/wav", headers={"Content-Disposition": 'attachment; filename="speech.wav"'}) diff --git a/soprano/server/api.py b/soprano/server/api.py new file mode 100644 index 0000000..0cabebc --- /dev/null +++ b/soprano/server/api.py @@ -0,0 +1,444 @@ +import asyncio +import io +import logging +import os +import time +from typing import Optional, Dict, Any, AsyncGenerator +import numpy as np +from fastapi import FastAPI, HTTPException, Depends +from fastapi.responses import Response +from pydantic import BaseModel, Field +from scipy.io.wavfile import write +from torch import Tensor +import torch +from contextlib import asynccontextmanager + +from soprano.tts import SopranoTTS + + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class CircuitBreaker: + """ + Circuit breaker implementation to handle external dependency failures. + """ + def __init__(self, failure_threshold=5, recovery_timeout=60): + self.failure_threshold = failure_threshold + self.recovery_timeout = recovery_timeout + self.failure_count = 0 + self.last_failure_time = None + self.state = "CLOSED" # CLOSED, OPEN, HALF_OPEN + + def call(self, func, *args, **kwargs): + if self.state == "OPEN": + if time.time() - self.last_failure_time > self.recovery_timeout: + self.state = "HALF_OPEN" + else: + raise Exception("Circuit breaker is in OPEN state and not accepting requests") + + if self.state == "HALF_OPEN": + try: + result = func(*args, **kwargs) + self._success() + return result + except Exception as e: + self._failure() + raise e + + try: + result = func(*args, **kwargs) + return result + except Exception as e: + self._failure() + raise e + + def _failure(self): + self.failure_count += 1 + self.last_failure_time = time.time() + if self.failure_count >= self.failure_threshold: + self.state = "OPEN" + + def _success(self): + self.failure_count = 0 + self.state = "CLOSED" + + +def retry(func, retries=3, delay=1, backoff=2): + """ + Retry decorator with exponential backoff for transient failures. + """ + def wrapper(*args, **kwargs): + current_delay = delay + for attempt in range(retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt == retries - 1: # Last attempt + raise e + logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {current_delay} seconds...") + time.sleep(current_delay) + current_delay *= backoff + return None + return wrapper + + +class SpeechRequest(BaseModel): + """ + Request model for text-to-speech conversion following OpenAI API format. + """ + input: str = Field(..., min_length=1, max_length=1000, description="Text to synthesize") + model: Optional[str] = Field(None, description="Model to use (ignored, using default model)") + voice: Optional[str] = Field(None, description="Voice to use (ignored, using default voice)") + response_format: Optional[str] = Field("wav", description="Response format (only wav supported)") + speed: Optional[float] = Field(None, ge=0.1, le=2.0, description="Speech speed (not implemented yet)") + temperature: Optional[float] = Field(0.3, ge=0.0, le=1.0, description="Generation temperature") + top_p: Optional[float] = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling parameter") + repetition_penalty: Optional[float] = Field(1.2, ge=0.1, le=2.0, description="Repetition penalty") + + +class TTSManager: + """ + Singleton manager for TTS model lifecycle and inference. + """ + _instance = None + _lock = asyncio.Lock() + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if not hasattr(self, 'initialized'): + self.initialized = True + self.tts: Optional[SopranoTTS] = None + # Prioritize CUDA, fallback to CPU only if CUDA is not available + if torch.cuda.is_available(): + self.device = 'cuda' + logger.info("CUDA is available, using GPU for TTS processing") + else: + self.device = 'cpu' + logger.info("CUDA is not available, falling back to CPU for TTS processing") + self.circuit_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30) + logger.info(f"Initializing TTS on device: {self.device}") + + async def initialize_model(self): + """ + Initialize the TTS model asynchronously to avoid blocking the event loop. + """ + async with self._lock: + if self.tts is None: + logger.info("Loading Soprano TTS model...") + try: + # Run model initialization in a thread pool to avoid blocking + loop = asyncio.get_event_loop() + + # Use retry mechanism for model loading + def load_model(): + return SopranoTTS( + cache_size_mb=100, + device=self.device + ) + + self.tts = await loop.run_in_executor( + None, + retry(load_model, retries=3, delay=2, backoff=2) + ) + logger.info("Soprano TTS model loaded successfully") + except Exception as e: + logger.error(f"Failed to load Soprano TTS model: {e}", exc_info=True) + raise RuntimeError(f"Failed to initialize TTS model: {str(e)}") from e + + def get_model(self) -> SopranoTTS: + """ + Get the initialized TTS model instance. + """ + if self.tts is None: + raise RuntimeError("TTS model not initialized. Call initialize_model() first.") + return self.tts + + def generate_audio(self, text: str, top_p: float, temperature: float, repetition_penalty: float): + """ + Generate audio with circuit breaker protection and retry mechanism. + """ + def _generate(): + return self.tts.infer( + text=text, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty + ) + + # Use circuit breaker to protect against repeated failures + return self.circuit_breaker.call( + retry(_generate, retries=2, delay=1, backoff=2) + ) + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + """ + Lifespan event handler for startup and shutdown events. + """ + logger.info("Starting up Soprano TTS API server...") + try: + tts_manager = TTSManager() + await tts_manager.initialize_model() + + # Perform initial cleanup of old files + output_dir = "audio_output" + os.makedirs(output_dir, exist_ok=True) + cleanup_old_files(output_dir) + + logger.info("Soprano TTS API server started successfully") + yield + except Exception as e: + logger.error(f"Failed to start Soprano TTS API server: {e}", exc_info=True) + raise + finally: + logger.info("Shutting down Soprano TTS API server...") + + +# Create FastAPI app with metadata +app = FastAPI( + title="Soprano TTS API", + description="Ultra-realistic Text-to-Speech API based on Soprano model", + version="1.0.0", + contact={ + "name": "Soprano TTS", + "url": "https://github.com/ekwek1/soprano", + }, + lifespan=lifespan +) + + +def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: + """ + Convert a 1D fp32 torch tensor to a WAV byte stream. + """ + # Convert to numpy array + audio_np = tensor.cpu().numpy() + + # Normalize to int16 range if needed + if audio_np.dtype != np.int16: + # Ensure values are in the range [-1, 1] + audio_np = np.clip(audio_np, -1.0, 1.0) + # Convert to int16 + audio_np = (audio_np * 32767).astype(np.int16) + + # Create in-memory WAV file + wav_io = io.BytesIO() + write(wav_io, 32000, audio_np) # 32kHz sample rate + wav_io.seek(0) + return wav_io.read() + + +def cleanup_old_files(directory: str, max_age_hours: int = 24, max_files: int = 100): + """ + Clean up old files in the specified directory to prevent unlimited growth. + + Args: + directory: Directory to clean up + max_age_hours: Maximum age of files in hours + max_files: Maximum number of files to keep + """ + import time + try: + files = [] + for filename in os.listdir(directory): + filepath = os.path.join(directory, filename) + if os.path.isfile(filepath): + files.append((filepath, os.path.getctime(filepath))) + + # Sort by creation time (oldest first) + files.sort(key=lambda x: x[1]) + + current_time = time.time() + cutoff_time = current_time - (max_age_hours * 3600) + + # Remove files older than cutoff time + removed_count = 0 + for filepath, creation_time in files: + if creation_time < cutoff_time: + try: + os.remove(filepath) + removed_count += 1 + logger.info(f"Removed old file: {filepath}") + except OSError as e: + logger.error(f"Failed to remove old file {filepath}: {e}") + + # If still too many files, remove oldest ones beyond the limit + remaining_files = len(files) - removed_count + if remaining_files > max_files: + excess_count = remaining_files - max_files + for i in range(excess_count): + if i < len(files) - removed_count: + filepath = files[i][0] + try: + os.remove(filepath) + removed_count += 1 + logger.info(f"Removed excess file: {filepath}") + except OSError as e: + logger.error(f"Failed to remove excess file {filepath}: {e}") + + if removed_count > 0: + logger.info(f"Cleaned up {removed_count} old files from {directory}") + except Exception as e: + logger.error(f"Error during file cleanup: {e}") + + + + +@app.post("/v1/audio/speech", + response_class=Response, + summary="Generate speech from text", + description="Convert input text to audio using Soprano TTS model") +async def create_speech(request: SpeechRequest): + """ + Generate speech from input text following OpenAI's Speech endpoint format. + """ + try: + # Validate input text + if not request.input or not request.input.strip(): + raise HTTPException( + status_code=400, + detail="`input` field must be a non-empty string." + ) + + # Check text length + if len(request.input) > 1000: + raise HTTPException( + status_code=400, + detail="Input text exceeds maximum length of 1000 characters." + ) + + # Get TTS manager and generate audio using circuit breaker and retry + tts_manager = TTSManager() + + logger.info(f"Processing TTS request for text: '{request.input[:50]}{'...' if len(request.input) > 50 else ''}'") + + try: + # Generate audio with circuit breaker and retry mechanism + audio_tensor = tts_manager.generate_audio( + text=request.input, + top_p=request.top_p, + temperature=request.temperature, + repetition_penalty=request.repetition_penalty + ) + except Exception as e: + logger.error(f"Circuit breaker or retry mechanism failed: {str(e)}", exc_info=True) + raise HTTPException( + status_code=503, + detail=f"Service temporarily unavailable due to TTS processing error: {str(e)}" + ) + + # Convert tensor to WAV bytes + wav_bytes = _tensor_to_wav_bytes(audio_tensor) + + # Create audio_output directory if it doesn't exist + output_dir = "audio_output" + os.makedirs(output_dir, exist_ok=True) + + # Generate unique filename with sequential numbering + file_counter = 1 + while True: + filename = f"output_{file_counter}.wav" + filepath = os.path.join(output_dir, filename) + if not os.path.exists(filepath): + break + file_counter += 1 + + # Prevent path traversal by ensuring filename is safe + if '..' in filename or '/' in filename or '\\' in filename: + raise HTTPException( + status_code=400, + detail="Invalid characters in text that could lead to path traversal." + ) + + # Save the audio file to the audio_output directory with error handling + try: + with open(filepath, 'wb') as f: + f.write(wav_bytes) + logger.info(f"Audio saved to: {filepath}") + except OSError as e: + logger.error(f"Failed to save audio file: {e}") + raise HTTPException( + status_code=500, + detail=f"Failed to save audio file: {str(e)}" + ) + + logger.info(f"TTS generation completed successfully.") + + # Return WAV response + return Response( + content=wav_bytes, + media_type="audio/wav", + headers={ + "Content-Disposition": f'attachment; filename="{filename}"', + "Content-Length": str(len(wav_bytes)) + } + ) + + except HTTPException: + # Re-raise HTTP exceptions as-is + raise + except Exception as e: + logger.error(f"Error during TTS generation: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Internal server error during TTS generation: {str(e)}" + ) + + +@app.get("/", + summary="Root endpoint", + description="Provides information about the Soprano TTS API") +async def root(): + """ + Root endpoint to provide API information. + """ + return { + "message": "Soprano TTS API", + "version": "1.0.0", + "description": "Ultra-realistic Text-to-Speech API based on Soprano model", + "endpoints": { + "tts": "/v1/audio/speech", + "health": "/health" + } + } + + +@app.get("/health", + summary="Health check endpoint", + description="Check if the server and TTS model are running properly") +async def health_check(): + """ + Health check endpoint to verify the server and model are operational. + """ + try: + tts_manager = TTSManager() + tts = tts_manager.get_model() + return {"status": "healthy", "device": tts.device} + except Exception as e: + logger.error(f"Health check failed: {str(e)}") + raise HTTPException(status_code=503, detail="Service unavailable") + + +if __name__ == "__main__": + import uvicorn + import torch + + print("Starting Soprano TTS API Server...") + print(f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}") + + # Start the server + print("Server starting on http://localhost:8000") + uvicorn.run( + "soprano.server.api:app", + host="localhost", + port=8000, + reload=False + ) diff --git a/soprano/server/api_test.py b/soprano/server/api_test.py new file mode 100644 index 0000000..bba4cdf --- /dev/null +++ b/soprano/server/api_test.py @@ -0,0 +1,70 @@ +import asyncio +import aiohttp +import time +from pathlib import Path + +async def send_tts_request(text): + """ + Send a TTS request to the API server with custom text + """ + base_url = "http://localhost:8000" + + payload = { + "input": text, + "temperature": 0.3, + "top_p": 0.95, + "repetition_penalty": 1.2 + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post(f"{base_url}/v1/audio/speech", json=payload) as response: + status = response.status + if status == 200: + audio_content = await response.read() + + # The API already saves the file, so we just confirm success + print(f"Audio generated successfully. Check the audio_output folder for the file.") + return True + else: + error_text = await response.text() + print(f"Request failed with status {status}") + print(f"Error: {error_text}") + return False + except Exception as e: + print(f"Request failed with error: {e}") + return False + +async def main(): + print("Soprano TTS API Request Sender") + print("Make sure the API server is running on http://localhost:8000 before executing this.") + print() + + while True: + text = input("Enter text to convert to speech (or 'quit' to exit): ") + if text.lower() == 'quit': + break + + if not text.strip(): + print("Text cannot be empty. Please enter some text.") + continue + + print(f"Sending request with text: '{text[:50]}{'...' if len(text) > 50 else ''}'") + success = await send_tts_request(text) + + if success: + print("Request completed successfully!") + else: + print("Request failed!") + + print() + +if __name__ == "__main__": + # Check if required packages are available + try: + import aiohttp + except ImportError: + print("Error: aiohttp is not installed. Please install it with: pip install aiohttp") + exit(1) + + asyncio.run(main()) \ No newline at end of file diff --git a/soprano/server/docs/architecture.md b/soprano/server/docs/architecture.md new file mode 100644 index 0000000..6ecf6a4 --- /dev/null +++ b/soprano/server/docs/architecture.md @@ -0,0 +1,158 @@ +# API Architecture and Implementation Details + +## System Architecture + +### High-Level Architecture +The Soprano TTS API follows a layered architecture: + +``` +┌─────────────────┐ +│ API Layer │ ← FastAPI endpoints +├─────────────────┤ +│ Business Logic │ ← TTSManager, Circuit Breaker, Retry +├─────────────────┤ +│ Model Layer │ ← SopranoTTS, Neural Processing +├─────────────────┤ +│ Utilities │ ← Audio Processing, File Management +└─────────────────┘ +``` + +### Component Breakdown + +#### API Layer (FastAPI) +- **Framework**: FastAPI for high-performance API +- **Features**: Automatic validation, documentation, async support +- **Endpoints**: RESTful API following OpenAI format + +#### Business Logic Layer +- **TTSManager**: Singleton pattern for model lifecycle management +- **CircuitBreaker**: Fault tolerance for external dependencies +- **Retry Mechanism**: Exponential backoff for transient failures + +#### Model Layer +- **SopranoTTS**: Integration with the core TTS model +- **Backend Selection**: Auto-detection of optimal backend (lmdeploy/transformers) + +#### Utilities Layer +- **Audio Processing**: WAV conversion and normalization +- **File Management**: Sequential naming and cleanup +- **Logging**: Comprehensive system logging + +## Key Implementation Details + +### Singleton Pattern (TTSManager) +The TTSManager implements a singleton pattern to ensure: +- Model loaded only once at startup +- Efficient resource utilization +- Thread-safe access to the model + +```python +class TTSManager: + _instance = None + _lock = asyncio.Lock() + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance +``` + +### Circuit Breaker Implementation +The circuit breaker prevents cascading failures with three states: +- **CLOSED**: Normal operation +- **OPEN**: Tripped after threshold failures +- **HALF_OPEN**: Testing recovery + +### Retry Mechanism +The retry mechanism uses exponential backoff: +- Configurable number of retries +- Initial delay with backoff factor +- Proper error propagation after final attempt + +## Performance Optimizations + +### Model Loading +- Asynchronous initialization to prevent blocking +- Single model instance shared across requests +- Backend auto-detection for optimal performance + +### Audio Processing +- Efficient tensor-to-WAV conversion +- Memory-efficient processing +- Proper normalization for audio quality + +### File Management +- Sequential file naming to prevent conflicts +- Automatic cleanup of old files +- Safe filename generation to prevent path traversal + +## Security Considerations + +### Input Validation +- Comprehensive request validation using Pydantic +- Character filtering for safe filenames +- Length restrictions to prevent abuse + +### File Security +- Path traversal prevention +- Safe character filtering for filenames +- Proper file permissions handling + +## Error Handling Strategy + +### Circuit Breaker Pattern +- Prevents repeated calls to failing services +- Automatic recovery after timeout +- State management for different failure scenarios + +### Retry Mechanism +- Exponential backoff for transient failures +- Configurable retry parameters +- Proper error propagation after final attempt + +### Graceful Degradation +- Fallback to CPU when CUDA not available +- Proper error responses for clients +- Comprehensive logging for debugging + +## Scalability Considerations + +### Current Limitations +- Single model instance (not multi-tenant) +- Sequential file naming (not distributed) + +### Potential Improvements +- Model instance pooling for higher throughput +- Distributed file naming for multi-server setups +- Caching for repeated requests + +## Technology Stack + +### Core Technologies +- **Python 3.10+**: Primary programming language +- **FastAPI**: High-performance web framework +- **Pydantic**: Data validation and settings management +- **Torch**: Deep learning framework +- **Uvicorn**: ASGI server for FastAPI + +### Additional Libraries +- **NumPy**: Numerical operations +- **SciPy**: Scientific computing (audio processing) +- **aiohttp**: For client-side testing + +## Development Patterns + +### Async/Sync Considerations +- Async endpoints for non-blocking operations +- Thread pool execution for model loading +- Proper async/await patterns throughout + +### Logging Strategy +- Structured logging with appropriate levels +- Contextual information for debugging +- Performance monitoring through logs + +### Testing Approach +- Unit tests for individual components +- Integration tests for API functionality +- Error condition testing \ No newline at end of file diff --git a/soprano/server/docs/configuration.md b/soprano/server/docs/configuration.md new file mode 100644 index 0000000..b8586d3 --- /dev/null +++ b/soprano/server/docs/configuration.md @@ -0,0 +1,71 @@ +# Configuration and Setup + +## System Requirements + +### Hardware Requirements +- **CPU**: Modern multi-core processor +- **GPU**: NVIDIA GPU with CUDA support (optional but recommended) +- **RAM**: 8GB minimum, 16GB+ recommended +- **Storage**: Sufficient space for model files and audio output + +### Software Requirements +- **Operating System**: Windows, macOS, or Linux +- **Python**: Version 3.10 or higher +- **CUDA**: Version 11.3 or higher (for GPU acceleration) + +## Installation + +### Prerequisites +1. Install Python 3.10 or higher +2. Install pip package manager + +### Installation Steps +1. Install the Soprano TTS package: + ``` + pip install soprano-tts + ``` + +2. The installation will automatically handle all dependencies + +## Running the API Server + +### Starting the Server +To start the API server, execute: +``` +python soprano\server\api.py +``` + +The server will: +- Detect available hardware (CUDA/CPU) +- Load the TTS model +- Start on `http://localhost:8000` +- Initialize file cleanup processes + +### Environment Variables +The API does not require specific environment variables, but you can configure: + +- **CUDA_VISIBLE_DEVICES**: To specify which GPU(s) to use +- **TORCH_DEVICE**: To force a specific device (though the API will auto-detect) + +## Configuration Options + +### Model Configuration +The API uses the Soprano-80M model by default, which will be downloaded automatically on first use. + +### File Output Configuration +- **Output Directory**: `audio_output` (created automatically) +- **File Naming**: Sequential (output_1.wav, output_2.wav, etc.) +- **File Cleanup**: Automatic cleanup of files older than 24 hours + +## Performance Optimization + +### GPU Usage +The API automatically detects and uses CUDA when available: +- Prioritizes GPU for faster processing +- Falls back to CPU if GPU is not available +- Uses appropriate backend (lmdeploy for GPU, transformers for CPU) + +### Memory Management +- Model loaded once at startup +- Efficient tensor processing +- Automatic cleanup of temporary resources \ No newline at end of file diff --git a/soprano/server/docs/endpoints.md b/soprano/server/docs/endpoints.md new file mode 100644 index 0000000..8ac0b8d --- /dev/null +++ b/soprano/server/docs/endpoints.md @@ -0,0 +1,100 @@ +# API Endpoints + +## Base URL +All endpoints are relative to `http://localhost:8000` + +## Available Endpoints + +### Root Endpoint +- **URL**: `GET /` +- **Description**: Provides information about the Soprano TTS API +- **Response**: + ```json + { + "message": "Soprano TTS API", + "version": "1.0.0", + "description": "Ultra-realistic Text-to-Speech API based on Soprano model", + "endpoints": { + "tts": "/v1/audio/speech", + "health": "/health" + } + } + ``` + +### Text-to-Speech Generation +- **URL**: `POST /v1/audio/speech` +- **Description**: Generate speech from input text following OpenAI's Speech endpoint format +- **Request Body**: + ```json + { + "input": "string (required, min 1, max 1000 characters)", + "model": "string (optional)", + "voice": "string (optional)", + "response_format": "string (optional, default: 'wav')", + "temperature": "float (optional, default: 0.3)", + "top_p": "float (optional, default: 0.95)", + "repetition_penalty": "float (optional, default: 1.2)" + } + ``` +- **Response**: WAV audio file +- **Headers**: + - `Content-Disposition: attachment; filename="output_N.wav"` + - `Content-Length: {size}` + +### Health Check +- **URL**: `GET /health` +- **Description**: Check if the server and TTS model are running properly +- **Response**: + ```json + { + "status": "healthy", + "device": "cuda" or "cpu" + } + ``` + +### API Documentation +- **URL**: `GET /docs` +- **Description**: Interactive API documentation with Swagger UI + +- **URL**: `GET /redoc` +- **Description**: Alternative API documentation with ReDoc + +- **URL**: `GET /openapi.json` +- **Description**: OpenAPI schema specification + +## Request Examples + +### cURL Example +```bash +curl -X POST "http://localhost:8000/v1/audio/speech" \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello world, this is a test of the Soprano TTS system.", + "temperature": 0.3, + "top_p": 0.95, + "repetition_penalty": 1.2 + }' \ + --output output.wav +``` + +### Python Example +```python +import requests + +url = "http://localhost:8000/v1/audio/speech" +payload = { + "input": "Hello world, this is a test of the Soprano TTS system.", + "temperature": 0.3, + "top_p": 0.95, + "repetition_penalty": 1.2 +} + +response = requests.post(url, json=payload) + +if response.status_code == 200: + with open("output.wav", "wb") as f: + f.write(response.content) + print("Audio saved successfully") +else: + print(f"Request failed with status {response.status_code}") +``` \ No newline at end of file diff --git a/soprano/server/docs/errors_and_troubleshooting.md b/soprano/server/docs/errors_and_troubleshooting.md new file mode 100644 index 0000000..ace0438 --- /dev/null +++ b/soprano/server/docs/errors_and_troubleshooting.md @@ -0,0 +1,109 @@ +# Error Handling and Troubleshooting + +## Error Handling + +### HTTP Status Codes +The API returns standard HTTP status codes: + +- **200 OK**: Request successful +- **400 Bad Request**: Invalid request parameters +- **404 Not Found**: Endpoint not found +- **422 Unprocessable Entity**: Validation error in request body +- **500 Internal Server Error**: Server-side error +- **503 Service Unavailable**: Service temporarily unavailable + +### Common Error Scenarios + +#### 400 Bad Request +- **Cause**: Invalid characters in text that could lead to path traversal +- **Solution**: Remove special characters like '../', '/', or '\' from input text + +#### 422 Unprocessable Entity +- **Cause**: Validation errors in request body +- **Examples**: + - Empty input text + - Input text exceeding 1000 characters + - Invalid parameter types or values +- **Solution**: Ensure input meets validation requirements + +#### 503 Service Unavailable +- **Cause**: Circuit breaker triggered due to repeated failures +- **Solution**: Wait for recovery period (30 seconds by default) or restart service + +## Circuit Breaker Pattern + +The API implements a circuit breaker to prevent cascading failures: + +- **Threshold**: 3 consecutive failures +- **Recovery Timeout**: 30 seconds +- **States**: CLOSED (normal operation), OPEN (tripped), HALF_OPEN (testing recovery) + +## Retry Mechanism + +Transient failures are handled with a retry mechanism: + +- **Retries**: 2 attempts +- **Initial Delay**: 1 second +- **Backoff Factor**: 2 (exponential backoff) + +## Troubleshooting + +### Common Issues and Solutions + +#### Issue: "CUDA is not available, falling back to CPU" +- **Description**: GPU not detected or CUDA not properly installed +- **Solution**: + 1. Verify NVIDIA GPU is installed + 2. Install/update CUDA drivers + 3. Ensure CUDA version is 11.3 or higher + +#### Issue: "Connection refused" when accessing API +- **Description**: API server not running +- **Solution**: + 1. Ensure server is started with `python soprano\server\api.py` + 2. Check that port 8000 is available + 3. Verify firewall settings + +#### Issue: "Failed to save audio file" +- **Description**: Permission or disk space issues +- **Solution**: + 1. Verify write permissions to `audio_output` directory + 2. Check available disk space + 3. Ensure directory path is valid + +#### Issue: High memory usage +- **Description**: Model consuming excessive memory +- **Solution**: + 1. Monitor memory usage during operation + 2. Consider reducing concurrent requests + 3. Close other applications to free memory + +### Debugging Tips + +#### Enable Verbose Logging +The API uses INFO level logging by default. For more detailed debugging: +1. Modify the logging level in the source code if needed +2. Check the console output for detailed error messages + +#### Check Model Loading +If experiencing slow responses on first request: +1. Verify model download completed successfully +2. Check internet connectivity during initial model loading + +#### Monitor File System +To monitor file creation: +1. Watch the `audio_output` directory +2. Verify sequential file naming is working correctly +3. Check for any permission issues + +### Performance Monitoring + +#### API Response Times +- First request after startup may be slower due to model loading +- Subsequent requests should be faster +- Monitor for any degradation over time + +#### Resource Utilization +- CPU/GPU usage during processing +- Memory consumption +- Disk I/O for file operations \ No newline at end of file diff --git a/soprano/server/docs/index.md b/soprano/server/docs/index.md new file mode 100644 index 0000000..7f7e5c5 --- /dev/null +++ b/soprano/server/docs/index.md @@ -0,0 +1,44 @@ +# Soprano TTS API Documentation + +Welcome to the comprehensive documentation for the Soprano TTS API. This documentation provides detailed information about the API, its usage, configuration, and implementation details. + +## Table of Contents + +1. [Overview](overview.md) - Introduction to the Soprano TTS API +2. [API Endpoints](endpoints.md) - Detailed information about all API endpoints +3. [Configuration](configuration.md) - Setup and configuration instructions +4. [Usage Examples](usage_examples.md) - Practical examples and use cases +5. [Architecture](architecture.md) - Technical architecture and implementation details +6. [Error Handling & Troubleshooting](errors_and_troubleshooting.md) - Error handling and troubleshooting guide + +## Quick Start + +### Prerequisites +- Python 3.10 or higher +- CUDA-compatible GPU (optional, CPU fallback available) + +### Installation +``` +pip install soprano-tts +``` + +### Running the API Server +``` +python soprano\server\api.py +``` + +The server will start on `http://localhost:8000` and automatically detect available hardware. + +### Making Your First Request +``` +curl -X POST "http://localhost:8000/v1/audio/speech" \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello world, this is a test of the Soprano TTS system." + }' \ + --output output.wav +``` + +## Support + +For support, please refer to the troubleshooting section or create an issue in the project repository. \ No newline at end of file diff --git a/soprano/server/docs/overview.md b/soprano/server/docs/overview.md new file mode 100644 index 0000000..e425db4 --- /dev/null +++ b/soprano/server/docs/overview.md @@ -0,0 +1,43 @@ +# Soprano TTS API Documentation + +## Overview + +The Soprano TTS API is a high-performance text-to-speech service that converts text to realistic audio using advanced neural models. The API follows OpenAI's speech endpoint format for compatibility and ease of use. + +## Features + +- **High-Quality Audio**: Uses state-of-the-art neural models for realistic speech synthesis +- **GPU Acceleration**: Automatically utilizes CUDA when available for faster processing +- **Error Handling**: Comprehensive error handling with circuit breaker and retry mechanisms +- **File Management**: Automatic sequential file naming and cleanup of old files +- **OpenAI Compatible**: Follows OpenAI's speech endpoint format + +## Architecture + +The API is built using FastAPI and follows a modular architecture: + +- **API Layer**: FastAPI endpoints with request/response handling +- **Business Logic**: TTSManager with singleton pattern and resource management +- **Model Layer**: SopranoTTS with neural model integration +- **Utilities**: Audio processing, file management, and error handling components + +## Requirements + +- Python 3.10+ +- CUDA-compatible GPU (optional, CPU fallback available) +- Required Python packages (see pyproject.toml) + +## Installation + +1. Clone the repository +2. Install dependencies: `pip install soprano-tts` +3. Run the API server + +## API Server Execution + +To start the API server, run: +``` +python soprano\server\api.py +``` + +The server will start on `http://localhost:8000` and automatically detect CUDA availability. \ No newline at end of file diff --git a/soprano/server/docs/usage_examples.md b/soprano/server/docs/usage_examples.md new file mode 100644 index 0000000..8d367e2 --- /dev/null +++ b/soprano/server/docs/usage_examples.md @@ -0,0 +1,229 @@ +# Usage Examples + +## Basic Usage + +### Command Line Interface +The API can be tested using command line tools: + +#### Using cURL +```bash +# Basic text-to-speech conversion +curl -X POST "http://localhost:8000/v1/audio/speech" \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello world, this is a test of the Soprano TTS system." + }' \ + --output output.wav + +# With custom parameters +curl -X POST "http://localhost:8000/v1/audio/speech" \ + -H "Content-Type: application/json" \ + -d '{ + "input": "This is a test with custom parameters.", + "temperature": 0.5, + "top_p": 0.9, + "repetition_penalty": 1.1 + }' \ + --output custom_output.wav +``` + +#### Using Python requests +```python +import requests + +# Basic request +url = "http://localhost:8000/v1/audio/speech" +payload = { + "input": "Hello world, this is a test of the Soprano TTS system." +} + +response = requests.post(url, json=payload) + +if response.status_code == 200: + with open("output.wav", "wb") as f: + f.write(response.content) + print("Audio saved successfully") +else: + print(f"Request failed with status {response.status_code}") + print(response.text) +``` + +## Advanced Usage + +### Custom Parameters +The API supports various parameters to customize the output: + +```python +import requests + +url = "http://localhost:8000/v1/audio/speech" +payload = { + "input": "This is a test with custom parameters.", + "temperature": 0.3, # Controls randomness (0.0-1.0) + "top_p": 0.95, # Controls diversity (0.0-1.0) + "repetition_penalty": 1.2 # Controls repetition (0.1-2.0) +} + +response = requests.post(url, json=payload) + +if response.status_code == 200: + with open("custom_output.wav", "wb") as f: + f.write(response.content) + print("Custom audio saved successfully") +``` + +### Batch Processing +To process multiple texts: + +```python +import requests +import time + +def process_texts(texts): + url = "http://localhost:8000/v1/audio/speech" + + for i, text in enumerate(texts): + payload = { + "input": text, + "temperature": 0.3, + "top_p": 0.95, + "repetition_penalty": 1.2 + } + + response = requests.post(url, json=payload) + + if response.status_code == 200: + filename = f"batch_output_{i+1}.wav" + with open(filename, "wb") as f: + f.write(response.content) + print(f"Saved {filename}") + else: + print(f"Failed to process text {i+1}: {response.status_code}") + + # Optional: Add delay between requests + time.sleep(1) + +# Example usage +texts = [ + "This is the first text.", + "This is the second text.", + "This is the third text." +] + +process_texts(texts) +``` + +## Integration Examples + +### Web Application Integration +Example of integrating with a web application: + +```python +from flask import Flask, request, send_file +import requests +import tempfile +import os + +app = Flask(__name__) + +@app.route('/tts', methods=['POST']) +def text_to_speech(): + data = request.json + text = data.get('text', '') + + if not text: + return {'error': 'Text is required'}, 400 + + # Call the Soprano TTS API + tts_url = "http://localhost:8000/v1/audio/speech" + payload = { + "input": text, + "temperature": 0.3, + "top_p": 0.95, + "repetition_penalty": 1.2 + } + + response = requests.post(tts_url, json=payload) + + if response.status_code == 200: + # Return the audio file + return send_file( + io.BytesIO(response.content), + mimetype='audio/wav', + as_attachment=True, + download_name='output.wav' + ) + else: + return {'error': 'TTS generation failed'}, response.status_code + +if __name__ == '__main__': + app.run(debug=True) +``` + +### Using with the Test Client +The provided test.py file allows for interactive usage: + +1. Start the API server +2. Run `python test.py` +3. Enter text when prompted +4. Check the audio_output folder for generated files + +## File Management + +### Output Files +- Files are saved in the `audio_output` directory +- Files use sequential naming: `output_1.wav`, `output_2.wav`, etc. +- Old files are automatically cleaned up after 24 hours + +### File Access +Generated files can be accessed directly from the `audio_output` directory or through the API response. + +## Best Practices + +### Input Validation +Always validate input text: +- Ensure text is not empty +- Keep text under 1000 characters +- Avoid special characters that might cause path traversal issues + +### Error Handling +Implement proper error handling in your client applications: + +```python +import requests + +def safe_tts_request(text): + url = "http://localhost:8000/v1/audio/speech" + payload = {"input": text} + + try: + response = requests.post(url, json=payload, timeout=60) + + if response.status_code == 200: + return response.content + elif response.status_code == 422: + print(f"Validation error: {response.text}") + return None + elif response.status_code == 503: + print("Service temporarily unavailable") + return None + else: + print(f"Request failed with status {response.status_code}") + return None + except requests.exceptions.RequestException as e: + print(f"Request error: {e}") + return None + +# Example usage +audio_data = safe_tts_request("Hello world") +if audio_data: + with open("output.wav", "wb") as f: + f.write(audio_data) + print("Audio saved successfully") +``` + +### Performance Considerations +- The first request after startup may take longer due to model loading +- Subsequent requests will be faster +- Consider the computational requirements for longer texts +- Monitor resource usage during heavy usage \ No newline at end of file diff --git a/soprano/soprano_cli.py b/soprano/soprano_cli.py deleted file mode 100644 index e600bdd..0000000 --- a/soprano/soprano_cli.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -""" -Soprano TTS Command Line Interface -""" -import argparse -import os -from soprano import SopranoTTS - -def main(): - parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') - parser.add_argument('text', help='Text to synthesize') - parser.add_argument('--output', '-o', default='output.wav', help='Output audio file path') - parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') - parser.add_argument('--device', '-d', default='cpu', choices=['cuda', 'cpu'], - help='Device to use for inference') - parser.add_argument('--backend', '-b', default='auto', - choices=['auto', 'transformers', 'lmdeploy'], - help='Backend to use for inference') - parser.add_argument('--cache-size', '-c', type=int, default=10, - help='Cache size in MB (for lmdeploy backend)') - - args = parser.parse_args() - - # Initialize TTS - tts = SopranoTTS( - backend=args.backend, - device=args.device, - cache_size_mb=args.cache_size, - model_path=args.model_path - ) - - # Generate speech - print(f"Generating speech for: '{args.text}'") - tts.infer(args.text, out_path=args.output) - print(f"Audio saved to: {args.output}") - -if __name__ == "__main__": - main() \ No newline at end of file From 90264ba00aea168bb7b8885ad00e2287bba5355e Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Fri, 9 Jan 2026 13:07:50 +0545 Subject: [PATCH 02/27] Add initial README.md for Soprano TTS project Added detailed documentation for Soprano TTS including features, installation, usage examples, architecture, future roadmap, contributing guidelines, and license information. --- README.md | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..314b7d6 --- /dev/null +++ b/README.md @@ -0,0 +1,128 @@ +
+ +# Soprano TTS + +**Ultra-realistic Text-to-Speech System** + +[![License](https://img.shields.io/github/license/ekwek1/soprano)](LICENSE) +[![Python Version](https://img.shields.io/pypi/pyversions/soprano-tts)](https://pypi.org/project/soprano-tts/) +[![PyPI Version](https://img.shields.io/pypi/v/soprano-tts)](https://pypi.org/project/soprano-tts/) +[![GitHub](https://img.shields.io/badge/GitHub-Original%20Repo-blue?logo=github)](https://github.com/ekwek1/soprano) +[![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97-Hugging%20Face-orange)](https://huggingface.co/ekwek1/soprano) +[![Gradio](https://img.shields.io/badge/Demo-Live%20on%20HF-green)](https://huggingface.co/spaces/ekwek1/soprano) + +*Soprano delivers high-quality, natural-sounding speech synthesis with minimal latency using cutting-edge deep learning techniques.* + +
+ +## Key Features + +- **High-fidelity audio** - Crystal-clear speech synthesis +- **GPU acceleration** - Support for both CPU and CUDA +- **Multiple backends** - Transformers & LMDeploy with auto-selection +- **REST API** - Easy HTTP integration +- **Interactive CLI** - Command-line interface for quick usage +- **Streaming support** - Real-time audio generation capabilities + +## Installation + +```bash +pip install soprano-tts +``` + +## Quick Start + +### Using the CLI + +```bash +# Launch the interactive CLI +soprano + +# Customize backend and cache size +soprano --backend transformers --cache-size 50 +``` + +### Using the Python API + +```python +from soprano import SopranoTTS + +# Initialize the TTS model +tts = SopranoTTS(device='cuda') # Use 'cpu' if CUDA is not available + +# Generate speech +audio = tts.infer("Hello, welcome to Soprano TTS!") + +# Save to file +tts.infer("Hello world!", out_path="output.wav") +``` + +### Using the REST API + +```bash +# Start the API server +cd soprano/server +python api.py + +# Make requests to the API +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"input": "Hello, this is Soprano TTS!", "model": "soprano"}' \ + --output output.wav +``` + +## Architecture + +Soprano TTS is built with a modular architecture: + +- **Core Engine** - Advanced synthesis engine for text processing +- **Adaptive Backends** - Multiple inference options with smart selection +- **Custom Decoder** - Specialized vocoder for high-quality audio +- **RESTful API** - FastAPI-powered HTTP interface +- **Interactive CLI** - User-friendly command-line experience + +## Future Roadmap + +We're constantly enhancing Soprano TTS with innovative features: + +### WebUI for Visual Voice Management + +An intuitive web-based interface for comprehensive voice control: + +- **Visual Parameter Adjustment** - Drag-and-drop controls for pitch, speed, and tone +- **Real-time Previews** - Instant playback of voice modifications +- **Profile Management** - Save and share custom voice configurations +- **Advanced Editing Tools** - Format and segment text with ease +- **Batch Processing** - Handle multiple text inputs simultaneously +- **Multi-format Export** - Download in various audio formats + +### WebSocket Integration for Real-Time Streaming + +Low-latency audio streaming for interactive applications: + +- **Ultra-low Latency** - Sub-millisecond response times for live streaming +- **Bidirectional Communication** - Full-duplex interaction capabilities +- **Streaming Synthesis** - Continuous audio generation for long texts +- **Real-time Feedback** - Dynamic adjustments during playback +- **Optimized Buffering** - Consistent quality with intelligent caching +- **Resilient Connections** - Automatic recovery from network interruptions + +These enhancements will extend the existing REST API in `soprano/server/api.py`, providing both traditional HTTP endpoints and real-time streaming capabilities for diverse use cases. + +## Contributing + +We welcome contributions! Please see our contributing guidelines for details on how to participate in the project. + +## License + +This project is licensed under the terms specified in the [LICENSE](LICENSE) file. + +--- + +
+ +For the open-source community + +[GitHub](https://github.com/ekwek1/soprano) • [Issues](https://github.com/ekwek1/soprano/issues) + +
From 6a80f544c603ae5eb99de08c41f1ba65be46868c Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Fri, 9 Jan 2026 13:18:11 +0545 Subject: [PATCH 03/27] Update README.md --- README.md | 73 +++---------------------------------------------------- 1 file changed, 4 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 314b7d6..1290ec2 100644 --- a/README.md +++ b/README.md @@ -24,52 +24,7 @@ - **Interactive CLI** - Command-line interface for quick usage - **Streaming support** - Real-time audio generation capabilities -## Installation -```bash -pip install soprano-tts -``` - -## Quick Start - -### Using the CLI - -```bash -# Launch the interactive CLI -soprano - -# Customize backend and cache size -soprano --backend transformers --cache-size 50 -``` - -### Using the Python API - -```python -from soprano import SopranoTTS - -# Initialize the TTS model -tts = SopranoTTS(device='cuda') # Use 'cpu' if CUDA is not available - -# Generate speech -audio = tts.infer("Hello, welcome to Soprano TTS!") - -# Save to file -tts.infer("Hello world!", out_path="output.wav") -``` - -### Using the REST API - -```bash -# Start the API server -cd soprano/server -python api.py - -# Make requests to the API -curl -X POST http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{"input": "Hello, this is Soprano TTS!", "model": "soprano"}' \ - --output output.wav -``` ## Architecture @@ -82,32 +37,12 @@ Soprano TTS is built with a modular architecture: - **Interactive CLI** - User-friendly command-line experience ## Future Roadmap +- [ ] Web Socket for real time streaming and audio playback. +- [ ] Web UI for User Interaction +- [ ] LLM Intregation -We're constantly enhancing Soprano TTS with innovative features: - -### WebUI for Visual Voice Management - -An intuitive web-based interface for comprehensive voice control: - -- **Visual Parameter Adjustment** - Drag-and-drop controls for pitch, speed, and tone -- **Real-time Previews** - Instant playback of voice modifications -- **Profile Management** - Save and share custom voice configurations -- **Advanced Editing Tools** - Format and segment text with ease -- **Batch Processing** - Handle multiple text inputs simultaneously -- **Multi-format Export** - Download in various audio formats - -### WebSocket Integration for Real-Time Streaming - -Low-latency audio streaming for interactive applications: - -- **Ultra-low Latency** - Sub-millisecond response times for live streaming -- **Bidirectional Communication** - Full-duplex interaction capabilities -- **Streaming Synthesis** - Continuous audio generation for long texts -- **Real-time Feedback** - Dynamic adjustments during playback -- **Optimized Buffering** - Consistent quality with intelligent caching -- **Resilient Connections** - Automatic recovery from network interruptions -These enhancements will extend the existing REST API in `soprano/server/api.py`, providing both traditional HTTP endpoints and real-time streaming capabilities for diverse use cases. +These features will build upon the recently added REST API located in `soprano/server/api.py`. ## Contributing From b8a8fa5d30075246546fa2f15134ecb0b61b4288 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Fri, 9 Jan 2026 20:16:00 +0545 Subject: [PATCH 04/27] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9b97140..ed27cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,4 +39,4 @@ Homepage = "https://github.com/ekwek1/soprano" Issues = "https://github.com/ekwek1/soprano/issues" [project.scripts] -soprano = "soprano.soprano_cli:main" \ No newline at end of file +soprano = "soprano.soprano_cli:main" From 5b6ceaddd7cb6b78c042c34dd4bc92815ae04299 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Sat, 10 Jan 2026 10:14:27 +0545 Subject: [PATCH 05/27] Fix merge conflict in pyproject.toml Resolved merge conflict in dependencies section. --- pyproject.toml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed27cd3..1cea9c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,14 +23,9 @@ dependencies = [ "scipy", "torch", "unidecode", -<<<<<<< HEAD "inflect", - "scipy", - "sounddevice" -======= - "uvicorn", - "inflect" ->>>>>>> upstream/main + "sounddevice", + "uvicorn" ] license = {file = "LICENSE"} From 0c177686284d29fd0983d119b99b5d16fdd4e01e Mon Sep 17 00:00:00 2001 From: biswas445 Date: Sat, 10 Jan 2026 15:53:17 +0545 Subject: [PATCH 06/27] Initial commit: Complete Soprano TTS implementation with API and WebSocket streaming --- CLI/CLI_DOCUMENTATION.md | 159 -------- CLI/__init__.py | 1 + README.md | 74 ++-- Soprano.bat | 110 +++++- pyproject.toml | 8 +- soprano/server/README.md | 30 ++ soprano/server/Run_API.bat | 17 + soprano/server/Run_WebSocket.bat | 29 ++ soprano/server/api.py | 130 ++----- soprano/server/api_test.py | 70 ---- soprano/server/docs/api_readme.md | 70 ++++ soprano/server/docs/architecture.md | 158 -------- soprano/server/docs/configuration.md | 71 ---- soprano/server/docs/endpoints.md | 100 ------ .../server/docs/errors_and_troubleshooting.md | 109 ------ soprano/server/docs/index.md | 44 --- soprano/server/docs/overview.md | 43 --- soprano/server/docs/test_readme.md | 42 +++ soprano/server/docs/usage_examples.md | 229 ------------ soprano/server/docs/websocket_readme.md | 100 ++++++ soprano/server/test_api.py | 138 +++++++ soprano/server/test_websocket.py | 81 +++++ soprano/server/websocket.py | 338 ++++++++++++++++++ soprano/tts.py | 18 +- 24 files changed, 1022 insertions(+), 1147 deletions(-) delete mode 100644 CLI/CLI_DOCUMENTATION.md create mode 100644 CLI/__init__.py create mode 100644 soprano/server/README.md create mode 100644 soprano/server/Run_API.bat create mode 100644 soprano/server/Run_WebSocket.bat delete mode 100644 soprano/server/api_test.py create mode 100644 soprano/server/docs/api_readme.md delete mode 100644 soprano/server/docs/architecture.md delete mode 100644 soprano/server/docs/configuration.md delete mode 100644 soprano/server/docs/endpoints.md delete mode 100644 soprano/server/docs/errors_and_troubleshooting.md delete mode 100644 soprano/server/docs/index.md delete mode 100644 soprano/server/docs/overview.md create mode 100644 soprano/server/docs/test_readme.md delete mode 100644 soprano/server/docs/usage_examples.md create mode 100644 soprano/server/docs/websocket_readme.md create mode 100644 soprano/server/test_api.py create mode 100644 soprano/server/test_websocket.py create mode 100644 soprano/server/websocket.py diff --git a/CLI/CLI_DOCUMENTATION.md b/CLI/CLI_DOCUMENTATION.md deleted file mode 100644 index b24e641..0000000 --- a/CLI/CLI_DOCUMENTATION.md +++ /dev/null @@ -1,159 +0,0 @@ -# Soprano TTS CLI Documentation - -## Overview - -Soprano TTS is an ultra-realistic text-to-speech system that generates high-quality audio from text input. The CLI provides an interactive interface to utilize the Soprano TTS engine with customizable voice parameters for naturalistic speech synthesis. - -## Features - -- Interactive menu-driven interface -- Real-time audio playback without file saving -- File-based audio generation with customizable output paths -- Adjustable voice parameters for naturalistic speech -- Automatic device selection (CUDA fallback to CPU) -- Progress indicators during audio playback - -## Installation - -```bash -pip install soprano-tts -``` - -## Usage - -Run the CLI with default settings: - -```bash -python soprano_cli.py -``` - -With optional parameters: - -```bash -python soprano_cli.py --model-path /path/to/model --backend auto --cache-size 10 -``` - -### Command Line Arguments - -- `--model-path` or `-m`: Path to local model directory (optional, defaults to Hugging Face model) -- `--backend`: Backend to use for inference (options: auto, transformers, lmdeploy; default: auto) -- `--cache-size` or `-c`: Cache size in MB for lmdeploy backend (default: 10) - -## Interactive Menu Options - -### Option 1: Input Text for Synthesis (with file saving) - -Generates audio from input text and saves it to a WAV file in the `audio_output` directory. The system automatically creates this directory if it doesn't exist and uses incremental naming: -- First file: `output_audio.wav` -- Second file: `output_audio1.wav` -- And so on... - -### Option 2: Real-time Audio Playback (no file saving) - -Generates audio from input text and plays it directly without saving to disk. This option: -- Generates audio in real-time -- Plays audio through system speakers -- Waits for complete playback before returning to menu - -### Option 3: View Saved Audio Files - -Displays a list of all audio files saved in the `audio_output` directory with their filenames. - -### Option 4: Exit - -Terminates the CLI application. - -## Visual Pipeline - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Soprano TTS CLI │ -├─────────────────────────────────────────────────────────────────┤ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ User Input │ │ Model Load │ │ Device Check │ │ -│ │ & Validation │ │ & Init │ │ & Select │ │ -│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ -│ │ │ │ │ -│ ▼ ▼ ▼ │ -│ ┌─────────────────────────────────────────────────────────────┤ -│ │ Main Menu Loop │ -│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────┐ │ -│ │ │ Option 1: Save │ │ Option 2: Play │ │ Option 3: │ │ -│ │ │ to File │ │ to Speaker │ │View Audio │ │ -│ │ └─────────────────┘ └─────────────────┘ │ Files │ │ -│ └─────────────────────────────────────────────────────────────┤ -│ │ │ │ │ -│ ▼ ▼ ▼ │ -│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┤ -│ │ Text Pre- │ │ Text Pre- │ │ List │ -│ │ processing │ │ processing │ │ Audio │ -│ └─────────────────┘ └─────────────────┘ │ Files │ -│ │ │ └─────────────────┤ -│ ▼ ▼ │ │ -│ ┌─────────────────┐ ┌─────────────────┐ │ │ -│ │ Model Inference│ │ Model Inference│ │ │ -│ │ (Generate │ │ (Generate │ │ │ -│ │ Audio Data) │ │ Audio Data) │ │ │ -│ └─────────────────┘ └─────────────────┘ │ │ -│ │ │ │ │ -│ ▼ ▼ │ │ -│ ┌─────────────────┐ ┌─────────────────┐ │ │ -│ │ Save to │ │ Audio │ │ │ -│ │ File (.wav) │ │ Playback │ │ │ -│ │in audio_output │ │(real-time) │ │ │ -│ │ directory │ │ │ │ │ -│ └─────────────────┘ └─────────────────┘ │ │ -│ │ │ -│ ▼ │ -│ ┌─────────────────┤ -│ │ Return to │ -│ │ Main Menu │ -│ └─────────────────┤ -└─────────────────────────────────────────────────────────────────┘ -``` - -## Voice Characteristics - -The system uses optimized default parameters for naturalistic speech: - -- **Temperature**: 0.7 (provides natural variation and creativity) -- **Top-p**: 0.9 (balances coherent speech with natural variation) -- **Repetition Penalty**: 1.05 (minimizes repetition while maintaining quality) - -These parameters are built-in and optimized for the most natural, human-like voice output. - -## Technical Details - -### Audio Specifications -- Sample Rate: 32,000 Hz -- Format: WAV (for saved files) -- Real-time playback through system audio - -### Model Architecture -- Uses Soprano-80M model by default -- Vocos-based decoder for high-quality audio synthesis -- Support for both LMDeploy and Transformers backends - -### Supported Platforms -- Windows, macOS, Linux -- CUDA-compatible GPUs (recommended) or CPU -- Python 3.10+ - -## Troubleshooting - -### Common Issues: - -1. **No audio output**: Ensure `sounddevice` is installed: - ```bash - pip install sounddevice - ``` - -2. **CUDA unavailable**: The system will automatically fall back to CPU - -3. **Long text processing**: Text is limited to 1000 characters per input - -4. **Model loading errors**: Check internet connection for downloading models from Hugging Face - -## License - -This project is licensed under the terms specified in the LICENSE file. \ No newline at end of file diff --git a/CLI/__init__.py b/CLI/__init__.py new file mode 100644 index 0000000..e0e4938 --- /dev/null +++ b/CLI/__init__.py @@ -0,0 +1 @@ +"""CLI package for Soprano.""" \ No newline at end of file diff --git a/README.md b/README.md index 1290ec2..5324961 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,47 @@ -
- # Soprano TTS -**Ultra-realistic Text-to-Speech System** - -[![License](https://img.shields.io/github/license/ekwek1/soprano)](LICENSE) -[![Python Version](https://img.shields.io/pypi/pyversions/soprano-tts)](https://pypi.org/project/soprano-tts/) -[![PyPI Version](https://img.shields.io/pypi/v/soprano-tts)](https://pypi.org/project/soprano-tts/) -[![GitHub](https://img.shields.io/badge/GitHub-Original%20Repo-blue?logo=github)](https://github.com/ekwek1/soprano) -[![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97-Hugging%20Face-orange)](https://huggingface.co/ekwek1/soprano) -[![Gradio](https://img.shields.io/badge/Demo-Live%20on%20HF-green)](https://huggingface.co/spaces/ekwek1/soprano) +Soprano is an ultra-realistic Text-to-Speech system that provides both REST API and WebSocket streaming capabilities. -*Soprano delivers high-quality, natural-sounding speech synthesis with minimal latency using cutting-edge deep learning techniques.* +## Features -
+- **High Quality Audio**: Generates ultra-realistic speech using advanced TTS models +- **Multiple Interfaces**: REST API and WebSocket streaming options +- **OpenAI Compatible**: Follows OpenAI's speech endpoint format +- **Real-time Streaming**: WebSocket support for real-time audio streaming +- **Configurable Parameters**: Supports temperature, top_p, repetition_penalty, and min_text_length controls -## Key Features +## Components -- **High-fidelity audio** - Crystal-clear speech synthesis -- **GPU acceleration** - Support for both CPU and CUDA -- **Multiple backends** - Transformers & LMDeploy with auto-selection -- **REST API** - Easy HTTP integration -- **Interactive CLI** - Command-line interface for quick usage -- **Streaming support** - Real-time audio generation capabilities +- **API Server**: RESTful API with OpenAI-compatible endpoints +- **WebSocket Server**: Real-time audio streaming via WebSocket +- **CLI Interface**: Interactive command-line interface +- **Test Clients**: Dedicated test clients for both API and WebSocket +## Quick Start +### Using the Launcher +Run `Soprano.bat` to access the main menu with options to launch any component. -## Architecture +### API Server +Start the API server and send requests to `http://localhost:8000/v1/audio/speech` -Soprano TTS is built with a modular architecture: +### WebSocket Server +Start the WebSocket server and connect to `ws://localhost:8001/ws/tts` -- **Core Engine** - Advanced synthesis engine for text processing -- **Adaptive Backends** - Multiple inference options with smart selection -- **Custom Decoder** - Specialized vocoder for high-quality audio -- **RESTful API** - FastAPI-powered HTTP interface -- **Interactive CLI** - User-friendly command-line experience +## Endpoints -## Future Roadmap -- [ ] Web Socket for real time streaming and audio playback. -- [ ] Web UI for User Interaction -- [ ] LLM Intregation +### API +- `POST /v1/audio/speech` - Generate speech from text +- `GET /health` - Health check endpoint +- `GET /` - Root endpoint with API information +### WebSocket +- `ws://localhost:8001/ws/tts` - Real-time TTS streaming -These features will build upon the recently added REST API located in `soprano/server/api.py`. +## Integration -## Contributing - -We welcome contributions! Please see our contributing guidelines for details on how to participate in the project. +The API is designed for easy integration with workflow automation platforms like n8n, Zapier, and other systems that can make HTTP requests. ## License -This project is licensed under the terms specified in the [LICENSE](LICENSE) file. - ---- - -
- -For the open-source community - -[GitHub](https://github.com/ekwek1/soprano) • [Issues](https://github.com/ekwek1/soprano/issues) - -
+Licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/Soprano.bat b/Soprano.bat index d3a3f90..d04ca0d 100644 --- a/Soprano.bat +++ b/Soprano.bat @@ -1,2 +1,108 @@ -cd CLI -python soprano_cli.py \ No newline at end of file +@echo off +title Soprano TTS Launcher +color 0A + +:menu +cls +echo ================================================ +echo SOPRANO TTS LAUNCHER +echo ================================================ +echo. +echo Select an option: +echo. +echo 1. Launch API Server +echo 2. Launch CLI Interface +echo 3. Launch API Test Client +echo 4. Launch WebSocket Server +echo 5. Launch WebSocket Test Client +echo 6. Exit +echo. +set /p choice="Enter your choice (1-6): " + +if "%choice%"=="1" goto api_server +if "%choice%"=="2" goto cli +if "%choice%"=="3" goto api_test +if "%choice%"=="4" goto websocket_server +if "%choice%"=="5" goto websocket_test +if "%choice%"=="6" goto exit +goto invalid_choice + +:invalid_choice +echo. +echo Invalid choice. Please enter 1, 2, 3, or 4. +timeout /t 2 /nobreak >nul +goto menu + +:api_server +echo. +echo Starting Soprano TTS API Server... +echo. +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) +cd soprano\server +call Run_API.bat +cd ..\.. +goto end + +:cli +echo. +echo Starting Soprano TTS CLI Interface... +echo. +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) +python -m CLI.soprano_cli +goto end + +:api_test +echo. +echo Starting Soprano TTS API Test Client... +echo. +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) +cd soprano\server +call Run_API_Test.bat "Hello, this is a test of the Soprano TTS API. The system is working properly." +cd ..\.. +goto end + +:websocket_server +echo. +echo Starting Soprano TTS WebSocket Server... +echo. +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) +cd soprano\server +call Run_WebSocket.bat +cd ..\.. +goto end + +:websocket_test +echo. +echo Starting Soprano TTS WebSocket Test Client... +echo. +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) +cd soprano\server +call Run_WebSocket_Test.bat "Hello, this is a test of the WebSocket TTS system. Audio is streaming in real-time." +cd ..\.. +goto end + +:exit +exit /b + +:end +pause \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1cea9c3..64bcf28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,11 +27,15 @@ dependencies = [ "sounddevice", "uvicorn" ] -license = {file = "LICENSE"} +license = {text = "MIT"} [project.urls] Homepage = "https://github.com/ekwek1/soprano" Issues = "https://github.com/ekwek1/soprano/issues" [project.scripts] -soprano = "soprano.soprano_cli:main" +soprano = "CLI.soprano_cli:main" + +[tool.setuptools.packages.find] +where = ["."] +include = ["soprano*", "CLI*"] diff --git a/soprano/server/README.md b/soprano/server/README.md new file mode 100644 index 0000000..4c1a9a5 --- /dev/null +++ b/soprano/server/README.md @@ -0,0 +1,30 @@ +# Soprano TTS Server + +This directory contains the server components for the Soprano TTS system, including the API server and test utilities. + +## Components + +- **API Server** (`api.py`): OpenAI-compatible text-to-speech API server +- **WebSocket Server** (`websocket.py`): Real-time streaming TTS via WebSocket +- **API Test Client** (`test_api.py`): Test client for the API server +- **WebSocket Test Client** (`test_websocket.py`): Test client for the WebSocket server +- **Documentation**: README files explaining usage and integration + +## API Compatibility + +The API server implements OpenAI-compatible endpoints, making it easy to integrate with existing applications and services that expect OpenAI's speech API format. + +## Quick Start + +1. Start the API server: `python -m soprano.server.api` +2. Start the WebSocket server: `python -m soprano.server.websocket` +3. Test with the clients: `python -m soprano.server.test_api` or `python -m soprano.server.test_websocket` +4. Or use directly with HTTP requests to `http://localhost:8000/v1/audio/speech` + +## Integration Ready + +The server is designed for seamless integration with: +- Workflow automation tools (like n8n) +- Web and mobile applications +- Voice-enabled systems +- Any system capable of making HTTP requests or WebSocket connections \ No newline at end of file diff --git a/soprano/server/Run_API.bat b/soprano/server/Run_API.bat new file mode 100644 index 0000000..2e7a969 --- /dev/null +++ b/soprano/server/Run_API.bat @@ -0,0 +1,17 @@ +@echo off +title Soprano TTS API Server +echo Starting Soprano TTS API Server... +echo. + +REM Activate virtual environment if present +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) + +REM Start the API server with performance optimizations +echo Starting server on http://localhost:8000 +python -c "import uvicorn; import torch; print('Starting Soprano TTS API Server...'); device = 'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'; print(f'Available device: {device}'); uvicorn.run('soprano.server.api:app', host='localhost', port=8000, workers=1, log_level='info')" + +pause \ No newline at end of file diff --git a/soprano/server/Run_WebSocket.bat b/soprano/server/Run_WebSocket.bat new file mode 100644 index 0000000..b8aa1ea --- /dev/null +++ b/soprano/server/Run_WebSocket.bat @@ -0,0 +1,29 @@ +@echo off +title Soprano TTS WebSocket Server +echo Starting Soprano TTS WebSocket Server... +echo. + +REM Activate virtual environment if present +if exist venv\Scripts\activate.bat ( + call venv\Scripts\activate.bat +) else if exist env\Scripts\activate.bat ( + call env\Scripts\activate.bat +) + +REM Start the WebSocket server +echo Starting WebSocket server on ws://localhost:8001/ws/tts +python -c " +import uvicorn +import torch +print('Starting Soprano TTS WebSocket Server...') +device = 'CUDA (GPU)' if torch.cuda.is_available() else 'CPU' +print(f'Available device: {device}') +uvicorn.run( + 'soprano.server.websocket:app', + host='localhost', + port=8001, + log_level='info' +) +" + +pause \ No newline at end of file diff --git a/soprano/server/api.py b/soprano/server/api.py index 0cabebc..909bb1a 100644 --- a/soprano/server/api.py +++ b/soprano/server/api.py @@ -1,7 +1,6 @@ import asyncio import io import logging -import os import time from typing import Optional, Dict, Any, AsyncGenerator import numpy as np @@ -95,8 +94,9 @@ class SpeechRequest(BaseModel): response_format: Optional[str] = Field("wav", description="Response format (only wav supported)") speed: Optional[float] = Field(None, ge=0.1, le=2.0, description="Speech speed (not implemented yet)") temperature: Optional[float] = Field(0.3, ge=0.0, le=1.0, description="Generation temperature") - top_p: Optional[float] = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling parameter") + top_p: Optional[float] = Field(1.0, ge=0.0, le=1.0, description="Top-p sampling parameter") repetition_penalty: Optional[float] = Field(1.2, ge=0.1, le=2.0, description="Repetition penalty") + min_text_length: Optional[int] = Field(30, ge=1, le=1000, description="Minimum text length for processing (default 30)") class TTSManager: @@ -134,7 +134,7 @@ async def initialize_model(self): logger.info("Loading Soprano TTS model...") try: # Run model initialization in a thread pool to avoid blocking - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() # Use retry mechanism for model loading def load_model(): @@ -160,7 +160,7 @@ def get_model(self) -> SopranoTTS: raise RuntimeError("TTS model not initialized. Call initialize_model() first.") return self.tts - def generate_audio(self, text: str, top_p: float, temperature: float, repetition_penalty: float): + def generate_audio(self, text: str, top_p: float, temperature: float, repetition_penalty: float, min_text_length: int = 30): """ Generate audio with circuit breaker protection and retry mechanism. """ @@ -169,7 +169,8 @@ def _generate(): text=text, top_p=top_p, temperature=temperature, - repetition_penalty=repetition_penalty + repetition_penalty=repetition_penalty, + min_text_length=min_text_length ) # Use circuit breaker to protect against repeated failures @@ -188,11 +189,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: tts_manager = TTSManager() await tts_manager.initialize_model() - # Perform initial cleanup of old files - output_dir = "audio_output" - os.makedirs(output_dir, exist_ok=True) - cleanup_old_files(output_dir) - logger.info("Soprano TTS API server started successfully") yield except Exception as e: @@ -202,7 +198,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: logger.info("Shutting down Soprano TTS API server...") -# Create FastAPI app with metadata +# Create FastAPI app with performance optimizations app = FastAPI( title="Soprano TTS API", description="Ultra-realistic Text-to-Speech API based on Soprano model", @@ -211,83 +207,29 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: "name": "Soprano TTS", "url": "https://github.com/ekwek1/soprano", }, - lifespan=lifespan + lifespan=lifespan, + # Performance optimizations + timeout=60, # Increase timeout for longer texts ) def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: """ - Convert a 1D fp32 torch tensor to a WAV byte stream. + Convert a 1D fp32 torch tensor to a WAV byte stream efficiently. """ # Convert to numpy array audio_np = tensor.cpu().numpy() - # Normalize to int16 range if needed - if audio_np.dtype != np.int16: - # Ensure values are in the range [-1, 1] - audio_np = np.clip(audio_np, -1.0, 1.0) - # Convert to int16 - audio_np = (audio_np * 32767).astype(np.int16) + # Ensure values are in the range [-1, 1] and convert to int16 in one step + audio_np = np.clip(audio_np, -1.0, 1.0) + audio_np = (audio_np * 32767).astype(np.int16) - # Create in-memory WAV file + # Create in-memory WAV file directly without intermediate buffer wav_io = io.BytesIO() write(wav_io, 32000, audio_np) # 32kHz sample rate - wav_io.seek(0) - return wav_io.read() - + return wav_io.getvalue() # Use getvalue() instead of seek() + read() -def cleanup_old_files(directory: str, max_age_hours: int = 24, max_files: int = 100): - """ - Clean up old files in the specified directory to prevent unlimited growth. - Args: - directory: Directory to clean up - max_age_hours: Maximum age of files in hours - max_files: Maximum number of files to keep - """ - import time - try: - files = [] - for filename in os.listdir(directory): - filepath = os.path.join(directory, filename) - if os.path.isfile(filepath): - files.append((filepath, os.path.getctime(filepath))) - - # Sort by creation time (oldest first) - files.sort(key=lambda x: x[1]) - - current_time = time.time() - cutoff_time = current_time - (max_age_hours * 3600) - - # Remove files older than cutoff time - removed_count = 0 - for filepath, creation_time in files: - if creation_time < cutoff_time: - try: - os.remove(filepath) - removed_count += 1 - logger.info(f"Removed old file: {filepath}") - except OSError as e: - logger.error(f"Failed to remove old file {filepath}: {e}") - - # If still too many files, remove oldest ones beyond the limit - remaining_files = len(files) - removed_count - if remaining_files > max_files: - excess_count = remaining_files - max_files - for i in range(excess_count): - if i < len(files) - removed_count: - filepath = files[i][0] - try: - os.remove(filepath) - removed_count += 1 - logger.info(f"Removed excess file: {filepath}") - except OSError as e: - logger.error(f"Failed to remove excess file {filepath}: {e}") - - if removed_count > 0: - logger.info(f"Cleaned up {removed_count} old files from {directory}") - except Exception as e: - logger.error(f"Error during file cleanup: {e}") @@ -326,7 +268,8 @@ async def create_speech(request: SpeechRequest): text=request.input, top_p=request.top_p, temperature=request.temperature, - repetition_penalty=request.repetition_penalty + repetition_penalty=request.repetition_penalty, + min_text_length=request.min_text_length ) except Exception as e: logger.error(f"Circuit breaker or retry mechanism failed: {str(e)}", exc_info=True) @@ -338,41 +281,12 @@ async def create_speech(request: SpeechRequest): # Convert tensor to WAV bytes wav_bytes = _tensor_to_wav_bytes(audio_tensor) - # Create audio_output directory if it doesn't exist - output_dir = "audio_output" - os.makedirs(output_dir, exist_ok=True) - - # Generate unique filename with sequential numbering - file_counter = 1 - while True: - filename = f"output_{file_counter}.wav" - filepath = os.path.join(output_dir, filename) - if not os.path.exists(filepath): - break - file_counter += 1 - - # Prevent path traversal by ensuring filename is safe - if '..' in filename or '/' in filename or '\\' in filename: - raise HTTPException( - status_code=400, - detail="Invalid characters in text that could lead to path traversal." - ) - - # Save the audio file to the audio_output directory with error handling - try: - with open(filepath, 'wb') as f: - f.write(wav_bytes) - logger.info(f"Audio saved to: {filepath}") - except OSError as e: - logger.error(f"Failed to save audio file: {e}") - raise HTTPException( - status_code=500, - detail=f"Failed to save audio file: {str(e)}" - ) - logger.info(f"TTS generation completed successfully.") - # Return WAV response + # Generate a generic filename for the response + filename = "speech_output.wav" + + # Return WAV response directly to client without saving on server return Response( content=wav_bytes, media_type="audio/wav", diff --git a/soprano/server/api_test.py b/soprano/server/api_test.py deleted file mode 100644 index bba4cdf..0000000 --- a/soprano/server/api_test.py +++ /dev/null @@ -1,70 +0,0 @@ -import asyncio -import aiohttp -import time -from pathlib import Path - -async def send_tts_request(text): - """ - Send a TTS request to the API server with custom text - """ - base_url = "http://localhost:8000" - - payload = { - "input": text, - "temperature": 0.3, - "top_p": 0.95, - "repetition_penalty": 1.2 - } - - try: - async with aiohttp.ClientSession() as session: - async with session.post(f"{base_url}/v1/audio/speech", json=payload) as response: - status = response.status - if status == 200: - audio_content = await response.read() - - # The API already saves the file, so we just confirm success - print(f"Audio generated successfully. Check the audio_output folder for the file.") - return True - else: - error_text = await response.text() - print(f"Request failed with status {status}") - print(f"Error: {error_text}") - return False - except Exception as e: - print(f"Request failed with error: {e}") - return False - -async def main(): - print("Soprano TTS API Request Sender") - print("Make sure the API server is running on http://localhost:8000 before executing this.") - print() - - while True: - text = input("Enter text to convert to speech (or 'quit' to exit): ") - if text.lower() == 'quit': - break - - if not text.strip(): - print("Text cannot be empty. Please enter some text.") - continue - - print(f"Sending request with text: '{text[:50]}{'...' if len(text) > 50 else ''}'") - success = await send_tts_request(text) - - if success: - print("Request completed successfully!") - else: - print("Request failed!") - - print() - -if __name__ == "__main__": - # Check if required packages are available - try: - import aiohttp - except ImportError: - print("Error: aiohttp is not installed. Please install it with: pip install aiohttp") - exit(1) - - asyncio.run(main()) \ No newline at end of file diff --git a/soprano/server/docs/api_readme.md b/soprano/server/docs/api_readme.md new file mode 100644 index 0000000..4bbfa02 --- /dev/null +++ b/soprano/server/docs/api_readme.md @@ -0,0 +1,70 @@ +# Soprano TTS API + +The Soprano TTS API provides a high-quality, ultra-realistic text-to-speech service with OpenAI-compatible endpoints. This API allows you to convert text to natural-sounding speech using the Soprano model. + +## Features + +- **OpenAI Compatible**: Follows OpenAI's speech endpoint format for easy integration +- **High Quality Audio**: Generates ultra-realistic speech using advanced TTS models +- **Configurable Parameters**: Supports temperature, top_p, repetition_penalty, and min_text_length controls +- **Fast Processing**: Model loaded once at startup for optimal performance +- **Production Ready**: Includes health checks and error handling + +## Endpoints + +### Generate Speech +- **URL**: `POST /v1/audio/speech` +- **Description**: Convert text to speech +- **Request Body**: + ```json + { + "input": "Text to synthesize (required, 1-1000 chars)", + "model": "Model to use (optional, ignored)", + "voice": "Voice to use (optional, ignored)", + "response_format": "Response format (optional, default: 'wav')", + "speed": "Speech speed (optional, not implemented)", + "temperature": "Generation temperature (optional, default: 0.3, range: 0.0-1.0)", + "top_p": "Top-p sampling parameter (optional, default: 1.0, range: 0.0-1.0)", + "repetition_penalty": "Repetition penalty (optional, default: 1.2, range: 0.1-2.0)", + "min_text_length": "Minimum text length for processing (optional, default: 30, range: 1-1000)" + } + ``` +- **Response**: WAV audio file as binary data + +### Health Check +- **URL**: `GET /health` +- **Description**: Check if the server and TTS model are running properly +- **Response**: Status and device information + +### Root Endpoint +- **URL**: `GET /` +- **Description**: API information and available endpoints + +## Integration + +This API is designed for easy integration with various systems including: +- Automation platforms (like n8n) +- Web applications +- Mobile applications +- Voice assistants +- Any system that can make HTTP requests + +## Performance + +- Model is loaded once at startup for optimal performance +- Efficient audio processing with minimal overhead +- Designed for concurrent requests with proper error handling + +## Usage Example + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello, this is a test.", + "temperature": 0.3, + "top_p": 1.0, + "repetition_penalty": 1.2 + }' \ + --output output.wav +``` \ No newline at end of file diff --git a/soprano/server/docs/architecture.md b/soprano/server/docs/architecture.md deleted file mode 100644 index 6ecf6a4..0000000 --- a/soprano/server/docs/architecture.md +++ /dev/null @@ -1,158 +0,0 @@ -# API Architecture and Implementation Details - -## System Architecture - -### High-Level Architecture -The Soprano TTS API follows a layered architecture: - -``` -┌─────────────────┐ -│ API Layer │ ← FastAPI endpoints -├─────────────────┤ -│ Business Logic │ ← TTSManager, Circuit Breaker, Retry -├─────────────────┤ -│ Model Layer │ ← SopranoTTS, Neural Processing -├─────────────────┤ -│ Utilities │ ← Audio Processing, File Management -└─────────────────┘ -``` - -### Component Breakdown - -#### API Layer (FastAPI) -- **Framework**: FastAPI for high-performance API -- **Features**: Automatic validation, documentation, async support -- **Endpoints**: RESTful API following OpenAI format - -#### Business Logic Layer -- **TTSManager**: Singleton pattern for model lifecycle management -- **CircuitBreaker**: Fault tolerance for external dependencies -- **Retry Mechanism**: Exponential backoff for transient failures - -#### Model Layer -- **SopranoTTS**: Integration with the core TTS model -- **Backend Selection**: Auto-detection of optimal backend (lmdeploy/transformers) - -#### Utilities Layer -- **Audio Processing**: WAV conversion and normalization -- **File Management**: Sequential naming and cleanup -- **Logging**: Comprehensive system logging - -## Key Implementation Details - -### Singleton Pattern (TTSManager) -The TTSManager implements a singleton pattern to ensure: -- Model loaded only once at startup -- Efficient resource utilization -- Thread-safe access to the model - -```python -class TTSManager: - _instance = None - _lock = asyncio.Lock() - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance -``` - -### Circuit Breaker Implementation -The circuit breaker prevents cascading failures with three states: -- **CLOSED**: Normal operation -- **OPEN**: Tripped after threshold failures -- **HALF_OPEN**: Testing recovery - -### Retry Mechanism -The retry mechanism uses exponential backoff: -- Configurable number of retries -- Initial delay with backoff factor -- Proper error propagation after final attempt - -## Performance Optimizations - -### Model Loading -- Asynchronous initialization to prevent blocking -- Single model instance shared across requests -- Backend auto-detection for optimal performance - -### Audio Processing -- Efficient tensor-to-WAV conversion -- Memory-efficient processing -- Proper normalization for audio quality - -### File Management -- Sequential file naming to prevent conflicts -- Automatic cleanup of old files -- Safe filename generation to prevent path traversal - -## Security Considerations - -### Input Validation -- Comprehensive request validation using Pydantic -- Character filtering for safe filenames -- Length restrictions to prevent abuse - -### File Security -- Path traversal prevention -- Safe character filtering for filenames -- Proper file permissions handling - -## Error Handling Strategy - -### Circuit Breaker Pattern -- Prevents repeated calls to failing services -- Automatic recovery after timeout -- State management for different failure scenarios - -### Retry Mechanism -- Exponential backoff for transient failures -- Configurable retry parameters -- Proper error propagation after final attempt - -### Graceful Degradation -- Fallback to CPU when CUDA not available -- Proper error responses for clients -- Comprehensive logging for debugging - -## Scalability Considerations - -### Current Limitations -- Single model instance (not multi-tenant) -- Sequential file naming (not distributed) - -### Potential Improvements -- Model instance pooling for higher throughput -- Distributed file naming for multi-server setups -- Caching for repeated requests - -## Technology Stack - -### Core Technologies -- **Python 3.10+**: Primary programming language -- **FastAPI**: High-performance web framework -- **Pydantic**: Data validation and settings management -- **Torch**: Deep learning framework -- **Uvicorn**: ASGI server for FastAPI - -### Additional Libraries -- **NumPy**: Numerical operations -- **SciPy**: Scientific computing (audio processing) -- **aiohttp**: For client-side testing - -## Development Patterns - -### Async/Sync Considerations -- Async endpoints for non-blocking operations -- Thread pool execution for model loading -- Proper async/await patterns throughout - -### Logging Strategy -- Structured logging with appropriate levels -- Contextual information for debugging -- Performance monitoring through logs - -### Testing Approach -- Unit tests for individual components -- Integration tests for API functionality -- Error condition testing \ No newline at end of file diff --git a/soprano/server/docs/configuration.md b/soprano/server/docs/configuration.md deleted file mode 100644 index b8586d3..0000000 --- a/soprano/server/docs/configuration.md +++ /dev/null @@ -1,71 +0,0 @@ -# Configuration and Setup - -## System Requirements - -### Hardware Requirements -- **CPU**: Modern multi-core processor -- **GPU**: NVIDIA GPU with CUDA support (optional but recommended) -- **RAM**: 8GB minimum, 16GB+ recommended -- **Storage**: Sufficient space for model files and audio output - -### Software Requirements -- **Operating System**: Windows, macOS, or Linux -- **Python**: Version 3.10 or higher -- **CUDA**: Version 11.3 or higher (for GPU acceleration) - -## Installation - -### Prerequisites -1. Install Python 3.10 or higher -2. Install pip package manager - -### Installation Steps -1. Install the Soprano TTS package: - ``` - pip install soprano-tts - ``` - -2. The installation will automatically handle all dependencies - -## Running the API Server - -### Starting the Server -To start the API server, execute: -``` -python soprano\server\api.py -``` - -The server will: -- Detect available hardware (CUDA/CPU) -- Load the TTS model -- Start on `http://localhost:8000` -- Initialize file cleanup processes - -### Environment Variables -The API does not require specific environment variables, but you can configure: - -- **CUDA_VISIBLE_DEVICES**: To specify which GPU(s) to use -- **TORCH_DEVICE**: To force a specific device (though the API will auto-detect) - -## Configuration Options - -### Model Configuration -The API uses the Soprano-80M model by default, which will be downloaded automatically on first use. - -### File Output Configuration -- **Output Directory**: `audio_output` (created automatically) -- **File Naming**: Sequential (output_1.wav, output_2.wav, etc.) -- **File Cleanup**: Automatic cleanup of files older than 24 hours - -## Performance Optimization - -### GPU Usage -The API automatically detects and uses CUDA when available: -- Prioritizes GPU for faster processing -- Falls back to CPU if GPU is not available -- Uses appropriate backend (lmdeploy for GPU, transformers for CPU) - -### Memory Management -- Model loaded once at startup -- Efficient tensor processing -- Automatic cleanup of temporary resources \ No newline at end of file diff --git a/soprano/server/docs/endpoints.md b/soprano/server/docs/endpoints.md deleted file mode 100644 index 8ac0b8d..0000000 --- a/soprano/server/docs/endpoints.md +++ /dev/null @@ -1,100 +0,0 @@ -# API Endpoints - -## Base URL -All endpoints are relative to `http://localhost:8000` - -## Available Endpoints - -### Root Endpoint -- **URL**: `GET /` -- **Description**: Provides information about the Soprano TTS API -- **Response**: - ```json - { - "message": "Soprano TTS API", - "version": "1.0.0", - "description": "Ultra-realistic Text-to-Speech API based on Soprano model", - "endpoints": { - "tts": "/v1/audio/speech", - "health": "/health" - } - } - ``` - -### Text-to-Speech Generation -- **URL**: `POST /v1/audio/speech` -- **Description**: Generate speech from input text following OpenAI's Speech endpoint format -- **Request Body**: - ```json - { - "input": "string (required, min 1, max 1000 characters)", - "model": "string (optional)", - "voice": "string (optional)", - "response_format": "string (optional, default: 'wav')", - "temperature": "float (optional, default: 0.3)", - "top_p": "float (optional, default: 0.95)", - "repetition_penalty": "float (optional, default: 1.2)" - } - ``` -- **Response**: WAV audio file -- **Headers**: - - `Content-Disposition: attachment; filename="output_N.wav"` - - `Content-Length: {size}` - -### Health Check -- **URL**: `GET /health` -- **Description**: Check if the server and TTS model are running properly -- **Response**: - ```json - { - "status": "healthy", - "device": "cuda" or "cpu" - } - ``` - -### API Documentation -- **URL**: `GET /docs` -- **Description**: Interactive API documentation with Swagger UI - -- **URL**: `GET /redoc` -- **Description**: Alternative API documentation with ReDoc - -- **URL**: `GET /openapi.json` -- **Description**: OpenAPI schema specification - -## Request Examples - -### cURL Example -```bash -curl -X POST "http://localhost:8000/v1/audio/speech" \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Hello world, this is a test of the Soprano TTS system.", - "temperature": 0.3, - "top_p": 0.95, - "repetition_penalty": 1.2 - }' \ - --output output.wav -``` - -### Python Example -```python -import requests - -url = "http://localhost:8000/v1/audio/speech" -payload = { - "input": "Hello world, this is a test of the Soprano TTS system.", - "temperature": 0.3, - "top_p": 0.95, - "repetition_penalty": 1.2 -} - -response = requests.post(url, json=payload) - -if response.status_code == 200: - with open("output.wav", "wb") as f: - f.write(response.content) - print("Audio saved successfully") -else: - print(f"Request failed with status {response.status_code}") -``` \ No newline at end of file diff --git a/soprano/server/docs/errors_and_troubleshooting.md b/soprano/server/docs/errors_and_troubleshooting.md deleted file mode 100644 index ace0438..0000000 --- a/soprano/server/docs/errors_and_troubleshooting.md +++ /dev/null @@ -1,109 +0,0 @@ -# Error Handling and Troubleshooting - -## Error Handling - -### HTTP Status Codes -The API returns standard HTTP status codes: - -- **200 OK**: Request successful -- **400 Bad Request**: Invalid request parameters -- **404 Not Found**: Endpoint not found -- **422 Unprocessable Entity**: Validation error in request body -- **500 Internal Server Error**: Server-side error -- **503 Service Unavailable**: Service temporarily unavailable - -### Common Error Scenarios - -#### 400 Bad Request -- **Cause**: Invalid characters in text that could lead to path traversal -- **Solution**: Remove special characters like '../', '/', or '\' from input text - -#### 422 Unprocessable Entity -- **Cause**: Validation errors in request body -- **Examples**: - - Empty input text - - Input text exceeding 1000 characters - - Invalid parameter types or values -- **Solution**: Ensure input meets validation requirements - -#### 503 Service Unavailable -- **Cause**: Circuit breaker triggered due to repeated failures -- **Solution**: Wait for recovery period (30 seconds by default) or restart service - -## Circuit Breaker Pattern - -The API implements a circuit breaker to prevent cascading failures: - -- **Threshold**: 3 consecutive failures -- **Recovery Timeout**: 30 seconds -- **States**: CLOSED (normal operation), OPEN (tripped), HALF_OPEN (testing recovery) - -## Retry Mechanism - -Transient failures are handled with a retry mechanism: - -- **Retries**: 2 attempts -- **Initial Delay**: 1 second -- **Backoff Factor**: 2 (exponential backoff) - -## Troubleshooting - -### Common Issues and Solutions - -#### Issue: "CUDA is not available, falling back to CPU" -- **Description**: GPU not detected or CUDA not properly installed -- **Solution**: - 1. Verify NVIDIA GPU is installed - 2. Install/update CUDA drivers - 3. Ensure CUDA version is 11.3 or higher - -#### Issue: "Connection refused" when accessing API -- **Description**: API server not running -- **Solution**: - 1. Ensure server is started with `python soprano\server\api.py` - 2. Check that port 8000 is available - 3. Verify firewall settings - -#### Issue: "Failed to save audio file" -- **Description**: Permission or disk space issues -- **Solution**: - 1. Verify write permissions to `audio_output` directory - 2. Check available disk space - 3. Ensure directory path is valid - -#### Issue: High memory usage -- **Description**: Model consuming excessive memory -- **Solution**: - 1. Monitor memory usage during operation - 2. Consider reducing concurrent requests - 3. Close other applications to free memory - -### Debugging Tips - -#### Enable Verbose Logging -The API uses INFO level logging by default. For more detailed debugging: -1. Modify the logging level in the source code if needed -2. Check the console output for detailed error messages - -#### Check Model Loading -If experiencing slow responses on first request: -1. Verify model download completed successfully -2. Check internet connectivity during initial model loading - -#### Monitor File System -To monitor file creation: -1. Watch the `audio_output` directory -2. Verify sequential file naming is working correctly -3. Check for any permission issues - -### Performance Monitoring - -#### API Response Times -- First request after startup may be slower due to model loading -- Subsequent requests should be faster -- Monitor for any degradation over time - -#### Resource Utilization -- CPU/GPU usage during processing -- Memory consumption -- Disk I/O for file operations \ No newline at end of file diff --git a/soprano/server/docs/index.md b/soprano/server/docs/index.md deleted file mode 100644 index 7f7e5c5..0000000 --- a/soprano/server/docs/index.md +++ /dev/null @@ -1,44 +0,0 @@ -# Soprano TTS API Documentation - -Welcome to the comprehensive documentation for the Soprano TTS API. This documentation provides detailed information about the API, its usage, configuration, and implementation details. - -## Table of Contents - -1. [Overview](overview.md) - Introduction to the Soprano TTS API -2. [API Endpoints](endpoints.md) - Detailed information about all API endpoints -3. [Configuration](configuration.md) - Setup and configuration instructions -4. [Usage Examples](usage_examples.md) - Practical examples and use cases -5. [Architecture](architecture.md) - Technical architecture and implementation details -6. [Error Handling & Troubleshooting](errors_and_troubleshooting.md) - Error handling and troubleshooting guide - -## Quick Start - -### Prerequisites -- Python 3.10 or higher -- CUDA-compatible GPU (optional, CPU fallback available) - -### Installation -``` -pip install soprano-tts -``` - -### Running the API Server -``` -python soprano\server\api.py -``` - -The server will start on `http://localhost:8000` and automatically detect available hardware. - -### Making Your First Request -``` -curl -X POST "http://localhost:8000/v1/audio/speech" \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Hello world, this is a test of the Soprano TTS system." - }' \ - --output output.wav -``` - -## Support - -For support, please refer to the troubleshooting section or create an issue in the project repository. \ No newline at end of file diff --git a/soprano/server/docs/overview.md b/soprano/server/docs/overview.md deleted file mode 100644 index e425db4..0000000 --- a/soprano/server/docs/overview.md +++ /dev/null @@ -1,43 +0,0 @@ -# Soprano TTS API Documentation - -## Overview - -The Soprano TTS API is a high-performance text-to-speech service that converts text to realistic audio using advanced neural models. The API follows OpenAI's speech endpoint format for compatibility and ease of use. - -## Features - -- **High-Quality Audio**: Uses state-of-the-art neural models for realistic speech synthesis -- **GPU Acceleration**: Automatically utilizes CUDA when available for faster processing -- **Error Handling**: Comprehensive error handling with circuit breaker and retry mechanisms -- **File Management**: Automatic sequential file naming and cleanup of old files -- **OpenAI Compatible**: Follows OpenAI's speech endpoint format - -## Architecture - -The API is built using FastAPI and follows a modular architecture: - -- **API Layer**: FastAPI endpoints with request/response handling -- **Business Logic**: TTSManager with singleton pattern and resource management -- **Model Layer**: SopranoTTS with neural model integration -- **Utilities**: Audio processing, file management, and error handling components - -## Requirements - -- Python 3.10+ -- CUDA-compatible GPU (optional, CPU fallback available) -- Required Python packages (see pyproject.toml) - -## Installation - -1. Clone the repository -2. Install dependencies: `pip install soprano-tts` -3. Run the API server - -## API Server Execution - -To start the API server, run: -``` -python soprano\server\api.py -``` - -The server will start on `http://localhost:8000` and automatically detect CUDA availability. \ No newline at end of file diff --git a/soprano/server/docs/test_readme.md b/soprano/server/docs/test_readme.md new file mode 100644 index 0000000..eb54fe5 --- /dev/null +++ b/soprano/server/docs/test_readme.md @@ -0,0 +1,42 @@ +# Soprano TTS Test Clients + +This directory contains test clients for both the API and WebSocket servers. + +## API Test Client + +The API test client allows you to test the REST API server functionality. + +### Usage +```bash +python -m soprano.server.test_api "Your text here" +``` + +### Features +- Tests the main TTS endpoint +- Includes health check functionality +- Saves received audio to audio_output directory +- Handles connection errors gracefully +- Uses aiohttp for async HTTP requests + +## WebSocket Test Client + +The WebSocket test client allows you to test the WebSocket streaming server functionality. + +### Usage +```bash +python -m soprano.server.test_websocket "Your text here" +``` + +### Features +- Tests WebSocket connection and streaming +- Real-time audio playback using PyAudio +- Connection testing with ping/pong +- Proper audio stream management +- Comprehensive error handling + +## Prerequisites + +- For API tests: `pip install aiohttp` +- For WebSocket tests: `pip install websockets pyaudio` +- Running API server on http://localhost:8000 +- Running WebSocket server on ws://localhost:8001/ws/tts \ No newline at end of file diff --git a/soprano/server/docs/usage_examples.md b/soprano/server/docs/usage_examples.md deleted file mode 100644 index 8d367e2..0000000 --- a/soprano/server/docs/usage_examples.md +++ /dev/null @@ -1,229 +0,0 @@ -# Usage Examples - -## Basic Usage - -### Command Line Interface -The API can be tested using command line tools: - -#### Using cURL -```bash -# Basic text-to-speech conversion -curl -X POST "http://localhost:8000/v1/audio/speech" \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Hello world, this is a test of the Soprano TTS system." - }' \ - --output output.wav - -# With custom parameters -curl -X POST "http://localhost:8000/v1/audio/speech" \ - -H "Content-Type: application/json" \ - -d '{ - "input": "This is a test with custom parameters.", - "temperature": 0.5, - "top_p": 0.9, - "repetition_penalty": 1.1 - }' \ - --output custom_output.wav -``` - -#### Using Python requests -```python -import requests - -# Basic request -url = "http://localhost:8000/v1/audio/speech" -payload = { - "input": "Hello world, this is a test of the Soprano TTS system." -} - -response = requests.post(url, json=payload) - -if response.status_code == 200: - with open("output.wav", "wb") as f: - f.write(response.content) - print("Audio saved successfully") -else: - print(f"Request failed with status {response.status_code}") - print(response.text) -``` - -## Advanced Usage - -### Custom Parameters -The API supports various parameters to customize the output: - -```python -import requests - -url = "http://localhost:8000/v1/audio/speech" -payload = { - "input": "This is a test with custom parameters.", - "temperature": 0.3, # Controls randomness (0.0-1.0) - "top_p": 0.95, # Controls diversity (0.0-1.0) - "repetition_penalty": 1.2 # Controls repetition (0.1-2.0) -} - -response = requests.post(url, json=payload) - -if response.status_code == 200: - with open("custom_output.wav", "wb") as f: - f.write(response.content) - print("Custom audio saved successfully") -``` - -### Batch Processing -To process multiple texts: - -```python -import requests -import time - -def process_texts(texts): - url = "http://localhost:8000/v1/audio/speech" - - for i, text in enumerate(texts): - payload = { - "input": text, - "temperature": 0.3, - "top_p": 0.95, - "repetition_penalty": 1.2 - } - - response = requests.post(url, json=payload) - - if response.status_code == 200: - filename = f"batch_output_{i+1}.wav" - with open(filename, "wb") as f: - f.write(response.content) - print(f"Saved {filename}") - else: - print(f"Failed to process text {i+1}: {response.status_code}") - - # Optional: Add delay between requests - time.sleep(1) - -# Example usage -texts = [ - "This is the first text.", - "This is the second text.", - "This is the third text." -] - -process_texts(texts) -``` - -## Integration Examples - -### Web Application Integration -Example of integrating with a web application: - -```python -from flask import Flask, request, send_file -import requests -import tempfile -import os - -app = Flask(__name__) - -@app.route('/tts', methods=['POST']) -def text_to_speech(): - data = request.json - text = data.get('text', '') - - if not text: - return {'error': 'Text is required'}, 400 - - # Call the Soprano TTS API - tts_url = "http://localhost:8000/v1/audio/speech" - payload = { - "input": text, - "temperature": 0.3, - "top_p": 0.95, - "repetition_penalty": 1.2 - } - - response = requests.post(tts_url, json=payload) - - if response.status_code == 200: - # Return the audio file - return send_file( - io.BytesIO(response.content), - mimetype='audio/wav', - as_attachment=True, - download_name='output.wav' - ) - else: - return {'error': 'TTS generation failed'}, response.status_code - -if __name__ == '__main__': - app.run(debug=True) -``` - -### Using with the Test Client -The provided test.py file allows for interactive usage: - -1. Start the API server -2. Run `python test.py` -3. Enter text when prompted -4. Check the audio_output folder for generated files - -## File Management - -### Output Files -- Files are saved in the `audio_output` directory -- Files use sequential naming: `output_1.wav`, `output_2.wav`, etc. -- Old files are automatically cleaned up after 24 hours - -### File Access -Generated files can be accessed directly from the `audio_output` directory or through the API response. - -## Best Practices - -### Input Validation -Always validate input text: -- Ensure text is not empty -- Keep text under 1000 characters -- Avoid special characters that might cause path traversal issues - -### Error Handling -Implement proper error handling in your client applications: - -```python -import requests - -def safe_tts_request(text): - url = "http://localhost:8000/v1/audio/speech" - payload = {"input": text} - - try: - response = requests.post(url, json=payload, timeout=60) - - if response.status_code == 200: - return response.content - elif response.status_code == 422: - print(f"Validation error: {response.text}") - return None - elif response.status_code == 503: - print("Service temporarily unavailable") - return None - else: - print(f"Request failed with status {response.status_code}") - return None - except requests.exceptions.RequestException as e: - print(f"Request error: {e}") - return None - -# Example usage -audio_data = safe_tts_request("Hello world") -if audio_data: - with open("output.wav", "wb") as f: - f.write(audio_data) - print("Audio saved successfully") -``` - -### Performance Considerations -- The first request after startup may take longer due to model loading -- Subsequent requests will be faster -- Consider the computational requirements for longer texts -- Monitor resource usage during heavy usage \ No newline at end of file diff --git a/soprano/server/docs/websocket_readme.md b/soprano/server/docs/websocket_readme.md new file mode 100644 index 0000000..057631a --- /dev/null +++ b/soprano/server/docs/websocket_readme.md @@ -0,0 +1,100 @@ +# Soprano TTS WebSocket + +The Soprano TTS WebSocket provides real-time streaming text-to-speech functionality. This WebSocket server allows you to generate audio in real-time and stream it to clients as it's produced. + +## Features + +- **Real-time Streaming**: Generate and stream audio in real-time +- **Raw PCM Frames**: Outputs raw PCM frames suitable for playback via PyAudio +- **Metadata Support**: Sends audio format metadata at the start +- **Small Chunks**: Streams audio in small chunks (~1024 samples) for low latency +- **End Signal**: Sends "end" signal when synthesis finishes + +## Connection + +- **Endpoint**: `ws://localhost:8001/ws/tts` +- **Protocol**: WebSocket with JSON control messages and binary audio frames + +## Message Format + +### Client to Server +```json +{ + "type": "synthesize", + "text": "Your text here", + "stream": true, + "min_text_length": 30 +} +``` + +### Server to Client +- **Metadata** (JSON): +```json +{ + "type": "metadata", + "sample_rate": 32000, + "channels": 1, + "format": "int16" +} +``` + +- **Audio Data** (Binary): Raw PCM audio frames +- **End Signal** (JSON): +```json +{ + "type": "end" +} +``` + +- **Error** (JSON): +```json +{ + "type": "error", + "message": "Error description" +} +``` + +## Integration + +The WebSocket server is ideal for: +- Real-time voice assistants +- Interactive applications +- Live broadcasting systems +- Gaming applications +- Any system requiring immediate audio feedback + +## Usage Example + +```javascript +const ws = new WebSocket('ws://localhost:8001/ws/tts'); + +ws.onopen = () => { + ws.send(JSON.stringify({ + type: "synthesize", + text: "Hello, this is a real-time audio stream", + stream: true, + min_text_length: 30 + })); +}; + +ws.onmessage = (event) => { + if (typeof event.data === 'string') { + const message = JSON.parse(event.data); + if (message.type === 'metadata') { + // Handle audio format info + } else if (message.type === 'end') { + // Streaming finished + } + } else { + // Binary audio data - play with audio API + playAudioChunk(event.data); + } +}; +``` + +## Performance + +- Implements backpressure handling to manage slow clients +- Uses asyncio queues to decouple audio generation from network transmission +- Supports graceful shutdown with proper task cancellation +- Optimized for real-time performance with minimal latency \ No newline at end of file diff --git a/soprano/server/test_api.py b/soprano/server/test_api.py new file mode 100644 index 0000000..886b91b --- /dev/null +++ b/soprano/server/test_api.py @@ -0,0 +1,138 @@ +import asyncio +import aiohttp +import json +import sys +import os +from pathlib import Path + +async def test_api(text="Hello, this is a test of the Soprano TTS API."): + """ + Test the Soprano TTS API endpoint + """ + base_url = "http://localhost:8000" + endpoint = "/v1/audio/speech" + + payload = { + "input": text, + "temperature": 0.3, + "top_p": 1.0, + "repetition_penalty": 1.2, + "min_text_length": 30 + } + + print(f"Testing API at {base_url}{endpoint}") + print(f"Sending text: '{text[:50]}{'...' if len(text) > 50 else ''}'") + + try: + async with aiohttp.ClientSession() as session: + async with session.post(f"{base_url}{endpoint}", json=payload) as response: + status = response.status + print(f"Response status: {status}") + + if status == 200: + # Read the audio content + audio_content = await response.read() + print(f"Received audio data: {len(audio_content)} bytes") + + # Save the audio to a file + output_dir = "audio_output" + os.makedirs(output_dir, exist_ok=True) + + # Generate unique filename + file_counter = 1 + while True: + filename = f"api_test_output_{file_counter}.wav" + filepath = os.path.join(output_dir, filename) + if not os.path.exists(filepath): + break + file_counter += 1 + + with open(filepath, 'wb') as f: + f.write(audio_content) + + print(f"Audio saved to: {filepath}") + return True + else: + error_text = await response.text() + print(f"Request failed with status {status}") + print(f"Error: {error_text}") + return False + except aiohttp.ClientConnectorError: + print("Error: Could not connect to API server. Make sure it's running on http://localhost:8000") + return False + except Exception as e: + print(f"Request failed with error: {e}") + return False + + +async def test_health(): + """ + Test the health check endpoint + """ + base_url = "http://localhost:8000" + endpoint = "/health" + + print(f"Testing health endpoint at {base_url}{endpoint}") + + try: + async with aiohttp.ClientSession() as session: + async with session.get(f"{base_url}{endpoint}") as response: + status = response.status + print(f"Health check status: {status}") + + if status == 200: + health_data = await response.json() + print(f"Health check result: {health_data}") + return True + else: + error_text = await response.text() + print(f"Health check failed with status {status}") + print(f"Error: {error_text}") + return False + except Exception as e: + print(f"Health check failed with error: {e}") + return False + + +async def main(): + print("Soprano TTS API Test Client") + print("Make sure the API server is running on http://localhost:8000 before executing this.") + print() + + # Test health endpoint first + print("Testing health endpoint...") + health_ok = await test_health() + if not health_ok: + print("Health check failed. Exiting.") + return + + print() + + # Get text from command line arguments or use default + if len(sys.argv) > 1: + text = " ".join(sys.argv[1:]) + else: + text = "Hello, this is a test of the Soprano TTS API. The system is working properly." + + if not text.strip(): + print("Text cannot be empty. Please enter some text.") + return + + print("Testing TTS API...") + success = await test_api(text) + + if success: + print("API test completed successfully!") + else: + print("API test failed!") + + +if __name__ == "__main__": + # Check if required packages are available + try: + import aiohttp + except ImportError: + print("Error: aiohttp is not installed. Please install it with: pip install aiohttp") + exit(1) + + asyncio.run(main()) \ No newline at end of file diff --git a/soprano/server/test_websocket.py b/soprano/server/test_websocket.py new file mode 100644 index 0000000..b8c304e --- /dev/null +++ b/soprano/server/test_websocket.py @@ -0,0 +1,81 @@ +import asyncio +import websockets +import json +import pyaudio +import sys + +class SopranoWSClient: + def __init__(self): + self.p = pyaudio.PyAudio() + self.stream = None + # Default settings (will be updated by metadata from server) + self.rate = 32000 + self.channels = 1 + + def open_stream(self, rate, channels): + """Opens the audio device for live playback.""" + self.stream = self.p.open( + format=pyaudio.paInt16, + channels=channels, + rate=rate, + output=True + ) + + async def start_test(self, text): + uri = "ws://localhost:8001/ws/tts" + + try: + async with websockets.connect(uri) as ws: + # 1. Send the synthesis request + payload = { + "type": "synthesize", + "text": text, + "stream": True, + "min_text_length": 30 + } + await ws.send(json.dumps(payload)) + print(f">>> Sent text to server. Waiting for audio...") + + # 2. Listen for chunks + while True: + message = await ws.recv() + + # Handle Audio Bytes + if isinstance(message, bytes): + if self.stream: + self.stream.write(message) + print(".", end="", flush=True) + + # Handle JSON Messages + else: + data = json.loads(message) + if data["type"] == "metadata": + print(f"\n[Metadata] Rate: {data['sample_rate']}Hz") + self.open_stream(data['sample_rate'], data['channels']) + + elif data["type"] == "end": + print("\n[Finished] Server signaled end of stream.") + break + + elif data["type"] == "error": + print(f"\n[Error] {data['message']}") + break + + except Exception as e: + print(f"Connection Error: {e}") + finally: + self.cleanup() + + def cleanup(self): + if self.stream: + self.stream.stop_stream() + self.stream.close() + self.p.terminate() + +if __name__ == "__main__": + input_text = "Testing the live websocket stream. I should hear this almost immediately." + if len(sys.argv) > 1: + input_text = " ".join(sys.argv[1:]) + + client = SopranoWSClient() + asyncio.run(client.start_test(input_text)) \ No newline at end of file diff --git a/soprano/server/websocket.py b/soprano/server/websocket.py new file mode 100644 index 0000000..5bb0723 --- /dev/null +++ b/soprano/server/websocket.py @@ -0,0 +1,338 @@ +import asyncio +import json +import logging +from typing import AsyncGenerator +import numpy as np +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +import torch +from contextlib import asynccontextmanager +from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError +from asyncio import Queue, QueueEmpty + +from soprano.tts import SopranoTTS + + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class TTSWebSocketManager: + """ + Manager for WebSocket TTS streaming functionality. + """ + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if not hasattr(self, 'initialized'): + self.initialized = True + self.tts: SopranoTTS = None + self._lock = asyncio.Lock() # Move lock to instance level + # Track active streaming tasks for graceful shutdown + self.active_tasks = set() + # Prioritize CUDA, fallback to CPU only if CUDA is not available + if torch.cuda.is_available(): + self.device = 'cuda' + logger.info("CUDA is available, using GPU for TTS processing") + else: + self.device = 'cpu' + logger.info("CUDA is not available, falling back to CPU for TTS processing") + logger.info(f"Initializing TTS on device: {self.device}") + + async def initialize_model(self): + """ + Initialize the TTS model asynchronously to avoid blocking the event loop. + """ + async with self._lock: + if self.tts is None: + logger.info("Loading Soprano TTS model for WebSocket streaming...") + try: + # Run model initialization in a thread pool to avoid blocking + loop = asyncio.get_running_loop() # Use get_running_loop instead of get_event_loop + + def load_model(): + return SopranoTTS( + cache_size_mb=100, + device=self.device + ) + + self.tts = await loop.run_in_executor(None, load_model) + logger.info("Soprano TTS model loaded successfully for WebSocket streaming") + except Exception as e: + logger.error(f"Failed to load Soprano TTS model: {e}", exc_info=True) + raise RuntimeError(f"Failed to initialize TTS model: {str(e)}") from e + + def get_model(self) -> SopranoTTS: + """ + Get the initialized TTS model instance. + """ + if self.tts is None: + raise RuntimeError("TTS model not initialized. Call initialize_model() first.") + return self.tts + + async def stream_audio_with_backpressure(self, websocket: WebSocket, text: str, min_text_length: int = 30): + """ + Stream audio in real-time from the TTS model with backpressure handling. + Uses a queue to decouple TTS generation from WebSocket sending. + """ + # Create a queue to decouple generation from sending + audio_queue = Queue(maxsize=10) # Limit queue size to prevent memory buildup + + async def producer(): + """Generate audio chunks and put them in the queue.""" + try: + # Get the TTS model + tts = self.get_model() + + # Use the streaming inference method from the TTS model + logger.info(f"Starting streaming TTS for text: '{text[:50]}{'...' if len(text) > 50 else ''}'") + + # Use the infer_stream method which is designed for streaming + for audio_chunk in tts.infer_stream( + text=text, + chunk_size=1, + top_p=1.0, # Using the default value we set + temperature=0.3, # Using the default value we set + repetition_penalty=1.2, # Using the default value we set + min_text_length=min_text_length # Use the passed value instead of hardcoded 1000 + ): + # Convert tensor to numpy array + audio_np = audio_chunk.cpu().numpy() + + # Ensure values are in the range [-1, 1] and convert to int16 + audio_np = np.clip(audio_np, -1.0, 1.0) + audio_np = (audio_np * 32767).astype(np.int16) + + # Convert to bytes + audio_bytes = audio_np.tobytes() + + # Put audio chunk in queue, with timeout to handle slow consumers + try: + await asyncio.wait_for(audio_queue.put(audio_bytes), timeout=5.0) + except asyncio.TimeoutError: + logger.warning("Audio queue timeout - client may be slow") + break + + # Put None to signal end of stream + await audio_queue.put(None) + except Exception as e: + logger.error(f"Error in audio producer: {str(e)}", exc_info=True) + try: + await audio_queue.put(None) # Signal error to consumer + except: + pass + + async def consumer(): + """Take audio chunks from the queue and send them via WebSocket.""" + try: + while True: + # Get audio chunk from queue with timeout + try: + audio_bytes = await asyncio.wait_for(audio_queue.get(), timeout=10.0) + except asyncio.TimeoutError: + logger.warning("Timeout waiting for audio data") + break + + # If None, it means the producer is done + if audio_bytes is None: + break + + # Send the audio chunk as binary data + await websocket.send_bytes(audio_bytes) + except Exception as e: + logger.error(f"Error in audio consumer: {str(e)}", exc_info=True) + # Don't re-raise here as we want to ensure cleanup happens + + # Create tasks for producer and consumer + producer_task = asyncio.create_task(producer()) + consumer_task = asyncio.create_task(consumer()) + + # Add tasks to active tasks set for graceful shutdown + self.active_tasks.add(producer_task) + self.active_tasks.add(consumer_task) + + try: + # Wait for both tasks to complete + await asyncio.gather(producer_task, consumer_task, return_exceptions=True) + finally: + # Remove tasks from active tasks set + self.active_tasks.discard(producer_task) + self.active_tasks.discard(consumer_task) + + # Cancel tasks if they're still running + if not producer_task.done(): + producer_task.cancel() + if not consumer_task.done(): + consumer_task.cancel() + + async def stream_audio(self, websocket: WebSocket, text: str, min_text_length: int = 30): + """ + Stream audio in real-time from the TTS model. + """ + try: + # Send metadata at the start + metadata = { + "type": "metadata", + "sample_rate": 32000, + "channels": 1, + "format": "int16", + "model_info": "soprano" + } + await websocket.send_text(json.dumps(metadata)) + + # Use the backpressure-aware streaming method + await self.stream_audio_with_backpressure(websocket, text, min_text_length) + + # Send end signal + await websocket.send_text(json.dumps({"type": "end"})) + logger.info("Streaming completed successfully") + + except Exception as e: + logger.error(f"Error during streaming: {str(e)}", exc_info=True) + try: + await websocket.send_text(json.dumps({"type": "error", "message": str(e)})) + except: + pass # If we can't send the error, just continue + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + """ + Lifespan event handler for startup and shutdown events. + """ + logger.info("Starting up Soprano TTS WebSocket server...") + tts_manager = TTSWebSocketManager() + + try: + await tts_manager.initialize_model() + logger.info("Soprano TTS WebSocket server started successfully") + yield + except Exception as e: + logger.error(f"Failed to start Soprano TTS WebSocket server: {e}", exc_info=True) + raise + finally: + logger.info("Shutting down Soprano TTS WebSocket server...") + # Cancel any active streaming tasks for graceful shutdown + if hasattr(tts_manager, 'active_tasks'): + for task in tts_manager.active_tasks.copy(): # Use copy to avoid modification during iteration + if not task.done(): + logger.info("Cancelling active streaming task...") + task.cancel() + try: + await task + except asyncio.CancelledError: + pass # Expected when cancelling tasks + logger.info("Soprano TTS WebSocket server shut down completed") + + +# Create FastAPI app with WebSocket support +app = FastAPI( + title="Soprano TTS WebSocket API", + description="Real-time streaming Text-to-Speech via WebSocket", + version="1.0.0", + lifespan=lifespan +) + + +@app.websocket("/ws/tts") +async def websocket_endpoint(websocket: WebSocket): + """ + WebSocket endpoint for real-time TTS streaming. + Supports multiple synthesize requests over the same connection. + """ + await websocket.accept() + logger.info("WebSocket connection accepted") + + try: + tts_manager = TTSWebSocketManager() + + # Keep the connection open to handle multiple requests + while True: + # Wait for a message + data = await websocket.receive_text() + message = json.loads(data) + + if message.get("type") == "synthesize": + text = message.get("text", "") + stream = message.get("stream", True) + # Allow client to specify min_text_length, default to 30 + min_text_length = message.get("min_text_length", 30) + + if not text or not text.strip(): + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Text cannot be empty" + })) + continue # Continue to listen for more messages + + if stream: + # Start streaming audio + await tts_manager.stream_audio(websocket, text, min_text_length) + else: + # For non-streaming, we could implement a regular synthesis + # but the requirement is for streaming, so we'll focus on that + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Only streaming is supported in this endpoint" + })) + elif message.get("type") == "ping": + # Simple ping/pong for connection health + await websocket.send_text(json.dumps({"type": "pong"})) + else: + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Invalid message type. Use 'synthesize' or 'ping'." + })) + + except WebSocketDisconnect: + logger.info("WebSocket disconnected") + except json.JSONDecodeError: + logger.error("Invalid JSON received") + try: + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Invalid JSON format" + })) + except: + pass + except ConnectionClosedOK: + logger.info("WebSocket connection closed normally") + except ConnectionClosedError: + logger.info("WebSocket connection closed with error") + except Exception as e: + logger.error(f"Unexpected error in WebSocket: {str(e)}", exc_info=True) + try: + await websocket.send_text(json.dumps({ + "type": "error", + "message": f"Server error: {str(e)}" + })) + except: + pass + finally: + try: + if hasattr(websocket, 'client_state') and websocket.client_state.name != 'DISCONNECTED': + await websocket.close() + except: + pass + + +if __name__ == "__main__": + import uvicorn + import torch + + print("Starting Soprano TTS WebSocket Server...") + print(f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}") + + # Start the server + print("WebSocket server starting on ws://localhost:8001/ws/tts") + uvicorn.run( + "soprano.server.websocket:app", + host="localhost", + port=8001, # Using port 8001 to avoid conflict with the regular API + reload=False + ) \ No newline at end of file diff --git a/soprano/tts.py b/soprano/tts.py index b7a308c..d21f35c 100644 --- a/soprano/tts.py +++ b/soprano/tts.py @@ -95,11 +95,13 @@ def infer(self, out_path=None, top_p=0.95, temperature=0.3, - repetition_penalty=1.2): + repetition_penalty=1.2, + min_text_length=30): results = self.infer_batch([text], top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, + min_text_length=min_text_length, out_dir=None)[0] if out_path: wavfile.write(out_path, 32000, results.cpu().numpy()) @@ -110,8 +112,9 @@ def infer_batch(self, out_dir=None, top_p=0.95, temperature=0.3, - repetition_penalty=1.2): - sentence_data = self._preprocess_text(texts) + repetition_penalty=1.2, + min_text_length=30): + sentence_data = self._preprocess_text(texts, min_length=min_text_length) prompts = list(map(lambda x: x[0], sentence_data)) responses = self.pipeline.infer(prompts, top_p=top_p, @@ -143,13 +146,13 @@ def infer_batch(self, batch_hidden_states = torch.cat(batch_hidden_states) with torch.no_grad(): audio = self.decoder(batch_hidden_states) - + for i in range(N): text_id = sentence_data[idx+i][1] sentence_id = sentence_data[idx+i][2] audio_concat[text_id][sentence_id] = audio[i].squeeze()[-(lengths[i]*self.TOKEN_SIZE-self.TOKEN_SIZE):] audio_concat = [torch.cat(x).cpu() for x in audio_concat] - + if out_dir: os.makedirs(out_dir, exist_ok=True) for i in range(len(audio_concat)): @@ -161,9 +164,10 @@ def infer_stream(self, chunk_size=1, top_p=0.95, temperature=0.3, - repetition_penalty=1.2): + repetition_penalty=1.2, + min_text_length=30): start_time = time.time() - sentence_data = self._preprocess_text([text]) + sentence_data = self._preprocess_text([text], min_length=min_text_length) first_chunk = True for sentence, _, _ in sentence_data: From 3902e91c87f916dc3f5ef42d11f8c73be8825645 Mon Sep 17 00:00:00 2001 From: biswas445 Date: Sat, 10 Jan 2026 15:56:10 +0545 Subject: [PATCH 07/27] Initial commit: Complete Soprano TTS implementation with API and WebSocket streaming --- CHANGES_LOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 CHANGES_LOG.md diff --git a/CHANGES_LOG.md b/CHANGES_LOG.md new file mode 100644 index 0000000..a4099e2 --- /dev/null +++ b/CHANGES_LOG.md @@ -0,0 +1,17 @@ +# Soprano TTS - Complete Implementation + +This repository contains a complete implementation of the Soprano TTS system with: + +- REST API server with OpenAI-compatible endpoints +- WebSocket server for real-time audio streaming +- Comprehensive test clients for both interfaces +- Proper documentation and launch scripts +- Production-ready architecture with error handling + +## Features + +- **API Server**: OpenAI-compatible `/v1/audio/speech` endpoint +- **WebSocket Server**: Real-time streaming TTS with backpressure handling +- **Configurable Parameters**: temperature, top_p, repetition_penalty, min_text_length +- **Robust Architecture**: Circuit breakers, retry mechanisms, graceful shutdown +- **Workflow Integration**: Ready for n8n, Zapier, and other automation platforms \ No newline at end of file From e8e20539e244a0dc399f19cd990e9b7712f0c38c Mon Sep 17 00:00:00 2001 From: biswas445 Date: Sat, 10 Jan 2026 17:54:02 +0545 Subject: [PATCH 08/27] pre-merge save --- CHANGES_LOG.md | 17 ----- CLI/start_cli.bat | 1 + Soprano.bat | 108 ------------------------------- setup.bat | 65 +++++++++++++++++++ soprano/server/README.md | 30 --------- soprano/server/Run_API.bat | 17 ----- soprano/server/Run_WebSocket.bat | 29 --------- 7 files changed, 66 insertions(+), 201 deletions(-) delete mode 100644 CHANGES_LOG.md create mode 100644 CLI/start_cli.bat delete mode 100644 Soprano.bat create mode 100644 setup.bat delete mode 100644 soprano/server/README.md delete mode 100644 soprano/server/Run_API.bat delete mode 100644 soprano/server/Run_WebSocket.bat diff --git a/CHANGES_LOG.md b/CHANGES_LOG.md deleted file mode 100644 index a4099e2..0000000 --- a/CHANGES_LOG.md +++ /dev/null @@ -1,17 +0,0 @@ -# Soprano TTS - Complete Implementation - -This repository contains a complete implementation of the Soprano TTS system with: - -- REST API server with OpenAI-compatible endpoints -- WebSocket server for real-time audio streaming -- Comprehensive test clients for both interfaces -- Proper documentation and launch scripts -- Production-ready architecture with error handling - -## Features - -- **API Server**: OpenAI-compatible `/v1/audio/speech` endpoint -- **WebSocket Server**: Real-time streaming TTS with backpressure handling -- **Configurable Parameters**: temperature, top_p, repetition_penalty, min_text_length -- **Robust Architecture**: Circuit breakers, retry mechanisms, graceful shutdown -- **Workflow Integration**: Ready for n8n, Zapier, and other automation platforms \ No newline at end of file diff --git a/CLI/start_cli.bat b/CLI/start_cli.bat new file mode 100644 index 0000000..c7f26fe --- /dev/null +++ b/CLI/start_cli.bat @@ -0,0 +1 @@ +python soprano_cli.py \ No newline at end of file diff --git a/Soprano.bat b/Soprano.bat deleted file mode 100644 index d04ca0d..0000000 --- a/Soprano.bat +++ /dev/null @@ -1,108 +0,0 @@ -@echo off -title Soprano TTS Launcher -color 0A - -:menu -cls -echo ================================================ -echo SOPRANO TTS LAUNCHER -echo ================================================ -echo. -echo Select an option: -echo. -echo 1. Launch API Server -echo 2. Launch CLI Interface -echo 3. Launch API Test Client -echo 4. Launch WebSocket Server -echo 5. Launch WebSocket Test Client -echo 6. Exit -echo. -set /p choice="Enter your choice (1-6): " - -if "%choice%"=="1" goto api_server -if "%choice%"=="2" goto cli -if "%choice%"=="3" goto api_test -if "%choice%"=="4" goto websocket_server -if "%choice%"=="5" goto websocket_test -if "%choice%"=="6" goto exit -goto invalid_choice - -:invalid_choice -echo. -echo Invalid choice. Please enter 1, 2, 3, or 4. -timeout /t 2 /nobreak >nul -goto menu - -:api_server -echo. -echo Starting Soprano TTS API Server... -echo. -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) -cd soprano\server -call Run_API.bat -cd ..\.. -goto end - -:cli -echo. -echo Starting Soprano TTS CLI Interface... -echo. -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) -python -m CLI.soprano_cli -goto end - -:api_test -echo. -echo Starting Soprano TTS API Test Client... -echo. -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) -cd soprano\server -call Run_API_Test.bat "Hello, this is a test of the Soprano TTS API. The system is working properly." -cd ..\.. -goto end - -:websocket_server -echo. -echo Starting Soprano TTS WebSocket Server... -echo. -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) -cd soprano\server -call Run_WebSocket.bat -cd ..\.. -goto end - -:websocket_test -echo. -echo Starting Soprano TTS WebSocket Test Client... -echo. -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) -cd soprano\server -call Run_WebSocket_Test.bat "Hello, this is a test of the WebSocket TTS system. Audio is streaming in real-time." -cd ..\.. -goto end - -:exit -exit /b - -:end -pause \ No newline at end of file diff --git a/setup.bat b/setup.bat new file mode 100644 index 0000000..f4aed6d --- /dev/null +++ b/setup.bat @@ -0,0 +1,65 @@ +@echo off +title Soprano TTS Setup +color 0A + +echo ================================================ +echo SOPRANO TTS SETUP +echo ================================================ +echo. +echo This script will: +echo 1. Install the Soprano TTS package +echo 2. Install/fix PyTorch with CUDA support +echo 3. Verify the installation +echo. +echo Press any key to continue or Ctrl+C to cancel... +pause >nul + +echo. +echo Installing Soprano TTS package... +echo. + +REM Install the package in editable mode +pip install -e . + +if %errorlevel% neq 0 ( + echo Error occurred during installation. Attempting to fix... + goto fix_pytorch +) + +echo. +echo Installing PyTorch with CUDA support... +echo. + +:fix_pytorch +REM Uninstall current PyTorch +pip uninstall -y torch torchvision torchaudio + +REM Install PyTorch with CUDA 12.6 support +pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126 + +if %errorlevel% neq 0 ( + echo Warning: PyTorch CUDA installation failed. Installing CPU version... + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cpu +) + +echo. +echo Verifying installation... +echo. + +REM Test the installation +python -c "import soprano; from soprano import SopranoTTS; print('Soprano TTS imported successfully'); print('Installation verified successfully!')" + +if %errorlevel% neq 0 ( + echo Warning: Verification failed, but installation may still be OK. +) + +echo. +echo ================================================ +echo Setup completed! +echo. +echo To use Soprano TTS: +echo - Run Soprano.bat to access the main menu +echo - Or run individual components as needed +echo ================================================ + +pause \ No newline at end of file diff --git a/soprano/server/README.md b/soprano/server/README.md deleted file mode 100644 index 4c1a9a5..0000000 --- a/soprano/server/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# Soprano TTS Server - -This directory contains the server components for the Soprano TTS system, including the API server and test utilities. - -## Components - -- **API Server** (`api.py`): OpenAI-compatible text-to-speech API server -- **WebSocket Server** (`websocket.py`): Real-time streaming TTS via WebSocket -- **API Test Client** (`test_api.py`): Test client for the API server -- **WebSocket Test Client** (`test_websocket.py`): Test client for the WebSocket server -- **Documentation**: README files explaining usage and integration - -## API Compatibility - -The API server implements OpenAI-compatible endpoints, making it easy to integrate with existing applications and services that expect OpenAI's speech API format. - -## Quick Start - -1. Start the API server: `python -m soprano.server.api` -2. Start the WebSocket server: `python -m soprano.server.websocket` -3. Test with the clients: `python -m soprano.server.test_api` or `python -m soprano.server.test_websocket` -4. Or use directly with HTTP requests to `http://localhost:8000/v1/audio/speech` - -## Integration Ready - -The server is designed for seamless integration with: -- Workflow automation tools (like n8n) -- Web and mobile applications -- Voice-enabled systems -- Any system capable of making HTTP requests or WebSocket connections \ No newline at end of file diff --git a/soprano/server/Run_API.bat b/soprano/server/Run_API.bat deleted file mode 100644 index 2e7a969..0000000 --- a/soprano/server/Run_API.bat +++ /dev/null @@ -1,17 +0,0 @@ -@echo off -title Soprano TTS API Server -echo Starting Soprano TTS API Server... -echo. - -REM Activate virtual environment if present -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) - -REM Start the API server with performance optimizations -echo Starting server on http://localhost:8000 -python -c "import uvicorn; import torch; print('Starting Soprano TTS API Server...'); device = 'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'; print(f'Available device: {device}'); uvicorn.run('soprano.server.api:app', host='localhost', port=8000, workers=1, log_level='info')" - -pause \ No newline at end of file diff --git a/soprano/server/Run_WebSocket.bat b/soprano/server/Run_WebSocket.bat deleted file mode 100644 index b8aa1ea..0000000 --- a/soprano/server/Run_WebSocket.bat +++ /dev/null @@ -1,29 +0,0 @@ -@echo off -title Soprano TTS WebSocket Server -echo Starting Soprano TTS WebSocket Server... -echo. - -REM Activate virtual environment if present -if exist venv\Scripts\activate.bat ( - call venv\Scripts\activate.bat -) else if exist env\Scripts\activate.bat ( - call env\Scripts\activate.bat -) - -REM Start the WebSocket server -echo Starting WebSocket server on ws://localhost:8001/ws/tts -python -c " -import uvicorn -import torch -print('Starting Soprano TTS WebSocket Server...') -device = 'CUDA (GPU)' if torch.cuda.is_available() else 'CPU' -print(f'Available device: {device}') -uvicorn.run( - 'soprano.server.websocket:app', - host='localhost', - port=8001, - log_level='info' -) -" - -pause \ No newline at end of file From 83d251edab40c6534e5a29b50a1d481d49ebdcbf Mon Sep 17 00:00:00 2001 From: biswas445 Date: Sun, 11 Jan 2026 15:43:22 +0545 Subject: [PATCH 09/27] updated version --- CLI/__init__.py | 1 - CLI/soprano_cli.py | 184 ------------- CLI/start_cli.bat | 1 - pyproject.toml | 7 +- setup.bat | 22 +- soprano/__init__.py | 13 +- soprano/backends/transformers.py | 43 ++- soprano/server.py | 139 ++++++++++ soprano/server/api.py | 21 +- soprano/server/websocket.py | 58 +++- soprano/soprano_cli.py | 190 +++++++++++-- soprano/tts.py | 5 +- soprano/webui.py | 445 +++++++++++++++++++++++++------ start_soprano.bat | 3 + 14 files changed, 828 insertions(+), 304 deletions(-) delete mode 100644 CLI/__init__.py delete mode 100644 CLI/soprano_cli.py delete mode 100644 CLI/start_cli.bat create mode 100644 soprano/server.py create mode 100644 start_soprano.bat diff --git a/CLI/__init__.py b/CLI/__init__.py deleted file mode 100644 index e0e4938..0000000 --- a/CLI/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""CLI package for Soprano.""" \ No newline at end of file diff --git a/CLI/soprano_cli.py b/CLI/soprano_cli.py deleted file mode 100644 index 83007a0..0000000 --- a/CLI/soprano_cli.py +++ /dev/null @@ -1,184 +0,0 @@ - -""" -Soprano TTS Command Line Interface -""" -import argparse -import sys -import torch -from soprano import SopranoTTS - -try: - import sounddevice as sd - SOUNDDEVICE_AVAILABLE = True -except ImportError: - SOUNDDEVICE_AVAILABLE = False - -def get_device(): - """Determine the best available device (CUDA if available, otherwise CPU)""" - return 'cuda' if torch.cuda.is_available() else 'cpu' - -def play_audio(audio_tensor): - """Play audio tensor using sounddevice""" - if not SOUNDDEVICE_AVAILABLE: - print("Error: sounddevice library not available. Install it with 'pip install sounddevice'") - return - - import numpy as np - audio_np = audio_tensor.cpu().numpy() if isinstance(audio_tensor, torch.Tensor) else audio_tensor - - duration = len(audio_np) / 32000 - print(f"Playing audio ({duration:.2f}s)...") - - sample_rate = 32000 - sd.play(audio_np, samplerate=sample_rate) - - import time - time.sleep(duration + 0.5) - - try: - if sd.get_status().playing: - sd.wait() - except: - time.sleep(0.5) - -def validate_text(text): - """Validate input text""" - stripped_text = text.strip() if text else "" - if not stripped_text: - print("Error: Text cannot be empty.") - return False - if len(stripped_text) > 1000: - print("Error: Text is too long (max 1000 characters).") - return False - return True - -def get_validated_input(prompt, validator_func, error_msg=None): - """Get validated input from user""" - while True: - user_input = input(prompt).strip() - if validator_func(user_input): - return user_input - else: - if error_msg: - print(error_msg) - else: - print("Invalid input, please try again.") - -def get_next_filename(base_name="output_audio", ext=".wav"): - """Generate next available filename with incremental numbering""" - import os - - output_dir = "audio_output" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - counter = 0 - while True: - if counter == 0: - filename = f"{base_name}{ext}" - else: - filename = f"{base_name}{counter}{ext}" - - full_path = os.path.join(output_dir, filename) - if not os.path.exists(full_path): - return full_path - counter += 1 - -def main(): - parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') - parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') - parser.add_argument('--backend', '-b', default='auto', - choices=['auto', 'transformers', 'lmdeploy'], - help='Backend to use for inference') - parser.add_argument('--cache-size', '-c', type=int, default=10, - help='Cache size in MB (for lmdeploy backend)') - - args = parser.parse_args() - - device = get_device() - - try: - import io - import contextlib - - with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): - tts = SopranoTTS( - backend=args.backend, - device=device, - cache_size_mb=args.cache_size, - model_path=args.model_path - ) - except Exception as e: - print(f"Error initializing model: {e}") - sys.exit(1) - - print("Soprano TTS is ready. Starting interactive menu...") - - while True: - print("\n" + "="*50) - print(" SOPRANO TTS MENU") - print("="*50) - print("1. Input text for synthesis (with file saving)") - print("2. Real-time audio playback (no file saving)") - print("3. View saved audio files") - print("4. Exit") - print("="*50) - - choice = input("Enter your choice (1-4): ").strip() - - if choice == '1': - text = get_validated_input( - "Enter text to synthesize: ", - validate_text, - "Text must not be empty and must be under 1000 characters." - ) - - output_path = get_next_filename() - print(f"Using output path: {output_path}") - - print(f"Generating speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'") - try: - tts.infer(text, out_path=output_path) - print(f"✓ Audio saved to: {output_path}") - except Exception as e: - print(f"✗ Error generating audio: {e}") - - elif choice == '2': - text = get_validated_input( - "Enter text for real-time playback: ", - validate_text, - "Text must not be empty and must be under 1000 characters." - ) - - print(f"Generating real-time audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'") - try: - audio_tensor = tts.infer(text) - print("Playing audio...") - play_audio(audio_tensor) - print("✓ Playback finished.") - except Exception as e: - print(f"✗ Error during playback: {e}") - - elif choice == '3': - import os - output_dir = "audio_output" - if os.path.exists(output_dir): - files = [f for f in os.listdir(output_dir) if f.lower().endswith('.wav')] - if files: - print(f"Found {len(files)} audio file(s) in {output_dir}/:") - for i, file in enumerate(sorted(files), 1): - print(f" {i}. {file}") - else: - print(f"No audio files found in {output_dir}/") - else: - print(f"No {output_dir}/ directory exists yet.") - - elif choice == '4': - print("Thank you for using Soprano TTS. Goodbye!") - break - - else: - print("✗ Invalid choice. Please enter 1, 2, 3, or 4.") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/CLI/start_cli.bat b/CLI/start_cli.bat deleted file mode 100644 index c7f26fe..0000000 --- a/CLI/start_cli.bat +++ /dev/null @@ -1 +0,0 @@ -python soprano_cli.py \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7cfa334..0a9b483 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "inflect", "sounddevice", "uvicorn", - "gradio" + "gradio", + "pyaudio" ] license = {text = "MIT"} @@ -35,9 +36,9 @@ Homepage = "https://github.com/ekwek1/soprano" Issues = "https://github.com/ekwek1/soprano/issues" [project.scripts] -soprano = "CLI.soprano_cli:main" +soprano = "soprano.soprano_cli:main" soprano-webui = "soprano.webui:main" [tool.setuptools.packages.find] where = ["."] -include = ["soprano*", "CLI*"] +include = ["soprano*"] \ No newline at end of file diff --git a/setup.bat b/setup.bat index f4aed6d..6dd4c07 100644 --- a/setup.bat +++ b/setup.bat @@ -14,6 +14,17 @@ echo. echo Press any key to continue or Ctrl+C to cancel... pause >nul +echo. +echo Installing required dependencies... +echo. + +REM Install all required packages +pip install fastapi huggingface_hub lmdeploy numpy scipy unidecode inflect sounddevice uvicorn gradio pyaudio + +if %errorlevel% neq 0 ( + echo Error occurred during dependency installation. Attempting to continue... +) + echo. echo Installing Soprano TTS package... echo. @@ -34,8 +45,13 @@ echo. REM Uninstall current PyTorch pip uninstall -y torch torchvision torchaudio -REM Install PyTorch with CUDA 12.6 support -pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126 +REM Install PyTorch with CUDA 12.8 support +pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128 + +if %errorlevel% neq 0 ( + echo Warning: PyTorch CUDA 12.8 installation failed. Installing CUDA 12.6 version... + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126 +) if %errorlevel% neq 0 ( echo Warning: PyTorch CUDA installation failed. Installing CPU version... @@ -58,7 +74,7 @@ echo ================================================ echo Setup completed! echo. echo To use Soprano TTS: -echo - Run Soprano.bat to access the main menu +echo - Run start_soprano.bat to access the main menu echo - Or run individual components as needed echo ================================================ diff --git a/soprano/__init__.py b/soprano/__init__.py index feadb53..01940e8 100644 --- a/soprano/__init__.py +++ b/soprano/__init__.py @@ -1 +1,12 @@ -from .tts import SopranoTTS \ No newline at end of file +""" +Soprano TTS - Ultra-realistic Text-to-Speech System + +This package provides high-quality text-to-speech functionality with both +REST API and WebSocket streaming capabilities. +""" + +from .tts import SopranoTTS + +__version__ = "0.0.2" +__author__ = "ekwek1" +__all__ = ["SopranoTTS"] \ No newline at end of file diff --git a/soprano/backends/transformers.py b/soprano/backends/transformers.py index b85c49e..ec42341 100644 --- a/soprano/backends/transformers.py +++ b/soprano/backends/transformers.py @@ -69,4 +69,45 @@ def stream_infer(self, top_p=0.95, temperature=0.3, repetition_penalty=1.2): - raise NotImplementedError("transformers backend does not currently support streaming, please consider using lmdeploy backend instead.") + # For transformers backend, simulate streaming by returning all results at once + # This is a workaround to provide basic streaming functionality + inputs = self.tokenizer( + [prompt], + return_tensors='pt', + padding=True, + truncation=True, + max_length=512, + ).to(self.device) + + with torch.no_grad(): + outputs = self.model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + max_new_tokens=512, + do_sample=True, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + pad_token_id=self.tokenizer.pad_token_id, + return_dict_in_generate=True, + output_hidden_states=True, + ) + + eos_token_id = self.model.config.eos_token_id + seq = outputs.sequences[0] + hidden_states = [] + num_output_tokens = len(outputs.hidden_states) + + for j in range(num_output_tokens): + token = seq[j + seq.size(0) - num_output_tokens] + if token != eos_token_id: + hidden_state = outputs.hidden_states[j][-1][0, -1, :] + hidden_states.append(hidden_state) + + # Yield all hidden states as a single "stream" + for i, hidden_state in enumerate(hidden_states): + finish_reason = 'stop' if i == len(hidden_states) - 1 and seq[-1].item() == eos_token_id else None + yield { + 'finish_reason': finish_reason, + 'hidden_state': hidden_state + } diff --git a/soprano/server.py b/soprano/server.py new file mode 100644 index 0000000..035ac9d --- /dev/null +++ b/soprano/server.py @@ -0,0 +1,139 @@ +""" +Soprano TTS Server +Menu-driven interface to launch different server options. +""" +import sys +import os +import subprocess + + +def display_menu() -> None: + """Display the main menu options""" + print("\n" + "="*60) + print(" SOPRANO TTS SERVER MENU") + print("="*60) + print("Select an option:") + print() + print("1. Start API Server") + print(" OpenAI-compatible API for workflow integration") + print(" Accessible at http://localhost:8000/v1/audio/speech") + print() + print("2. Test API Server") + print(" Test client for the OpenAI-compatible API") + print(" Requires API server to be running") + print() + print("3. Start WebSocket Server") + print(" Real-time audio streaming for interactive applications") + print(" Available at ws://localhost:8001/ws/tts") + print() + print("4. Test WebSocket Server") + print(" Test client for real-time audio streaming") + print(" Requires WebSocket server to be running") + print() + print("5. Start WebUI") + print(" Gradio web interface for Soprano TTS") + print(" Opens browser with interactive UI") + print() + print("6. Start CLI") + print(" Command-line interface for Soprano TTS") + print(" Interactive menu for text synthesis") + print() + print("7. Exit") + print("="*60) + + +def get_user_choice() -> str: + """Get and validate user choice from the menu""" + while True: + try: + choice = input("Enter your choice (1-7): ").strip() + if choice in ['1', '2', '3', '4', '5', '6', '7']: + return choice + else: + print("Invalid choice. Please enter 1, 2, 3, 4, 5, 6, or 7.") + except (KeyboardInterrupt, EOFError): + print("\n\nSoprano TTS Server menu interrupted. Goodbye!") + sys.exit(0) + + +def main_menu() -> None: + """Display the main menu and handle user selection""" + # Get the root directory (where this script is located) + root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + while True: + display_menu() + + choice = get_user_choice() + + if choice == '1': + # Open new cmdline in root, cd to soprano/server, run python api.py + cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python api.py"' + subprocess.run(cmd, shell=True) + print("API server started in new terminal. This menu will now close.") + sys.exit(0) + elif choice == '2': + # Open new cmdline in root, cd to soprano/server, run python api.py and test_api.py in separate terminals + cmd1 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python api.py"' + cmd2 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && ping 127.0.0.1 -n 11 > nul && python test_api.py"' + subprocess.run(cmd1, shell=True) + subprocess.run(cmd2, shell=True) + print("API server and test client started in new terminals. This menu will now close.") + sys.exit(0) + elif choice == '3': + # Open new cmdline in root, cd to soprano/server, run python websocket.py + cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python websocket.py"' + subprocess.run(cmd, shell=True) + print("WebSocket server started in new terminal. This menu will now close.") + sys.exit(0) + elif choice == '4': + # Open new cmdline in root, cd to soprano/server, run python websocket.py and test_websocket.py in separate terminals + cmd1 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python websocket.py"' + cmd2 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && ping 127.0.0.1 -n 11 > nul && python test_websocket.py"' + subprocess.run(cmd1, shell=True) + subprocess.run(cmd2, shell=True) + print("WebSocket server and test client started in new terminals. This menu will now close.") + sys.exit(0) + elif choice == '5': + # Open new cmdline in root, cd to soprano, run python webui.py + cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && python webui.py"' + subprocess.run(cmd, shell=True) + print("WebUI started in new terminal. This menu will now close.") + sys.exit(0) + elif choice == '6': + # Open new cmdline in root, cd to soprano, run python soprano_cli.py + cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && python soprano_cli.py"' + subprocess.run(cmd, shell=True) + print("CLI started in new terminal. This menu will now close.") + sys.exit(0) + elif choice == '7': + print("Thank you for using Soprano TTS. Goodbye!") + sys.exit(0) + + +def main() -> None: + """ + Main entry point for the server module. + Initializes device detection and starts the main menu. + """ + try: + # Check available device + try: + import torch + device_info = f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}" + print(device_info) + except ImportError: + print("Available device: CPU (PyTorch not available)") + + # Start the main menu + main_menu() + except KeyboardInterrupt: + print("\n\nSoprano TTS Server interrupted. Goodbye!") + sys.exit(0) + except Exception as e: + print(f"An unexpected error occurred in main: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/soprano/server/api.py b/soprano/server/api.py index 909bb1a..c0c6df8 100644 --- a/soprano/server/api.py +++ b/soprano/server/api.py @@ -12,7 +12,15 @@ import torch from contextlib import asynccontextmanager -from soprano.tts import SopranoTTS +# Handle import when running from within the server directory +try: + from soprano.tts import SopranoTTS +except ImportError: + import sys + import os + # Add the parent directory to the Python path to resolve import issues + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from soprano.tts import SopranoTTS # Set up logging @@ -138,6 +146,15 @@ async def initialize_model(self): # Use retry mechanism for model loading def load_model(): + # Import here in case it's needed in the executor + try: + from soprano.tts import SopranoTTS + except ImportError: + import sys + import os + # Add the parent directory to the Python path to resolve import issues + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + from soprano.tts import SopranoTTS return SopranoTTS( cache_size_mb=100, device=self.device @@ -351,7 +368,7 @@ async def health_check(): # Start the server print("Server starting on http://localhost:8000") uvicorn.run( - "soprano.server.api:app", + app, host="localhost", port=8000, reload=False diff --git a/soprano/server/websocket.py b/soprano/server/websocket.py index 5bb0723..45883e1 100644 --- a/soprano/server/websocket.py +++ b/soprano/server/websocket.py @@ -9,7 +9,15 @@ from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError from asyncio import Queue, QueueEmpty -from soprano.tts import SopranoTTS +# Handle import when running from within the server directory +try: + from soprano.tts import SopranoTTS +except ImportError: + import sys + import os + # Add the parent directory to the Python path to resolve import issues + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from soprano.tts import SopranoTTS # Set up logging @@ -56,6 +64,15 @@ async def initialize_model(self): loop = asyncio.get_running_loop() # Use get_running_loop instead of get_event_loop def load_model(): + # Import here in case it's needed in the executor + try: + from soprano.tts import SopranoTTS + except ImportError: + import sys + import os + # Add the parent directory to the Python path to resolve import issues + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + from soprano.tts import SopranoTTS return SopranoTTS( cache_size_mb=100, device=self.device @@ -80,6 +97,23 @@ async def stream_audio_with_backpressure(self, websocket: WebSocket, text: str, Stream audio in real-time from the TTS model with backpressure handling. Uses a queue to decouple TTS generation from WebSocket sending. """ + # Check if streaming is supported (only available on GPU with LMDeploy backend) + try: + tts = self.get_model() + # Check if we're using transformers backend which doesn't support streaming + from soprano.backends.transformers import TransformersModel + if isinstance(tts.pipeline, TransformersModel): + # Send error message to client + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Real-time streaming is not supported on CPU. Only generate speech is supported for CPU." + })) + logger.warning("Streaming requested but not supported on CPU") + return + except Exception as e: + logger.warning(f"Could not determine backend type: {e}") + # Continue with original logic if there's an issue checking the backend + # Create a queue to decouple generation from sending audio_queue = Queue(maxsize=10) # Limit queue size to prevent memory buildup @@ -120,6 +154,20 @@ async def producer(): # Put None to signal end of stream await audio_queue.put(None) + except NotImplementedError as e: + logger.error(f"Streaming not supported: {str(e)}") + # Send error message to client + try: + await websocket.send_text(json.dumps({ + "type": "error", + "message": "Real-time streaming is not supported on CPU. Only generate speech is supported for CPU." + })) + except: + pass + try: + await audio_queue.put(None) # Signal error to consumer + except: + pass except Exception as e: logger.error(f"Error in audio producer: {str(e)}", exc_info=True) try: @@ -186,11 +234,11 @@ async def stream_audio(self, websocket: WebSocket, text: str, min_text_length: i await websocket.send_text(json.dumps(metadata)) # Use the backpressure-aware streaming method + # This will return early if streaming is not supported await self.stream_audio_with_backpressure(websocket, text, min_text_length) - # Send end signal - await websocket.send_text(json.dumps({"type": "end"})) - logger.info("Streaming completed successfully") + # Only send end signal if streaming was not terminated early due to unsupported backend + # The function will return before reaching here if streaming is not supported except Exception as e: logger.error(f"Error during streaming: {str(e)}", exc_info=True) @@ -331,7 +379,7 @@ async def websocket_endpoint(websocket: WebSocket): # Start the server print("WebSocket server starting on ws://localhost:8001/ws/tts") uvicorn.run( - "soprano.server.websocket:app", + app, host="localhost", port=8001, # Using port 8001 to avoid conflict with the regular API reload=False diff --git a/soprano/soprano_cli.py b/soprano/soprano_cli.py index 208c87d..38a8f64 100644 --- a/soprano/soprano_cli.py +++ b/soprano/soprano_cli.py @@ -1,38 +1,186 @@ -#!/usr/bin/env python3 """ Soprano TTS Command Line Interface """ import argparse +import sys import os +import torch +# Add the parent directory to the Python path to resolve import issues when running directly +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from soprano import SopranoTTS +try: + import sounddevice as sd + SOUNDDEVICE_AVAILABLE = True +except ImportError: + SOUNDDEVICE_AVAILABLE = False + +def get_device(): + """Determine the best available device (CUDA if available, otherwise CPU)""" + return 'cuda' if torch.cuda.is_available() else 'cpu' + +def play_audio(audio_tensor): + """Play audio tensor using sounddevice""" + if not SOUNDDEVICE_AVAILABLE: + print("Error: sounddevice library not available. Install it with 'pip install sounddevice'") + return + + import numpy as np + audio_np = audio_tensor.cpu().numpy() if isinstance(audio_tensor, torch.Tensor) else audio_tensor + + duration = len(audio_np) / 32000 + print(f"Playing audio ({duration:.2f}s)...") + + sample_rate = 32000 + sd.play(audio_np, samplerate=sample_rate) + + import time + time.sleep(duration + 0.5) + + try: + if sd.get_status().playing: + sd.wait() + except: + time.sleep(0.5) + +def validate_text(text): + """Validate input text""" + stripped_text = text.strip() if text else "" + if not stripped_text: + print("Error: Text cannot be empty.") + return False + if len(stripped_text) > 1000: + print("Error: Text is too long (max 1000 characters).") + return False + return True + +def get_validated_input(prompt, validator_func, error_msg=None): + """Get validated input from user""" + while True: + user_input = input(prompt).strip() + if validator_func(user_input): + return user_input + else: + if error_msg: + print(error_msg) + else: + print("Invalid input, please try again.") + +def get_next_filename(base_name="output_audio", ext=".wav"): + """Generate next available filename with incremental numbering""" + import os + + output_dir = "audio_output" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + counter = 0 + while True: + if counter == 0: + filename = f"{base_name}{ext}" + else: + filename = f"{base_name}{counter}{ext}" + + full_path = os.path.join(output_dir, filename) + if not os.path.exists(full_path): + return full_path + counter += 1 + def main(): parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') - parser.add_argument('text', help='Text to synthesize') - parser.add_argument('--output', '-o', default='output.wav', help='Output audio file path') parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') - parser.add_argument('--device', '-d', default='cpu', choices=['cuda', 'cpu'], - help='Device to use for inference') - parser.add_argument('--backend', '-b', default='auto', + parser.add_argument('--backend', '-b', default='auto', choices=['auto', 'transformers', 'lmdeploy'], help='Backend to use for inference') - parser.add_argument('--cache-size', '-c', type=int, default=100, + parser.add_argument('--cache-size', '-c', type=int, default=10, help='Cache size in MB (for lmdeploy backend)') - + args = parser.parse_args() - - # Initialize TTS - tts = SopranoTTS( - backend=args.backend, - device=args.device, - cache_size_mb=args.cache_size, - model_path=args.model_path - ) - - # Generate speech - print(f"Generating speech for: '{args.text}'") - tts.infer(args.text, out_path=args.output) - print(f"Audio saved to: {args.output}") + + device = get_device() + + try: + import io + import contextlib + + with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): + tts = SopranoTTS( + backend=args.backend, + device=device, + cache_size_mb=args.cache_size, + model_path=args.model_path + ) + except Exception as e: + print(f"Error initializing model: {e}") + sys.exit(1) + + print("Soprano TTS is ready. Starting interactive menu...") + + while True: + print("\n" + "="*50) + print(" SOPRANO TTS MENU") + print("="*50) + print("1. Input text for synthesis (with file saving)") + print("2. Real-time audio playback (no file saving)") + print("3. View saved audio files") + print("4. Exit") + print("="*50) + + choice = input("Enter your choice (1-4): ").strip() + + if choice == '1': + text = get_validated_input( + "Enter text to synthesize: ", + validate_text, + "Text must not be empty and must be under 1000 characters." + ) + + output_path = get_next_filename() + print(f"Using output path: {output_path}") + + print(f"Generating speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'") + try: + tts.infer(text, out_path=output_path) + print(f"✓ Audio saved to: {output_path}") + except Exception as e: + print(f"✗ Error generating audio: {e}") + + elif choice == '2': + text = get_validated_input( + "Enter text for real-time playback: ", + validate_text, + "Text must not be empty and must be under 1000 characters." + ) + + print(f"Generating real-time audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'") + try: + audio_tensor = tts.infer(text) + print("Playing audio...") + play_audio(audio_tensor) + print("✓ Playback finished.") + except Exception as e: + print(f"✗ Error during playback: {e}") + + elif choice == '3': + import os + output_dir = "audio_output" + if os.path.exists(output_dir): + files = [f for f in os.listdir(output_dir) if f.lower().endswith('.wav')] + if files: + print(f"Found {len(files)} audio file(s) in {output_dir}/:") + for i, file in enumerate(sorted(files), 1): + print(f" {i}. {file}") + else: + print(f"No audio files found in {output_dir}/") + else: + print(f"No {output_dir}/ directory exists yet.") + + elif choice == '4': + print("Thank you for using Soprano TTS. Goodbye!") + break + + else: + print("✗ Invalid choice. Please enter 1, 2, 3, or 4.") if __name__ == "__main__": main() \ No newline at end of file diff --git a/soprano/tts.py b/soprano/tts.py index 39b7c30..8d15011 100644 --- a/soprano/tts.py +++ b/soprano/tts.py @@ -42,11 +42,14 @@ def __init__(self, self.decoder = SopranoDecoder() if device == 'cuda': self.decoder = self.decoder.cuda() + map_location = 'cuda' + else: + map_location = 'cpu' if model_path: decoder_path = os.path.join(model_path, 'decoder.pth') else: decoder_path = hf_hub_download(repo_id='ekwek/Soprano-80M', filename='decoder.pth') - self.decoder.load_state_dict(torch.load(decoder_path)) + self.decoder.load_state_dict(torch.load(decoder_path, map_location=map_location)) self.decoder_batch_size=decoder_batch_size self.RECEPTIVE_FIELD = 4 # Decoder receptive field self.TOKEN_SIZE = 2048 # Number of samples per audio token diff --git a/soprano/webui.py b/soprano/webui.py index ab7d5eb..5364ec8 100644 --- a/soprano/webui.py +++ b/soprano/webui.py @@ -1,99 +1,382 @@ #!/usr/bin/env python3 """ -Soprano TTS Web UI +Gradio Web Interface for Soprano TTS """ + import gradio as gr import torch +import sys +import os +# Add the parent directory to the Python path to resolve import issues when running directly +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from soprano import SopranoTTS +import numpy as np +import socket +import time +import threading + +# Try to import pyaudio, handle if not available +try: + import pyaudio + PYAUDIO_AVAILABLE = True +except ImportError: + PYAUDIO_AVAILABLE = False + print("PyAudio not found. Install it with 'pip install pyaudio' for real-time audio streaming.") + +# Global variables for PyAudio management +current_stream = None +current_pyaudio_instance = None +stream_lock = threading.Lock() + +# Detect device +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Initialize model +print("Loading Soprano TTS model...") +model = SopranoTTS( + backend="auto", + device=DEVICE, + cache_size_mb=100, + decoder_batch_size=1, +) +print("Model loaded successfully!") +SAMPLE_RATE = 32000 -def create_app(): - # Initialize the TTS model - device = 'cuda' if torch.cuda.is_available() else 'cpu' - tts = SopranoTTS(device=device, cache_size_mb=100) - - def synthesize(text, temperature, top_p, repetition_penalty): - if not text.strip(): - return "Error: Text cannot be empty", None - - # Generate audio - audio_tensor = tts.infer( - text=text, + +async def generate_speech( + text: str, + temperature: float, + top_p: float, + repetition_penalty: float, +) -> tuple: + if not text.strip(): + return None, "Please enter some text to generate speech." + + try: + start_time = time.perf_counter() + + audio = model.infer( + text, temperature=temperature, top_p=top_p, - repetition_penalty=repetition_penalty - ) - - # Convert tensor to numpy for Gradio - audio_np = audio_tensor.cpu().numpy() - - # Return the audio as a tuple (sample_rate, audio_data) - return "Audio generated successfully!", (32000, audio_np) - - # Create Gradio interface - with gr.Blocks(title="Soprano TTS Web UI") as demo: - gr.Markdown("# 🎵 Soprano TTS Web UI") - gr.Markdown("Convert text to realistic speech using Soprano TTS") - - with gr.Row(): - with gr.Column(): - text_input = gr.TextArea(label="Input Text", placeholder="Enter text to synthesize...", elem_id="text_input") - - with gr.Group(): - temperature = gr.Slider( - minimum=0.1, - maximum=1.5, - value=0.3, - step=0.05, - label="Temperature", - ) - - top_p = gr.Slider( - minimum=0.5, - maximum=1.0, - value=0.95, - step=0.05, - label="Top P", - ) - - repetition_penalty = gr.Slider( - minimum=1.0, - maximum=2.0, - value=1.2, - step=0.1, - label="Repetition Penalty", - ) - - generate_btn = gr.Button("Generate Speech", variant="primary") - - with gr.Column(): - status_output = gr.Textbox(label="Status", interactive=False) - audio_output = gr.Audio(label="Generated Speech", type="numpy") - - generate_btn.click( - fn=synthesize, - inputs=[text_input, temperature, top_p, repetition_penalty], - outputs=[status_output, audio_output] + repetition_penalty=repetition_penalty, ) - - gr.Examples( - examples=[ - ["Hello, welcome to Soprano TTS. This is a demonstration of the web interface."], - ["The quick brown fox jumps over the lazy dog."], - ["Soprano is an extremely lightweight text to speech model that can achieve high quality audio synthesis."], - ], - inputs=[text_input], + + gen_time = time.perf_counter() - start_time + + audio_np = audio.cpu().numpy() + audio_int16 = (audio_np * 32767).astype(np.int16) + + audio_seconds = len(audio_np) / SAMPLE_RATE + rtf = audio_seconds / gen_time if gen_time > 0 else float("inf") + + status = ( + f"✓ Generated {audio_seconds:.2f} s audio | " + f"Generation time: {gen_time:.3f} s " + f"({rtf:.2f}x realtime)" ) - - return demo + return (SAMPLE_RATE, audio_int16), status -def main(): - app = create_app() - print("Starting Soprano TTS Web UI...") - print("Visit http://localhost:7860 to access the interface") - app.launch(server_name="localhost", server_port=7860) + except Exception as e: + return None, f"✗ Error: {str(e)}" + + +async def speak_realtime( + text: str, + temperature: float, + top_p: float, + repetition_penalty: float, +) -> str: + if not text.strip(): + return "Please enter some text to speak." + + if not PYAUDIO_AVAILABLE: + return "PyAudio is not available. Install it with 'pip install pyaudio' for real-time audio streaming." + + # Use the lock to prevent concurrent access to the audio stream + with stream_lock: + global current_stream, current_pyaudio_instance + + # Check if there's already an active stream + if current_stream is not None: + try: + current_stream.stop_stream() + current_stream.close() + except: + pass # Stream might already be closed + + if current_pyaudio_instance is not None: + try: + current_pyaudio_instance.terminate() + except: + pass # Instance might already be terminated + + try: + # Initialize PyAudio + p = pyaudio.PyAudio() + current_pyaudio_instance = p + + # Open stream + stream = p.open( + format=pyaudio.paInt16, + channels=1, + rate=SAMPLE_RATE, + output=True + ) + current_stream = stream + + # Start streaming inference + start_time = time.perf_counter() + + # Use the streaming inference method from the model + stream_gen = model.infer_stream( + text, + chunk_size=1, + temperature=temperature, + top_p=top_p, + repetition_penalty=repetition_penalty, + ) + + total_samples = 0 + + # Process audio chunks in real-time + for audio_chunk in stream_gen: + # Check if stream is still active + if current_stream is None or not current_stream.is_active(): + break + + # Convert tensor to numpy array + audio_np = audio_chunk.cpu().numpy() + + # Ensure values are in the range [-1, 1] and convert to int16 + audio_np = np.clip(audio_np, -1.0, 1.0) + audio_int16 = (audio_np * 32767).astype(np.int16) + + # Play the audio chunk directly + stream.write(audio_int16.tobytes()) + total_samples += len(audio_int16) + + # Close stream and terminate PyAudio + stream.stop_stream() + stream.close() + p.terminate() + + # Reset globals after successful playback + current_stream = None + current_pyaudio_instance = None + + gen_time = time.perf_counter() - start_time + audio_seconds = total_samples / SAMPLE_RATE + rtf = audio_seconds / gen_time if gen_time > 0 else float("inf") + + status = ( + f"✓ Finished speaking {audio_seconds:.2f} s audio | " + f"Playback time: {gen_time:.3f} s " + f"({rtf:.2f}x realtime)" + ) + + return status + + except Exception as e: + # Ensure cleanup in case of error + if current_stream: + try: + current_stream.stop_stream() + current_stream.close() + except: + pass + current_stream = None + if current_pyaudio_instance: + try: + current_pyaudio_instance.terminate() + except: + pass + current_pyaudio_instance = None + + return f"✗ Error during real-time playback: {str(e)}" + + +# Create Gradio interface +with gr.Blocks(title="Soprano TTS") as demo: + + # State variable to track which function is active + active_function = gr.State(value=None) # Can be "generate", "speak", or None + + gr.Markdown( + f""" +# 🎵 Soprano TTS + +**Running on: {DEVICE.upper()}** + +Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time, +high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** +and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**. + +
+ + +""" + ) + + with gr.Row(): + with gr.Column(scale=2): + text_input = gr.Textbox( + label="Text to Synthesize", + placeholder="Enter text here...", + value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", + lines=5, + max_lines=10, + ) + + with gr.Accordion("Advanced Settings", open=False): + temperature = gr.Slider( + minimum=0.1, + maximum=1.5, + value=0.3, + step=0.05, + label="Temperature", + ) + + top_p = gr.Slider( + minimum=0.5, + maximum=1.0, + value=0.95, + step=0.05, + label="Top P", + ) + + repetition_penalty = gr.Slider( + minimum=1.0, + maximum=2.0, + value=1.2, + step=0.1, + label="Repetition Penalty", + ) + + with gr.Row(): + generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") + speak_btn = gr.Button("Speak", variant="primary", size="lg") + clear_btn = gr.Button("Clear", variant="secondary", size="lg") + + with gr.Column(scale=1): + audio_output = gr.Audio( + label="Generated Speech", + type="numpy", + autoplay=True, + streaming=True + ) + + status_output = gr.Textbox( + label="Status", + interactive=False, + lines=3, + max_lines=10 + ) + + gr.Examples( + examples=[ + ["Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed."], + ["Hello! Welcome to Soprano text to speech. This is a short example."], + ["The quick brown fox jumps over the lazy dog. This sentence contains all letters of the alphabet."], + ["Artificial intelligence is transforming the world in ways we never imagined. It's revolutionizing industries and changing how we interact with technology."], + ["In a distant future, humanity has colonized the stars. Advanced AI systems govern interstellar travel, ensuring safety and efficiency across vast cosmic distances. Explorers venture into uncharted territories, seeking new worlds and civilizations."], + ["To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer The slings and arrows of outrageous fortune, Or to take arms against a sea of troubles And by opposing end them. To die—to sleep, no more; and by a sleep to say we end The heart-ache and the thousand natural shocks That flesh is heir to: 'tis a consummation Devoutly to be wish'd. To die, to sleep; To sleep, perchance to dream—ay, there's the rub: For in that sleep of death what dreams may come, When we have shuffled off this mortal coil, Must give us pause—there's the respect That makes calamity of so long life."], + ], + inputs=[text_input], + label="Examples", + ) + + async def check_and_set_active_generate(active_func, *args): + if active_func is not None: + return None, f"Error: Please press Clear first. Current operation: {active_func}", active_func + # Call the actual generate function + result = await generate_speech(args[0], args[1], args[2], args[3]) + return result[0], result[1], "generate" + + async def check_and_set_active_speak(active_func, *args): + if active_func is not None: + return f"Error: Please press Clear first. Current operation: {active_func}", active_func + # Call the actual speak function + result = await speak_realtime(args[0], args[1], args[2], args[3]) + return result, "speak" + + def clear_active_state(): + return None + + generate_btn.click( + fn=check_and_set_active_generate, + inputs=[active_function, text_input, temperature, top_p, repetition_penalty], + outputs=[audio_output, status_output, active_function] + ) + + speak_btn.click( + fn=check_and_set_active_speak, + inputs=[active_function, text_input, temperature, top_p, repetition_penalty], + outputs=[status_output, active_function] + ) + + def clear_inputs(active_func): + # Reset the active function state + return "", None, "Ready for input...", None + + clear_btn.click( + fn=clear_inputs, + inputs=[active_function], + outputs=[text_input, audio_output, status_output, active_function] + ) + + gr.Markdown( + """ + +
+ +### Usage tips: + +- Soprano works best when each sentence is between 2 and 15 seconds long. +- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. + Best results can be achieved by converting these into their phonetic form. + (1+1 -> one plus one, etc) +- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. + You may also change the sampling settings for more varied results. +- Avoid improper grammar such as not using contractions, multiple spaces, etc. +""" + ) + + +def find_free_port(start_port=7860, max_tries=100): + for port in range(start_port, start_port + max_tries): + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port)) + return port + except OSError: + continue + raise OSError("Could not find a free port") + +def main(): + port = find_free_port(7860) + print(f"Starting Gradio interface on port {port}") + demo.queue(max_size=20).launch( + server_name="127.0.0.1", + server_port=port, + share=False, + theme=gr.themes.Soft(primary_hue="green"), + css=""" +a { + color: var(--primary-600); +} +a:hover { + color: var(--primary-700); +} +""" + ) if __name__ == "__main__": main() \ No newline at end of file diff --git a/start_soprano.bat b/start_soprano.bat new file mode 100644 index 0000000..e471e75 --- /dev/null +++ b/start_soprano.bat @@ -0,0 +1,3 @@ +@echo off +cd soprano +python server.py \ No newline at end of file From fe728d334e6dbabd8bf47fafbd87cf54054571f2 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Sun, 11 Jan 2026 18:54:24 +0545 Subject: [PATCH 10/27] Refactor Soprano TTS web UI with asyncio and error handling Refactor web UI for Soprano TTS with asyncio support and improved error handling. Enhance audio streaming and Gradio interface. --- soprano/webui.py | 555 +++++++++++++++++++++++++++++------------------ 1 file changed, 346 insertions(+), 209 deletions(-) diff --git a/soprano/webui.py b/soprano/webui.py index 5364ec8..29fa896 100644 --- a/soprano/webui.py +++ b/soprano/webui.py @@ -7,15 +7,43 @@ import torch import sys import os -# Add the parent directory to the Python path to resolve import issues when running directly -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from soprano import SopranoTTS +import asyncio +import logging import numpy as np import socket import time import threading +import traceback + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from soprano import SopranoTTS + +logging.getLogger('asyncio').setLevel(logging.CRITICAL) + +def custom_exception_handler(loop, context): + exception = context.get('exception') + + if exception and not (isinstance(exception, ConnectionResetError) and "forcibly closed" in str(exception)): + print(f"AsyncIO Exception: {context.get('message')}") + exc = context.get('exception') + if exc: + traceback.print_exception(type(exc), exc, exc.__traceback__) + + if exception and isinstance(exception, ConnectionResetError) and "forcibly closed" in str(exception): + pass + else: + loop.default_exception_handler(context) + +if sys.platform.startswith("win"): + try: + loop = asyncio.get_running_loop() + loop.set_exception_handler(custom_exception_handler) + except RuntimeError: + loop = asyncio.new_event_loop() + loop.set_exception_handler(custom_exception_handler) + asyncio.set_event_loop(loop) -# Try to import pyaudio, handle if not available try: import pyaudio PYAUDIO_AVAILABLE = True @@ -23,15 +51,13 @@ PYAUDIO_AVAILABLE = False print("PyAudio not found. Install it with 'pip install pyaudio' for real-time audio streaming.") -# Global variables for PyAudio management current_stream = None current_pyaudio_instance = None stream_lock = threading.Lock() -# Detect device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +SAMPLE_RATE = 32000 -# Initialize model print("Loading Soprano TTS model...") model = SopranoTTS( backend="auto", @@ -41,8 +67,6 @@ ) print("Model loaded successfully!") -SAMPLE_RATE = 32000 - async def generate_speech( text: str, @@ -66,6 +90,14 @@ async def generate_speech( gen_time = time.perf_counter() - start_time audio_np = audio.cpu().numpy() + + if audio_np.size == 0: + return None, "✗ Error: Generated audio is empty." + + max_val = np.max(np.abs(audio_np)) + if max_val > 1.0: + audio_np = audio_np / max_val + audio_int16 = (audio_np * 32767).astype(np.int16) audio_seconds = len(audio_np) / SAMPLE_RATE @@ -79,58 +111,108 @@ async def generate_speech( return (SAMPLE_RATE, audio_int16), status + except ConnectionResetError: + return None, "✗ Connection error during generation. Please try again." except Exception as e: return None, f"✗ Error: {str(e)}" +class AudioStreamer: + + def __init__(self): + self.stream = None + self.pyaudio_instance = None + self.is_playing = False + + def cleanup(self): + if self.stream: + try: + if hasattr(self.stream, 'is_active') and self.stream.is_active(): + self.stream.stop_stream() + self.stream.close() + except Exception: + pass + self.stream = None + + if self.pyaudio_instance: + try: + self.pyaudio_instance.terminate() + except Exception: + pass + self.pyaudio_instance = None + + self.is_playing = False + + def play_audio_chunk(self, audio_chunk): + if self.stream and self.is_playing: + audio_np = audio_chunk.cpu().numpy() + + if len(audio_np) > 1: + if len(audio_np) == 2: + smoothed = np.array([audio_np[0], audio_np[1]]) + elif len(audio_np) == 3: + smoothed = np.array([ + audio_np[0], + (audio_np[0] + audio_np[1] + audio_np[2]) / 3, + audio_np[2] + ]) + else: + smoothed = np.zeros_like(audio_np) + smoothed[0] = audio_np[0] + smoothed[-1] = audio_np[-1] + if len(audio_np) > 2: + smoothed[1:-1] = (audio_np[:-2] + audio_np[1:-1] + audio_np[2:]) / 3 + audio_np = smoothed + + audio_np = np.clip(audio_np, -1.0, 1.0) + + audio_int16 = (np.tanh(audio_np) * 32767).astype(np.int16) + + self.stream.write(audio_int16.tobytes()) + return len(audio_int16) + + return 0 + + async def speak_realtime( text: str, temperature: float, top_p: float, repetition_penalty: float, ) -> str: + global current_stream, current_pyaudio_instance + if not text.strip(): return "Please enter some text to speak." if not PYAUDIO_AVAILABLE: return "PyAudio is not available. Install it with 'pip install pyaudio' for real-time audio streaming." - # Use the lock to prevent concurrent access to the audio stream with stream_lock: - global current_stream, current_pyaudio_instance - - # Check if there's already an active stream - if current_stream is not None: - try: - current_stream.stop_stream() - current_stream.close() - except: - pass # Stream might already be closed + audio_streamer = AudioStreamer() - if current_pyaudio_instance is not None: - try: - current_pyaudio_instance.terminate() - except: - pass # Instance might already be terminated + current_stream = None + current_pyaudio_instance = None try: - # Initialize PyAudio - p = pyaudio.PyAudio() - current_pyaudio_instance = p + audio_streamer.pyaudio_instance = pyaudio.PyAudio() - # Open stream - stream = p.open( + audio_streamer.stream = audio_streamer.pyaudio_instance.open( format=pyaudio.paInt16, channels=1, rate=SAMPLE_RATE, - output=True + output=True, + frames_per_buffer=2048 # Smaller buffer for more responsive streaming ) - current_stream = stream + audio_streamer.is_playing = True + + current_stream = audio_streamer.stream + current_pyaudio_instance = audio_streamer.pyaudio_instance - # Start streaming inference start_time = time.perf_counter() - # Use the streaming inference method from the model + # Real-time streaming: immediately feed text into synthesis engine + # Begin generating audio in small chunks without waiting for full text processing stream_gen = model.infer_stream( text, chunk_size=1, @@ -141,29 +223,20 @@ async def speak_realtime( total_samples = 0 - # Process audio chunks in real-time + # As each chunk of audio is produced, stream and play it back immediately + # while the rest of the text is still being converted for audio_chunk in stream_gen: - # Check if stream is still active - if current_stream is None or not current_stream.is_active(): + # Check if stream is still active to maintain continuous, live speech output + if (not audio_streamer.is_playing or + not (hasattr(audio_streamer.stream, 'is_active') and audio_streamer.stream.is_active())): break - # Convert tensor to numpy array - audio_np = audio_chunk.cpu().numpy() - - # Ensure values are in the range [-1, 1] and convert to int16 - audio_np = np.clip(audio_np, -1.0, 1.0) - audio_int16 = (audio_np * 32767).astype(np.int16) + # Stream and play audio chunks as they become available + samples_written = audio_streamer.play_audio_chunk(audio_chunk) + total_samples += samples_written - # Play the audio chunk directly - stream.write(audio_int16.tobytes()) - total_samples += len(audio_int16) + audio_streamer.cleanup() - # Close stream and terminate PyAudio - stream.stop_stream() - stream.close() - p.terminate() - - # Reset globals after successful playback current_stream = None current_pyaudio_instance = None @@ -179,175 +252,184 @@ async def speak_realtime( return status + except ConnectionResetError: + audio_streamer.cleanup() + + current_stream = None + current_pyaudio_instance = None + + return "✗ Connection error during real-time playback. Please try again." except Exception as e: - # Ensure cleanup in case of error - if current_stream: - try: - current_stream.stop_stream() - current_stream.close() - except: - pass - current_stream = None + audio_streamer.cleanup() - if current_pyaudio_instance: - try: - current_pyaudio_instance.terminate() - except: - pass - current_pyaudio_instance = None + current_stream = None + current_pyaudio_instance = None return f"✗ Error during real-time playback: {str(e)}" -# Create Gradio interface -with gr.Blocks(title="Soprano TTS") as demo: +def create_gradio_interface(): + with gr.Blocks(title="Soprano TTS") as demo: + active_function = gr.State(value="ready") - # State variable to track which function is active - active_function = gr.State(value=None) # Can be "generate", "speak", or None + gr.Markdown( + f""" + # 🎵 Soprano TTS - gr.Markdown( - f""" -# 🎵 Soprano TTS + **Running on: {DEVICE.upper()}** -**Running on: {DEVICE.upper()}** + Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time, + high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** + and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**. -Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time, -high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** -and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**. +
-
- - -""" - ) - - with gr.Row(): - with gr.Column(scale=2): - text_input = gr.Textbox( - label="Text to Synthesize", - placeholder="Enter text here...", - value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", - lines=5, - max_lines=10, - ) + + """ + ) - with gr.Accordion("Advanced Settings", open=False): - temperature = gr.Slider( - minimum=0.1, - maximum=1.5, - value=0.3, - step=0.05, - label="Temperature", + with gr.Row(): + with gr.Column(scale=2): + text_input = gr.Textbox( + label="Text to Synthesize", + placeholder="Enter text here...", + value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", + lines=5, + max_lines=10, ) - top_p = gr.Slider( - minimum=0.5, - maximum=1.0, - value=0.95, - step=0.05, - label="Top P", + with gr.Accordion("Advanced Settings", open=False): + temperature = gr.Slider( + minimum=0.1, + maximum=1.5, + value=0.3, + step=0.05, + label="Temperature", + ) + + top_p = gr.Slider( + minimum=0.5, + maximum=1.0, + value=0.95, + step=0.05, + label="Top P", + ) + + repetition_penalty = gr.Slider( + minimum=1.0, + maximum=2.0, + value=1.2, + step=0.1, + label="Repetition Penalty", + ) + + with gr.Row(): + generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") + speak_btn = gr.Button("Speak", variant="primary", size="lg") + clear_btn = gr.Button("Clear", variant="secondary", size="lg") + + with gr.Column(scale=1): + audio_output = gr.Audio( + label="Generated Speech", + type="numpy", + autoplay=True, + streaming=True ) - repetition_penalty = gr.Slider( - minimum=1.0, - maximum=2.0, - value=1.2, - step=0.1, - label="Repetition Penalty", + status_output = gr.Textbox( + label="Status", + interactive=False, + lines=3, + max_lines=10 ) - with gr.Row(): - generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") - speak_btn = gr.Button("Speak", variant="primary", size="lg") - clear_btn = gr.Button("Clear", variant="secondary", size="lg") - - with gr.Column(scale=1): - audio_output = gr.Audio( - label="Generated Speech", - type="numpy", - autoplay=True, - streaming=True - ) + gr.Examples( + examples=[ + ["Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed."], + ["Hello! Welcome to Soprano text to speech. This is a short example."], + ["The quick brown fox jumps over the lazy dog. This sentence contains all letters of the alphabet."], + ["Artificial intelligence is transforming the world in ways we never imagined. It's revolutionizing industries and changing how we interact with technology."], + ["In a distant future, humanity has colonized the stars. Advanced AI systems govern interstellar travel, ensuring safety and efficiency across vast cosmic distances. Explorers venture into uncharted territories, seeking new worlds and civilizations."], + ["To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer The slings and arrows of outrageous fortune, Or to take arms against a sea of troubles And by opposing end them. To die—to sleep, no more; and by a sleep to say we end The heart-ache and the thousand natural shocks That flesh is heir to: 'tis a consummation Devoutly to be wish'd. To die, to sleep; To sleep, perchance to dream—ay, there's the rub: For in that sleep of death what dreams may come, When we have shuffled off this mortal coil, Must give us pause—there's the respect That makes calamity of so long life."], + ], + inputs=[text_input], + label="Examples", + ) - status_output = gr.Textbox( - label="Status", - interactive=False, - lines=3, - max_lines=10 - ) + async def check_and_set_active_generate(active_func, *args): + if active_func is not None and active_func != "ready": + return None, f"Error: Please press Clear first. Current operation: {active_func}", active_func + result = await generate_speech(args[0], args[1], args[2], args[3]) + return result[0], result[1], "generate" + + async def check_and_set_active_speak(active_func, *args): + if active_func is not None and active_func != "ready": + return f"Error: Please press Clear first. Current operation: {active_func}", active_func + result = await speak_realtime(args[0], args[1], args[2], args[3]) + return result, "speak" + + def clear_active_state(): + return "ready" + + def clear_inputs(active_func): + with stream_lock: + global current_stream, current_pyaudio_instance + if current_stream: + try: + if hasattr(current_stream, 'is_active') and current_stream.is_active(): + current_stream.stop_stream() + current_stream.close() + except Exception: + pass + current_stream = None + if current_pyaudio_instance: + try: + current_pyaudio_instance.terminate() + except Exception: + pass + current_pyaudio_instance = None + return "", None, "Ready for input...", "ready" + + generate_btn.click( + fn=check_and_set_active_generate, + inputs=[active_function, text_input, temperature, top_p, repetition_penalty], + outputs=[audio_output, status_output, active_function] + ) - gr.Examples( - examples=[ - ["Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed."], - ["Hello! Welcome to Soprano text to speech. This is a short example."], - ["The quick brown fox jumps over the lazy dog. This sentence contains all letters of the alphabet."], - ["Artificial intelligence is transforming the world in ways we never imagined. It's revolutionizing industries and changing how we interact with technology."], - ["In a distant future, humanity has colonized the stars. Advanced AI systems govern interstellar travel, ensuring safety and efficiency across vast cosmic distances. Explorers venture into uncharted territories, seeking new worlds and civilizations."], - ["To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer The slings and arrows of outrageous fortune, Or to take arms against a sea of troubles And by opposing end them. To die—to sleep, no more; and by a sleep to say we end The heart-ache and the thousand natural shocks That flesh is heir to: 'tis a consummation Devoutly to be wish'd. To die, to sleep; To sleep, perchance to dream—ay, there's the rub: For in that sleep of death what dreams may come, When we have shuffled off this mortal coil, Must give us pause—there's the respect That makes calamity of so long life."], - ], - inputs=[text_input], - label="Examples", - ) - - async def check_and_set_active_generate(active_func, *args): - if active_func is not None: - return None, f"Error: Please press Clear first. Current operation: {active_func}", active_func - # Call the actual generate function - result = await generate_speech(args[0], args[1], args[2], args[3]) - return result[0], result[1], "generate" - - async def check_and_set_active_speak(active_func, *args): - if active_func is not None: - return f"Error: Please press Clear first. Current operation: {active_func}", active_func - # Call the actual speak function - result = await speak_realtime(args[0], args[1], args[2], args[3]) - return result, "speak" - - def clear_active_state(): - return None - - generate_btn.click( - fn=check_and_set_active_generate, - inputs=[active_function, text_input, temperature, top_p, repetition_penalty], - outputs=[audio_output, status_output, active_function] - ) - - speak_btn.click( - fn=check_and_set_active_speak, - inputs=[active_function, text_input, temperature, top_p, repetition_penalty], - outputs=[status_output, active_function] - ) - - def clear_inputs(active_func): - # Reset the active function state - return "", None, "Ready for input...", None - - clear_btn.click( - fn=clear_inputs, - inputs=[active_function], - outputs=[text_input, audio_output, status_output, active_function] - ) - - gr.Markdown( - """ - -
- -### Usage tips: - -- Soprano works best when each sentence is between 2 and 15 seconds long. -- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. - Best results can be achieved by converting these into their phonetic form. - (1+1 -> one plus one, etc) -- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. - You may also change the sampling settings for more varied results. -- Avoid improper grammar such as not using contractions, multiple spaces, etc. -""" - ) + speak_btn.click( + fn=check_and_set_active_speak, + inputs=[active_function, text_input, temperature, top_p, repetition_penalty], + outputs=[status_output, active_function] + ) + + clear_btn.click( + fn=clear_inputs, + inputs=[active_function], + outputs=[text_input, audio_output, status_output, active_function] + ) + + gr.Markdown( + """ + +
+ + ### Usage tips: + + - Soprano works best when each sentence is between 2 and 15 seconds long. + - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. + Best results can be achieved by converting these into their phonetic form. + (1+1 -> one plus one, etc) + - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. + You may also change the sampling settings for more varied results. + - Avoid improper grammar such as not using contractions, multiple spaces, etc. + """ + ) + + return demo def find_free_port(start_port=7860, max_tries=100): @@ -360,23 +442,78 @@ def find_free_port(start_port=7860, max_tries=100): continue raise OSError("Could not find a free port") + def main(): + global current_stream, current_pyaudio_instance + port = find_free_port(7860) print(f"Starting Gradio interface on port {port}") - demo.queue(max_size=20).launch( - server_name="127.0.0.1", - server_port=port, - share=False, - theme=gr.themes.Soft(primary_hue="green"), - css=""" + + if sys.platform.startswith("win"): + try: + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + print("Set Windows Proactor event loop policy") + except Exception as e: + print(f"Could not set Windows Proactor event loop policy: {e}") + + demo = create_gradio_interface() + + try: + demo.queue(max_size=20).launch( + server_name="127.0.0.1", + server_port=port, + share=False, + theme=gr.themes.Soft(primary_hue="green"), + prevent_thread_lock=False, + show_error=True, + quiet=False, + favicon_path=None, + ssl_verify=False, + max_threads=40, + css=""" a { color: var(--primary-600); } a:hover { color: var(--primary-700); } -""" - ) +""", + root_path="" + ) + print("Gradio interface launched successfully") + except KeyboardInterrupt: + print("\nShutting down gracefully...") + with stream_lock: + if current_stream: + try: + if hasattr(current_stream, 'is_active') and current_stream.is_active(): + current_stream.stop_stream() + current_stream.close() + except Exception: + pass + if current_pyaudio_instance: + try: + current_pyaudio_instance.terminate() + except Exception: + pass + sys.exit(0) + except Exception as e: + print(f"Error starting Gradio interface: {e}") + traceback.print_exc() + with stream_lock: + if current_stream: + try: + if hasattr(current_stream, 'is_active') and current_stream.is_active(): + current_stream.stop_stream() + current_stream.close() + except Exception: + pass + if current_pyaudio_instance: + try: + current_pyaudio_instance.terminate() + except Exception: + pass + sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main() From 7d780e789348688f0b775e0401d765c054b93629 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:14:36 +0545 Subject: [PATCH 11/27] Revise README for clarity and updated instructions Updated README.md to remove emojis, add new installation/startup instructions, and reorganize menu options for clarity. --- README.md | 221 ++++++++++++++++-------------------------------------- 1 file changed, 65 insertions(+), 156 deletions(-) diff --git a/README.md b/README.md index 986b682..399f7dc 100644 --- a/README.md +++ b/README.md @@ -1,191 +1,100 @@ -# Soprano TTS - -Soprano is an ultra-realistic Text-to-Speech system that provides both REST API and WebSocket streaming capabilities. - -> **Note**: Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model. - ---- - -## Features - -- **High Quality Audio**: Generates ultra-realistic speech using advanced TTS models -- **Multiple Interfaces**: REST API and WebSocket streaming options -- **OpenAI Compatible**: Follows OpenAI's speech endpoint format -- **Real-time Streaming**: WebSocket support for real-time audio streaming -- **Configurable Parameters**: Supports temperature, top_p, repetition_penalty, and min_text_length controls - -## Components - -- **API Server**: RESTful API with OpenAI-compatible endpoints -- **WebSocket Server**: Real-time audio streaming via WebSocket -- **CLI Interface**: Interactive command-line interface -- **Test Clients**: Dedicated test clients for both API and WebSocket - -## Usage - -```python -from soprano import SopranoTTS - -model = SopranoTTS(backend='auto', device='cuda', cache_size_mb=100, decoder_batch_size=1) -``` - -> **Tip**: You can increase cache_size_mb and decoder_batch_size to increase inference speed at the cost of higher memory usage. +Here is the updated, professional `README.md` file. I have removed all emojis, incorporated the new installation/startup instructions, and organized the menu options as requested. -### Basic inference +*** -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.") # can achieve 2000x real-time with sufficiently long input! -``` - -### Save output to a file - -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.", "out.wav") -``` - -### Custom sampling parameters - -```python -out = model.infer( - "Soprano is an extremely lightweight text to speech model.", - temperature=0.3, - top_p=0.95, - repetition_penalty=1.2, -) -``` - -### Batched inference - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10) # can achieve 2000x real-time with sufficiently large input size! -``` - -#### Save batch outputs to a directory - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10, "/dir") -``` - -### Streaming inference - -```python -import torch +# Soprano TTS -stream = model.infer_stream("Soprano is an extremely lightweight text to speech model.", chunk_size=1) +Soprano is an ultra-realistic Text-to-Speech system that provides REST API, WebSocket streaming capabilities, and a user-friendly Web UI. It is designed to be lightweight yet high-fidelity, offering OpenAI-compatible endpoints for seamless integration into existing workflows. -# Audio chunks can be accessed via an iterator -chunks = [] -for chunk in stream: - chunks.append(chunk) # first chunk arrives in <15 ms! +> **Note:** Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model. -out = torch.cat(chunks) -``` +## Features -### Serve endpoint +- **High Quality Audio:** Generates ultra-realistic speech at 32 kHz using advanced TTS models. +- **Multiple Interfaces:** Includes REST API, WebSocket streaming, Web UI, and CLI. +- **OpenAI Compatible:** Follows OpenAI's speech endpoint format for drop-in replacement. +- **Real-time Streaming:** WebSocket support for real-time audio streaming with <15 ms latency. +- **Configurable Parameters:** Supports temperature, top_p, repetition_penalty, and min_text_length controls. +- **Interactive Launcher:** Easy-to-use batch script for managing services. + +## Installation and Setup + +### Prerequisites +Ensure you have Git and Python installed on your system. + +### Steps +1. Clone the repository: + ```bash + git clone https://github.com/biswas445/soprano.git + ``` +2. Navigate to the project directory: + ```bash + cd soprano + ``` +3. Run the setup script and follow the prompts: + ```bat + setup.bat + ``` -``` -uvicorn soprano.server:app --host 0.0.0.0 --port 8000 -``` +## Quick Start -Compatible with OpenAI speech API. Use the endpoint like this: +To start the application, run the `start.bat` file located in the root directory: -```bash -curl http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "input": "The quick brown fox jumped over the lazy dog." - }' \ - --output speech.wav +```bat +start.bat ``` -## Usage tips: - -* Soprano works best when each sentence is between 2 and 15 seconds long. -* Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc) -* If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results. -* Avoid improper grammar such as not using contractions, multiple spaces, etc. - ---- +This will launch the interactive menu where you can choose the desired component: -## Key Features +1. **API Server:** Starts the RESTful API server. +2. **Test API:** Launches the API server and automatically runs the API test client to verify functionality. +3. **Real-time Assistant:** Launches a voice-to-voice AI assistant demo featuring real-time audio streaming. +4. **WebSocket Test:** Launches the WebSocket server and the corresponding test client. +5. *(Reserved)* +6. **Web UI:** Starts the browser-based interface for standard users. +7. **CLI:** Starts the interactive Command Line Interface for testing purposes. -### 1. High‑fidelity 32 kHz audio -Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper and clearer than the 24 kHz output used by many existing TTS models. +## Technical Architecture -### 2. Vocoder‑based neural decoder +### 1. High-fidelity 32 kHz Audio +Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper than the 24 kHz output used by many existing TTS models. -Instead of slow diffusion decoders, Soprano uses a **vocoder‑based decoder** with a Vocos architecture, enabling **orders‑of‑magnitude faster** waveform generation while maintaining comparable perceptual quality. +### 2. Vocoder-based Neural Decoder +Instead of slow diffusion decoders, Soprano uses a **vocoder-based decoder** with a Vocos architecture. This enables **orders-of-magnitude faster** waveform generation while maintaining comparable perceptual quality. ### 3. Seamless Streaming +Soprano leverages the decoder's finite receptive field to losslessly stream audio with ultra-low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**. -Soprano leverages the decoder’s finite receptive field to losslessly stream audio with ultra‑low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**. - -### 4. State‑of‑the‑art neural audio codec - +### 4. State-of-the-art Neural Audio Codec Speech is represented using a **neural codec** that compresses audio to **~15 tokens/sec** at just **0.2 kbps**, allowing extremely fast generation and efficient memory usage without sacrificing quality. -### 5. Sentence‑level streaming for infinite context - -Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real‑time performance for long‑form generation. +### 5. Sentence-level Streaming +Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real-time performance for long-form generation. ---- +## Project Status -## Quick Start - -### Using the Launcher -Run `Soprano.bat` to access the main menu with options to launch any component. - -### API Server -Start the API server and send requests to `http://localhost:8000/v1/audio/speech` - -### WebSocket Server -Start the WebSocket server and connect to `ws://localhost:8001/ws/tts` - -## Endpoints - -### API -- `POST /v1/audio/speech` - Generate speech from text -- `GET /health` - Health check endpoint -- `GET /` - Root endpoint with API information - -### WebSocket -- `ws://localhost:8001/ws/tts` - Real-time TTS streaming +The core infrastructure, including the OpenAI-compatible API and various interfaces, is complete. -## Integration - -The API is designed for easy integration with workflow automation platforms like n8n, Zapier, and other systems that can make HTTP requests. +**Current Focus Areas:** +1. **Backend Strengthening:** Improving the robustness of the inference engine. +2. **Text Normalization:** Enhancing the handling of numbers, abbreviations, and special characters to improve pronunciation accuracy. ## Limitations -I’m a second-year undergrad who’s just started working on TTS models, so I wanted to start small. Soprano was only pretrained on 1000 hours of audio (~100x less than other TTS models), so its stability and quality will improve tremendously as I train it on more data. Also, I optimized Soprano purely for speed, which is why it lacks bells and whistles like voice cloning, style control, and multilingual support. Now that I have experience creating TTS models, I have a lot of ideas for how to make Soprano even better in the future, so stay tuned for those! - ---- - -## Roadmap - -* [x] Add model and inference code -* [x] Seamless streaming -* [x] Batched inference -* [x] Command-line interface (CLI) -* [x] CPU support -* [x] Server / API inference -* [ ] Additional LLM backends -* [ ] Voice cloning -* [ ] Multilingual support - ---- +Soprano was optimized purely for speed and was pretrained on approximately 1000 hours of audio. Consequently: +* Numbers and special characters may occasionally be mispronounced (phonetic conversion is recommended). +* Voice cloning and style controls are currently not implemented. +* Stability and quality are expected to improve with future training on larger datasets. ## Acknowledgements Soprano uses and/or is inspired by the following projects: -* [Vocos](https://github.com/gemelo-ai/vocos) -* [XTTS](https://github.com/coqui-ai/TTS) -* [LMDeploy](https://github.com/InternLM/lmdeploy) - ---- +* [Vocos](https://github.com/gemelo-ai/vocos) +* [XTTS](https://github.com/coqui-ai/TTS) +* [LMDeploy](https://github.com/InternLM/lmdeploy) ## License -Licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. \ No newline at end of file +Licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. From c0b4e99f523a75ed55397dc8379581857d7ac3f2 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 13:57:55 +0545 Subject: [PATCH 12/27] Revise README.md for professionalism and clarity Updated README.md to remove emojis and include new installation/startup instructions. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 399f7dc..706db2e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -Here is the updated, professional `README.md` file. I have removed all emojis, incorporated the new installation/startup instructions, and organized the menu options as requested. +Here is the updated, professional `README.md` file. I have removed all emojis, incorporated the new installation/startup instructions, and organized the menu options as requested. *** From 10a80d5025c8a9c1edda3e1f2081a0034172afdd Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 14:13:04 +0545 Subject: [PATCH 13/27] Fix formatting of note in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 706db2e..7e4803f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Here is the updated, professional `README.md` file. I have removed all emojis, i Soprano is an ultra-realistic Text-to-Speech system that provides REST API, WebSocket streaming capabilities, and a user-friendly Web UI. It is designed to be lightweight yet high-fidelity, offering OpenAI-compatible endpoints for seamless integration into existing workflows. -> **Note:** Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model. +> **Note:** Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model ## Features From 07094c13bb97d581f436128ea1d7b070ecf407e4 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:03:31 +0545 Subject: [PATCH 14/27] Delete soprano/backends directory --- soprano/backends/base.py | 20 ------ soprano/backends/lmdeploy.py | 59 ---------------- soprano/backends/transformers.py | 113 ------------------------------- 3 files changed, 192 deletions(-) delete mode 100644 soprano/backends/base.py delete mode 100644 soprano/backends/lmdeploy.py delete mode 100644 soprano/backends/transformers.py diff --git a/soprano/backends/base.py b/soprano/backends/base.py deleted file mode 100644 index a58274d..0000000 --- a/soprano/backends/base.py +++ /dev/null @@ -1,20 +0,0 @@ -class BaseModel: - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - ''' - Takes a list of prompts and returns the output hidden states - ''' - pass - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - ''' - Takes a prompt and returns an iterator of the output hidden states - ''' - pass diff --git a/soprano/backends/lmdeploy.py b/soprano/backends/lmdeploy.py deleted file mode 100644 index 1d7f45c..0000000 --- a/soprano/backends/lmdeploy.py +++ /dev/null @@ -1,59 +0,0 @@ -import torch -from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig -from .base import BaseModel - - -class LMDeployModel(BaseModel): - def __init__(self, - device='cuda', - cache_size_mb=100, - model_path=None, - **kwargs): - assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead." - cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory - backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio) - - # Use local model if path provided, otherwise use HuggingFace - model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M' - - self.pipeline = pipeline(model_name_or_path, - log_level='ERROR', - backend_config=backend_config) - - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - gen_config=GenerationConfig(output_last_hidden_state='generation', - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - max_new_tokens=512) - responses = self.pipeline(prompts, gen_config=gen_config) - res = [] - for response in responses: - res.append({ - 'finish_reason': response.finish_reason, - 'hidden_state': response.last_hidden_state - }) - return res - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - gen_config=GenerationConfig(output_last_hidden_state='generation', - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - max_new_tokens=512) - responses = self.pipeline.stream_infer([prompt], gen_config=gen_config) - for response in responses: - yield { - 'finish_reason': response.finish_reason, - 'hidden_state': response.last_hidden_state - } diff --git a/soprano/backends/transformers.py b/soprano/backends/transformers.py deleted file mode 100644 index ec42341..0000000 --- a/soprano/backends/transformers.py +++ /dev/null @@ -1,113 +0,0 @@ -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer -from .base import BaseModel - - -class TransformersModel(BaseModel): - def __init__(self, - device='cuda', - model_path=None, - **kwargs): - self.device = device - - # Use local model if path provided, otherwise use HuggingFace - model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M' - - self.model = AutoModelForCausalLM.from_pretrained( - model_name_or_path, - dtype=torch.bfloat16 if device == 'cuda' else torch.float32, - device_map=device - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.model.eval() - - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - inputs = self.tokenizer( - prompts, - return_tensors='pt', - padding=True, - truncation=True, - max_length=512, - ).to(self.device) - - with torch.no_grad(): - outputs = self.model.generate( - input_ids=inputs['input_ids'], - attention_mask=inputs['attention_mask'], - max_new_tokens=512, - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - pad_token_id=self.tokenizer.pad_token_id, - return_dict_in_generate=True, - output_hidden_states=True, - ) - res = [] - eos_token_id = self.model.config.eos_token_id - for i in range(len(prompts)): - seq = outputs.sequences[i] - hidden_states = [] - num_output_tokens = len(outputs.hidden_states) - for j in range(num_output_tokens): - token = seq[j + seq.size(0) - num_output_tokens] - if token != eos_token_id: hidden_states.append(outputs.hidden_states[j][-1][i, -1, :]) - last_hidden_state = torch.stack(hidden_states).squeeze() - finish_reason = 'stop' if seq[-1].item() == eos_token_id else 'length' - res.append({ - 'finish_reason': finish_reason, - 'hidden_state': last_hidden_state - }) - return res - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - # For transformers backend, simulate streaming by returning all results at once - # This is a workaround to provide basic streaming functionality - inputs = self.tokenizer( - [prompt], - return_tensors='pt', - padding=True, - truncation=True, - max_length=512, - ).to(self.device) - - with torch.no_grad(): - outputs = self.model.generate( - input_ids=inputs['input_ids'], - attention_mask=inputs['attention_mask'], - max_new_tokens=512, - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - pad_token_id=self.tokenizer.pad_token_id, - return_dict_in_generate=True, - output_hidden_states=True, - ) - - eos_token_id = self.model.config.eos_token_id - seq = outputs.sequences[0] - hidden_states = [] - num_output_tokens = len(outputs.hidden_states) - - for j in range(num_output_tokens): - token = seq[j + seq.size(0) - num_output_tokens] - if token != eos_token_id: - hidden_state = outputs.hidden_states[j][-1][0, -1, :] - hidden_states.append(hidden_state) - - # Yield all hidden states as a single "stream" - for i, hidden_state in enumerate(hidden_states): - finish_reason = 'stop' if i == len(hidden_states) - 1 and seq[-1].item() == eos_token_id else None - yield { - 'finish_reason': finish_reason, - 'hidden_state': hidden_state - } From dfa61ccb66073c39f8726ea6428cb0bebade6839 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:03:39 +0545 Subject: [PATCH 15/27] Delete soprano/server directory --- soprano/server/api.py | 375 ----------------------- soprano/server/docs/api_readme.md | 70 ----- soprano/server/docs/test_readme.md | 42 --- soprano/server/docs/websocket_readme.md | 100 ------ soprano/server/test_api.py | 138 --------- soprano/server/test_websocket.py | 81 ----- soprano/server/websocket.py | 386 ------------------------ 7 files changed, 1192 deletions(-) delete mode 100644 soprano/server/api.py delete mode 100644 soprano/server/docs/api_readme.md delete mode 100644 soprano/server/docs/test_readme.md delete mode 100644 soprano/server/docs/websocket_readme.md delete mode 100644 soprano/server/test_api.py delete mode 100644 soprano/server/test_websocket.py delete mode 100644 soprano/server/websocket.py diff --git a/soprano/server/api.py b/soprano/server/api.py deleted file mode 100644 index c0c6df8..0000000 --- a/soprano/server/api.py +++ /dev/null @@ -1,375 +0,0 @@ -import asyncio -import io -import logging -import time -from typing import Optional, Dict, Any, AsyncGenerator -import numpy as np -from fastapi import FastAPI, HTTPException, Depends -from fastapi.responses import Response -from pydantic import BaseModel, Field -from scipy.io.wavfile import write -from torch import Tensor -import torch -from contextlib import asynccontextmanager - -# Handle import when running from within the server directory -try: - from soprano.tts import SopranoTTS -except ImportError: - import sys - import os - # Add the parent directory to the Python path to resolve import issues - sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from soprano.tts import SopranoTTS - - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class CircuitBreaker: - """ - Circuit breaker implementation to handle external dependency failures. - """ - def __init__(self, failure_threshold=5, recovery_timeout=60): - self.failure_threshold = failure_threshold - self.recovery_timeout = recovery_timeout - self.failure_count = 0 - self.last_failure_time = None - self.state = "CLOSED" # CLOSED, OPEN, HALF_OPEN - - def call(self, func, *args, **kwargs): - if self.state == "OPEN": - if time.time() - self.last_failure_time > self.recovery_timeout: - self.state = "HALF_OPEN" - else: - raise Exception("Circuit breaker is in OPEN state and not accepting requests") - - if self.state == "HALF_OPEN": - try: - result = func(*args, **kwargs) - self._success() - return result - except Exception as e: - self._failure() - raise e - - try: - result = func(*args, **kwargs) - return result - except Exception as e: - self._failure() - raise e - - def _failure(self): - self.failure_count += 1 - self.last_failure_time = time.time() - if self.failure_count >= self.failure_threshold: - self.state = "OPEN" - - def _success(self): - self.failure_count = 0 - self.state = "CLOSED" - - -def retry(func, retries=3, delay=1, backoff=2): - """ - Retry decorator with exponential backoff for transient failures. - """ - def wrapper(*args, **kwargs): - current_delay = delay - for attempt in range(retries): - try: - return func(*args, **kwargs) - except Exception as e: - if attempt == retries - 1: # Last attempt - raise e - logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {current_delay} seconds...") - time.sleep(current_delay) - current_delay *= backoff - return None - return wrapper - - -class SpeechRequest(BaseModel): - """ - Request model for text-to-speech conversion following OpenAI API format. - """ - input: str = Field(..., min_length=1, max_length=1000, description="Text to synthesize") - model: Optional[str] = Field(None, description="Model to use (ignored, using default model)") - voice: Optional[str] = Field(None, description="Voice to use (ignored, using default voice)") - response_format: Optional[str] = Field("wav", description="Response format (only wav supported)") - speed: Optional[float] = Field(None, ge=0.1, le=2.0, description="Speech speed (not implemented yet)") - temperature: Optional[float] = Field(0.3, ge=0.0, le=1.0, description="Generation temperature") - top_p: Optional[float] = Field(1.0, ge=0.0, le=1.0, description="Top-p sampling parameter") - repetition_penalty: Optional[float] = Field(1.2, ge=0.1, le=2.0, description="Repetition penalty") - min_text_length: Optional[int] = Field(30, ge=1, le=1000, description="Minimum text length for processing (default 30)") - - -class TTSManager: - """ - Singleton manager for TTS model lifecycle and inference. - """ - _instance = None - _lock = asyncio.Lock() - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - if not hasattr(self, 'initialized'): - self.initialized = True - self.tts: Optional[SopranoTTS] = None - # Prioritize CUDA, fallback to CPU only if CUDA is not available - if torch.cuda.is_available(): - self.device = 'cuda' - logger.info("CUDA is available, using GPU for TTS processing") - else: - self.device = 'cpu' - logger.info("CUDA is not available, falling back to CPU for TTS processing") - self.circuit_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=30) - logger.info(f"Initializing TTS on device: {self.device}") - - async def initialize_model(self): - """ - Initialize the TTS model asynchronously to avoid blocking the event loop. - """ - async with self._lock: - if self.tts is None: - logger.info("Loading Soprano TTS model...") - try: - # Run model initialization in a thread pool to avoid blocking - loop = asyncio.get_running_loop() - - # Use retry mechanism for model loading - def load_model(): - # Import here in case it's needed in the executor - try: - from soprano.tts import SopranoTTS - except ImportError: - import sys - import os - # Add the parent directory to the Python path to resolve import issues - sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - from soprano.tts import SopranoTTS - return SopranoTTS( - cache_size_mb=100, - device=self.device - ) - - self.tts = await loop.run_in_executor( - None, - retry(load_model, retries=3, delay=2, backoff=2) - ) - logger.info("Soprano TTS model loaded successfully") - except Exception as e: - logger.error(f"Failed to load Soprano TTS model: {e}", exc_info=True) - raise RuntimeError(f"Failed to initialize TTS model: {str(e)}") from e - - def get_model(self) -> SopranoTTS: - """ - Get the initialized TTS model instance. - """ - if self.tts is None: - raise RuntimeError("TTS model not initialized. Call initialize_model() first.") - return self.tts - - def generate_audio(self, text: str, top_p: float, temperature: float, repetition_penalty: float, min_text_length: int = 30): - """ - Generate audio with circuit breaker protection and retry mechanism. - """ - def _generate(): - return self.tts.infer( - text=text, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - min_text_length=min_text_length - ) - - # Use circuit breaker to protect against repeated failures - return self.circuit_breaker.call( - retry(_generate, retries=2, delay=1, backoff=2) - ) - - -@asynccontextmanager -async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: - """ - Lifespan event handler for startup and shutdown events. - """ - logger.info("Starting up Soprano TTS API server...") - try: - tts_manager = TTSManager() - await tts_manager.initialize_model() - - logger.info("Soprano TTS API server started successfully") - yield - except Exception as e: - logger.error(f"Failed to start Soprano TTS API server: {e}", exc_info=True) - raise - finally: - logger.info("Shutting down Soprano TTS API server...") - - -# Create FastAPI app with performance optimizations -app = FastAPI( - title="Soprano TTS API", - description="Ultra-realistic Text-to-Speech API based on Soprano model", - version="1.0.0", - contact={ - "name": "Soprano TTS", - "url": "https://github.com/ekwek1/soprano", - }, - lifespan=lifespan, - # Performance optimizations - timeout=60, # Increase timeout for longer texts -) - - -def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: - """ - Convert a 1D fp32 torch tensor to a WAV byte stream efficiently. - """ - # Convert to numpy array - audio_np = tensor.cpu().numpy() - - # Ensure values are in the range [-1, 1] and convert to int16 in one step - audio_np = np.clip(audio_np, -1.0, 1.0) - audio_np = (audio_np * 32767).astype(np.int16) - - # Create in-memory WAV file directly without intermediate buffer - wav_io = io.BytesIO() - write(wav_io, 32000, audio_np) # 32kHz sample rate - return wav_io.getvalue() # Use getvalue() instead of seek() + read() - - - - - - -@app.post("/v1/audio/speech", - response_class=Response, - summary="Generate speech from text", - description="Convert input text to audio using Soprano TTS model") -async def create_speech(request: SpeechRequest): - """ - Generate speech from input text following OpenAI's Speech endpoint format. - """ - try: - # Validate input text - if not request.input or not request.input.strip(): - raise HTTPException( - status_code=400, - detail="`input` field must be a non-empty string." - ) - - # Check text length - if len(request.input) > 1000: - raise HTTPException( - status_code=400, - detail="Input text exceeds maximum length of 1000 characters." - ) - - # Get TTS manager and generate audio using circuit breaker and retry - tts_manager = TTSManager() - - logger.info(f"Processing TTS request for text: '{request.input[:50]}{'...' if len(request.input) > 50 else ''}'") - - try: - # Generate audio with circuit breaker and retry mechanism - audio_tensor = tts_manager.generate_audio( - text=request.input, - top_p=request.top_p, - temperature=request.temperature, - repetition_penalty=request.repetition_penalty, - min_text_length=request.min_text_length - ) - except Exception as e: - logger.error(f"Circuit breaker or retry mechanism failed: {str(e)}", exc_info=True) - raise HTTPException( - status_code=503, - detail=f"Service temporarily unavailable due to TTS processing error: {str(e)}" - ) - - # Convert tensor to WAV bytes - wav_bytes = _tensor_to_wav_bytes(audio_tensor) - - logger.info(f"TTS generation completed successfully.") - - # Generate a generic filename for the response - filename = "speech_output.wav" - - # Return WAV response directly to client without saving on server - return Response( - content=wav_bytes, - media_type="audio/wav", - headers={ - "Content-Disposition": f'attachment; filename="{filename}"', - "Content-Length": str(len(wav_bytes)) - } - ) - - except HTTPException: - # Re-raise HTTP exceptions as-is - raise - except Exception as e: - logger.error(f"Error during TTS generation: {str(e)}", exc_info=True) - raise HTTPException( - status_code=500, - detail=f"Internal server error during TTS generation: {str(e)}" - ) - - -@app.get("/", - summary="Root endpoint", - description="Provides information about the Soprano TTS API") -async def root(): - """ - Root endpoint to provide API information. - """ - return { - "message": "Soprano TTS API", - "version": "1.0.0", - "description": "Ultra-realistic Text-to-Speech API based on Soprano model", - "endpoints": { - "tts": "/v1/audio/speech", - "health": "/health" - } - } - - -@app.get("/health", - summary="Health check endpoint", - description="Check if the server and TTS model are running properly") -async def health_check(): - """ - Health check endpoint to verify the server and model are operational. - """ - try: - tts_manager = TTSManager() - tts = tts_manager.get_model() - return {"status": "healthy", "device": tts.device} - except Exception as e: - logger.error(f"Health check failed: {str(e)}") - raise HTTPException(status_code=503, detail="Service unavailable") - - -if __name__ == "__main__": - import uvicorn - import torch - - print("Starting Soprano TTS API Server...") - print(f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}") - - # Start the server - print("Server starting on http://localhost:8000") - uvicorn.run( - app, - host="localhost", - port=8000, - reload=False - ) diff --git a/soprano/server/docs/api_readme.md b/soprano/server/docs/api_readme.md deleted file mode 100644 index 4bbfa02..0000000 --- a/soprano/server/docs/api_readme.md +++ /dev/null @@ -1,70 +0,0 @@ -# Soprano TTS API - -The Soprano TTS API provides a high-quality, ultra-realistic text-to-speech service with OpenAI-compatible endpoints. This API allows you to convert text to natural-sounding speech using the Soprano model. - -## Features - -- **OpenAI Compatible**: Follows OpenAI's speech endpoint format for easy integration -- **High Quality Audio**: Generates ultra-realistic speech using advanced TTS models -- **Configurable Parameters**: Supports temperature, top_p, repetition_penalty, and min_text_length controls -- **Fast Processing**: Model loaded once at startup for optimal performance -- **Production Ready**: Includes health checks and error handling - -## Endpoints - -### Generate Speech -- **URL**: `POST /v1/audio/speech` -- **Description**: Convert text to speech -- **Request Body**: - ```json - { - "input": "Text to synthesize (required, 1-1000 chars)", - "model": "Model to use (optional, ignored)", - "voice": "Voice to use (optional, ignored)", - "response_format": "Response format (optional, default: 'wav')", - "speed": "Speech speed (optional, not implemented)", - "temperature": "Generation temperature (optional, default: 0.3, range: 0.0-1.0)", - "top_p": "Top-p sampling parameter (optional, default: 1.0, range: 0.0-1.0)", - "repetition_penalty": "Repetition penalty (optional, default: 1.2, range: 0.1-2.0)", - "min_text_length": "Minimum text length for processing (optional, default: 30, range: 1-1000)" - } - ``` -- **Response**: WAV audio file as binary data - -### Health Check -- **URL**: `GET /health` -- **Description**: Check if the server and TTS model are running properly -- **Response**: Status and device information - -### Root Endpoint -- **URL**: `GET /` -- **Description**: API information and available endpoints - -## Integration - -This API is designed for easy integration with various systems including: -- Automation platforms (like n8n) -- Web applications -- Mobile applications -- Voice assistants -- Any system that can make HTTP requests - -## Performance - -- Model is loaded once at startup for optimal performance -- Efficient audio processing with minimal overhead -- Designed for concurrent requests with proper error handling - -## Usage Example - -```bash -curl -X POST http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "input": "Hello, this is a test.", - "temperature": 0.3, - "top_p": 1.0, - "repetition_penalty": 1.2 - }' \ - --output output.wav -``` \ No newline at end of file diff --git a/soprano/server/docs/test_readme.md b/soprano/server/docs/test_readme.md deleted file mode 100644 index eb54fe5..0000000 --- a/soprano/server/docs/test_readme.md +++ /dev/null @@ -1,42 +0,0 @@ -# Soprano TTS Test Clients - -This directory contains test clients for both the API and WebSocket servers. - -## API Test Client - -The API test client allows you to test the REST API server functionality. - -### Usage -```bash -python -m soprano.server.test_api "Your text here" -``` - -### Features -- Tests the main TTS endpoint -- Includes health check functionality -- Saves received audio to audio_output directory -- Handles connection errors gracefully -- Uses aiohttp for async HTTP requests - -## WebSocket Test Client - -The WebSocket test client allows you to test the WebSocket streaming server functionality. - -### Usage -```bash -python -m soprano.server.test_websocket "Your text here" -``` - -### Features -- Tests WebSocket connection and streaming -- Real-time audio playback using PyAudio -- Connection testing with ping/pong -- Proper audio stream management -- Comprehensive error handling - -## Prerequisites - -- For API tests: `pip install aiohttp` -- For WebSocket tests: `pip install websockets pyaudio` -- Running API server on http://localhost:8000 -- Running WebSocket server on ws://localhost:8001/ws/tts \ No newline at end of file diff --git a/soprano/server/docs/websocket_readme.md b/soprano/server/docs/websocket_readme.md deleted file mode 100644 index 057631a..0000000 --- a/soprano/server/docs/websocket_readme.md +++ /dev/null @@ -1,100 +0,0 @@ -# Soprano TTS WebSocket - -The Soprano TTS WebSocket provides real-time streaming text-to-speech functionality. This WebSocket server allows you to generate audio in real-time and stream it to clients as it's produced. - -## Features - -- **Real-time Streaming**: Generate and stream audio in real-time -- **Raw PCM Frames**: Outputs raw PCM frames suitable for playback via PyAudio -- **Metadata Support**: Sends audio format metadata at the start -- **Small Chunks**: Streams audio in small chunks (~1024 samples) for low latency -- **End Signal**: Sends "end" signal when synthesis finishes - -## Connection - -- **Endpoint**: `ws://localhost:8001/ws/tts` -- **Protocol**: WebSocket with JSON control messages and binary audio frames - -## Message Format - -### Client to Server -```json -{ - "type": "synthesize", - "text": "Your text here", - "stream": true, - "min_text_length": 30 -} -``` - -### Server to Client -- **Metadata** (JSON): -```json -{ - "type": "metadata", - "sample_rate": 32000, - "channels": 1, - "format": "int16" -} -``` - -- **Audio Data** (Binary): Raw PCM audio frames -- **End Signal** (JSON): -```json -{ - "type": "end" -} -``` - -- **Error** (JSON): -```json -{ - "type": "error", - "message": "Error description" -} -``` - -## Integration - -The WebSocket server is ideal for: -- Real-time voice assistants -- Interactive applications -- Live broadcasting systems -- Gaming applications -- Any system requiring immediate audio feedback - -## Usage Example - -```javascript -const ws = new WebSocket('ws://localhost:8001/ws/tts'); - -ws.onopen = () => { - ws.send(JSON.stringify({ - type: "synthesize", - text: "Hello, this is a real-time audio stream", - stream: true, - min_text_length: 30 - })); -}; - -ws.onmessage = (event) => { - if (typeof event.data === 'string') { - const message = JSON.parse(event.data); - if (message.type === 'metadata') { - // Handle audio format info - } else if (message.type === 'end') { - // Streaming finished - } - } else { - // Binary audio data - play with audio API - playAudioChunk(event.data); - } -}; -``` - -## Performance - -- Implements backpressure handling to manage slow clients -- Uses asyncio queues to decouple audio generation from network transmission -- Supports graceful shutdown with proper task cancellation -- Optimized for real-time performance with minimal latency \ No newline at end of file diff --git a/soprano/server/test_api.py b/soprano/server/test_api.py deleted file mode 100644 index 886b91b..0000000 --- a/soprano/server/test_api.py +++ /dev/null @@ -1,138 +0,0 @@ -import asyncio -import aiohttp -import json -import sys -import os -from pathlib import Path - -async def test_api(text="Hello, this is a test of the Soprano TTS API."): - """ - Test the Soprano TTS API endpoint - """ - base_url = "http://localhost:8000" - endpoint = "/v1/audio/speech" - - payload = { - "input": text, - "temperature": 0.3, - "top_p": 1.0, - "repetition_penalty": 1.2, - "min_text_length": 30 - } - - print(f"Testing API at {base_url}{endpoint}") - print(f"Sending text: '{text[:50]}{'...' if len(text) > 50 else ''}'") - - try: - async with aiohttp.ClientSession() as session: - async with session.post(f"{base_url}{endpoint}", json=payload) as response: - status = response.status - print(f"Response status: {status}") - - if status == 200: - # Read the audio content - audio_content = await response.read() - print(f"Received audio data: {len(audio_content)} bytes") - - # Save the audio to a file - output_dir = "audio_output" - os.makedirs(output_dir, exist_ok=True) - - # Generate unique filename - file_counter = 1 - while True: - filename = f"api_test_output_{file_counter}.wav" - filepath = os.path.join(output_dir, filename) - if not os.path.exists(filepath): - break - file_counter += 1 - - with open(filepath, 'wb') as f: - f.write(audio_content) - - print(f"Audio saved to: {filepath}") - return True - else: - error_text = await response.text() - print(f"Request failed with status {status}") - print(f"Error: {error_text}") - return False - except aiohttp.ClientConnectorError: - print("Error: Could not connect to API server. Make sure it's running on http://localhost:8000") - return False - except Exception as e: - print(f"Request failed with error: {e}") - return False - - -async def test_health(): - """ - Test the health check endpoint - """ - base_url = "http://localhost:8000" - endpoint = "/health" - - print(f"Testing health endpoint at {base_url}{endpoint}") - - try: - async with aiohttp.ClientSession() as session: - async with session.get(f"{base_url}{endpoint}") as response: - status = response.status - print(f"Health check status: {status}") - - if status == 200: - health_data = await response.json() - print(f"Health check result: {health_data}") - return True - else: - error_text = await response.text() - print(f"Health check failed with status {status}") - print(f"Error: {error_text}") - return False - except Exception as e: - print(f"Health check failed with error: {e}") - return False - - -async def main(): - print("Soprano TTS API Test Client") - print("Make sure the API server is running on http://localhost:8000 before executing this.") - print() - - # Test health endpoint first - print("Testing health endpoint...") - health_ok = await test_health() - if not health_ok: - print("Health check failed. Exiting.") - return - - print() - - # Get text from command line arguments or use default - if len(sys.argv) > 1: - text = " ".join(sys.argv[1:]) - else: - text = "Hello, this is a test of the Soprano TTS API. The system is working properly." - - if not text.strip(): - print("Text cannot be empty. Please enter some text.") - return - - print("Testing TTS API...") - success = await test_api(text) - - if success: - print("API test completed successfully!") - else: - print("API test failed!") - - -if __name__ == "__main__": - # Check if required packages are available - try: - import aiohttp - except ImportError: - print("Error: aiohttp is not installed. Please install it with: pip install aiohttp") - exit(1) - - asyncio.run(main()) \ No newline at end of file diff --git a/soprano/server/test_websocket.py b/soprano/server/test_websocket.py deleted file mode 100644 index b8c304e..0000000 --- a/soprano/server/test_websocket.py +++ /dev/null @@ -1,81 +0,0 @@ -import asyncio -import websockets -import json -import pyaudio -import sys - -class SopranoWSClient: - def __init__(self): - self.p = pyaudio.PyAudio() - self.stream = None - # Default settings (will be updated by metadata from server) - self.rate = 32000 - self.channels = 1 - - def open_stream(self, rate, channels): - """Opens the audio device for live playback.""" - self.stream = self.p.open( - format=pyaudio.paInt16, - channels=channels, - rate=rate, - output=True - ) - - async def start_test(self, text): - uri = "ws://localhost:8001/ws/tts" - - try: - async with websockets.connect(uri) as ws: - # 1. Send the synthesis request - payload = { - "type": "synthesize", - "text": text, - "stream": True, - "min_text_length": 30 - } - await ws.send(json.dumps(payload)) - print(f">>> Sent text to server. Waiting for audio...") - - # 2. Listen for chunks - while True: - message = await ws.recv() - - # Handle Audio Bytes - if isinstance(message, bytes): - if self.stream: - self.stream.write(message) - print(".", end="", flush=True) - - # Handle JSON Messages - else: - data = json.loads(message) - if data["type"] == "metadata": - print(f"\n[Metadata] Rate: {data['sample_rate']}Hz") - self.open_stream(data['sample_rate'], data['channels']) - - elif data["type"] == "end": - print("\n[Finished] Server signaled end of stream.") - break - - elif data["type"] == "error": - print(f"\n[Error] {data['message']}") - break - - except Exception as e: - print(f"Connection Error: {e}") - finally: - self.cleanup() - - def cleanup(self): - if self.stream: - self.stream.stop_stream() - self.stream.close() - self.p.terminate() - -if __name__ == "__main__": - input_text = "Testing the live websocket stream. I should hear this almost immediately." - if len(sys.argv) > 1: - input_text = " ".join(sys.argv[1:]) - - client = SopranoWSClient() - asyncio.run(client.start_test(input_text)) \ No newline at end of file diff --git a/soprano/server/websocket.py b/soprano/server/websocket.py deleted file mode 100644 index 45883e1..0000000 --- a/soprano/server/websocket.py +++ /dev/null @@ -1,386 +0,0 @@ -import asyncio -import json -import logging -from typing import AsyncGenerator -import numpy as np -from fastapi import FastAPI, WebSocket, WebSocketDisconnect -import torch -from contextlib import asynccontextmanager -from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError -from asyncio import Queue, QueueEmpty - -# Handle import when running from within the server directory -try: - from soprano.tts import SopranoTTS -except ImportError: - import sys - import os - # Add the parent directory to the Python path to resolve import issues - sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from soprano.tts import SopranoTTS - - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -class TTSWebSocketManager: - """ - Manager for WebSocket TTS streaming functionality. - """ - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - if not hasattr(self, 'initialized'): - self.initialized = True - self.tts: SopranoTTS = None - self._lock = asyncio.Lock() # Move lock to instance level - # Track active streaming tasks for graceful shutdown - self.active_tasks = set() - # Prioritize CUDA, fallback to CPU only if CUDA is not available - if torch.cuda.is_available(): - self.device = 'cuda' - logger.info("CUDA is available, using GPU for TTS processing") - else: - self.device = 'cpu' - logger.info("CUDA is not available, falling back to CPU for TTS processing") - logger.info(f"Initializing TTS on device: {self.device}") - - async def initialize_model(self): - """ - Initialize the TTS model asynchronously to avoid blocking the event loop. - """ - async with self._lock: - if self.tts is None: - logger.info("Loading Soprano TTS model for WebSocket streaming...") - try: - # Run model initialization in a thread pool to avoid blocking - loop = asyncio.get_running_loop() # Use get_running_loop instead of get_event_loop - - def load_model(): - # Import here in case it's needed in the executor - try: - from soprano.tts import SopranoTTS - except ImportError: - import sys - import os - # Add the parent directory to the Python path to resolve import issues - sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - from soprano.tts import SopranoTTS - return SopranoTTS( - cache_size_mb=100, - device=self.device - ) - - self.tts = await loop.run_in_executor(None, load_model) - logger.info("Soprano TTS model loaded successfully for WebSocket streaming") - except Exception as e: - logger.error(f"Failed to load Soprano TTS model: {e}", exc_info=True) - raise RuntimeError(f"Failed to initialize TTS model: {str(e)}") from e - - def get_model(self) -> SopranoTTS: - """ - Get the initialized TTS model instance. - """ - if self.tts is None: - raise RuntimeError("TTS model not initialized. Call initialize_model() first.") - return self.tts - - async def stream_audio_with_backpressure(self, websocket: WebSocket, text: str, min_text_length: int = 30): - """ - Stream audio in real-time from the TTS model with backpressure handling. - Uses a queue to decouple TTS generation from WebSocket sending. - """ - # Check if streaming is supported (only available on GPU with LMDeploy backend) - try: - tts = self.get_model() - # Check if we're using transformers backend which doesn't support streaming - from soprano.backends.transformers import TransformersModel - if isinstance(tts.pipeline, TransformersModel): - # Send error message to client - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Real-time streaming is not supported on CPU. Only generate speech is supported for CPU." - })) - logger.warning("Streaming requested but not supported on CPU") - return - except Exception as e: - logger.warning(f"Could not determine backend type: {e}") - # Continue with original logic if there's an issue checking the backend - - # Create a queue to decouple generation from sending - audio_queue = Queue(maxsize=10) # Limit queue size to prevent memory buildup - - async def producer(): - """Generate audio chunks and put them in the queue.""" - try: - # Get the TTS model - tts = self.get_model() - - # Use the streaming inference method from the TTS model - logger.info(f"Starting streaming TTS for text: '{text[:50]}{'...' if len(text) > 50 else ''}'") - - # Use the infer_stream method which is designed for streaming - for audio_chunk in tts.infer_stream( - text=text, - chunk_size=1, - top_p=1.0, # Using the default value we set - temperature=0.3, # Using the default value we set - repetition_penalty=1.2, # Using the default value we set - min_text_length=min_text_length # Use the passed value instead of hardcoded 1000 - ): - # Convert tensor to numpy array - audio_np = audio_chunk.cpu().numpy() - - # Ensure values are in the range [-1, 1] and convert to int16 - audio_np = np.clip(audio_np, -1.0, 1.0) - audio_np = (audio_np * 32767).astype(np.int16) - - # Convert to bytes - audio_bytes = audio_np.tobytes() - - # Put audio chunk in queue, with timeout to handle slow consumers - try: - await asyncio.wait_for(audio_queue.put(audio_bytes), timeout=5.0) - except asyncio.TimeoutError: - logger.warning("Audio queue timeout - client may be slow") - break - - # Put None to signal end of stream - await audio_queue.put(None) - except NotImplementedError as e: - logger.error(f"Streaming not supported: {str(e)}") - # Send error message to client - try: - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Real-time streaming is not supported on CPU. Only generate speech is supported for CPU." - })) - except: - pass - try: - await audio_queue.put(None) # Signal error to consumer - except: - pass - except Exception as e: - logger.error(f"Error in audio producer: {str(e)}", exc_info=True) - try: - await audio_queue.put(None) # Signal error to consumer - except: - pass - - async def consumer(): - """Take audio chunks from the queue and send them via WebSocket.""" - try: - while True: - # Get audio chunk from queue with timeout - try: - audio_bytes = await asyncio.wait_for(audio_queue.get(), timeout=10.0) - except asyncio.TimeoutError: - logger.warning("Timeout waiting for audio data") - break - - # If None, it means the producer is done - if audio_bytes is None: - break - - # Send the audio chunk as binary data - await websocket.send_bytes(audio_bytes) - except Exception as e: - logger.error(f"Error in audio consumer: {str(e)}", exc_info=True) - # Don't re-raise here as we want to ensure cleanup happens - - # Create tasks for producer and consumer - producer_task = asyncio.create_task(producer()) - consumer_task = asyncio.create_task(consumer()) - - # Add tasks to active tasks set for graceful shutdown - self.active_tasks.add(producer_task) - self.active_tasks.add(consumer_task) - - try: - # Wait for both tasks to complete - await asyncio.gather(producer_task, consumer_task, return_exceptions=True) - finally: - # Remove tasks from active tasks set - self.active_tasks.discard(producer_task) - self.active_tasks.discard(consumer_task) - - # Cancel tasks if they're still running - if not producer_task.done(): - producer_task.cancel() - if not consumer_task.done(): - consumer_task.cancel() - - async def stream_audio(self, websocket: WebSocket, text: str, min_text_length: int = 30): - """ - Stream audio in real-time from the TTS model. - """ - try: - # Send metadata at the start - metadata = { - "type": "metadata", - "sample_rate": 32000, - "channels": 1, - "format": "int16", - "model_info": "soprano" - } - await websocket.send_text(json.dumps(metadata)) - - # Use the backpressure-aware streaming method - # This will return early if streaming is not supported - await self.stream_audio_with_backpressure(websocket, text, min_text_length) - - # Only send end signal if streaming was not terminated early due to unsupported backend - # The function will return before reaching here if streaming is not supported - - except Exception as e: - logger.error(f"Error during streaming: {str(e)}", exc_info=True) - try: - await websocket.send_text(json.dumps({"type": "error", "message": str(e)})) - except: - pass # If we can't send the error, just continue - - -@asynccontextmanager -async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: - """ - Lifespan event handler for startup and shutdown events. - """ - logger.info("Starting up Soprano TTS WebSocket server...") - tts_manager = TTSWebSocketManager() - - try: - await tts_manager.initialize_model() - logger.info("Soprano TTS WebSocket server started successfully") - yield - except Exception as e: - logger.error(f"Failed to start Soprano TTS WebSocket server: {e}", exc_info=True) - raise - finally: - logger.info("Shutting down Soprano TTS WebSocket server...") - # Cancel any active streaming tasks for graceful shutdown - if hasattr(tts_manager, 'active_tasks'): - for task in tts_manager.active_tasks.copy(): # Use copy to avoid modification during iteration - if not task.done(): - logger.info("Cancelling active streaming task...") - task.cancel() - try: - await task - except asyncio.CancelledError: - pass # Expected when cancelling tasks - logger.info("Soprano TTS WebSocket server shut down completed") - - -# Create FastAPI app with WebSocket support -app = FastAPI( - title="Soprano TTS WebSocket API", - description="Real-time streaming Text-to-Speech via WebSocket", - version="1.0.0", - lifespan=lifespan -) - - -@app.websocket("/ws/tts") -async def websocket_endpoint(websocket: WebSocket): - """ - WebSocket endpoint for real-time TTS streaming. - Supports multiple synthesize requests over the same connection. - """ - await websocket.accept() - logger.info("WebSocket connection accepted") - - try: - tts_manager = TTSWebSocketManager() - - # Keep the connection open to handle multiple requests - while True: - # Wait for a message - data = await websocket.receive_text() - message = json.loads(data) - - if message.get("type") == "synthesize": - text = message.get("text", "") - stream = message.get("stream", True) - # Allow client to specify min_text_length, default to 30 - min_text_length = message.get("min_text_length", 30) - - if not text or not text.strip(): - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Text cannot be empty" - })) - continue # Continue to listen for more messages - - if stream: - # Start streaming audio - await tts_manager.stream_audio(websocket, text, min_text_length) - else: - # For non-streaming, we could implement a regular synthesis - # but the requirement is for streaming, so we'll focus on that - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Only streaming is supported in this endpoint" - })) - elif message.get("type") == "ping": - # Simple ping/pong for connection health - await websocket.send_text(json.dumps({"type": "pong"})) - else: - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Invalid message type. Use 'synthesize' or 'ping'." - })) - - except WebSocketDisconnect: - logger.info("WebSocket disconnected") - except json.JSONDecodeError: - logger.error("Invalid JSON received") - try: - await websocket.send_text(json.dumps({ - "type": "error", - "message": "Invalid JSON format" - })) - except: - pass - except ConnectionClosedOK: - logger.info("WebSocket connection closed normally") - except ConnectionClosedError: - logger.info("WebSocket connection closed with error") - except Exception as e: - logger.error(f"Unexpected error in WebSocket: {str(e)}", exc_info=True) - try: - await websocket.send_text(json.dumps({ - "type": "error", - "message": f"Server error: {str(e)}" - })) - except: - pass - finally: - try: - if hasattr(websocket, 'client_state') and websocket.client_state.name != 'DISCONNECTED': - await websocket.close() - except: - pass - - -if __name__ == "__main__": - import uvicorn - import torch - - print("Starting Soprano TTS WebSocket Server...") - print(f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}") - - # Start the server - print("WebSocket server starting on ws://localhost:8001/ws/tts") - uvicorn.run( - app, - host="localhost", - port=8001, # Using port 8001 to avoid conflict with the regular API - reload=False - ) \ No newline at end of file From 742899929b3a66bfa00f8d76b94152ce65e8727d Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:03:47 +0545 Subject: [PATCH 16/27] Delete soprano/utils directory --- soprano/utils/text.py | 401 ------------------------------------------ 1 file changed, 401 deletions(-) delete mode 100644 soprano/utils/text.py diff --git a/soprano/utils/text.py b/soprano/utils/text.py deleted file mode 100644 index 2295448..0000000 --- a/soprano/utils/text.py +++ /dev/null @@ -1,401 +0,0 @@ -""" -Normalize input text to a format that Soprano recognizes. -Adapted from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/tokenizer.py -""" -import re - -import inflect -from unidecode import unidecode - - -_inflect = inflect.engine() - -#################################################################################################### -# Abbreviations - -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('ms', 'miss'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] -_cased_abbreviations = [(re.compile('\\b%s\\b' % x[0]), x[1]) for x in [ - ('TTS', 'text to speech'), - ('Hz', 'hertz'), - ('kHz', 'kilohertz'), - ('KBs', 'kilobytes'), - ('KB', 'kilobyte'), - ('MBs', 'megabytes'), - ('MB', 'megabyte'), - ('GBs', 'gigabytes'), - ('GB', 'gigabyte'), - ('TBs', 'terabytes'), - ('TB', 'terabyte'), - ('APIs', 'a p i\'s'), - ('API', 'a p i'), - ('CLIs', 'c l i\'s'), - ('CLI', 'c l i'), - ('CPUs', 'c p u\'s'), - ('CPU', 'c p u'), - ('GPUs', 'g p u\'s'), - ('GPU', 'g p u'), - ('Ave', 'avenue'), - ('etc', 'et cetera'), - ('Mon', 'monday'), - ('Tues', 'tuesday'), - ('Wed', 'wednesday'), - ('Thurs', 'thursday'), - ('Fri', 'friday'), - ('Sat', 'saturday'), - ('Sun', 'sunday'), - ('and/or', 'and or'), -]] - -def expand_abbreviations(text): - for regex, replacement in _abbreviations + _cased_abbreviations: - text = re.sub(regex, replacement, text) - return text - -#################################################################################################### -# Numbers - -_num_prefix_re = re.compile(r'#\d') -_num_suffix_re = re.compile(r'\b\d+(K|M|B|T)\b', re.IGNORECASE) -_num_letter_split_re = re.compile(r'(\d[a-z]|[a-z]\d)', re.IGNORECASE) - -_comma_number_re = re.compile(r'(\d[\d\,]+\d)') -_date_re = re.compile(r'(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])') -_phone_number_re = re.compile(r'(\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4})') -_time_re = re.compile(r'(\d\d?:\d\d(?::\d\d)?)') -_pounds_re = re.compile(r'£([\d\,]*\d+)') -_dollars_re = re.compile(r'\$([\d\.\,]*\d+)') -_decimal_number_re = re.compile(r'(\d+(?:\.\d+)+)') -_multiply_re = re.compile(r'(\d\s?\*\s?\d)') -_divide_re = re.compile(r'(\d\s?/\s?\d)') -_add_re = re.compile(r'(\d\s?\+\s?\d)') -_subtract_re = re.compile(r'(\d?\s?-\s?\d)') # also does negative numbers -_fraction_re = re.compile(r'(\d+(?:/\d+)+)') -_ordinal_re = re.compile(r'\d+(st|nd|rd|th)') -_number_re = re.compile(r'\d+') - -def _expand_num_prefix(m): - match = m.group(0) - return f"number {match[1]}" - -def _expand_num_suffix(m): - match = m.group(0) - if match[1].upper() == 'K': return f"{match[0]} thousand" - elif match[1].upper() == 'M': return f"{match[0]} million" - elif match[1].upper() == 'B': return f"{match[0]} billion" - elif match[1].upper() == 'T': return f"{match[0]} trillion" - return match # unexpected format - -def _split_alphanumeric(m): - match = m.group(1) - return f"{match[0]} {match[1]}" - -def _remove_commas(m): - return m.group(1).replace(',', '') - -def _expand_date(m): - match = m.group(2) - match = re.split('[./-]', match) - return m.group(1) + ' dash '.join(match) + m.group(3) - -def _expand_phone_number(m): - match = m.group(1) - match = re.sub(r'\D', '', match) - assert len(match) == 10 - match = f"{' '.join(list(match[:3]))}, {' '.join(list(match[3:6]))}, {' '.join(list(match[6:]))}" - return match - -def _expand_time(m): - match = m.group(1) - match = match.split(':') - if len(match) == 2: - hours, minutes = match - if minutes == '00': - if int(hours) == 0: - return '0' - elif int(hours) > 12: return f"{hours} minutes" - return f"{hours} o'clock" - elif minutes.startswith('0'): - minutes = f'oh {minutes[1:]}' - return f"{hours} {minutes}" - else: - hours, minutes, seconds = match - if int(hours) != 0: - return f"{hours} {'oh oh' if minutes == '00' else f'oh {minutes}' if minutes.startswith('0') else {minutes}} {'' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" - elif minutes != '00': - return f"{minutes} {'oh oh' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" - else: - return seconds - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - -def _expand_decimal_point(m): - match = m.group(1) - match = match.split('.') - return match[0] + ' point ' + ' point '.join(' '.join(list(match[i])) for i in range(1, len(match))) - -def _expand_fraction(m): - match = m.group(1) - match = match.split('/') - return ' over '.join(match) if len(match)==2 else ' slash '.join(match) - -def _expand_multiply(m): - return ' times '.join(m.group(1).split('*')) - -def _expand_divide(m): - return ' over '.join(m.group(1).split('/')) - -def _expand_add(m): - return ' plus '.join(m.group(1).split('+')) - -def _expand_subtract(m): - return ' minus '.join(m.group(1).split('-')) - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0), andword='') - -def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') - -def normalize_numbers(text): - text = re.sub(_num_prefix_re, _expand_num_prefix, text) - text = re.sub(_num_suffix_re, _expand_num_suffix, text) - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_date_re, _expand_date, text) - text = re.sub(_phone_number_re, _expand_phone_number, text) - text = re.sub(_time_re, _expand_time, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_multiply_re, _expand_multiply, text) - text = re.sub(_divide_re, _expand_divide, text) - text = re.sub(_add_re, _expand_add, text) - text = re.sub(_subtract_re, _expand_subtract, text) - - text = re.sub(_fraction_re, _expand_fraction, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - for _ in range(2): # need to do this twice to find all matches - text = re.sub(_num_letter_split_re, _split_alphanumeric, text) - text = re.sub(_number_re, _expand_number, text) - return text - -#################################################################################################### -# Special characters & other patterns - -_special_characters = [(re.compile(x[0]), x[1]) for x in [ - ('@', ' at '), - ('&', ' and '), - ('%', ' percent '), - (':', '.'), - (';', ','), - (r'\+', ' plus '), - (r'\\', ' backslash '), - ('~', ' about '), - ('(^| )<3', ' heart '), - ('<=', ' less than or equal to '), - ('>=', ' greater than or equal to '), - ('<', ' less than '), - ('>', ' greater than '), - ('=', ' equals '), - ('/', ' slash '), - ('_', ' '), - (r'\*', ' '), -]] -_link_header_re = re.compile(r'(https?://)') -_dash_re = re.compile(r'(. - .)') -_dot_re = re.compile(r'([A-Z]\.[A-Z])', re.IGNORECASE) -_parentheses_re = re.compile(r'[\(\[\{].*[\)\]\}](.|$)') - -def expand_special_characters(text): - for regex, replacement in _special_characters: - text = re.sub(regex, replacement, text) - return text - -def _expand_link_header(m): - return 'h t t p s colon slash slash ' - -def _expand_dash(m): - match = m.group(0) - return f"{match[0]}, {match[4]}" - -def _expand_dot(m): - match = m.group(0) - return f"{match[0]} dot {match[2]}" - -def _expand_parantheses(m): - match = m.group(0) - match = re.sub(r'[\(\[\{]', ', ', match) - match = re.sub(r'[\)\]\}][^$.!?,]', ', ', match) - match = re.sub(r'[\)\]\}]', '', match) - return match - -def normalize_special(text): - text = re.sub(_link_header_re, _expand_link_header, text) - text = re.sub(_dash_re, _expand_dash, text) - text = re.sub(_dot_re, _expand_dot, text) - text = re.sub(_parentheses_re, _expand_parantheses, text) - return text - -#################################################################################################### -# Misc - -def lowercase(text): - return text.lower() - -def convert_to_ascii(text): - return unidecode(text) - -def normalize_newlines(text): - text = text.split('\n') - for i in range(len(text)): - text[i] = text[i].strip() - if not text[i]: continue - if text[i][-1] not in '.!?': - text[i] = f"{text[i]}." - return ' '.join(text) - -def remove_unknown_characters(text): - text = re.sub(r"[^A-Za-z !\$%&'\*\+,-./0123456789<>\?_]", "", text) - text = re.sub(r"[<>/_+]", "", text) - return text - -def collapse_whitespace(text): - text = re.sub(r'\s+', ' ', text) - text = re.sub(r' [.\?!,]', lambda m: m.group(0)[1], text) - return text.strip() - -def dedup_punctuation(text): - text = re.sub(r"\.\.\.+", "[ELLIPSIS]", text) - text = re.sub(r",+", ",", text) - text = re.sub(r"[\.,]*\.[\.,]*", ".", text) - text = re.sub(r"[\.,!]*![\.,!]*", "!", text) - text = re.sub(r"[\.,!\?]*\?[\.,!\?]*", "?", text) - text = re.sub(r"\[ELLIPSIS\]", "...", text) - return text - -def clean_text(text): - text = convert_to_ascii(text) - text = normalize_newlines(text) - text = normalize_numbers(text) - text = normalize_special(text) - text = expand_abbreviations(text) - text = expand_special_characters(text) - text = lowercase(text) - text = remove_unknown_characters(text) - text = collapse_whitespace(text) - text = dedup_punctuation(text) - return text - - -if __name__ == '__main__': - print(clean_text('1,2,3,456,176')) - print(clean_text('123,456,789')) - print(clean_text('123,456,789th')) - print(clean_text('123-456-7890')) - print(clean_text('111-111-1111')) - print(clean_text('(111) 111-1111')) - print(clean_text('A(111) 111-1111')) - print(clean_text('A (111) 111-1111')) - print(clean_text('$2.47')) - print(clean_text('$247')) - print(clean_text('$0.27')) - print(clean_text('$1.00')) - print(clean_text('£20')) - for i in range(1990, 2030): - print(clean_text(str(i))) - print(clean_text('2656')) - print(clean_text('1024')) - print(clean_text('2.47023')) - print(clean_text('20.47023')) - print(clean_text('1.17.1.1')) - print(clean_text('111.111.1111')) - print(clean_text('1/1/2025')) - print(clean_text('1-1-2025')) - print(clean_text('1-1-25')) - print(clean_text('A 1/1/11 A')) - print(clean_text('A 1/1 A')) - print(clean_text('1/1')) - print(clean_text('1/10')) - print(clean_text('1/1/10')) - print(clean_text('11/1/1/10')) - - print(clean_text('0:00')) - print(clean_text('12:00')) - print(clean_text('13:00')) - print(clean_text('8:00')) - print(clean_text('8:05')) - print(clean_text('8:15')) - print(clean_text('0:00:00')) - print(clean_text('00:01:10')) - print(clean_text('00:10:01')) - print(clean_text('01:01:01')) - print(clean_text('00:01:00')) - print(clean_text('01:00:00')) - - print(clean_text('-1 + 2 * 3 - 4 / 5')) - print(clean_text('-1+2*3-5/4/25')) - - print(clean_text('100x1')) - print(clean_text('100k')) - print(clean_text('100m')) - print(clean_text('100b')) - print(clean_text('100t')) - - print(clean_text('#1')) - - print(clean_text('12:00')) - print(clean_text('11:59')) - print(clean_text('01:00')) - print(clean_text('0100')) - - print(clean_text('1st 2nd 3rd 4th')) - print(clean_text('1K 1M 1B 1T 1K1M1B1T')) - print(clean_text('and/or')) From 1083b1a2b08671c59efc61c0b3fe8f0f508f1b5c Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:03:54 +0545 Subject: [PATCH 17/27] Delete soprano/vocos directory --- soprano/vocos/decoder.py | 45 --------------------- soprano/vocos/heads.py | 50 ----------------------- soprano/vocos/models.py | 61 ----------------------------- soprano/vocos/modules.py | 47 ---------------------- soprano/vocos/spectral_ops.py | 74 ----------------------------------- 5 files changed, 277 deletions(-) delete mode 100644 soprano/vocos/decoder.py delete mode 100644 soprano/vocos/heads.py delete mode 100644 soprano/vocos/models.py delete mode 100644 soprano/vocos/modules.py delete mode 100644 soprano/vocos/spectral_ops.py diff --git a/soprano/vocos/decoder.py b/soprano/vocos/decoder.py deleted file mode 100644 index 75d506a..0000000 --- a/soprano/vocos/decoder.py +++ /dev/null @@ -1,45 +0,0 @@ -import torch -from torch import nn - -from .models import VocosBackbone -from .heads import ISTFTHead - - -class SopranoDecoder(nn.Module): - def __init__(self, - num_input_channels=512, - decoder_num_layers=8, - decoder_dim=512, - decoder_intermediate_dim=None, - hop_length=512, - n_fft=2048, - upscale=4, - dw_kernel=3, - ): - super().__init__() - self.decoder_initial_channels = num_input_channels - self.num_layers = decoder_num_layers - self.dim = decoder_dim - self.intermediate_dim = decoder_intermediate_dim if decoder_intermediate_dim else decoder_dim*3 - self.hop_length = hop_length - self.n_fft = n_fft - self.upscale = upscale - self.dw_kernel = dw_kernel - - self.decoder = VocosBackbone(input_channels=self.decoder_initial_channels, - dim=self.dim, - intermediate_dim=self.intermediate_dim, - num_layers=self.num_layers, - input_kernel_size=dw_kernel, - dw_kernel_size=dw_kernel, - ) - self.head = ISTFTHead(dim=self.dim, - n_fft=self.n_fft, - hop_length=self.hop_length) - - def forward(self, x): - T = x.size(2) - x = torch.nn.functional.interpolate(x, size=self.upscale*(T-1)+1, mode='linear', align_corners=True) - x = self.decoder(x) - reconstructed = self.head(x) - return reconstructed diff --git a/soprano/vocos/heads.py b/soprano/vocos/heads.py deleted file mode 100644 index 5b9e15c..0000000 --- a/soprano/vocos/heads.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from torch import nn -from .spectral_ops import ISTFT - - -class ISTFTHead(nn.Module): - """ - ISTFT Head module for predicting STFT complex coefficients. - - Args: - dim (int): Hidden dimension of the model. - n_fft (int): Size of Fourier transform. - hop_length (int): The distance between neighboring sliding window frames, which should align with - the resolution of the input features. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - """ - - def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "center"): - super().__init__() - out_dim = n_fft + 2 - self.out = torch.nn.Linear(dim, out_dim) - self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding) - - @torch.compiler.disable - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Forward pass of the ISTFTHead module. - - Args: - x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, - L is the sequence length, and H denotes the model dimension. - - Returns: - Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. - """ - x = self.out(x.transpose(1,2)).transpose(1, 2) - mag, p = x.chunk(2, dim=1) - mag = torch.exp(mag) - mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes - # wrapping happens here. These two lines produce real and imaginary value - x = torch.cos(p) - y = torch.sin(p) - # recalculating phase here does not produce anything new - # only costs time - # phase = torch.atan2(y, x) - # S = mag * torch.exp(phase * 1j) - # better directly produce the complex value - S = mag * (x + 1j * y) - audio = self.istft(S) - return audio diff --git a/soprano/vocos/models.py b/soprano/vocos/models.py deleted file mode 100644 index 458d815..0000000 --- a/soprano/vocos/models.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Optional - -import torch -from torch import nn - -from .modules import ConvNeXtBlock - -class VocosBackbone(nn.Module): - """ - Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization - - Args: - input_channels (int): Number of input features channels. - dim (int): Hidden dimension of the model. - intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock. - num_layers (int): Number of ConvNeXtBlock layers. - layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`. - """ - - def __init__( - self, - input_channels: int, - dim: int, - intermediate_dim: int, - num_layers: int, - input_kernel_size: int = 9, - dw_kernel_size: int = 9, - layer_scale_init_value: Optional[float] = None, - pad: str = 'zeros', - ): - super().__init__() - self.embed = nn.Conv1d(input_channels, dim, kernel_size=input_kernel_size, padding=input_kernel_size//2, padding_mode=pad) - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.convnext = nn.ModuleList( - [ - ConvNeXtBlock( - dim=dim, - intermediate_dim=intermediate_dim, - dw_kernel_size=dw_kernel_size, - layer_scale_init_value=layer_scale_init_value or 1 / num_layers**0.5, - ) - for _ in range(num_layers) - ] - ) - self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, (nn.Conv1d, nn.Linear)): - nn.init.trunc_normal_(m.weight, std=0.02) - if m.bias is not None: nn.init.constant_(m.bias, 0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.embed(x) # (B, C, L) - x = self.norm(x.transpose(1, 2)) - x = x.transpose(1, 2) - for conv_block in self.convnext: - x = conv_block(x) - x = self.final_layer_norm(x.transpose(1, 2)) - x = x.transpose(1, 2) - return x diff --git a/soprano/vocos/modules.py b/soprano/vocos/modules.py deleted file mode 100644 index f969d4f..0000000 --- a/soprano/vocos/modules.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -from torch import nn - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: float, - dw_kernel_size: int = 9, - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=dw_kernel_size, padding=dw_kernel_size//2, groups=dim) # depthwise conv - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - - x = residual + x - return x diff --git a/soprano/vocos/spectral_ops.py b/soprano/vocos/spectral_ops.py deleted file mode 100644 index 8a38cb8..0000000 --- a/soprano/vocos/spectral_ops.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch -from torch import nn - -class ISTFT(nn.Module): - """ - Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with - windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges. - See issue: https://github.com/pytorch/pytorch/issues/62323 - Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs. - The NOLA constraint is met as we trim padded samples anyway. - - Args: - n_fft (int): Size of Fourier transform. - hop_length (int): The distance between neighboring sliding window frames. - win_length (int): The size of window frame and STFT filter. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - """ - - def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"): - super().__init__() - if padding not in ["center", "same"]: - raise ValueError("Padding must be 'center' or 'same'.") - self.padding = padding - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - window = torch.hann_window(win_length) - self.register_buffer("window", window) - - def forward(self, spec: torch.Tensor) -> torch.Tensor: - """ - Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram. - - Args: - spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size, - N is the number of frequency bins, and T is the number of time frames. - - Returns: - Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal. - """ - if self.padding == "center": - spec[:,0] = 0 # fixes some strange bug where first/last freqs don't matter when bs<16 which causes exploding gradients - spec[:,-1] = 0 - # Fallback to pytorch native implementation - return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True) - elif self.padding == "same": - pad = (self.win_length - self.hop_length) // 2 - else: - raise ValueError("Padding must be 'center' or 'same'.") - - assert spec.dim() == 3, "Expected a 3D tensor as input" - B, N, T = spec.shape - - # Inverse FFT - ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") - ifft = ifft * self.window[None, :, None] - - # Overlap and Add - output_size = (T - 1) * self.hop_length + self.win_length - y = torch.nn.functional.fold( - ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), - )[:, 0, 0, pad:-pad] - - # Window envelope - window_sq = self.window.square().expand(1, T, -1).transpose(1, 2) - window_envelope = torch.nn.functional.fold( - window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), - ).squeeze()[pad:-pad] - - # Normalize - assert (window_envelope > 1e-11).all() - y = y / window_envelope - - return y From 161b87148a20ef9508907396b2e6134654a7b380 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:04 +0545 Subject: [PATCH 18/27] Delete soprano/__init__.py --- soprano/__init__.py | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 soprano/__init__.py diff --git a/soprano/__init__.py b/soprano/__init__.py deleted file mode 100644 index 01940e8..0000000 --- a/soprano/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Soprano TTS - Ultra-realistic Text-to-Speech System - -This package provides high-quality text-to-speech functionality with both -REST API and WebSocket streaming capabilities. -""" - -from .tts import SopranoTTS - -__version__ = "0.0.2" -__author__ = "ekwek1" -__all__ = ["SopranoTTS"] \ No newline at end of file From 14cac5fe2cefcae5e6b068e5bf3e8bb19544b751 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:11 +0545 Subject: [PATCH 19/27] Delete soprano/server.py --- soprano/server.py | 139 ---------------------------------------------- 1 file changed, 139 deletions(-) delete mode 100644 soprano/server.py diff --git a/soprano/server.py b/soprano/server.py deleted file mode 100644 index 035ac9d..0000000 --- a/soprano/server.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -Soprano TTS Server -Menu-driven interface to launch different server options. -""" -import sys -import os -import subprocess - - -def display_menu() -> None: - """Display the main menu options""" - print("\n" + "="*60) - print(" SOPRANO TTS SERVER MENU") - print("="*60) - print("Select an option:") - print() - print("1. Start API Server") - print(" OpenAI-compatible API for workflow integration") - print(" Accessible at http://localhost:8000/v1/audio/speech") - print() - print("2. Test API Server") - print(" Test client for the OpenAI-compatible API") - print(" Requires API server to be running") - print() - print("3. Start WebSocket Server") - print(" Real-time audio streaming for interactive applications") - print(" Available at ws://localhost:8001/ws/tts") - print() - print("4. Test WebSocket Server") - print(" Test client for real-time audio streaming") - print(" Requires WebSocket server to be running") - print() - print("5. Start WebUI") - print(" Gradio web interface for Soprano TTS") - print(" Opens browser with interactive UI") - print() - print("6. Start CLI") - print(" Command-line interface for Soprano TTS") - print(" Interactive menu for text synthesis") - print() - print("7. Exit") - print("="*60) - - -def get_user_choice() -> str: - """Get and validate user choice from the menu""" - while True: - try: - choice = input("Enter your choice (1-7): ").strip() - if choice in ['1', '2', '3', '4', '5', '6', '7']: - return choice - else: - print("Invalid choice. Please enter 1, 2, 3, 4, 5, 6, or 7.") - except (KeyboardInterrupt, EOFError): - print("\n\nSoprano TTS Server menu interrupted. Goodbye!") - sys.exit(0) - - -def main_menu() -> None: - """Display the main menu and handle user selection""" - # Get the root directory (where this script is located) - root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - while True: - display_menu() - - choice = get_user_choice() - - if choice == '1': - # Open new cmdline in root, cd to soprano/server, run python api.py - cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python api.py"' - subprocess.run(cmd, shell=True) - print("API server started in new terminal. This menu will now close.") - sys.exit(0) - elif choice == '2': - # Open new cmdline in root, cd to soprano/server, run python api.py and test_api.py in separate terminals - cmd1 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python api.py"' - cmd2 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && ping 127.0.0.1 -n 11 > nul && python test_api.py"' - subprocess.run(cmd1, shell=True) - subprocess.run(cmd2, shell=True) - print("API server and test client started in new terminals. This menu will now close.") - sys.exit(0) - elif choice == '3': - # Open new cmdline in root, cd to soprano/server, run python websocket.py - cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python websocket.py"' - subprocess.run(cmd, shell=True) - print("WebSocket server started in new terminal. This menu will now close.") - sys.exit(0) - elif choice == '4': - # Open new cmdline in root, cd to soprano/server, run python websocket.py and test_websocket.py in separate terminals - cmd1 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && python websocket.py"' - cmd2 = f'start cmd /k "cd /d {root_dir} && cd soprano && cd server && ping 127.0.0.1 -n 11 > nul && python test_websocket.py"' - subprocess.run(cmd1, shell=True) - subprocess.run(cmd2, shell=True) - print("WebSocket server and test client started in new terminals. This menu will now close.") - sys.exit(0) - elif choice == '5': - # Open new cmdline in root, cd to soprano, run python webui.py - cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && python webui.py"' - subprocess.run(cmd, shell=True) - print("WebUI started in new terminal. This menu will now close.") - sys.exit(0) - elif choice == '6': - # Open new cmdline in root, cd to soprano, run python soprano_cli.py - cmd = f'start cmd /k "cd /d {root_dir} && cd soprano && python soprano_cli.py"' - subprocess.run(cmd, shell=True) - print("CLI started in new terminal. This menu will now close.") - sys.exit(0) - elif choice == '7': - print("Thank you for using Soprano TTS. Goodbye!") - sys.exit(0) - - -def main() -> None: - """ - Main entry point for the server module. - Initializes device detection and starts the main menu. - """ - try: - # Check available device - try: - import torch - device_info = f"Available device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}" - print(device_info) - except ImportError: - print("Available device: CPU (PyTorch not available)") - - # Start the main menu - main_menu() - except KeyboardInterrupt: - print("\n\nSoprano TTS Server interrupted. Goodbye!") - sys.exit(0) - except Exception as e: - print(f"An unexpected error occurred in main: {e}") - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file From 218ad5fc7ea3af437e0562984157fdf58f4980f0 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:19 +0545 Subject: [PATCH 20/27] Delete soprano/soprano_cli.py --- soprano/soprano_cli.py | 186 ----------------------------------------- 1 file changed, 186 deletions(-) delete mode 100644 soprano/soprano_cli.py diff --git a/soprano/soprano_cli.py b/soprano/soprano_cli.py deleted file mode 100644 index 38a8f64..0000000 --- a/soprano/soprano_cli.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -Soprano TTS Command Line Interface -""" -import argparse -import sys -import os -import torch -# Add the parent directory to the Python path to resolve import issues when running directly -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from soprano import SopranoTTS - -try: - import sounddevice as sd - SOUNDDEVICE_AVAILABLE = True -except ImportError: - SOUNDDEVICE_AVAILABLE = False - -def get_device(): - """Determine the best available device (CUDA if available, otherwise CPU)""" - return 'cuda' if torch.cuda.is_available() else 'cpu' - -def play_audio(audio_tensor): - """Play audio tensor using sounddevice""" - if not SOUNDDEVICE_AVAILABLE: - print("Error: sounddevice library not available. Install it with 'pip install sounddevice'") - return - - import numpy as np - audio_np = audio_tensor.cpu().numpy() if isinstance(audio_tensor, torch.Tensor) else audio_tensor - - duration = len(audio_np) / 32000 - print(f"Playing audio ({duration:.2f}s)...") - - sample_rate = 32000 - sd.play(audio_np, samplerate=sample_rate) - - import time - time.sleep(duration + 0.5) - - try: - if sd.get_status().playing: - sd.wait() - except: - time.sleep(0.5) - -def validate_text(text): - """Validate input text""" - stripped_text = text.strip() if text else "" - if not stripped_text: - print("Error: Text cannot be empty.") - return False - if len(stripped_text) > 1000: - print("Error: Text is too long (max 1000 characters).") - return False - return True - -def get_validated_input(prompt, validator_func, error_msg=None): - """Get validated input from user""" - while True: - user_input = input(prompt).strip() - if validator_func(user_input): - return user_input - else: - if error_msg: - print(error_msg) - else: - print("Invalid input, please try again.") - -def get_next_filename(base_name="output_audio", ext=".wav"): - """Generate next available filename with incremental numbering""" - import os - - output_dir = "audio_output" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - counter = 0 - while True: - if counter == 0: - filename = f"{base_name}{ext}" - else: - filename = f"{base_name}{counter}{ext}" - - full_path = os.path.join(output_dir, filename) - if not os.path.exists(full_path): - return full_path - counter += 1 - -def main(): - parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') - parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') - parser.add_argument('--backend', '-b', default='auto', - choices=['auto', 'transformers', 'lmdeploy'], - help='Backend to use for inference') - parser.add_argument('--cache-size', '-c', type=int, default=10, - help='Cache size in MB (for lmdeploy backend)') - - args = parser.parse_args() - - device = get_device() - - try: - import io - import contextlib - - with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): - tts = SopranoTTS( - backend=args.backend, - device=device, - cache_size_mb=args.cache_size, - model_path=args.model_path - ) - except Exception as e: - print(f"Error initializing model: {e}") - sys.exit(1) - - print("Soprano TTS is ready. Starting interactive menu...") - - while True: - print("\n" + "="*50) - print(" SOPRANO TTS MENU") - print("="*50) - print("1. Input text for synthesis (with file saving)") - print("2. Real-time audio playback (no file saving)") - print("3. View saved audio files") - print("4. Exit") - print("="*50) - - choice = input("Enter your choice (1-4): ").strip() - - if choice == '1': - text = get_validated_input( - "Enter text to synthesize: ", - validate_text, - "Text must not be empty and must be under 1000 characters." - ) - - output_path = get_next_filename() - print(f"Using output path: {output_path}") - - print(f"Generating speech for: '{text[:50]}{'...' if len(text) > 50 else ''}'") - try: - tts.infer(text, out_path=output_path) - print(f"✓ Audio saved to: {output_path}") - except Exception as e: - print(f"✗ Error generating audio: {e}") - - elif choice == '2': - text = get_validated_input( - "Enter text for real-time playback: ", - validate_text, - "Text must not be empty and must be under 1000 characters." - ) - - print(f"Generating real-time audio for: '{text[:50]}{'...' if len(text) > 50 else ''}'") - try: - audio_tensor = tts.infer(text) - print("Playing audio...") - play_audio(audio_tensor) - print("✓ Playback finished.") - except Exception as e: - print(f"✗ Error during playback: {e}") - - elif choice == '3': - import os - output_dir = "audio_output" - if os.path.exists(output_dir): - files = [f for f in os.listdir(output_dir) if f.lower().endswith('.wav')] - if files: - print(f"Found {len(files)} audio file(s) in {output_dir}/:") - for i, file in enumerate(sorted(files), 1): - print(f" {i}. {file}") - else: - print(f"No audio files found in {output_dir}/") - else: - print(f"No {output_dir}/ directory exists yet.") - - elif choice == '4': - print("Thank you for using Soprano TTS. Goodbye!") - break - - else: - print("✗ Invalid choice. Please enter 1, 2, 3, or 4.") - -if __name__ == "__main__": - main() \ No newline at end of file From 0e984a3ca1e8a035c5f91aa49f44cb9ab9e2dd5f Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:27 +0545 Subject: [PATCH 21/27] Delete soprano/tts.py --- soprano/tts.py | 202 ------------------------------------------------- 1 file changed, 202 deletions(-) delete mode 100644 soprano/tts.py diff --git a/soprano/tts.py b/soprano/tts.py deleted file mode 100644 index 8d15011..0000000 --- a/soprano/tts.py +++ /dev/null @@ -1,202 +0,0 @@ -from .vocos.decoder import SopranoDecoder -from .utils.text import clean_text -import torch -import re -from unidecode import unidecode -from scipy.io import wavfile -from huggingface_hub import hf_hub_download -import os -import time - - -class SopranoTTS: - def __init__(self, - backend='auto', - device='cuda', - cache_size_mb=100, - decoder_batch_size=1, - model_path=None): - RECOGNIZED_DEVICES = ['cuda', 'cpu'] - RECOGNIZED_BACKENDS = ['auto', 'lmdeploy', 'transformers'] - assert device in RECOGNIZED_DEVICES, f"unrecognized device {device}, device must be in {RECOGNIZED_DEVICES}" - if backend == 'auto': - if device == 'cpu': - backend = 'transformers' - else: - try: - import lmdeploy - backend = 'lmdeploy' - except ImportError: - backend='transformers' - print(f"Using backend {backend}.") - assert backend in RECOGNIZED_BACKENDS, f"unrecognized backend {backend}, backend must be in {RECOGNIZED_BACKENDS}" - - if backend == 'lmdeploy': - from .backends.lmdeploy import LMDeployModel - self.pipeline = LMDeployModel(device=device, cache_size_mb=cache_size_mb, model_path=model_path) - elif backend == 'transformers': - from .backends.transformers import TransformersModel - self.pipeline = TransformersModel(device=device, model_path=model_path) - - self.device = device - self.decoder = SopranoDecoder() - if device == 'cuda': - self.decoder = self.decoder.cuda() - map_location = 'cuda' - else: - map_location = 'cpu' - if model_path: - decoder_path = os.path.join(model_path, 'decoder.pth') - else: - decoder_path = hf_hub_download(repo_id='ekwek/Soprano-80M', filename='decoder.pth') - self.decoder.load_state_dict(torch.load(decoder_path, map_location=map_location)) - self.decoder_batch_size=decoder_batch_size - self.RECEPTIVE_FIELD = 4 # Decoder receptive field - self.TOKEN_SIZE = 2048 # Number of samples per audio token - - self.infer("Hello world!") # warmup - - def _preprocess_text(self, texts, min_length=30): - ''' - adds prompt format and sentence/part index - Enforces a minimum sentence length by merging short sentences. - ''' - res = [] - for text_idx, text in enumerate(texts): - text = text.strip() - cleaned_text = clean_text(text) - sentences = re.split(r"(?<=[.!?])\s+", cleaned_text) - processed = [] - for sentence in sentences: - processed.append({ - "text": sentence, - "text_idx": text_idx, - }) - - if min_length > 0 and len(processed) > 1: - merged = [] - i = 0 - while i < len(processed): - cur = processed[i] - if len(cur["text"]) < min_length: - if merged: merged[-1]["text"] = (merged[-1]["text"] + " " + cur["text"]).strip() - else: - if i + 1 < len(processed): processed[i + 1]["text"] = (cur["text"] + " " + processed[i + 1]["text"]).strip() - else: merged.append(cur) - else: merged.append(cur) - i += 1 - processed = merged - sentence_idxes = {} - for item in processed: - if item['text_idx'] not in sentence_idxes: sentence_idxes[item['text_idx']] = 0 - res.append((f'[STOP][TEXT]{item["text"]}[START]', item["text_idx"], sentence_idxes[item['text_idx']])) - sentence_idxes[item['text_idx']] += 1 - return res - - def infer(self, - text, - out_path=None, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2, - min_text_length=30): - results = self.infer_batch([text], - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - min_text_length=min_text_length, - out_dir=None)[0] - if out_path: - wavfile.write(out_path, 32000, results.cpu().numpy()) - return results - - def infer_batch(self, - texts, - out_dir=None, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2, - min_text_length=30): - sentence_data = self._preprocess_text(texts, min_length=min_text_length) - prompts = list(map(lambda x: x[0], sentence_data)) - responses = self.pipeline.infer(prompts, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty) - hidden_states = [] - for i, response in enumerate(responses): - if response['finish_reason'] != 'stop': - print(f"Warning: some sentences did not complete generation, likely due to hallucination.") - hidden_state = response['hidden_state'] - hidden_states.append(hidden_state) - combined = list(zip(hidden_states, sentence_data)) - combined.sort(key=lambda x: -x[0].size(0)) - hidden_states, sentence_data = zip(*combined) - - num_texts = len(texts) - audio_concat = [[] for _ in range(num_texts)] - for sentence in sentence_data: - audio_concat[sentence[1]].append(None) - for idx in range(0, len(hidden_states), self.decoder_batch_size): - batch_hidden_states = [] - lengths = list(map(lambda x: x.size(0), hidden_states[idx:idx+self.decoder_batch_size])) - N = len(lengths) - for i in range(N): - batch_hidden_states.append(torch.cat([ - torch.zeros((1, 512, lengths[0]-lengths[i]), device=self.device), - hidden_states[idx+i].unsqueeze(0).transpose(1,2).to(self.device).to(torch.float32), - ], dim=2)) - batch_hidden_states = torch.cat(batch_hidden_states) - with torch.no_grad(): - audio = self.decoder(batch_hidden_states) - - for i in range(N): - text_id = sentence_data[idx+i][1] - sentence_id = sentence_data[idx+i][2] - audio_concat[text_id][sentence_id] = audio[i].squeeze()[-(lengths[i]*self.TOKEN_SIZE-self.TOKEN_SIZE):] - audio_concat = [torch.cat(x).cpu() for x in audio_concat] - - if out_dir: - os.makedirs(out_dir, exist_ok=True) - for i in range(len(audio_concat)): - wavfile.write(f"{out_dir}/{i}.wav", 32000, audio_concat[i].cpu().numpy()) - return audio_concat - - def infer_stream(self, - text, - chunk_size=1, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2, - min_text_length=30): - start_time = time.time() - sentence_data = self._preprocess_text([text], min_length=min_text_length) - - first_chunk = True - for sentence, _, _ in sentence_data: - responses = self.pipeline.stream_infer(sentence, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty) - hidden_states_buffer = [] - chunk_counter = chunk_size - for token in responses: - finished = token['finish_reason'] is not None - if not finished: hidden_states_buffer.append(token['hidden_state'][-1]) - hidden_states_buffer = hidden_states_buffer[-(2*self.RECEPTIVE_FIELD+chunk_size):] - if finished or len(hidden_states_buffer) >= self.RECEPTIVE_FIELD + chunk_size: - if finished or chunk_counter == chunk_size: - batch_hidden_states = torch.stack(hidden_states_buffer) - inp = batch_hidden_states.unsqueeze(0).transpose(1, 2).to(self.device).to(torch.float32) - with torch.no_grad(): - audio = self.decoder(inp)[0] - if finished: - audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_counter-1)*self.TOKEN_SIZE-self.TOKEN_SIZE):] - else: - audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_size)*self.TOKEN_SIZE-self.TOKEN_SIZE):-(self.RECEPTIVE_FIELD*self.TOKEN_SIZE-self.TOKEN_SIZE)] - chunk_counter = 0 - if first_chunk: - print(f"Streaming latency: {1000*(time.time()-start_time):.2f} ms") - first_chunk = False - yield audio_chunk.cpu() - chunk_counter += 1 From 588b0db8502ff61447504ddd979e84889abacb7e Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:36 +0545 Subject: [PATCH 22/27] Delete .gitignore --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index f68d7f8..0000000 --- a/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__/ -test.py -*.wav -dist/ -*.egg-info/ -.venv/ \ No newline at end of file From b04c0a41c7404e9a59adbd4ba3e447aba0ca342d Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:44 +0545 Subject: [PATCH 23/27] Delete LICENSE --- LICENSE | 201 -------------------------------------------------------- 1 file changed, 201 deletions(-) delete mode 100644 LICENSE diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. From c504576521e26ed51a158ca4c8ec455fa8504727 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:52 +0545 Subject: [PATCH 24/27] Delete setup.bat --- setup.bat | 81 ------------------------------------------------------- 1 file changed, 81 deletions(-) delete mode 100644 setup.bat diff --git a/setup.bat b/setup.bat deleted file mode 100644 index 6dd4c07..0000000 --- a/setup.bat +++ /dev/null @@ -1,81 +0,0 @@ -@echo off -title Soprano TTS Setup -color 0A - -echo ================================================ -echo SOPRANO TTS SETUP -echo ================================================ -echo. -echo This script will: -echo 1. Install the Soprano TTS package -echo 2. Install/fix PyTorch with CUDA support -echo 3. Verify the installation -echo. -echo Press any key to continue or Ctrl+C to cancel... -pause >nul - -echo. -echo Installing required dependencies... -echo. - -REM Install all required packages -pip install fastapi huggingface_hub lmdeploy numpy scipy unidecode inflect sounddevice uvicorn gradio pyaudio - -if %errorlevel% neq 0 ( - echo Error occurred during dependency installation. Attempting to continue... -) - -echo. -echo Installing Soprano TTS package... -echo. - -REM Install the package in editable mode -pip install -e . - -if %errorlevel% neq 0 ( - echo Error occurred during installation. Attempting to fix... - goto fix_pytorch -) - -echo. -echo Installing PyTorch with CUDA support... -echo. - -:fix_pytorch -REM Uninstall current PyTorch -pip uninstall -y torch torchvision torchaudio - -REM Install PyTorch with CUDA 12.8 support -pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128 - -if %errorlevel% neq 0 ( - echo Warning: PyTorch CUDA 12.8 installation failed. Installing CUDA 12.6 version... - pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126 -) - -if %errorlevel% neq 0 ( - echo Warning: PyTorch CUDA installation failed. Installing CPU version... - pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cpu -) - -echo. -echo Verifying installation... -echo. - -REM Test the installation -python -c "import soprano; from soprano import SopranoTTS; print('Soprano TTS imported successfully'); print('Installation verified successfully!')" - -if %errorlevel% neq 0 ( - echo Warning: Verification failed, but installation may still be OK. -) - -echo. -echo ================================================ -echo Setup completed! -echo. -echo To use Soprano TTS: -echo - Run start_soprano.bat to access the main menu -echo - Or run individual components as needed -echo ================================================ - -pause \ No newline at end of file From 8e0cc765573bb351f4aa54ff5255e987bfbdced7 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:04:59 +0545 Subject: [PATCH 25/27] Delete pyproject.toml --- pyproject.toml | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 0a9b483..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,44 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "soprano-tts" -version = "0.0.2" -authors = [ - { name="ekwek1", email="eugene.kwek.1@gmail.com" }, -] -description = "Soprano: Instant, Ultra‑Realistic Text‑to‑Speech" -readme = "README.md" -requires-python = ">=3.10" -classifiers = [ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", -] -dependencies = [ - "fastapi", - "huggingface_hub", - "lmdeploy", - "numpy", - "scipy", - "torch", - "unidecode", - "inflect", - "sounddevice", - "uvicorn", - "gradio", - "pyaudio" -] -license = {text = "MIT"} - -[project.urls] -Homepage = "https://github.com/ekwek1/soprano" -Issues = "https://github.com/ekwek1/soprano/issues" - -[project.scripts] -soprano = "soprano.soprano_cli:main" -soprano-webui = "soprano.webui:main" - -[tool.setuptools.packages.find] -where = ["."] -include = ["soprano*"] \ No newline at end of file From 30e3389c717280d4b5bd527ecf1a65de430faa4e Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:05:06 +0545 Subject: [PATCH 26/27] Delete start_soprano.bat --- start_soprano.bat | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 start_soprano.bat diff --git a/start_soprano.bat b/start_soprano.bat deleted file mode 100644 index e471e75..0000000 --- a/start_soprano.bat +++ /dev/null @@ -1,3 +0,0 @@ -@echo off -cd soprano -python server.py \ No newline at end of file From ff5b0dff224c19a3c40a82d9910174de8f894e57 Mon Sep 17 00:00:00 2001 From: Biswas Poudel <142727455+biswas445@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:05:59 +0545 Subject: [PATCH 27/27] Add note about web UI audio streaming and file generation --- soprano/readme.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 soprano/readme.md diff --git a/soprano/readme.md b/soprano/readme.md new file mode 100644 index 0000000..0751306 --- /dev/null +++ b/soprano/readme.md @@ -0,0 +1 @@ +now web ui support both realtime audio streaming capability and the audio file generation