diff --git a/.gitignore b/.gitignore deleted file mode 100644 index f68d7f8..0000000 --- a/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -__pycache__/ -test.py -*.wav -dist/ -*.egg-info/ -.venv/ \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/README.md b/README.md index a48613f..7e4803f 100644 --- a/README.md +++ b/README.md @@ -1,190 +1,100 @@ - -
- - # Soprano: Instant, Ultra‑Realistic Text‑to‑Speech +Here is the updated, professional `README.md` file. I have removed all emojis, incorporated the new installation/startup instructions, and organized the menu options as requested. - [![Alt Text](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/ekwek/Soprano-80M) - [![Alt Text](https://img.shields.io/badge/HuggingFace-Demo-yellow?logo=huggingface)](https://huggingface.co/spaces/ekwek/Soprano-TTS) -
+*** -https://github.com/user-attachments/assets/525cf529-e79e-4368-809f-6be620852826 +# Soprano TTS ---- +Soprano is an ultra-realistic Text-to-Speech system that provides REST API, WebSocket streaming capabilities, and a user-friendly Web UI. It is designed to be lightweight yet high-fidelity, offering OpenAI-compatible endpoints for seamless integration into existing workflows. -## Overview +> **Note:** Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model -**Soprano** is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed, all while remaining compact and easy to deploy at **under 1 GB VRAM usage**. +## Features -With only **80M parameters**, Soprano achieves a real‑time factor (RTF) of **~2000×**, capable of generating **10 hours of audio in under 20 seconds**. Soprano uses a **seamless streaming** technique that enables true real‑time synthesis in **<15 ms**, multiple orders of magnitude faster than existing TTS pipelines. +- **High Quality Audio:** Generates ultra-realistic speech at 32 kHz using advanced TTS models. +- **Multiple Interfaces:** Includes REST API, WebSocket streaming, Web UI, and CLI. +- **OpenAI Compatible:** Follows OpenAI's speech endpoint format for drop-in replacement. +- **Real-time Streaming:** WebSocket support for real-time audio streaming with <15 ms latency. +- **Configurable Parameters:** Supports temperature, top_p, repetition_penalty, and min_text_length controls. +- **Interactive Launcher:** Easy-to-use batch script for managing services. ---- +## Installation and Setup -## Installation +### Prerequisites +Ensure you have Git and Python installed on your system. -**Requirements**: Linux or Windows, CUDA‑enabled GPU required (CPU support coming soon!). +### Steps +1. Clone the repository: + ```bash + git clone https://github.com/biswas445/soprano.git + ``` +2. Navigate to the project directory: + ```bash + cd soprano + ``` +3. Run the setup script and follow the prompts: + ```bat + setup.bat + ``` -### Install with wheel +## Quick Start -```bash -pip install soprano-tts -pip uninstall -y torch -pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126 -``` - -### Install from source - -```bash -git clone https://github.com/ekwek1/soprano.git -cd soprano -pip install -e . -pip uninstall -y torch -pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126 -``` - -> **Note**: Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model. - ---- - -## Usage - -```python -from soprano import SopranoTTS - -model = SopranoTTS(backend='auto', device='cuda', cache_size_mb=100, decoder_batch_size=1) -``` - -> **Tip**: You can increase cache_size_mb and decoder_batch_size to increase inference speed at the cost of higher memory usage. - -### Basic inference +To start the application, run the `start.bat` file located in the root directory: -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.") # can achieve 2000x real-time with sufficiently long input! +```bat +start.bat ``` -### Save output to a file +This will launch the interactive menu where you can choose the desired component: -```python -out = model.infer("Soprano is an extremely lightweight text to speech model.", "out.wav") -``` - -### Custom sampling parameters - -```python -out = model.infer( - "Soprano is an extremely lightweight text to speech model.", - temperature=0.3, - top_p=0.95, - repetition_penalty=1.2, -) -``` - -### Batched inference - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10) # can achieve 2000x real-time with sufficiently large input size! -``` - -#### Save batch outputs to a directory - -```python -out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10, "/dir") -``` - -### Streaming inference +1. **API Server:** Starts the RESTful API server. +2. **Test API:** Launches the API server and automatically runs the API test client to verify functionality. +3. **Real-time Assistant:** Launches a voice-to-voice AI assistant demo featuring real-time audio streaming. +4. **WebSocket Test:** Launches the WebSocket server and the corresponding test client. +5. *(Reserved)* +6. **Web UI:** Starts the browser-based interface for standard users. +7. **CLI:** Starts the interactive Command Line Interface for testing purposes. -```python -import torch -stream = model.infer_stream("Soprano is an extremely lightweight text to speech model.", chunk_size=1) +## Technical Architecture -# Audio chunks can be accessed via an iterator -chunks = [] -for chunk in stream: - chunks.append(chunk) # first chunk arrives in <15 ms! +### 1. High-fidelity 32 kHz Audio +Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper than the 24 kHz output used by many existing TTS models. -out = torch.cat(chunks) -``` - -### Serve endpoint - -``` -uvicorn soprano.server:app --host 0.0.0.0 --port 8000 -``` - -Compatible with OpenAI speech API. Use the endpoint like this: - -```bash -curl http://localhost:8000/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{ - "input": "The quick brown fox jumped over the lazy dog." - }' \ - --output speech.wav -``` - -## Usage tips: - -* Soprano works best when each sentence is between 2 and 15 seconds long. -* Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc) -* If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results. -* Avoid improper grammar such as not using contractions, multiple spaces, etc. - ---- - -## Key Features - -### 1. High‑fidelity 32 kHz audio - -Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper and clearer than the 24 kHz output used by many existing TTS models. - -### 2. Vocoder‑based neural decoder - -Instead of slow diffusion decoders, Soprano uses a **vocoder‑based decoder** with a Vocos architecture, enabling **orders‑of‑magnitude faster** waveform generation while maintaining comparable perceptual quality. +### 2. Vocoder-based Neural Decoder +Instead of slow diffusion decoders, Soprano uses a **vocoder-based decoder** with a Vocos architecture. This enables **orders-of-magnitude faster** waveform generation while maintaining comparable perceptual quality. ### 3. Seamless Streaming +Soprano leverages the decoder's finite receptive field to losslessly stream audio with ultra-low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**. -Soprano leverages the decoder’s finite receptive field to losslessly stream audio with ultra‑low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**. - -### 4. State‑of‑the‑art neural audio codec - +### 4. State-of-the-art Neural Audio Codec Speech is represented using a **neural codec** that compresses audio to **~15 tokens/sec** at just **0.2 kbps**, allowing extremely fast generation and efficient memory usage without sacrificing quality. -### 5. Sentence‑level streaming for infinite context +### 5. Sentence-level Streaming +Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real-time performance for long-form generation. -Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real‑time performance for long‑form generation. +## Project Status ---- +The core infrastructure, including the OpenAI-compatible API and various interfaces, is complete. -## Limitations +**Current Focus Areas:** +1. **Backend Strengthening:** Improving the robustness of the inference engine. +2. **Text Normalization:** Enhancing the handling of numbers, abbreviations, and special characters to improve pronunciation accuracy. -I’m a second-year undergrad who’s just started working on TTS models, so I wanted to start small. Soprano was only pretrained on 1000 hours of audio (~100x less than other TTS models), so its stability and quality will improve tremendously as I train it on more data. Also, I optimized Soprano purely for speed, which is why it lacks bells and whistles like voice cloning, style control, and multilingual support. Now that I have experience creating TTS models, I have a lot of ideas for how to make Soprano even better in the future, so stay tuned for those! - ---- - -## Roadmap - -* [x] Add model and inference code -* [x] Seamless streaming -* [x] Batched inference -* [x] Command-line interface (CLI) -* [x] CPU support -* [x] Server / API inference -* [ ] Additional LLM backends -* [ ] Voice cloning -* [ ] Multilingual support +## Limitations ---- +Soprano was optimized purely for speed and was pretrained on approximately 1000 hours of audio. Consequently: +* Numbers and special characters may occasionally be mispronounced (phonetic conversion is recommended). +* Voice cloning and style controls are currently not implemented. +* Stability and quality are expected to improve with future training on larger datasets. ## Acknowledgements Soprano uses and/or is inspired by the following projects: -* [Vocos](https://github.com/gemelo-ai/vocos) -* [XTTS](https://github.com/coqui-ai/TTS) -* [LMDeploy](https://github.com/InternLM/lmdeploy) - ---- +* [Vocos](https://github.com/gemelo-ai/vocos) +* [XTTS](https://github.com/coqui-ai/TTS) +* [LMDeploy](https://github.com/InternLM/lmdeploy) ## License -This project is licensed under the **Apache-2.0** license. See `LICENSE` for details. +Licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index ec49194..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,38 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "soprano-tts" -version = "0.0.2" -authors = [ - { name="ekwek1", email="eugene.kwek.1@gmail.com" }, -] -description = "Soprano: Instant, Ultra‑Realistic Text‑to‑Speech" -readme = "README.md" -requires-python = ">=3.10" -classifiers = [ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", -] -dependencies = [ - "fastapi", - "gradio", - "huggingface_hub", - "lmdeploy", - "numpy", - "scipy", - "torch", - "unidecode", - "uvicorn", - "inflect" -] -license = {file = "LICENSE"} - -[project.urls] -Homepage = "https://github.com/ekwek1/soprano" -Issues = "https://github.com/ekwek1/soprano/issues" - -[project.scripts] -soprano = "soprano.soprano_cli:main" -soprano-webui = "soprano.webui:main" diff --git a/setup.bat b/setup.bat new file mode 100644 index 0000000..6dd4c07 --- /dev/null +++ b/setup.bat @@ -0,0 +1,81 @@ +@echo off +title Soprano TTS Setup +color 0A + +echo ================================================ +echo SOPRANO TTS SETUP +echo ================================================ +echo. +echo This script will: +echo 1. Install the Soprano TTS package +echo 2. Install/fix PyTorch with CUDA support +echo 3. Verify the installation +echo. +echo Press any key to continue or Ctrl+C to cancel... +pause >nul + +echo. +echo Installing required dependencies... +echo. + +REM Install all required packages +pip install fastapi huggingface_hub lmdeploy numpy scipy unidecode inflect sounddevice uvicorn gradio pyaudio + +if %errorlevel% neq 0 ( + echo Error occurred during dependency installation. Attempting to continue... +) + +echo. +echo Installing Soprano TTS package... +echo. + +REM Install the package in editable mode +pip install -e . + +if %errorlevel% neq 0 ( + echo Error occurred during installation. Attempting to fix... + goto fix_pytorch +) + +echo. +echo Installing PyTorch with CUDA support... +echo. + +:fix_pytorch +REM Uninstall current PyTorch +pip uninstall -y torch torchvision torchaudio + +REM Install PyTorch with CUDA 12.8 support +pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128 + +if %errorlevel% neq 0 ( + echo Warning: PyTorch CUDA 12.8 installation failed. Installing CUDA 12.6 version... + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu126 +) + +if %errorlevel% neq 0 ( + echo Warning: PyTorch CUDA installation failed. Installing CPU version... + pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cpu +) + +echo. +echo Verifying installation... +echo. + +REM Test the installation +python -c "import soprano; from soprano import SopranoTTS; print('Soprano TTS imported successfully'); print('Installation verified successfully!')" + +if %errorlevel% neq 0 ( + echo Warning: Verification failed, but installation may still be OK. +) + +echo. +echo ================================================ +echo Setup completed! +echo. +echo To use Soprano TTS: +echo - Run start_soprano.bat to access the main menu +echo - Or run individual components as needed +echo ================================================ + +pause \ No newline at end of file diff --git a/soprano/__init__.py b/soprano/__init__.py deleted file mode 100644 index feadb53..0000000 --- a/soprano/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .tts import SopranoTTS \ No newline at end of file diff --git a/soprano/backends/base.py b/soprano/backends/base.py deleted file mode 100644 index a58274d..0000000 --- a/soprano/backends/base.py +++ /dev/null @@ -1,20 +0,0 @@ -class BaseModel: - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - ''' - Takes a list of prompts and returns the output hidden states - ''' - pass - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - ''' - Takes a prompt and returns an iterator of the output hidden states - ''' - pass diff --git a/soprano/backends/lmdeploy.py b/soprano/backends/lmdeploy.py deleted file mode 100644 index 1d7f45c..0000000 --- a/soprano/backends/lmdeploy.py +++ /dev/null @@ -1,59 +0,0 @@ -import torch -from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig -from .base import BaseModel - - -class LMDeployModel(BaseModel): - def __init__(self, - device='cuda', - cache_size_mb=100, - model_path=None, - **kwargs): - assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead." - cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory - backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio) - - # Use local model if path provided, otherwise use HuggingFace - model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M' - - self.pipeline = pipeline(model_name_or_path, - log_level='ERROR', - backend_config=backend_config) - - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - gen_config=GenerationConfig(output_last_hidden_state='generation', - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - max_new_tokens=512) - responses = self.pipeline(prompts, gen_config=gen_config) - res = [] - for response in responses: - res.append({ - 'finish_reason': response.finish_reason, - 'hidden_state': response.last_hidden_state - }) - return res - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - gen_config=GenerationConfig(output_last_hidden_state='generation', - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - max_new_tokens=512) - responses = self.pipeline.stream_infer([prompt], gen_config=gen_config) - for response in responses: - yield { - 'finish_reason': response.finish_reason, - 'hidden_state': response.last_hidden_state - } diff --git a/soprano/backends/transformers.py b/soprano/backends/transformers.py deleted file mode 100644 index b85c49e..0000000 --- a/soprano/backends/transformers.py +++ /dev/null @@ -1,72 +0,0 @@ -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer -from .base import BaseModel - - -class TransformersModel(BaseModel): - def __init__(self, - device='cuda', - model_path=None, - **kwargs): - self.device = device - - # Use local model if path provided, otherwise use HuggingFace - model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M' - - self.model = AutoModelForCausalLM.from_pretrained( - model_name_or_path, - dtype=torch.bfloat16 if device == 'cuda' else torch.float32, - device_map=device - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - self.model.eval() - - def infer(self, - prompts, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - inputs = self.tokenizer( - prompts, - return_tensors='pt', - padding=True, - truncation=True, - max_length=512, - ).to(self.device) - - with torch.no_grad(): - outputs = self.model.generate( - input_ids=inputs['input_ids'], - attention_mask=inputs['attention_mask'], - max_new_tokens=512, - do_sample=True, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - pad_token_id=self.tokenizer.pad_token_id, - return_dict_in_generate=True, - output_hidden_states=True, - ) - res = [] - eos_token_id = self.model.config.eos_token_id - for i in range(len(prompts)): - seq = outputs.sequences[i] - hidden_states = [] - num_output_tokens = len(outputs.hidden_states) - for j in range(num_output_tokens): - token = seq[j + seq.size(0) - num_output_tokens] - if token != eos_token_id: hidden_states.append(outputs.hidden_states[j][-1][i, -1, :]) - last_hidden_state = torch.stack(hidden_states).squeeze() - finish_reason = 'stop' if seq[-1].item() == eos_token_id else 'length' - res.append({ - 'finish_reason': finish_reason, - 'hidden_state': last_hidden_state - }) - return res - - def stream_infer(self, - prompt, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - raise NotImplementedError("transformers backend does not currently support streaming, please consider using lmdeploy backend instead.") diff --git a/soprano/server.py b/soprano/server.py deleted file mode 100644 index 937c89a..0000000 --- a/soprano/server.py +++ /dev/null @@ -1,47 +0,0 @@ -import base64 -import io -import json -from typing import Generator - -import numpy as np -from fastapi import FastAPI, HTTPException -from fastapi.responses import Response -from scipy.io.wavfile import write -from torch import Tensor - -from soprano.tts import SopranoTTS - -# Load model at startup -tts = SopranoTTS(cache_size_mb = 100) - -app = FastAPI(title="Soprano TTS API") - -def _tensor_to_wav_bytes(tensor: Tensor) -> bytes: - """ - Convert a 1D fp32 torch tensor to a WAV byte stream. - """ - # convert to int16 - audio_int16 = (np.clip(tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16) - - wav_io = io.BytesIO() - write(wav_io, 32000, audio_int16) # 32kHz sample rate - wav_io.seek(0) - return wav_io.read() - - -@app.post("/v1/audio/speech") -async def create_speech(payload: dict): - """ - Minimal implementation of OpenAI's Speech endpoint. - Fields: - - input: string - text to synthesize - - model, voice, etc. are accepted but ignored. - - response_format: str - ignored, only support wav. - """ - text = payload.get("input") - if not isinstance(text, str) or not text.strip(): - raise HTTPException(status_code=400, detail="`input` field must be a non-empty string.") - - audio_tensor = tts.infer(text) - wav_bytes = _tensor_to_wav_bytes(audio_tensor) - return Response(content=wav_bytes, media_type="audio/wav", headers={"Content-Disposition": 'attachment; filename="speech.wav"'}) diff --git a/soprano/soprano_cli.py b/soprano/soprano_cli.py deleted file mode 100644 index 208c87d..0000000 --- a/soprano/soprano_cli.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -""" -Soprano TTS Command Line Interface -""" -import argparse -import os -from soprano import SopranoTTS - -def main(): - parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI') - parser.add_argument('text', help='Text to synthesize') - parser.add_argument('--output', '-o', default='output.wav', help='Output audio file path') - parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)') - parser.add_argument('--device', '-d', default='cpu', choices=['cuda', 'cpu'], - help='Device to use for inference') - parser.add_argument('--backend', '-b', default='auto', - choices=['auto', 'transformers', 'lmdeploy'], - help='Backend to use for inference') - parser.add_argument('--cache-size', '-c', type=int, default=100, - help='Cache size in MB (for lmdeploy backend)') - - args = parser.parse_args() - - # Initialize TTS - tts = SopranoTTS( - backend=args.backend, - device=args.device, - cache_size_mb=args.cache_size, - model_path=args.model_path - ) - - # Generate speech - print(f"Generating speech for: '{args.text}'") - tts.infer(args.text, out_path=args.output) - print(f"Audio saved to: {args.output}") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/soprano/tts.py b/soprano/tts.py deleted file mode 100644 index d03c457..0000000 --- a/soprano/tts.py +++ /dev/null @@ -1,195 +0,0 @@ -from .vocos.decoder import SopranoDecoder -from .utils.text import clean_text -import torch -import re -from unidecode import unidecode -from scipy.io import wavfile -from huggingface_hub import hf_hub_download -import os -import time - - -class SopranoTTS: - def __init__(self, - backend='auto', - device='cuda', - cache_size_mb=100, - decoder_batch_size=1, - model_path=None): - RECOGNIZED_DEVICES = ['cuda', 'cpu'] - RECOGNIZED_BACKENDS = ['auto', 'lmdeploy', 'transformers'] - assert device in RECOGNIZED_DEVICES, f"unrecognized device {device}, device must be in {RECOGNIZED_DEVICES}" - if backend == 'auto': - if device == 'cpu': - backend = 'transformers' - else: - try: - import lmdeploy - backend = 'lmdeploy' - except ImportError: - backend='transformers' - print(f"Using backend {backend}.") - assert backend in RECOGNIZED_BACKENDS, f"unrecognized backend {backend}, backend must be in {RECOGNIZED_BACKENDS}" - - if backend == 'lmdeploy': - from .backends.lmdeploy import LMDeployModel - self.pipeline = LMDeployModel(device=device, cache_size_mb=cache_size_mb, model_path=model_path) - elif backend == 'transformers': - from .backends.transformers import TransformersModel - self.pipeline = TransformersModel(device=device, model_path=model_path) - - self.device = device - self.decoder = SopranoDecoder() - if device == 'cuda': - self.decoder = self.decoder.cuda() - if model_path: - decoder_path = os.path.join(model_path, 'decoder.pth') - else: - decoder_path = hf_hub_download(repo_id='ekwek/Soprano-80M', filename='decoder.pth') - self.decoder.load_state_dict(torch.load(decoder_path)) - self.decoder_batch_size=decoder_batch_size - self.RECEPTIVE_FIELD = 4 # Decoder receptive field - self.TOKEN_SIZE = 2048 # Number of samples per audio token - - self.infer("Hello world!") # warmup - - def _preprocess_text(self, texts, min_length=30): - ''' - adds prompt format and sentence/part index - Enforces a minimum sentence length by merging short sentences. - ''' - res = [] - for text_idx, text in enumerate(texts): - text = text.strip() - cleaned_text = clean_text(text) - sentences = re.split(r"(?<=[.!?])\s+", cleaned_text) - processed = [] - for sentence in sentences: - processed.append({ - "text": sentence, - "text_idx": text_idx, - }) - - if min_length > 0 and len(processed) > 1: - merged = [] - i = 0 - while i < len(processed): - cur = processed[i] - if len(cur["text"]) < min_length: - if merged: merged[-1]["text"] = (merged[-1]["text"] + " " + cur["text"]).strip() - else: - if i + 1 < len(processed): processed[i + 1]["text"] = (cur["text"] + " " + processed[i + 1]["text"]).strip() - else: merged.append(cur) - else: merged.append(cur) - i += 1 - processed = merged - sentence_idxes = {} - for item in processed: - if item['text_idx'] not in sentence_idxes: sentence_idxes[item['text_idx']] = 0 - res.append((f'[STOP][TEXT]{item["text"]}[START]', item["text_idx"], sentence_idxes[item['text_idx']])) - sentence_idxes[item['text_idx']] += 1 - return res - - def infer(self, - text, - out_path=None, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - results = self.infer_batch([text], - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - out_dir=None)[0] - if out_path: - wavfile.write(out_path, 32000, results.cpu().numpy()) - return results - - def infer_batch(self, - texts, - out_dir=None, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - sentence_data = self._preprocess_text(texts) - prompts = list(map(lambda x: x[0], sentence_data)) - responses = self.pipeline.infer(prompts, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty) - hidden_states = [] - for i, response in enumerate(responses): - if response['finish_reason'] != 'stop': - print(f"Warning: some sentences did not complete generation, likely due to hallucination.") - hidden_state = response['hidden_state'] - hidden_states.append(hidden_state) - combined = list(zip(hidden_states, sentence_data)) - combined.sort(key=lambda x: -x[0].size(0)) - hidden_states, sentence_data = zip(*combined) - - num_texts = len(texts) - audio_concat = [[] for _ in range(num_texts)] - for sentence in sentence_data: - audio_concat[sentence[1]].append(None) - for idx in range(0, len(hidden_states), self.decoder_batch_size): - batch_hidden_states = [] - lengths = list(map(lambda x: x.size(0), hidden_states[idx:idx+self.decoder_batch_size])) - N = len(lengths) - for i in range(N): - batch_hidden_states.append(torch.cat([ - torch.zeros((1, 512, lengths[0]-lengths[i]), device=self.device), - hidden_states[idx+i].unsqueeze(0).transpose(1,2).to(self.device).to(torch.float32), - ], dim=2)) - batch_hidden_states = torch.cat(batch_hidden_states) - with torch.no_grad(): - audio = self.decoder(batch_hidden_states) - - for i in range(N): - text_id = sentence_data[idx+i][1] - sentence_id = sentence_data[idx+i][2] - audio_concat[text_id][sentence_id] = audio[i].squeeze()[-(lengths[i]*self.TOKEN_SIZE-self.TOKEN_SIZE):] - audio_concat = [torch.cat(x).cpu() for x in audio_concat] - - if out_dir: - os.makedirs(out_dir, exist_ok=True) - for i in range(len(audio_concat)): - wavfile.write(f"{out_dir}/{i}.wav", 32000, audio_concat[i].cpu().numpy()) - return audio_concat - - def infer_stream(self, - text, - chunk_size=1, - top_p=0.95, - temperature=0.3, - repetition_penalty=1.2): - start_time = time.time() - sentence_data = self._preprocess_text([text]) - - first_chunk = True - for sentence, _, _ in sentence_data: - responses = self.pipeline.stream_infer(sentence, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty) - hidden_states_buffer = [] - chunk_counter = chunk_size - for token in responses: - finished = token['finish_reason'] is not None - if not finished: hidden_states_buffer.append(token['hidden_state'][-1]) - hidden_states_buffer = hidden_states_buffer[-(2*self.RECEPTIVE_FIELD+chunk_size):] - if finished or len(hidden_states_buffer) >= self.RECEPTIVE_FIELD + chunk_size: - if finished or chunk_counter == chunk_size: - batch_hidden_states = torch.stack(hidden_states_buffer) - inp = batch_hidden_states.unsqueeze(0).transpose(1, 2).to(self.device).to(torch.float32) - with torch.no_grad(): - audio = self.decoder(inp)[0] - if finished: - audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_counter-1)*self.TOKEN_SIZE-self.TOKEN_SIZE):] - else: - audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_size)*self.TOKEN_SIZE-self.TOKEN_SIZE):-(self.RECEPTIVE_FIELD*self.TOKEN_SIZE-self.TOKEN_SIZE)] - chunk_counter = 0 - if first_chunk: - print(f"Streaming latency: {1000*(time.time()-start_time):.2f} ms") - first_chunk = False - yield audio_chunk.cpu() - chunk_counter += 1 diff --git a/soprano/utils/text.py b/soprano/utils/text.py deleted file mode 100644 index 2295448..0000000 --- a/soprano/utils/text.py +++ /dev/null @@ -1,401 +0,0 @@ -""" -Normalize input text to a format that Soprano recognizes. -Adapted from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/tokenizer.py -""" -import re - -import inflect -from unidecode import unidecode - - -_inflect = inflect.engine() - -#################################################################################################### -# Abbreviations - -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('ms', 'miss'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] -_cased_abbreviations = [(re.compile('\\b%s\\b' % x[0]), x[1]) for x in [ - ('TTS', 'text to speech'), - ('Hz', 'hertz'), - ('kHz', 'kilohertz'), - ('KBs', 'kilobytes'), - ('KB', 'kilobyte'), - ('MBs', 'megabytes'), - ('MB', 'megabyte'), - ('GBs', 'gigabytes'), - ('GB', 'gigabyte'), - ('TBs', 'terabytes'), - ('TB', 'terabyte'), - ('APIs', 'a p i\'s'), - ('API', 'a p i'), - ('CLIs', 'c l i\'s'), - ('CLI', 'c l i'), - ('CPUs', 'c p u\'s'), - ('CPU', 'c p u'), - ('GPUs', 'g p u\'s'), - ('GPU', 'g p u'), - ('Ave', 'avenue'), - ('etc', 'et cetera'), - ('Mon', 'monday'), - ('Tues', 'tuesday'), - ('Wed', 'wednesday'), - ('Thurs', 'thursday'), - ('Fri', 'friday'), - ('Sat', 'saturday'), - ('Sun', 'sunday'), - ('and/or', 'and or'), -]] - -def expand_abbreviations(text): - for regex, replacement in _abbreviations + _cased_abbreviations: - text = re.sub(regex, replacement, text) - return text - -#################################################################################################### -# Numbers - -_num_prefix_re = re.compile(r'#\d') -_num_suffix_re = re.compile(r'\b\d+(K|M|B|T)\b', re.IGNORECASE) -_num_letter_split_re = re.compile(r'(\d[a-z]|[a-z]\d)', re.IGNORECASE) - -_comma_number_re = re.compile(r'(\d[\d\,]+\d)') -_date_re = re.compile(r'(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])') -_phone_number_re = re.compile(r'(\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4})') -_time_re = re.compile(r'(\d\d?:\d\d(?::\d\d)?)') -_pounds_re = re.compile(r'£([\d\,]*\d+)') -_dollars_re = re.compile(r'\$([\d\.\,]*\d+)') -_decimal_number_re = re.compile(r'(\d+(?:\.\d+)+)') -_multiply_re = re.compile(r'(\d\s?\*\s?\d)') -_divide_re = re.compile(r'(\d\s?/\s?\d)') -_add_re = re.compile(r'(\d\s?\+\s?\d)') -_subtract_re = re.compile(r'(\d?\s?-\s?\d)') # also does negative numbers -_fraction_re = re.compile(r'(\d+(?:/\d+)+)') -_ordinal_re = re.compile(r'\d+(st|nd|rd|th)') -_number_re = re.compile(r'\d+') - -def _expand_num_prefix(m): - match = m.group(0) - return f"number {match[1]}" - -def _expand_num_suffix(m): - match = m.group(0) - if match[1].upper() == 'K': return f"{match[0]} thousand" - elif match[1].upper() == 'M': return f"{match[0]} million" - elif match[1].upper() == 'B': return f"{match[0]} billion" - elif match[1].upper() == 'T': return f"{match[0]} trillion" - return match # unexpected format - -def _split_alphanumeric(m): - match = m.group(1) - return f"{match[0]} {match[1]}" - -def _remove_commas(m): - return m.group(1).replace(',', '') - -def _expand_date(m): - match = m.group(2) - match = re.split('[./-]', match) - return m.group(1) + ' dash '.join(match) + m.group(3) - -def _expand_phone_number(m): - match = m.group(1) - match = re.sub(r'\D', '', match) - assert len(match) == 10 - match = f"{' '.join(list(match[:3]))}, {' '.join(list(match[3:6]))}, {' '.join(list(match[6:]))}" - return match - -def _expand_time(m): - match = m.group(1) - match = match.split(':') - if len(match) == 2: - hours, minutes = match - if minutes == '00': - if int(hours) == 0: - return '0' - elif int(hours) > 12: return f"{hours} minutes" - return f"{hours} o'clock" - elif minutes.startswith('0'): - minutes = f'oh {minutes[1:]}' - return f"{hours} {minutes}" - else: - hours, minutes, seconds = match - if int(hours) != 0: - return f"{hours} {'oh oh' if minutes == '00' else f'oh {minutes}' if minutes.startswith('0') else {minutes}} {'' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" - elif minutes != '00': - return f"{minutes} {'oh oh' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" - else: - return seconds - -def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' - -def _expand_decimal_point(m): - match = m.group(1) - match = match.split('.') - return match[0] + ' point ' + ' point '.join(' '.join(list(match[i])) for i in range(1, len(match))) - -def _expand_fraction(m): - match = m.group(1) - match = match.split('/') - return ' over '.join(match) if len(match)==2 else ' slash '.join(match) - -def _expand_multiply(m): - return ' times '.join(m.group(1).split('*')) - -def _expand_divide(m): - return ' over '.join(m.group(1).split('/')) - -def _expand_add(m): - return ' plus '.join(m.group(1).split('+')) - -def _expand_subtract(m): - return ' minus '.join(m.group(1).split('-')) - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0), andword='') - -def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' - else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') - -def normalize_numbers(text): - text = re.sub(_num_prefix_re, _expand_num_prefix, text) - text = re.sub(_num_suffix_re, _expand_num_suffix, text) - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_date_re, _expand_date, text) - text = re.sub(_phone_number_re, _expand_phone_number, text) - text = re.sub(_time_re, _expand_time, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_multiply_re, _expand_multiply, text) - text = re.sub(_divide_re, _expand_divide, text) - text = re.sub(_add_re, _expand_add, text) - text = re.sub(_subtract_re, _expand_subtract, text) - - text = re.sub(_fraction_re, _expand_fraction, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - for _ in range(2): # need to do this twice to find all matches - text = re.sub(_num_letter_split_re, _split_alphanumeric, text) - text = re.sub(_number_re, _expand_number, text) - return text - -#################################################################################################### -# Special characters & other patterns - -_special_characters = [(re.compile(x[0]), x[1]) for x in [ - ('@', ' at '), - ('&', ' and '), - ('%', ' percent '), - (':', '.'), - (';', ','), - (r'\+', ' plus '), - (r'\\', ' backslash '), - ('~', ' about '), - ('(^| )<3', ' heart '), - ('<=', ' less than or equal to '), - ('>=', ' greater than or equal to '), - ('<', ' less than '), - ('>', ' greater than '), - ('=', ' equals '), - ('/', ' slash '), - ('_', ' '), - (r'\*', ' '), -]] -_link_header_re = re.compile(r'(https?://)') -_dash_re = re.compile(r'(. - .)') -_dot_re = re.compile(r'([A-Z]\.[A-Z])', re.IGNORECASE) -_parentheses_re = re.compile(r'[\(\[\{].*[\)\]\}](.|$)') - -def expand_special_characters(text): - for regex, replacement in _special_characters: - text = re.sub(regex, replacement, text) - return text - -def _expand_link_header(m): - return 'h t t p s colon slash slash ' - -def _expand_dash(m): - match = m.group(0) - return f"{match[0]}, {match[4]}" - -def _expand_dot(m): - match = m.group(0) - return f"{match[0]} dot {match[2]}" - -def _expand_parantheses(m): - match = m.group(0) - match = re.sub(r'[\(\[\{]', ', ', match) - match = re.sub(r'[\)\]\}][^$.!?,]', ', ', match) - match = re.sub(r'[\)\]\}]', '', match) - return match - -def normalize_special(text): - text = re.sub(_link_header_re, _expand_link_header, text) - text = re.sub(_dash_re, _expand_dash, text) - text = re.sub(_dot_re, _expand_dot, text) - text = re.sub(_parentheses_re, _expand_parantheses, text) - return text - -#################################################################################################### -# Misc - -def lowercase(text): - return text.lower() - -def convert_to_ascii(text): - return unidecode(text) - -def normalize_newlines(text): - text = text.split('\n') - for i in range(len(text)): - text[i] = text[i].strip() - if not text[i]: continue - if text[i][-1] not in '.!?': - text[i] = f"{text[i]}." - return ' '.join(text) - -def remove_unknown_characters(text): - text = re.sub(r"[^A-Za-z !\$%&'\*\+,-./0123456789<>\?_]", "", text) - text = re.sub(r"[<>/_+]", "", text) - return text - -def collapse_whitespace(text): - text = re.sub(r'\s+', ' ', text) - text = re.sub(r' [.\?!,]', lambda m: m.group(0)[1], text) - return text.strip() - -def dedup_punctuation(text): - text = re.sub(r"\.\.\.+", "[ELLIPSIS]", text) - text = re.sub(r",+", ",", text) - text = re.sub(r"[\.,]*\.[\.,]*", ".", text) - text = re.sub(r"[\.,!]*![\.,!]*", "!", text) - text = re.sub(r"[\.,!\?]*\?[\.,!\?]*", "?", text) - text = re.sub(r"\[ELLIPSIS\]", "...", text) - return text - -def clean_text(text): - text = convert_to_ascii(text) - text = normalize_newlines(text) - text = normalize_numbers(text) - text = normalize_special(text) - text = expand_abbreviations(text) - text = expand_special_characters(text) - text = lowercase(text) - text = remove_unknown_characters(text) - text = collapse_whitespace(text) - text = dedup_punctuation(text) - return text - - -if __name__ == '__main__': - print(clean_text('1,2,3,456,176')) - print(clean_text('123,456,789')) - print(clean_text('123,456,789th')) - print(clean_text('123-456-7890')) - print(clean_text('111-111-1111')) - print(clean_text('(111) 111-1111')) - print(clean_text('A(111) 111-1111')) - print(clean_text('A (111) 111-1111')) - print(clean_text('$2.47')) - print(clean_text('$247')) - print(clean_text('$0.27')) - print(clean_text('$1.00')) - print(clean_text('£20')) - for i in range(1990, 2030): - print(clean_text(str(i))) - print(clean_text('2656')) - print(clean_text('1024')) - print(clean_text('2.47023')) - print(clean_text('20.47023')) - print(clean_text('1.17.1.1')) - print(clean_text('111.111.1111')) - print(clean_text('1/1/2025')) - print(clean_text('1-1-2025')) - print(clean_text('1-1-25')) - print(clean_text('A 1/1/11 A')) - print(clean_text('A 1/1 A')) - print(clean_text('1/1')) - print(clean_text('1/10')) - print(clean_text('1/1/10')) - print(clean_text('11/1/1/10')) - - print(clean_text('0:00')) - print(clean_text('12:00')) - print(clean_text('13:00')) - print(clean_text('8:00')) - print(clean_text('8:05')) - print(clean_text('8:15')) - print(clean_text('0:00:00')) - print(clean_text('00:01:10')) - print(clean_text('00:10:01')) - print(clean_text('01:01:01')) - print(clean_text('00:01:00')) - print(clean_text('01:00:00')) - - print(clean_text('-1 + 2 * 3 - 4 / 5')) - print(clean_text('-1+2*3-5/4/25')) - - print(clean_text('100x1')) - print(clean_text('100k')) - print(clean_text('100m')) - print(clean_text('100b')) - print(clean_text('100t')) - - print(clean_text('#1')) - - print(clean_text('12:00')) - print(clean_text('11:59')) - print(clean_text('01:00')) - print(clean_text('0100')) - - print(clean_text('1st 2nd 3rd 4th')) - print(clean_text('1K 1M 1B 1T 1K1M1B1T')) - print(clean_text('and/or')) diff --git a/soprano/vocos/decoder.py b/soprano/vocos/decoder.py deleted file mode 100644 index 75d506a..0000000 --- a/soprano/vocos/decoder.py +++ /dev/null @@ -1,45 +0,0 @@ -import torch -from torch import nn - -from .models import VocosBackbone -from .heads import ISTFTHead - - -class SopranoDecoder(nn.Module): - def __init__(self, - num_input_channels=512, - decoder_num_layers=8, - decoder_dim=512, - decoder_intermediate_dim=None, - hop_length=512, - n_fft=2048, - upscale=4, - dw_kernel=3, - ): - super().__init__() - self.decoder_initial_channels = num_input_channels - self.num_layers = decoder_num_layers - self.dim = decoder_dim - self.intermediate_dim = decoder_intermediate_dim if decoder_intermediate_dim else decoder_dim*3 - self.hop_length = hop_length - self.n_fft = n_fft - self.upscale = upscale - self.dw_kernel = dw_kernel - - self.decoder = VocosBackbone(input_channels=self.decoder_initial_channels, - dim=self.dim, - intermediate_dim=self.intermediate_dim, - num_layers=self.num_layers, - input_kernel_size=dw_kernel, - dw_kernel_size=dw_kernel, - ) - self.head = ISTFTHead(dim=self.dim, - n_fft=self.n_fft, - hop_length=self.hop_length) - - def forward(self, x): - T = x.size(2) - x = torch.nn.functional.interpolate(x, size=self.upscale*(T-1)+1, mode='linear', align_corners=True) - x = self.decoder(x) - reconstructed = self.head(x) - return reconstructed diff --git a/soprano/vocos/heads.py b/soprano/vocos/heads.py deleted file mode 100644 index 5b9e15c..0000000 --- a/soprano/vocos/heads.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -from torch import nn -from .spectral_ops import ISTFT - - -class ISTFTHead(nn.Module): - """ - ISTFT Head module for predicting STFT complex coefficients. - - Args: - dim (int): Hidden dimension of the model. - n_fft (int): Size of Fourier transform. - hop_length (int): The distance between neighboring sliding window frames, which should align with - the resolution of the input features. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - """ - - def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "center"): - super().__init__() - out_dim = n_fft + 2 - self.out = torch.nn.Linear(dim, out_dim) - self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding) - - @torch.compiler.disable - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Forward pass of the ISTFTHead module. - - Args: - x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, - L is the sequence length, and H denotes the model dimension. - - Returns: - Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. - """ - x = self.out(x.transpose(1,2)).transpose(1, 2) - mag, p = x.chunk(2, dim=1) - mag = torch.exp(mag) - mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes - # wrapping happens here. These two lines produce real and imaginary value - x = torch.cos(p) - y = torch.sin(p) - # recalculating phase here does not produce anything new - # only costs time - # phase = torch.atan2(y, x) - # S = mag * torch.exp(phase * 1j) - # better directly produce the complex value - S = mag * (x + 1j * y) - audio = self.istft(S) - return audio diff --git a/soprano/vocos/models.py b/soprano/vocos/models.py deleted file mode 100644 index 458d815..0000000 --- a/soprano/vocos/models.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Optional - -import torch -from torch import nn - -from .modules import ConvNeXtBlock - -class VocosBackbone(nn.Module): - """ - Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization - - Args: - input_channels (int): Number of input features channels. - dim (int): Hidden dimension of the model. - intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock. - num_layers (int): Number of ConvNeXtBlock layers. - layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`. - """ - - def __init__( - self, - input_channels: int, - dim: int, - intermediate_dim: int, - num_layers: int, - input_kernel_size: int = 9, - dw_kernel_size: int = 9, - layer_scale_init_value: Optional[float] = None, - pad: str = 'zeros', - ): - super().__init__() - self.embed = nn.Conv1d(input_channels, dim, kernel_size=input_kernel_size, padding=input_kernel_size//2, padding_mode=pad) - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.convnext = nn.ModuleList( - [ - ConvNeXtBlock( - dim=dim, - intermediate_dim=intermediate_dim, - dw_kernel_size=dw_kernel_size, - layer_scale_init_value=layer_scale_init_value or 1 / num_layers**0.5, - ) - for _ in range(num_layers) - ] - ) - self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, (nn.Conv1d, nn.Linear)): - nn.init.trunc_normal_(m.weight, std=0.02) - if m.bias is not None: nn.init.constant_(m.bias, 0) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.embed(x) # (B, C, L) - x = self.norm(x.transpose(1, 2)) - x = x.transpose(1, 2) - for conv_block in self.convnext: - x = conv_block(x) - x = self.final_layer_norm(x.transpose(1, 2)) - x = x.transpose(1, 2) - return x diff --git a/soprano/vocos/modules.py b/soprano/vocos/modules.py deleted file mode 100644 index f969d4f..0000000 --- a/soprano/vocos/modules.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -from torch import nn - - -class ConvNeXtBlock(nn.Module): - """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. - - Args: - dim (int): Number of input channels. - intermediate_dim (int): Dimensionality of the intermediate layer. - layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. - Defaults to None. - """ - - def __init__( - self, - dim: int, - intermediate_dim: int, - layer_scale_init_value: float, - dw_kernel_size: int = 9, - ): - super().__init__() - self.dwconv = nn.Conv1d(dim, dim, kernel_size=dw_kernel_size, padding=dw_kernel_size//2, groups=dim) # depthwise conv - self.norm = nn.LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(intermediate_dim, dim) - self.gamma = ( - nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) - if layer_scale_init_value > 0 - else None - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - residual = x - x = self.dwconv(x) - x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) - - x = residual + x - return x diff --git a/soprano/vocos/spectral_ops.py b/soprano/vocos/spectral_ops.py deleted file mode 100644 index 8a38cb8..0000000 --- a/soprano/vocos/spectral_ops.py +++ /dev/null @@ -1,74 +0,0 @@ -import torch -from torch import nn - -class ISTFT(nn.Module): - """ - Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with - windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges. - See issue: https://github.com/pytorch/pytorch/issues/62323 - Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs. - The NOLA constraint is met as we trim padded samples anyway. - - Args: - n_fft (int): Size of Fourier transform. - hop_length (int): The distance between neighboring sliding window frames. - win_length (int): The size of window frame and STFT filter. - padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". - """ - - def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"): - super().__init__() - if padding not in ["center", "same"]: - raise ValueError("Padding must be 'center' or 'same'.") - self.padding = padding - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - window = torch.hann_window(win_length) - self.register_buffer("window", window) - - def forward(self, spec: torch.Tensor) -> torch.Tensor: - """ - Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram. - - Args: - spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size, - N is the number of frequency bins, and T is the number of time frames. - - Returns: - Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal. - """ - if self.padding == "center": - spec[:,0] = 0 # fixes some strange bug where first/last freqs don't matter when bs<16 which causes exploding gradients - spec[:,-1] = 0 - # Fallback to pytorch native implementation - return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True) - elif self.padding == "same": - pad = (self.win_length - self.hop_length) // 2 - else: - raise ValueError("Padding must be 'center' or 'same'.") - - assert spec.dim() == 3, "Expected a 3D tensor as input" - B, N, T = spec.shape - - # Inverse FFT - ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") - ifft = ifft * self.window[None, :, None] - - # Overlap and Add - output_size = (T - 1) * self.hop_length + self.win_length - y = torch.nn.functional.fold( - ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), - )[:, 0, 0, pad:-pad] - - # Window envelope - window_sq = self.window.square().expand(1, T, -1).transpose(1, 2) - window_envelope = torch.nn.functional.fold( - window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), - ).squeeze()[pad:-pad] - - # Normalize - assert (window_envelope > 1e-11).all() - y = y / window_envelope - - return y diff --git a/soprano/webui.py b/soprano/webui.py deleted file mode 100644 index 4c4480b..0000000 --- a/soprano/webui.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -""" -Gradio Web Interface for Soprano TTS -""" - -import gradio as gr -import torch -from soprano import SopranoTTS -import numpy as np -import socket -import time - -# Detect device -DEVICE = "cuda" if torch.cuda.is_available() else "cpu" - -# Initialize model -print("Loading Soprano TTS model...") -model = SopranoTTS( - backend="auto", - device=DEVICE, - cache_size_mb=100, - decoder_batch_size=1, -) -print("Model loaded successfully!") - -SAMPLE_RATE = 32000 - - -def generate_speech( - text: str, - temperature: float, - top_p: float, - repetition_penalty: float, -) -> tuple: - if not text.strip(): - return None, "Please enter some text to generate speech." - - try: - start_time = time.perf_counter() - - audio = model.infer( - text, - temperature=temperature, - top_p=top_p, - repetition_penalty=repetition_penalty, - ) - - gen_time = time.perf_counter() - start_time - - audio_np = audio.cpu().numpy() - audio_int16 = (audio_np * 32767).astype(np.int16) - - audio_seconds = len(audio_np) / SAMPLE_RATE - rtf = audio_seconds / gen_time if gen_time > 0 else float("inf") - - status = ( - f"✓ Generated {audio_seconds:.2f} s audio | " - f"Generation time: {gen_time:.3f} s " - f"({rtf:.2f}x realtime)" - ) - - return (SAMPLE_RATE, audio_int16), status - - except Exception as e: - return None, f"✗ Error: {str(e)}" - - -# Create Gradio interface -with gr.Blocks(title="Soprano TTS") as demo: - - gr.Markdown( - f""" -# 🎵 Soprano TTS - -**Running on: {DEVICE.upper()}** - -Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time, -high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** -and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**. - -**GitHub:** https://github.com/ekwek1/soprano -**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS -**Model Weights:** https://huggingface.co/ekwek/Soprano-80M -""" - ) - - with gr.Row(): - with gr.Column(scale=2): - text_input = gr.Textbox( - label="Text to Synthesize", - placeholder="Enter text here...", - value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", - lines=5, - max_lines=10, - ) - - with gr.Accordion("Advanced Settings", open=False): - temperature = gr.Slider( - minimum=0.1, - maximum=1.5, - value=0.3, - step=0.05, - label="Temperature", - ) - - top_p = gr.Slider( - minimum=0.5, - maximum=1.0, - value=0.95, - step=0.05, - label="Top P", - ) - - repetition_penalty = gr.Slider( - minimum=1.0, - maximum=2.0, - value=1.2, - step=0.1, - label="Repetition Penalty", - ) - - generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") - - with gr.Column(scale=1): - audio_output = gr.Audio( - label="Generated Speech", - type="numpy", - autoplay=True, - ) - - status_output = gr.Textbox( - label="Status", - interactive=False, - lines=3, - max_lines=10 - ) - - gr.Examples( - examples=[ - ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2], - ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2], - ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2], - ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2], - ], - inputs=[text_input, temperature, top_p, repetition_penalty], - label="Example Prompts", - ) - - generate_btn.click( - fn=generate_speech, - inputs=[text_input, temperature, top_p, repetition_penalty], - outputs=[audio_output, status_output], - ) - gr.Markdown( - f""" -### Usage tips: - -- Soprano works best when each sentence is between 2 and 15 seconds long. -- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. - Best results can be achieved by converting these into their phonetic form. - (1+1 -> one plus one, etc) -- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. - You may also change the sampling settings for more varied results. -- Avoid improper grammar such as not using contractions, multiple spaces, etc. -""" - ) - - -def find_free_port(start_port=7860, max_tries=100): - for port in range(start_port, start_port + max_tries): - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", port)) - return port - except OSError: - continue - raise OSError("Could not find a free port") - -def main(): - port = find_free_port(7860) - print(f"Starting Gradio interface on port {port}") - demo.launch( - server_name="127.0.0.1", - server_port=port, - share=False, - theme=gr.themes.Soft(primary_hue="green"), - css=""" -a { - color: var(--primary-600); -} -a:hover { - color: var(--primary-700); -} -""" - ) - -if __name__ == "__main__": - main() \ No newline at end of file