diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index f68d7f8..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-__pycache__/
-test.py
-*.wav
-dist/
-*.egg-info/
-.venv/
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/README.md b/README.md
index a48613f..7e4803f 100644
--- a/README.md
+++ b/README.md
@@ -1,190 +1,100 @@
-
-
-
- # Soprano: Instant, Ultra‑Realistic Text‑to‑Speech
+Here is the updated, professional `README.md` file. I have removed all emojis, incorporated the new installation/startup instructions, and organized the menu options as requested.
- [](https://huggingface.co/ekwek/Soprano-80M)
- [](https://huggingface.co/spaces/ekwek/Soprano-TTS)
-
+***
-https://github.com/user-attachments/assets/525cf529-e79e-4368-809f-6be620852826
+# Soprano TTS
----
+Soprano is an ultra-realistic Text-to-Speech system that provides REST API, WebSocket streaming capabilities, and a user-friendly Web UI. It is designed to be lightweight yet high-fidelity, offering OpenAI-compatible endpoints for seamless integration into existing workflows.
-## Overview
+> **Note:** Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model
-**Soprano** is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed, all while remaining compact and easy to deploy at **under 1 GB VRAM usage**.
+## Features
-With only **80M parameters**, Soprano achieves a real‑time factor (RTF) of **~2000×**, capable of generating **10 hours of audio in under 20 seconds**. Soprano uses a **seamless streaming** technique that enables true real‑time synthesis in **<15 ms**, multiple orders of magnitude faster than existing TTS pipelines.
+- **High Quality Audio:** Generates ultra-realistic speech at 32 kHz using advanced TTS models.
+- **Multiple Interfaces:** Includes REST API, WebSocket streaming, Web UI, and CLI.
+- **OpenAI Compatible:** Follows OpenAI's speech endpoint format for drop-in replacement.
+- **Real-time Streaming:** WebSocket support for real-time audio streaming with <15 ms latency.
+- **Configurable Parameters:** Supports temperature, top_p, repetition_penalty, and min_text_length controls.
+- **Interactive Launcher:** Easy-to-use batch script for managing services.
----
+## Installation and Setup
-## Installation
+### Prerequisites
+Ensure you have Git and Python installed on your system.
-**Requirements**: Linux or Windows, CUDA‑enabled GPU required (CPU support coming soon!).
+### Steps
+1. Clone the repository:
+ ```bash
+ git clone https://github.com/biswas445/soprano.git
+ ```
+2. Navigate to the project directory:
+ ```bash
+ cd soprano
+ ```
+3. Run the setup script and follow the prompts:
+ ```bat
+ setup.bat
+ ```
-### Install with wheel
+## Quick Start
-```bash
-pip install soprano-tts
-pip uninstall -y torch
-pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126
-```
-
-### Install from source
-
-```bash
-git clone https://github.com/ekwek1/soprano.git
-cd soprano
-pip install -e .
-pip uninstall -y torch
-pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126
-```
-
-> **Note**: Soprano uses **LMDeploy** to accelerate inference by default. If LMDeploy cannot be installed in your environment, Soprano can fall back to the HuggingFace **transformers** backend (with slower performance). To enable this, pass `backend='transformers'` when creating the TTS model.
-
----
-
-## Usage
-
-```python
-from soprano import SopranoTTS
-
-model = SopranoTTS(backend='auto', device='cuda', cache_size_mb=100, decoder_batch_size=1)
-```
-
-> **Tip**: You can increase cache_size_mb and decoder_batch_size to increase inference speed at the cost of higher memory usage.
-
-### Basic inference
+To start the application, run the `start.bat` file located in the root directory:
-```python
-out = model.infer("Soprano is an extremely lightweight text to speech model.") # can achieve 2000x real-time with sufficiently long input!
+```bat
+start.bat
```
-### Save output to a file
+This will launch the interactive menu where you can choose the desired component:
-```python
-out = model.infer("Soprano is an extremely lightweight text to speech model.", "out.wav")
-```
-
-### Custom sampling parameters
-
-```python
-out = model.infer(
- "Soprano is an extremely lightweight text to speech model.",
- temperature=0.3,
- top_p=0.95,
- repetition_penalty=1.2,
-)
-```
-
-### Batched inference
-
-```python
-out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10) # can achieve 2000x real-time with sufficiently large input size!
-```
-
-#### Save batch outputs to a directory
-
-```python
-out = model.infer_batch(["Soprano is an extremely lightweight text to speech model."] * 10, "/dir")
-```
-
-### Streaming inference
+1. **API Server:** Starts the RESTful API server.
+2. **Test API:** Launches the API server and automatically runs the API test client to verify functionality.
+3. **Real-time Assistant:** Launches a voice-to-voice AI assistant demo featuring real-time audio streaming.
+4. **WebSocket Test:** Launches the WebSocket server and the corresponding test client.
+5. *(Reserved)*
+6. **Web UI:** Starts the browser-based interface for standard users.
+7. **CLI:** Starts the interactive Command Line Interface for testing purposes.
-```python
-import torch
-stream = model.infer_stream("Soprano is an extremely lightweight text to speech model.", chunk_size=1)
+## Technical Architecture
-# Audio chunks can be accessed via an iterator
-chunks = []
-for chunk in stream:
- chunks.append(chunk) # first chunk arrives in <15 ms!
+### 1. High-fidelity 32 kHz Audio
+Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper than the 24 kHz output used by many existing TTS models.
-out = torch.cat(chunks)
-```
-
-### Serve endpoint
-
-```
-uvicorn soprano.server:app --host 0.0.0.0 --port 8000
-```
-
-Compatible with OpenAI speech API. Use the endpoint like this:
-
-```bash
-curl http://localhost:8000/v1/audio/speech \
- -H "Content-Type: application/json" \
- -d '{
- "input": "The quick brown fox jumped over the lazy dog."
- }' \
- --output speech.wav
-```
-
-## Usage tips:
-
-* Soprano works best when each sentence is between 2 and 15 seconds long.
-* Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)
-* If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.
-* Avoid improper grammar such as not using contractions, multiple spaces, etc.
-
----
-
-## Key Features
-
-### 1. High‑fidelity 32 kHz audio
-
-Soprano synthesizes speech at **32 kHz**, delivering quality that is perceptually indistinguishable from 44.1/48 kHz audio and significantly sharper and clearer than the 24 kHz output used by many existing TTS models.
-
-### 2. Vocoder‑based neural decoder
-
-Instead of slow diffusion decoders, Soprano uses a **vocoder‑based decoder** with a Vocos architecture, enabling **orders‑of‑magnitude faster** waveform generation while maintaining comparable perceptual quality.
+### 2. Vocoder-based Neural Decoder
+Instead of slow diffusion decoders, Soprano uses a **vocoder-based decoder** with a Vocos architecture. This enables **orders-of-magnitude faster** waveform generation while maintaining comparable perceptual quality.
### 3. Seamless Streaming
+Soprano leverages the decoder's finite receptive field to losslessly stream audio with ultra-low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**.
-Soprano leverages the decoder’s finite receptive field to losslessly stream audio with ultra‑low latency. The streamed output is acoustically identical to offline synthesis, and streaming can begin after generating just 5 audio tokens, enabling **<15 ms latency**.
-
-### 4. State‑of‑the‑art neural audio codec
-
+### 4. State-of-the-art Neural Audio Codec
Speech is represented using a **neural codec** that compresses audio to **~15 tokens/sec** at just **0.2 kbps**, allowing extremely fast generation and efficient memory usage without sacrificing quality.
-### 5. Sentence‑level streaming for infinite context
+### 5. Sentence-level Streaming
+Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real-time performance for long-form generation.
-Each sentence is generated independently, enabling **effectively infinite generation length** while maintaining stability and real‑time performance for long‑form generation.
+## Project Status
----
+The core infrastructure, including the OpenAI-compatible API and various interfaces, is complete.
-## Limitations
+**Current Focus Areas:**
+1. **Backend Strengthening:** Improving the robustness of the inference engine.
+2. **Text Normalization:** Enhancing the handling of numbers, abbreviations, and special characters to improve pronunciation accuracy.
-I’m a second-year undergrad who’s just started working on TTS models, so I wanted to start small. Soprano was only pretrained on 1000 hours of audio (~100x less than other TTS models), so its stability and quality will improve tremendously as I train it on more data. Also, I optimized Soprano purely for speed, which is why it lacks bells and whistles like voice cloning, style control, and multilingual support. Now that I have experience creating TTS models, I have a lot of ideas for how to make Soprano even better in the future, so stay tuned for those!
-
----
-
-## Roadmap
-
-* [x] Add model and inference code
-* [x] Seamless streaming
-* [x] Batched inference
-* [x] Command-line interface (CLI)
-* [x] CPU support
-* [x] Server / API inference
-* [ ] Additional LLM backends
-* [ ] Voice cloning
-* [ ] Multilingual support
+## Limitations
----
+Soprano was optimized purely for speed and was pretrained on approximately 1000 hours of audio. Consequently:
+* Numbers and special characters may occasionally be mispronounced (phonetic conversion is recommended).
+* Voice cloning and style controls are currently not implemented.
+* Stability and quality are expected to improve with future training on larger datasets.
## Acknowledgements
Soprano uses and/or is inspired by the following projects:
-* [Vocos](https://github.com/gemelo-ai/vocos)
-* [XTTS](https://github.com/coqui-ai/TTS)
-* [LMDeploy](https://github.com/InternLM/lmdeploy)
-
----
+* [Vocos](https://github.com/gemelo-ai/vocos)
+* [XTTS](https://github.com/coqui-ai/TTS)
+* [LMDeploy](https://github.com/InternLM/lmdeploy)
## License
-This project is licensed under the **Apache-2.0** license. See `LICENSE` for details.
+Licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index ec49194..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,38 +0,0 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "soprano-tts"
-version = "0.0.2"
-authors = [
- { name="ekwek1", email="eugene.kwek.1@gmail.com" },
-]
-description = "Soprano: Instant, Ultra‑Realistic Text‑to‑Speech"
-readme = "README.md"
-requires-python = ">=3.10"
-classifiers = [
- "Programming Language :: Python :: 3",
- "Operating System :: OS Independent",
-]
-dependencies = [
- "fastapi",
- "gradio",
- "huggingface_hub",
- "lmdeploy",
- "numpy",
- "scipy",
- "torch",
- "unidecode",
- "uvicorn",
- "inflect"
-]
-license = {file = "LICENSE"}
-
-[project.urls]
-Homepage = "https://github.com/ekwek1/soprano"
-Issues = "https://github.com/ekwek1/soprano/issues"
-
-[project.scripts]
-soprano = "soprano.soprano_cli:main"
-soprano-webui = "soprano.webui:main"
diff --git a/soprano/__init__.py b/soprano/__init__.py
deleted file mode 100644
index feadb53..0000000
--- a/soprano/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .tts import SopranoTTS
\ No newline at end of file
diff --git a/soprano/backends/base.py b/soprano/backends/base.py
deleted file mode 100644
index a58274d..0000000
--- a/soprano/backends/base.py
+++ /dev/null
@@ -1,20 +0,0 @@
-class BaseModel:
- def infer(self,
- prompts,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- '''
- Takes a list of prompts and returns the output hidden states
- '''
- pass
-
- def stream_infer(self,
- prompt,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- '''
- Takes a prompt and returns an iterator of the output hidden states
- '''
- pass
diff --git a/soprano/backends/lmdeploy.py b/soprano/backends/lmdeploy.py
deleted file mode 100644
index 1d7f45c..0000000
--- a/soprano/backends/lmdeploy.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch
-from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
-from .base import BaseModel
-
-
-class LMDeployModel(BaseModel):
- def __init__(self,
- device='cuda',
- cache_size_mb=100,
- model_path=None,
- **kwargs):
- assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead."
- cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory
- backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio)
-
- # Use local model if path provided, otherwise use HuggingFace
- model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M'
-
- self.pipeline = pipeline(model_name_or_path,
- log_level='ERROR',
- backend_config=backend_config)
-
- def infer(self,
- prompts,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- gen_config=GenerationConfig(output_last_hidden_state='generation',
- do_sample=True,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- max_new_tokens=512)
- responses = self.pipeline(prompts, gen_config=gen_config)
- res = []
- for response in responses:
- res.append({
- 'finish_reason': response.finish_reason,
- 'hidden_state': response.last_hidden_state
- })
- return res
-
- def stream_infer(self,
- prompt,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- gen_config=GenerationConfig(output_last_hidden_state='generation',
- do_sample=True,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- max_new_tokens=512)
- responses = self.pipeline.stream_infer([prompt], gen_config=gen_config)
- for response in responses:
- yield {
- 'finish_reason': response.finish_reason,
- 'hidden_state': response.last_hidden_state
- }
diff --git a/soprano/backends/transformers.py b/soprano/backends/transformers.py
deleted file mode 100644
index b85c49e..0000000
--- a/soprano/backends/transformers.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from .base import BaseModel
-
-
-class TransformersModel(BaseModel):
- def __init__(self,
- device='cuda',
- model_path=None,
- **kwargs):
- self.device = device
-
- # Use local model if path provided, otherwise use HuggingFace
- model_name_or_path = model_path if model_path else 'ekwek/Soprano-80M'
-
- self.model = AutoModelForCausalLM.from_pretrained(
- model_name_or_path,
- dtype=torch.bfloat16 if device == 'cuda' else torch.float32,
- device_map=device
- )
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
- self.model.eval()
-
- def infer(self,
- prompts,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- inputs = self.tokenizer(
- prompts,
- return_tensors='pt',
- padding=True,
- truncation=True,
- max_length=512,
- ).to(self.device)
-
- with torch.no_grad():
- outputs = self.model.generate(
- input_ids=inputs['input_ids'],
- attention_mask=inputs['attention_mask'],
- max_new_tokens=512,
- do_sample=True,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- pad_token_id=self.tokenizer.pad_token_id,
- return_dict_in_generate=True,
- output_hidden_states=True,
- )
- res = []
- eos_token_id = self.model.config.eos_token_id
- for i in range(len(prompts)):
- seq = outputs.sequences[i]
- hidden_states = []
- num_output_tokens = len(outputs.hidden_states)
- for j in range(num_output_tokens):
- token = seq[j + seq.size(0) - num_output_tokens]
- if token != eos_token_id: hidden_states.append(outputs.hidden_states[j][-1][i, -1, :])
- last_hidden_state = torch.stack(hidden_states).squeeze()
- finish_reason = 'stop' if seq[-1].item() == eos_token_id else 'length'
- res.append({
- 'finish_reason': finish_reason,
- 'hidden_state': last_hidden_state
- })
- return res
-
- def stream_infer(self,
- prompt,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- raise NotImplementedError("transformers backend does not currently support streaming, please consider using lmdeploy backend instead.")
diff --git a/soprano/readme.md b/soprano/readme.md
new file mode 100644
index 0000000..0751306
--- /dev/null
+++ b/soprano/readme.md
@@ -0,0 +1 @@
+now web ui support both realtime audio streaming capability and the audio file generation
diff --git a/soprano/server.py b/soprano/server.py
deleted file mode 100644
index 937c89a..0000000
--- a/soprano/server.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import base64
-import io
-import json
-from typing import Generator
-
-import numpy as np
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import Response
-from scipy.io.wavfile import write
-from torch import Tensor
-
-from soprano.tts import SopranoTTS
-
-# Load model at startup
-tts = SopranoTTS(cache_size_mb = 100)
-
-app = FastAPI(title="Soprano TTS API")
-
-def _tensor_to_wav_bytes(tensor: Tensor) -> bytes:
- """
- Convert a 1D fp32 torch tensor to a WAV byte stream.
- """
- # convert to int16
- audio_int16 = (np.clip(tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
-
- wav_io = io.BytesIO()
- write(wav_io, 32000, audio_int16) # 32kHz sample rate
- wav_io.seek(0)
- return wav_io.read()
-
-
-@app.post("/v1/audio/speech")
-async def create_speech(payload: dict):
- """
- Minimal implementation of OpenAI's Speech endpoint.
- Fields:
- - input: string - text to synthesize
- - model, voice, etc. are accepted but ignored.
- - response_format: str - ignored, only support wav.
- """
- text = payload.get("input")
- if not isinstance(text, str) or not text.strip():
- raise HTTPException(status_code=400, detail="`input` field must be a non-empty string.")
-
- audio_tensor = tts.infer(text)
- wav_bytes = _tensor_to_wav_bytes(audio_tensor)
- return Response(content=wav_bytes, media_type="audio/wav", headers={"Content-Disposition": 'attachment; filename="speech.wav"'})
diff --git a/soprano/soprano_cli.py b/soprano/soprano_cli.py
deleted file mode 100644
index 208c87d..0000000
--- a/soprano/soprano_cli.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""
-Soprano TTS Command Line Interface
-"""
-import argparse
-import os
-from soprano import SopranoTTS
-
-def main():
- parser = argparse.ArgumentParser(description='Soprano Text-to-Speech CLI')
- parser.add_argument('text', help='Text to synthesize')
- parser.add_argument('--output', '-o', default='output.wav', help='Output audio file path')
- parser.add_argument('--model-path', '-m', help='Path to local model directory (optional)')
- parser.add_argument('--device', '-d', default='cpu', choices=['cuda', 'cpu'],
- help='Device to use for inference')
- parser.add_argument('--backend', '-b', default='auto',
- choices=['auto', 'transformers', 'lmdeploy'],
- help='Backend to use for inference')
- parser.add_argument('--cache-size', '-c', type=int, default=100,
- help='Cache size in MB (for lmdeploy backend)')
-
- args = parser.parse_args()
-
- # Initialize TTS
- tts = SopranoTTS(
- backend=args.backend,
- device=args.device,
- cache_size_mb=args.cache_size,
- model_path=args.model_path
- )
-
- # Generate speech
- print(f"Generating speech for: '{args.text}'")
- tts.infer(args.text, out_path=args.output)
- print(f"Audio saved to: {args.output}")
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/soprano/tts.py b/soprano/tts.py
deleted file mode 100644
index d03c457..0000000
--- a/soprano/tts.py
+++ /dev/null
@@ -1,195 +0,0 @@
-from .vocos.decoder import SopranoDecoder
-from .utils.text import clean_text
-import torch
-import re
-from unidecode import unidecode
-from scipy.io import wavfile
-from huggingface_hub import hf_hub_download
-import os
-import time
-
-
-class SopranoTTS:
- def __init__(self,
- backend='auto',
- device='cuda',
- cache_size_mb=100,
- decoder_batch_size=1,
- model_path=None):
- RECOGNIZED_DEVICES = ['cuda', 'cpu']
- RECOGNIZED_BACKENDS = ['auto', 'lmdeploy', 'transformers']
- assert device in RECOGNIZED_DEVICES, f"unrecognized device {device}, device must be in {RECOGNIZED_DEVICES}"
- if backend == 'auto':
- if device == 'cpu':
- backend = 'transformers'
- else:
- try:
- import lmdeploy
- backend = 'lmdeploy'
- except ImportError:
- backend='transformers'
- print(f"Using backend {backend}.")
- assert backend in RECOGNIZED_BACKENDS, f"unrecognized backend {backend}, backend must be in {RECOGNIZED_BACKENDS}"
-
- if backend == 'lmdeploy':
- from .backends.lmdeploy import LMDeployModel
- self.pipeline = LMDeployModel(device=device, cache_size_mb=cache_size_mb, model_path=model_path)
- elif backend == 'transformers':
- from .backends.transformers import TransformersModel
- self.pipeline = TransformersModel(device=device, model_path=model_path)
-
- self.device = device
- self.decoder = SopranoDecoder()
- if device == 'cuda':
- self.decoder = self.decoder.cuda()
- if model_path:
- decoder_path = os.path.join(model_path, 'decoder.pth')
- else:
- decoder_path = hf_hub_download(repo_id='ekwek/Soprano-80M', filename='decoder.pth')
- self.decoder.load_state_dict(torch.load(decoder_path))
- self.decoder_batch_size=decoder_batch_size
- self.RECEPTIVE_FIELD = 4 # Decoder receptive field
- self.TOKEN_SIZE = 2048 # Number of samples per audio token
-
- self.infer("Hello world!") # warmup
-
- def _preprocess_text(self, texts, min_length=30):
- '''
- adds prompt format and sentence/part index
- Enforces a minimum sentence length by merging short sentences.
- '''
- res = []
- for text_idx, text in enumerate(texts):
- text = text.strip()
- cleaned_text = clean_text(text)
- sentences = re.split(r"(?<=[.!?])\s+", cleaned_text)
- processed = []
- for sentence in sentences:
- processed.append({
- "text": sentence,
- "text_idx": text_idx,
- })
-
- if min_length > 0 and len(processed) > 1:
- merged = []
- i = 0
- while i < len(processed):
- cur = processed[i]
- if len(cur["text"]) < min_length:
- if merged: merged[-1]["text"] = (merged[-1]["text"] + " " + cur["text"]).strip()
- else:
- if i + 1 < len(processed): processed[i + 1]["text"] = (cur["text"] + " " + processed[i + 1]["text"]).strip()
- else: merged.append(cur)
- else: merged.append(cur)
- i += 1
- processed = merged
- sentence_idxes = {}
- for item in processed:
- if item['text_idx'] not in sentence_idxes: sentence_idxes[item['text_idx']] = 0
- res.append((f'[STOP][TEXT]{item["text"]}[START]', item["text_idx"], sentence_idxes[item['text_idx']]))
- sentence_idxes[item['text_idx']] += 1
- return res
-
- def infer(self,
- text,
- out_path=None,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- results = self.infer_batch([text],
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- out_dir=None)[0]
- if out_path:
- wavfile.write(out_path, 32000, results.cpu().numpy())
- return results
-
- def infer_batch(self,
- texts,
- out_dir=None,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- sentence_data = self._preprocess_text(texts)
- prompts = list(map(lambda x: x[0], sentence_data))
- responses = self.pipeline.infer(prompts,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty)
- hidden_states = []
- for i, response in enumerate(responses):
- if response['finish_reason'] != 'stop':
- print(f"Warning: some sentences did not complete generation, likely due to hallucination.")
- hidden_state = response['hidden_state']
- hidden_states.append(hidden_state)
- combined = list(zip(hidden_states, sentence_data))
- combined.sort(key=lambda x: -x[0].size(0))
- hidden_states, sentence_data = zip(*combined)
-
- num_texts = len(texts)
- audio_concat = [[] for _ in range(num_texts)]
- for sentence in sentence_data:
- audio_concat[sentence[1]].append(None)
- for idx in range(0, len(hidden_states), self.decoder_batch_size):
- batch_hidden_states = []
- lengths = list(map(lambda x: x.size(0), hidden_states[idx:idx+self.decoder_batch_size]))
- N = len(lengths)
- for i in range(N):
- batch_hidden_states.append(torch.cat([
- torch.zeros((1, 512, lengths[0]-lengths[i]), device=self.device),
- hidden_states[idx+i].unsqueeze(0).transpose(1,2).to(self.device).to(torch.float32),
- ], dim=2))
- batch_hidden_states = torch.cat(batch_hidden_states)
- with torch.no_grad():
- audio = self.decoder(batch_hidden_states)
-
- for i in range(N):
- text_id = sentence_data[idx+i][1]
- sentence_id = sentence_data[idx+i][2]
- audio_concat[text_id][sentence_id] = audio[i].squeeze()[-(lengths[i]*self.TOKEN_SIZE-self.TOKEN_SIZE):]
- audio_concat = [torch.cat(x).cpu() for x in audio_concat]
-
- if out_dir:
- os.makedirs(out_dir, exist_ok=True)
- for i in range(len(audio_concat)):
- wavfile.write(f"{out_dir}/{i}.wav", 32000, audio_concat[i].cpu().numpy())
- return audio_concat
-
- def infer_stream(self,
- text,
- chunk_size=1,
- top_p=0.95,
- temperature=0.3,
- repetition_penalty=1.2):
- start_time = time.time()
- sentence_data = self._preprocess_text([text])
-
- first_chunk = True
- for sentence, _, _ in sentence_data:
- responses = self.pipeline.stream_infer(sentence,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty)
- hidden_states_buffer = []
- chunk_counter = chunk_size
- for token in responses:
- finished = token['finish_reason'] is not None
- if not finished: hidden_states_buffer.append(token['hidden_state'][-1])
- hidden_states_buffer = hidden_states_buffer[-(2*self.RECEPTIVE_FIELD+chunk_size):]
- if finished or len(hidden_states_buffer) >= self.RECEPTIVE_FIELD + chunk_size:
- if finished or chunk_counter == chunk_size:
- batch_hidden_states = torch.stack(hidden_states_buffer)
- inp = batch_hidden_states.unsqueeze(0).transpose(1, 2).to(self.device).to(torch.float32)
- with torch.no_grad():
- audio = self.decoder(inp)[0]
- if finished:
- audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_counter-1)*self.TOKEN_SIZE-self.TOKEN_SIZE):]
- else:
- audio_chunk = audio[-((self.RECEPTIVE_FIELD+chunk_size)*self.TOKEN_SIZE-self.TOKEN_SIZE):-(self.RECEPTIVE_FIELD*self.TOKEN_SIZE-self.TOKEN_SIZE)]
- chunk_counter = 0
- if first_chunk:
- print(f"Streaming latency: {1000*(time.time()-start_time):.2f} ms")
- first_chunk = False
- yield audio_chunk.cpu()
- chunk_counter += 1
diff --git a/soprano/utils/text.py b/soprano/utils/text.py
deleted file mode 100644
index 2295448..0000000
--- a/soprano/utils/text.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""
-Normalize input text to a format that Soprano recognizes.
-Adapted from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/tokenizer.py
-"""
-import re
-
-import inflect
-from unidecode import unidecode
-
-
-_inflect = inflect.engine()
-
-####################################################################################################
-# Abbreviations
-
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
- ('mrs', 'misess'),
- ('ms', 'miss'),
- ('mr', 'mister'),
- ('dr', 'doctor'),
- ('st', 'saint'),
- ('co', 'company'),
- ('jr', 'junior'),
- ('maj', 'major'),
- ('gen', 'general'),
- ('drs', 'doctors'),
- ('rev', 'reverend'),
- ('lt', 'lieutenant'),
- ('hon', 'honorable'),
- ('sgt', 'sergeant'),
- ('capt', 'captain'),
- ('esq', 'esquire'),
- ('ltd', 'limited'),
- ('col', 'colonel'),
- ('ft', 'fort'),
-]]
-_cased_abbreviations = [(re.compile('\\b%s\\b' % x[0]), x[1]) for x in [
- ('TTS', 'text to speech'),
- ('Hz', 'hertz'),
- ('kHz', 'kilohertz'),
- ('KBs', 'kilobytes'),
- ('KB', 'kilobyte'),
- ('MBs', 'megabytes'),
- ('MB', 'megabyte'),
- ('GBs', 'gigabytes'),
- ('GB', 'gigabyte'),
- ('TBs', 'terabytes'),
- ('TB', 'terabyte'),
- ('APIs', 'a p i\'s'),
- ('API', 'a p i'),
- ('CLIs', 'c l i\'s'),
- ('CLI', 'c l i'),
- ('CPUs', 'c p u\'s'),
- ('CPU', 'c p u'),
- ('GPUs', 'g p u\'s'),
- ('GPU', 'g p u'),
- ('Ave', 'avenue'),
- ('etc', 'et cetera'),
- ('Mon', 'monday'),
- ('Tues', 'tuesday'),
- ('Wed', 'wednesday'),
- ('Thurs', 'thursday'),
- ('Fri', 'friday'),
- ('Sat', 'saturday'),
- ('Sun', 'sunday'),
- ('and/or', 'and or'),
-]]
-
-def expand_abbreviations(text):
- for regex, replacement in _abbreviations + _cased_abbreviations:
- text = re.sub(regex, replacement, text)
- return text
-
-####################################################################################################
-# Numbers
-
-_num_prefix_re = re.compile(r'#\d')
-_num_suffix_re = re.compile(r'\b\d+(K|M|B|T)\b', re.IGNORECASE)
-_num_letter_split_re = re.compile(r'(\d[a-z]|[a-z]\d)', re.IGNORECASE)
-
-_comma_number_re = re.compile(r'(\d[\d\,]+\d)')
-_date_re = re.compile(r'(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])')
-_phone_number_re = re.compile(r'(\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4})')
-_time_re = re.compile(r'(\d\d?:\d\d(?::\d\d)?)')
-_pounds_re = re.compile(r'£([\d\,]*\d+)')
-_dollars_re = re.compile(r'\$([\d\.\,]*\d+)')
-_decimal_number_re = re.compile(r'(\d+(?:\.\d+)+)')
-_multiply_re = re.compile(r'(\d\s?\*\s?\d)')
-_divide_re = re.compile(r'(\d\s?/\s?\d)')
-_add_re = re.compile(r'(\d\s?\+\s?\d)')
-_subtract_re = re.compile(r'(\d?\s?-\s?\d)') # also does negative numbers
-_fraction_re = re.compile(r'(\d+(?:/\d+)+)')
-_ordinal_re = re.compile(r'\d+(st|nd|rd|th)')
-_number_re = re.compile(r'\d+')
-
-def _expand_num_prefix(m):
- match = m.group(0)
- return f"number {match[1]}"
-
-def _expand_num_suffix(m):
- match = m.group(0)
- if match[1].upper() == 'K': return f"{match[0]} thousand"
- elif match[1].upper() == 'M': return f"{match[0]} million"
- elif match[1].upper() == 'B': return f"{match[0]} billion"
- elif match[1].upper() == 'T': return f"{match[0]} trillion"
- return match # unexpected format
-
-def _split_alphanumeric(m):
- match = m.group(1)
- return f"{match[0]} {match[1]}"
-
-def _remove_commas(m):
- return m.group(1).replace(',', '')
-
-def _expand_date(m):
- match = m.group(2)
- match = re.split('[./-]', match)
- return m.group(1) + ' dash '.join(match) + m.group(3)
-
-def _expand_phone_number(m):
- match = m.group(1)
- match = re.sub(r'\D', '', match)
- assert len(match) == 10
- match = f"{' '.join(list(match[:3]))}, {' '.join(list(match[3:6]))}, {' '.join(list(match[6:]))}"
- return match
-
-def _expand_time(m):
- match = m.group(1)
- match = match.split(':')
- if len(match) == 2:
- hours, minutes = match
- if minutes == '00':
- if int(hours) == 0:
- return '0'
- elif int(hours) > 12: return f"{hours} minutes"
- return f"{hours} o'clock"
- elif minutes.startswith('0'):
- minutes = f'oh {minutes[1:]}'
- return f"{hours} {minutes}"
- else:
- hours, minutes, seconds = match
- if int(hours) != 0:
- return f"{hours} {'oh oh' if minutes == '00' else f'oh {minutes}' if minutes.startswith('0') else {minutes}} {'' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}"
- elif minutes != '00':
- return f"{minutes} {'oh oh' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}"
- else:
- return seconds
-
-def _expand_dollars(m):
- match = m.group(1)
- parts = match.split('.')
- if len(parts) > 2:
- return match + ' dollars' # Unexpected format
- dollars = int(parts[0]) if parts[0] else 0
- cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
- if dollars and cents:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
- elif dollars:
- dollar_unit = 'dollar' if dollars == 1 else 'dollars'
- return '%s %s' % (dollars, dollar_unit)
- elif cents:
- cent_unit = 'cent' if cents == 1 else 'cents'
- return '%s %s' % (cents, cent_unit)
- else:
- return 'zero dollars'
-
-def _expand_decimal_point(m):
- match = m.group(1)
- match = match.split('.')
- return match[0] + ' point ' + ' point '.join(' '.join(list(match[i])) for i in range(1, len(match)))
-
-def _expand_fraction(m):
- match = m.group(1)
- match = match.split('/')
- return ' over '.join(match) if len(match)==2 else ' slash '.join(match)
-
-def _expand_multiply(m):
- return ' times '.join(m.group(1).split('*'))
-
-def _expand_divide(m):
- return ' over '.join(m.group(1).split('/'))
-
-def _expand_add(m):
- return ' plus '.join(m.group(1).split('+'))
-
-def _expand_subtract(m):
- return ' minus '.join(m.group(1).split('-'))
-
-def _expand_ordinal(m):
- return _inflect.number_to_words(m.group(0), andword='')
-
-def _expand_number(m):
- num = int(m.group(0))
- if num > 1000 and num < 3000:
- if num == 2000:
- return 'two thousand'
- elif num > 2000 and num < 2010:
- return 'two thousand ' + _inflect.number_to_words(num % 100)
- elif num % 100 == 0:
- return _inflect.number_to_words(num // 100) + ' hundred'
- else:
- return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
- else:
- return _inflect.number_to_words(num, andword='')
-
-def normalize_numbers(text):
- text = re.sub(_num_prefix_re, _expand_num_prefix, text)
- text = re.sub(_num_suffix_re, _expand_num_suffix, text)
- text = re.sub(_comma_number_re, _remove_commas, text)
- text = re.sub(_date_re, _expand_date, text)
- text = re.sub(_phone_number_re, _expand_phone_number, text)
- text = re.sub(_time_re, _expand_time, text)
- text = re.sub(_pounds_re, r'\1 pounds', text)
- text = re.sub(_dollars_re, _expand_dollars, text)
- text = re.sub(_decimal_number_re, _expand_decimal_point, text)
- text = re.sub(_multiply_re, _expand_multiply, text)
- text = re.sub(_divide_re, _expand_divide, text)
- text = re.sub(_add_re, _expand_add, text)
- text = re.sub(_subtract_re, _expand_subtract, text)
-
- text = re.sub(_fraction_re, _expand_fraction, text)
- text = re.sub(_ordinal_re, _expand_ordinal, text)
- for _ in range(2): # need to do this twice to find all matches
- text = re.sub(_num_letter_split_re, _split_alphanumeric, text)
- text = re.sub(_number_re, _expand_number, text)
- return text
-
-####################################################################################################
-# Special characters & other patterns
-
-_special_characters = [(re.compile(x[0]), x[1]) for x in [
- ('@', ' at '),
- ('&', ' and '),
- ('%', ' percent '),
- (':', '.'),
- (';', ','),
- (r'\+', ' plus '),
- (r'\\', ' backslash '),
- ('~', ' about '),
- ('(^| )<3', ' heart '),
- ('<=', ' less than or equal to '),
- ('>=', ' greater than or equal to '),
- ('<', ' less than '),
- ('>', ' greater than '),
- ('=', ' equals '),
- ('/', ' slash '),
- ('_', ' '),
- (r'\*', ' '),
-]]
-_link_header_re = re.compile(r'(https?://)')
-_dash_re = re.compile(r'(. - .)')
-_dot_re = re.compile(r'([A-Z]\.[A-Z])', re.IGNORECASE)
-_parentheses_re = re.compile(r'[\(\[\{].*[\)\]\}](.|$)')
-
-def expand_special_characters(text):
- for regex, replacement in _special_characters:
- text = re.sub(regex, replacement, text)
- return text
-
-def _expand_link_header(m):
- return 'h t t p s colon slash slash '
-
-def _expand_dash(m):
- match = m.group(0)
- return f"{match[0]}, {match[4]}"
-
-def _expand_dot(m):
- match = m.group(0)
- return f"{match[0]} dot {match[2]}"
-
-def _expand_parantheses(m):
- match = m.group(0)
- match = re.sub(r'[\(\[\{]', ', ', match)
- match = re.sub(r'[\)\]\}][^$.!?,]', ', ', match)
- match = re.sub(r'[\)\]\}]', '', match)
- return match
-
-def normalize_special(text):
- text = re.sub(_link_header_re, _expand_link_header, text)
- text = re.sub(_dash_re, _expand_dash, text)
- text = re.sub(_dot_re, _expand_dot, text)
- text = re.sub(_parentheses_re, _expand_parantheses, text)
- return text
-
-####################################################################################################
-# Misc
-
-def lowercase(text):
- return text.lower()
-
-def convert_to_ascii(text):
- return unidecode(text)
-
-def normalize_newlines(text):
- text = text.split('\n')
- for i in range(len(text)):
- text[i] = text[i].strip()
- if not text[i]: continue
- if text[i][-1] not in '.!?':
- text[i] = f"{text[i]}."
- return ' '.join(text)
-
-def remove_unknown_characters(text):
- text = re.sub(r"[^A-Za-z !\$%&'\*\+,-./0123456789<>\?_]", "", text)
- text = re.sub(r"[<>/_+]", "", text)
- return text
-
-def collapse_whitespace(text):
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r' [.\?!,]', lambda m: m.group(0)[1], text)
- return text.strip()
-
-def dedup_punctuation(text):
- text = re.sub(r"\.\.\.+", "[ELLIPSIS]", text)
- text = re.sub(r",+", ",", text)
- text = re.sub(r"[\.,]*\.[\.,]*", ".", text)
- text = re.sub(r"[\.,!]*![\.,!]*", "!", text)
- text = re.sub(r"[\.,!\?]*\?[\.,!\?]*", "?", text)
- text = re.sub(r"\[ELLIPSIS\]", "...", text)
- return text
-
-def clean_text(text):
- text = convert_to_ascii(text)
- text = normalize_newlines(text)
- text = normalize_numbers(text)
- text = normalize_special(text)
- text = expand_abbreviations(text)
- text = expand_special_characters(text)
- text = lowercase(text)
- text = remove_unknown_characters(text)
- text = collapse_whitespace(text)
- text = dedup_punctuation(text)
- return text
-
-
-if __name__ == '__main__':
- print(clean_text('1,2,3,456,176'))
- print(clean_text('123,456,789'))
- print(clean_text('123,456,789th'))
- print(clean_text('123-456-7890'))
- print(clean_text('111-111-1111'))
- print(clean_text('(111) 111-1111'))
- print(clean_text('A(111) 111-1111'))
- print(clean_text('A (111) 111-1111'))
- print(clean_text('$2.47'))
- print(clean_text('$247'))
- print(clean_text('$0.27'))
- print(clean_text('$1.00'))
- print(clean_text('£20'))
- for i in range(1990, 2030):
- print(clean_text(str(i)))
- print(clean_text('2656'))
- print(clean_text('1024'))
- print(clean_text('2.47023'))
- print(clean_text('20.47023'))
- print(clean_text('1.17.1.1'))
- print(clean_text('111.111.1111'))
- print(clean_text('1/1/2025'))
- print(clean_text('1-1-2025'))
- print(clean_text('1-1-25'))
- print(clean_text('A 1/1/11 A'))
- print(clean_text('A 1/1 A'))
- print(clean_text('1/1'))
- print(clean_text('1/10'))
- print(clean_text('1/1/10'))
- print(clean_text('11/1/1/10'))
-
- print(clean_text('0:00'))
- print(clean_text('12:00'))
- print(clean_text('13:00'))
- print(clean_text('8:00'))
- print(clean_text('8:05'))
- print(clean_text('8:15'))
- print(clean_text('0:00:00'))
- print(clean_text('00:01:10'))
- print(clean_text('00:10:01'))
- print(clean_text('01:01:01'))
- print(clean_text('00:01:00'))
- print(clean_text('01:00:00'))
-
- print(clean_text('-1 + 2 * 3 - 4 / 5'))
- print(clean_text('-1+2*3-5/4/25'))
-
- print(clean_text('100x1'))
- print(clean_text('100k'))
- print(clean_text('100m'))
- print(clean_text('100b'))
- print(clean_text('100t'))
-
- print(clean_text('#1'))
-
- print(clean_text('12:00'))
- print(clean_text('11:59'))
- print(clean_text('01:00'))
- print(clean_text('0100'))
-
- print(clean_text('1st 2nd 3rd 4th'))
- print(clean_text('1K 1M 1B 1T 1K1M1B1T'))
- print(clean_text('and/or'))
diff --git a/soprano/vocos/decoder.py b/soprano/vocos/decoder.py
deleted file mode 100644
index 75d506a..0000000
--- a/soprano/vocos/decoder.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import torch
-from torch import nn
-
-from .models import VocosBackbone
-from .heads import ISTFTHead
-
-
-class SopranoDecoder(nn.Module):
- def __init__(self,
- num_input_channels=512,
- decoder_num_layers=8,
- decoder_dim=512,
- decoder_intermediate_dim=None,
- hop_length=512,
- n_fft=2048,
- upscale=4,
- dw_kernel=3,
- ):
- super().__init__()
- self.decoder_initial_channels = num_input_channels
- self.num_layers = decoder_num_layers
- self.dim = decoder_dim
- self.intermediate_dim = decoder_intermediate_dim if decoder_intermediate_dim else decoder_dim*3
- self.hop_length = hop_length
- self.n_fft = n_fft
- self.upscale = upscale
- self.dw_kernel = dw_kernel
-
- self.decoder = VocosBackbone(input_channels=self.decoder_initial_channels,
- dim=self.dim,
- intermediate_dim=self.intermediate_dim,
- num_layers=self.num_layers,
- input_kernel_size=dw_kernel,
- dw_kernel_size=dw_kernel,
- )
- self.head = ISTFTHead(dim=self.dim,
- n_fft=self.n_fft,
- hop_length=self.hop_length)
-
- def forward(self, x):
- T = x.size(2)
- x = torch.nn.functional.interpolate(x, size=self.upscale*(T-1)+1, mode='linear', align_corners=True)
- x = self.decoder(x)
- reconstructed = self.head(x)
- return reconstructed
diff --git a/soprano/vocos/heads.py b/soprano/vocos/heads.py
deleted file mode 100644
index 5b9e15c..0000000
--- a/soprano/vocos/heads.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-from torch import nn
-from .spectral_ops import ISTFT
-
-
-class ISTFTHead(nn.Module):
- """
- ISTFT Head module for predicting STFT complex coefficients.
-
- Args:
- dim (int): Hidden dimension of the model.
- n_fft (int): Size of Fourier transform.
- hop_length (int): The distance between neighboring sliding window frames, which should align with
- the resolution of the input features.
- padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
- """
-
- def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "center"):
- super().__init__()
- out_dim = n_fft + 2
- self.out = torch.nn.Linear(dim, out_dim)
- self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
-
- @torch.compiler.disable
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- """
- Forward pass of the ISTFTHead module.
-
- Args:
- x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
- L is the sequence length, and H denotes the model dimension.
-
- Returns:
- Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
- """
- x = self.out(x.transpose(1,2)).transpose(1, 2)
- mag, p = x.chunk(2, dim=1)
- mag = torch.exp(mag)
- mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes
- # wrapping happens here. These two lines produce real and imaginary value
- x = torch.cos(p)
- y = torch.sin(p)
- # recalculating phase here does not produce anything new
- # only costs time
- # phase = torch.atan2(y, x)
- # S = mag * torch.exp(phase * 1j)
- # better directly produce the complex value
- S = mag * (x + 1j * y)
- audio = self.istft(S)
- return audio
diff --git a/soprano/vocos/models.py b/soprano/vocos/models.py
deleted file mode 100644
index 458d815..0000000
--- a/soprano/vocos/models.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from typing import Optional
-
-import torch
-from torch import nn
-
-from .modules import ConvNeXtBlock
-
-class VocosBackbone(nn.Module):
- """
- Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
-
- Args:
- input_channels (int): Number of input features channels.
- dim (int): Hidden dimension of the model.
- intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
- num_layers (int): Number of ConvNeXtBlock layers.
- layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
- """
-
- def __init__(
- self,
- input_channels: int,
- dim: int,
- intermediate_dim: int,
- num_layers: int,
- input_kernel_size: int = 9,
- dw_kernel_size: int = 9,
- layer_scale_init_value: Optional[float] = None,
- pad: str = 'zeros',
- ):
- super().__init__()
- self.embed = nn.Conv1d(input_channels, dim, kernel_size=input_kernel_size, padding=input_kernel_size//2, padding_mode=pad)
- self.norm = nn.LayerNorm(dim, eps=1e-6)
- self.convnext = nn.ModuleList(
- [
- ConvNeXtBlock(
- dim=dim,
- intermediate_dim=intermediate_dim,
- dw_kernel_size=dw_kernel_size,
- layer_scale_init_value=layer_scale_init_value or 1 / num_layers**0.5,
- )
- for _ in range(num_layers)
- ]
- )
- self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
- self.apply(self._init_weights)
-
- def _init_weights(self, m):
- if isinstance(m, (nn.Conv1d, nn.Linear)):
- nn.init.trunc_normal_(m.weight, std=0.02)
- if m.bias is not None: nn.init.constant_(m.bias, 0)
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- x = self.embed(x) # (B, C, L)
- x = self.norm(x.transpose(1, 2))
- x = x.transpose(1, 2)
- for conv_block in self.convnext:
- x = conv_block(x)
- x = self.final_layer_norm(x.transpose(1, 2))
- x = x.transpose(1, 2)
- return x
diff --git a/soprano/vocos/modules.py b/soprano/vocos/modules.py
deleted file mode 100644
index f969d4f..0000000
--- a/soprano/vocos/modules.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import torch
-from torch import nn
-
-
-class ConvNeXtBlock(nn.Module):
- """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-
- Args:
- dim (int): Number of input channels.
- intermediate_dim (int): Dimensionality of the intermediate layer.
- layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
- Defaults to None.
- """
-
- def __init__(
- self,
- dim: int,
- intermediate_dim: int,
- layer_scale_init_value: float,
- dw_kernel_size: int = 9,
- ):
- super().__init__()
- self.dwconv = nn.Conv1d(dim, dim, kernel_size=dw_kernel_size, padding=dw_kernel_size//2, groups=dim) # depthwise conv
- self.norm = nn.LayerNorm(dim, eps=1e-6)
- self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
- self.act = nn.GELU()
- self.pwconv2 = nn.Linear(intermediate_dim, dim)
- self.gamma = (
- nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
- if layer_scale_init_value > 0
- else None
- )
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- residual = x
- x = self.dwconv(x)
- x = x.transpose(1, 2) # (B, C, T) -> (B, T, C)
- x = self.norm(x)
- x = self.pwconv1(x)
- x = self.act(x)
- x = self.pwconv2(x)
- if self.gamma is not None:
- x = self.gamma * x
- x = x.transpose(1, 2) # (B, T, C) -> (B, C, T)
-
- x = residual + x
- return x
diff --git a/soprano/vocos/spectral_ops.py b/soprano/vocos/spectral_ops.py
deleted file mode 100644
index 8a38cb8..0000000
--- a/soprano/vocos/spectral_ops.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from torch import nn
-
-class ISTFT(nn.Module):
- """
- Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
- windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
- See issue: https://github.com/pytorch/pytorch/issues/62323
- Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
- The NOLA constraint is met as we trim padded samples anyway.
-
- Args:
- n_fft (int): Size of Fourier transform.
- hop_length (int): The distance between neighboring sliding window frames.
- win_length (int): The size of window frame and STFT filter.
- padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
- """
-
- def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"):
- super().__init__()
- if padding not in ["center", "same"]:
- raise ValueError("Padding must be 'center' or 'same'.")
- self.padding = padding
- self.n_fft = n_fft
- self.hop_length = hop_length
- self.win_length = win_length
- window = torch.hann_window(win_length)
- self.register_buffer("window", window)
-
- def forward(self, spec: torch.Tensor) -> torch.Tensor:
- """
- Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
-
- Args:
- spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
- N is the number of frequency bins, and T is the number of time frames.
-
- Returns:
- Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
- """
- if self.padding == "center":
- spec[:,0] = 0 # fixes some strange bug where first/last freqs don't matter when bs<16 which causes exploding gradients
- spec[:,-1] = 0
- # Fallback to pytorch native implementation
- return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True)
- elif self.padding == "same":
- pad = (self.win_length - self.hop_length) // 2
- else:
- raise ValueError("Padding must be 'center' or 'same'.")
-
- assert spec.dim() == 3, "Expected a 3D tensor as input"
- B, N, T = spec.shape
-
- # Inverse FFT
- ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
- ifft = ifft * self.window[None, :, None]
-
- # Overlap and Add
- output_size = (T - 1) * self.hop_length + self.win_length
- y = torch.nn.functional.fold(
- ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
- )[:, 0, 0, pad:-pad]
-
- # Window envelope
- window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
- window_envelope = torch.nn.functional.fold(
- window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
- ).squeeze()[pad:-pad]
-
- # Normalize
- assert (window_envelope > 1e-11).all()
- y = y / window_envelope
-
- return y
diff --git a/soprano/webui.py b/soprano/webui.py
index 4c4480b..29fa896 100644
--- a/soprano/webui.py
+++ b/soprano/webui.py
@@ -1,198 +1,519 @@
-#!/usr/bin/env python3
-"""
-Gradio Web Interface for Soprano TTS
-"""
-
-import gradio as gr
-import torch
-from soprano import SopranoTTS
-import numpy as np
-import socket
-import time
-
-# Detect device
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-# Initialize model
-print("Loading Soprano TTS model...")
-model = SopranoTTS(
- backend="auto",
- device=DEVICE,
- cache_size_mb=100,
- decoder_batch_size=1,
-)
-print("Model loaded successfully!")
-
-SAMPLE_RATE = 32000
-
-
-def generate_speech(
- text: str,
- temperature: float,
- top_p: float,
- repetition_penalty: float,
-) -> tuple:
- if not text.strip():
- return None, "Please enter some text to generate speech."
-
- try:
- start_time = time.perf_counter()
-
- audio = model.infer(
- text,
- temperature=temperature,
- top_p=top_p,
- repetition_penalty=repetition_penalty,
- )
-
- gen_time = time.perf_counter() - start_time
-
- audio_np = audio.cpu().numpy()
- audio_int16 = (audio_np * 32767).astype(np.int16)
-
- audio_seconds = len(audio_np) / SAMPLE_RATE
- rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
-
- status = (
- f"✓ Generated {audio_seconds:.2f} s audio | "
- f"Generation time: {gen_time:.3f} s "
- f"({rtf:.2f}x realtime)"
- )
-
- return (SAMPLE_RATE, audio_int16), status
-
- except Exception as e:
- return None, f"✗ Error: {str(e)}"
-
-
-# Create Gradio interface
-with gr.Blocks(title="Soprano TTS") as demo:
-
- gr.Markdown(
- f"""
-# 🎵 Soprano TTS
-
-**Running on: {DEVICE.upper()}**
-
-Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
-high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
-and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
-
-**GitHub:** https://github.com/ekwek1/soprano
-**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
-**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
-"""
- )
-
- with gr.Row():
- with gr.Column(scale=2):
- text_input = gr.Textbox(
- label="Text to Synthesize",
- placeholder="Enter text here...",
- value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
- lines=5,
- max_lines=10,
- )
-
- with gr.Accordion("Advanced Settings", open=False):
- temperature = gr.Slider(
- minimum=0.1,
- maximum=1.5,
- value=0.3,
- step=0.05,
- label="Temperature",
- )
-
- top_p = gr.Slider(
- minimum=0.5,
- maximum=1.0,
- value=0.95,
- step=0.05,
- label="Top P",
- )
-
- repetition_penalty = gr.Slider(
- minimum=1.0,
- maximum=2.0,
- value=1.2,
- step=0.1,
- label="Repetition Penalty",
- )
-
- generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
-
- with gr.Column(scale=1):
- audio_output = gr.Audio(
- label="Generated Speech",
- type="numpy",
- autoplay=True,
- )
-
- status_output = gr.Textbox(
- label="Status",
- interactive=False,
- lines=3,
- max_lines=10
- )
-
- gr.Examples(
- examples=[
- ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
- ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
- ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
- ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
- ],
- inputs=[text_input, temperature, top_p, repetition_penalty],
- label="Example Prompts",
- )
-
- generate_btn.click(
- fn=generate_speech,
- inputs=[text_input, temperature, top_p, repetition_penalty],
- outputs=[audio_output, status_output],
- )
- gr.Markdown(
- f"""
-### Usage tips:
-
-- Soprano works best when each sentence is between 2 and 15 seconds long.
-- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
- Best results can be achieved by converting these into their phonetic form.
- (1+1 -> one plus one, etc)
-- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
- You may also change the sampling settings for more varied results.
-- Avoid improper grammar such as not using contractions, multiple spaces, etc.
-"""
- )
-
-
-def find_free_port(start_port=7860, max_tries=100):
- for port in range(start_port, start_port + max_tries):
- try:
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
- s.bind(("", port))
- return port
- except OSError:
- continue
- raise OSError("Could not find a free port")
-
-def main():
- port = find_free_port(7860)
- print(f"Starting Gradio interface on port {port}")
- demo.launch(
- server_name="127.0.0.1",
- server_port=port,
- share=False,
- theme=gr.themes.Soft(primary_hue="green"),
- css="""
-a {
- color: var(--primary-600);
-}
-a:hover {
- color: var(--primary-700);
-}
-"""
- )
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
+#!/usr/bin/env python3
+"""
+Gradio Web Interface for Soprano TTS
+"""
+
+import gradio as gr
+import torch
+import sys
+import os
+import asyncio
+import logging
+import numpy as np
+import socket
+import time
+import threading
+import traceback
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from soprano import SopranoTTS
+
+logging.getLogger('asyncio').setLevel(logging.CRITICAL)
+
+def custom_exception_handler(loop, context):
+ exception = context.get('exception')
+
+ if exception and not (isinstance(exception, ConnectionResetError) and "forcibly closed" in str(exception)):
+ print(f"AsyncIO Exception: {context.get('message')}")
+ exc = context.get('exception')
+ if exc:
+ traceback.print_exception(type(exc), exc, exc.__traceback__)
+
+ if exception and isinstance(exception, ConnectionResetError) and "forcibly closed" in str(exception):
+ pass
+ else:
+ loop.default_exception_handler(context)
+
+if sys.platform.startswith("win"):
+ try:
+ loop = asyncio.get_running_loop()
+ loop.set_exception_handler(custom_exception_handler)
+ except RuntimeError:
+ loop = asyncio.new_event_loop()
+ loop.set_exception_handler(custom_exception_handler)
+ asyncio.set_event_loop(loop)
+
+try:
+ import pyaudio
+ PYAUDIO_AVAILABLE = True
+except ImportError:
+ PYAUDIO_AVAILABLE = False
+ print("PyAudio not found. Install it with 'pip install pyaudio' for real-time audio streaming.")
+
+current_stream = None
+current_pyaudio_instance = None
+stream_lock = threading.Lock()
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 32000
+
+print("Loading Soprano TTS model...")
+model = SopranoTTS(
+ backend="auto",
+ device=DEVICE,
+ cache_size_mb=100,
+ decoder_batch_size=1,
+)
+print("Model loaded successfully!")
+
+
+async def generate_speech(
+ text: str,
+ temperature: float,
+ top_p: float,
+ repetition_penalty: float,
+) -> tuple:
+ if not text.strip():
+ return None, "Please enter some text to generate speech."
+
+ try:
+ start_time = time.perf_counter()
+
+ audio = model.infer(
+ text,
+ temperature=temperature,
+ top_p=top_p,
+ repetition_penalty=repetition_penalty,
+ )
+
+ gen_time = time.perf_counter() - start_time
+
+ audio_np = audio.cpu().numpy()
+
+ if audio_np.size == 0:
+ return None, "✗ Error: Generated audio is empty."
+
+ max_val = np.max(np.abs(audio_np))
+ if max_val > 1.0:
+ audio_np = audio_np / max_val
+
+ audio_int16 = (audio_np * 32767).astype(np.int16)
+
+ audio_seconds = len(audio_np) / SAMPLE_RATE
+ rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
+
+ status = (
+ f"✓ Generated {audio_seconds:.2f} s audio | "
+ f"Generation time: {gen_time:.3f} s "
+ f"({rtf:.2f}x realtime)"
+ )
+
+ return (SAMPLE_RATE, audio_int16), status
+
+ except ConnectionResetError:
+ return None, "✗ Connection error during generation. Please try again."
+ except Exception as e:
+ return None, f"✗ Error: {str(e)}"
+
+
+class AudioStreamer:
+
+ def __init__(self):
+ self.stream = None
+ self.pyaudio_instance = None
+ self.is_playing = False
+
+ def cleanup(self):
+ if self.stream:
+ try:
+ if hasattr(self.stream, 'is_active') and self.stream.is_active():
+ self.stream.stop_stream()
+ self.stream.close()
+ except Exception:
+ pass
+ self.stream = None
+
+ if self.pyaudio_instance:
+ try:
+ self.pyaudio_instance.terminate()
+ except Exception:
+ pass
+ self.pyaudio_instance = None
+
+ self.is_playing = False
+
+ def play_audio_chunk(self, audio_chunk):
+ if self.stream and self.is_playing:
+ audio_np = audio_chunk.cpu().numpy()
+
+ if len(audio_np) > 1:
+ if len(audio_np) == 2:
+ smoothed = np.array([audio_np[0], audio_np[1]])
+ elif len(audio_np) == 3:
+ smoothed = np.array([
+ audio_np[0],
+ (audio_np[0] + audio_np[1] + audio_np[2]) / 3,
+ audio_np[2]
+ ])
+ else:
+ smoothed = np.zeros_like(audio_np)
+ smoothed[0] = audio_np[0]
+ smoothed[-1] = audio_np[-1]
+ if len(audio_np) > 2:
+ smoothed[1:-1] = (audio_np[:-2] + audio_np[1:-1] + audio_np[2:]) / 3
+ audio_np = smoothed
+
+ audio_np = np.clip(audio_np, -1.0, 1.0)
+
+ audio_int16 = (np.tanh(audio_np) * 32767).astype(np.int16)
+
+ self.stream.write(audio_int16.tobytes())
+ return len(audio_int16)
+
+ return 0
+
+
+async def speak_realtime(
+ text: str,
+ temperature: float,
+ top_p: float,
+ repetition_penalty: float,
+) -> str:
+ global current_stream, current_pyaudio_instance
+
+ if not text.strip():
+ return "Please enter some text to speak."
+
+ if not PYAUDIO_AVAILABLE:
+ return "PyAudio is not available. Install it with 'pip install pyaudio' for real-time audio streaming."
+
+ with stream_lock:
+ audio_streamer = AudioStreamer()
+
+ current_stream = None
+ current_pyaudio_instance = None
+
+ try:
+ audio_streamer.pyaudio_instance = pyaudio.PyAudio()
+
+ audio_streamer.stream = audio_streamer.pyaudio_instance.open(
+ format=pyaudio.paInt16,
+ channels=1,
+ rate=SAMPLE_RATE,
+ output=True,
+ frames_per_buffer=2048 # Smaller buffer for more responsive streaming
+ )
+ audio_streamer.is_playing = True
+
+ current_stream = audio_streamer.stream
+ current_pyaudio_instance = audio_streamer.pyaudio_instance
+
+ start_time = time.perf_counter()
+
+ # Real-time streaming: immediately feed text into synthesis engine
+ # Begin generating audio in small chunks without waiting for full text processing
+ stream_gen = model.infer_stream(
+ text,
+ chunk_size=1,
+ temperature=temperature,
+ top_p=top_p,
+ repetition_penalty=repetition_penalty,
+ )
+
+ total_samples = 0
+
+ # As each chunk of audio is produced, stream and play it back immediately
+ # while the rest of the text is still being converted
+ for audio_chunk in stream_gen:
+ # Check if stream is still active to maintain continuous, live speech output
+ if (not audio_streamer.is_playing or
+ not (hasattr(audio_streamer.stream, 'is_active') and audio_streamer.stream.is_active())):
+ break
+
+ # Stream and play audio chunks as they become available
+ samples_written = audio_streamer.play_audio_chunk(audio_chunk)
+ total_samples += samples_written
+
+ audio_streamer.cleanup()
+
+ current_stream = None
+ current_pyaudio_instance = None
+
+ gen_time = time.perf_counter() - start_time
+ audio_seconds = total_samples / SAMPLE_RATE
+ rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
+
+ status = (
+ f"✓ Finished speaking {audio_seconds:.2f} s audio | "
+ f"Playback time: {gen_time:.3f} s "
+ f"({rtf:.2f}x realtime)"
+ )
+
+ return status
+
+ except ConnectionResetError:
+ audio_streamer.cleanup()
+
+ current_stream = None
+ current_pyaudio_instance = None
+
+ return "✗ Connection error during real-time playback. Please try again."
+ except Exception as e:
+ audio_streamer.cleanup()
+
+ current_stream = None
+ current_pyaudio_instance = None
+
+ return f"✗ Error during real-time playback: {str(e)}"
+
+
+def create_gradio_interface():
+ with gr.Blocks(title="Soprano TTS") as demo:
+ active_function = gr.State(value="ready")
+
+ gr.Markdown(
+ f"""
+ # 🎵 Soprano TTS
+
+ **Running on: {DEVICE.upper()}**
+
+ Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
+ high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
+ and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
+
+
+
+
+ """
+ )
+
+ with gr.Row():
+ with gr.Column(scale=2):
+ text_input = gr.Textbox(
+ label="Text to Synthesize",
+ placeholder="Enter text here...",
+ value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
+ lines=5,
+ max_lines=10,
+ )
+
+ with gr.Accordion("Advanced Settings", open=False):
+ temperature = gr.Slider(
+ minimum=0.1,
+ maximum=1.5,
+ value=0.3,
+ step=0.05,
+ label="Temperature",
+ )
+
+ top_p = gr.Slider(
+ minimum=0.5,
+ maximum=1.0,
+ value=0.95,
+ step=0.05,
+ label="Top P",
+ )
+
+ repetition_penalty = gr.Slider(
+ minimum=1.0,
+ maximum=2.0,
+ value=1.2,
+ step=0.1,
+ label="Repetition Penalty",
+ )
+
+ with gr.Row():
+ generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
+ speak_btn = gr.Button("Speak", variant="primary", size="lg")
+ clear_btn = gr.Button("Clear", variant="secondary", size="lg")
+
+ with gr.Column(scale=1):
+ audio_output = gr.Audio(
+ label="Generated Speech",
+ type="numpy",
+ autoplay=True,
+ streaming=True
+ )
+
+ status_output = gr.Textbox(
+ label="Status",
+ interactive=False,
+ lines=3,
+ max_lines=10
+ )
+
+ gr.Examples(
+ examples=[
+ ["Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed."],
+ ["Hello! Welcome to Soprano text to speech. This is a short example."],
+ ["The quick brown fox jumps over the lazy dog. This sentence contains all letters of the alphabet."],
+ ["Artificial intelligence is transforming the world in ways we never imagined. It's revolutionizing industries and changing how we interact with technology."],
+ ["In a distant future, humanity has colonized the stars. Advanced AI systems govern interstellar travel, ensuring safety and efficiency across vast cosmic distances. Explorers venture into uncharted territories, seeking new worlds and civilizations."],
+ ["To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer The slings and arrows of outrageous fortune, Or to take arms against a sea of troubles And by opposing end them. To die—to sleep, no more; and by a sleep to say we end The heart-ache and the thousand natural shocks That flesh is heir to: 'tis a consummation Devoutly to be wish'd. To die, to sleep; To sleep, perchance to dream—ay, there's the rub: For in that sleep of death what dreams may come, When we have shuffled off this mortal coil, Must give us pause—there's the respect That makes calamity of so long life."],
+ ],
+ inputs=[text_input],
+ label="Examples",
+ )
+
+ async def check_and_set_active_generate(active_func, *args):
+ if active_func is not None and active_func != "ready":
+ return None, f"Error: Please press Clear first. Current operation: {active_func}", active_func
+ result = await generate_speech(args[0], args[1], args[2], args[3])
+ return result[0], result[1], "generate"
+
+ async def check_and_set_active_speak(active_func, *args):
+ if active_func is not None and active_func != "ready":
+ return f"Error: Please press Clear first. Current operation: {active_func}", active_func
+ result = await speak_realtime(args[0], args[1], args[2], args[3])
+ return result, "speak"
+
+ def clear_active_state():
+ return "ready"
+
+ def clear_inputs(active_func):
+ with stream_lock:
+ global current_stream, current_pyaudio_instance
+ if current_stream:
+ try:
+ if hasattr(current_stream, 'is_active') and current_stream.is_active():
+ current_stream.stop_stream()
+ current_stream.close()
+ except Exception:
+ pass
+ current_stream = None
+ if current_pyaudio_instance:
+ try:
+ current_pyaudio_instance.terminate()
+ except Exception:
+ pass
+ current_pyaudio_instance = None
+ return "", None, "Ready for input...", "ready"
+
+ generate_btn.click(
+ fn=check_and_set_active_generate,
+ inputs=[active_function, text_input, temperature, top_p, repetition_penalty],
+ outputs=[audio_output, status_output, active_function]
+ )
+
+ speak_btn.click(
+ fn=check_and_set_active_speak,
+ inputs=[active_function, text_input, temperature, top_p, repetition_penalty],
+ outputs=[status_output, active_function]
+ )
+
+ clear_btn.click(
+ fn=clear_inputs,
+ inputs=[active_function],
+ outputs=[text_input, audio_output, status_output, active_function]
+ )
+
+ gr.Markdown(
+ """
+
+
+
+ ### Usage tips:
+
+ - Soprano works best when each sentence is between 2 and 15 seconds long.
+ - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
+ Best results can be achieved by converting these into their phonetic form.
+ (1+1 -> one plus one, etc)
+ - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
+ You may also change the sampling settings for more varied results.
+ - Avoid improper grammar such as not using contractions, multiple spaces, etc.
+ """
+ )
+
+ return demo
+
+
+def find_free_port(start_port=7860, max_tries=100):
+ for port in range(start_port, start_port + max_tries):
+ try:
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", port))
+ return port
+ except OSError:
+ continue
+ raise OSError("Could not find a free port")
+
+
+def main():
+ global current_stream, current_pyaudio_instance
+
+ port = find_free_port(7860)
+ print(f"Starting Gradio interface on port {port}")
+
+ if sys.platform.startswith("win"):
+ try:
+ asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+ print("Set Windows Proactor event loop policy")
+ except Exception as e:
+ print(f"Could not set Windows Proactor event loop policy: {e}")
+
+ demo = create_gradio_interface()
+
+ try:
+ demo.queue(max_size=20).launch(
+ server_name="127.0.0.1",
+ server_port=port,
+ share=False,
+ theme=gr.themes.Soft(primary_hue="green"),
+ prevent_thread_lock=False,
+ show_error=True,
+ quiet=False,
+ favicon_path=None,
+ ssl_verify=False,
+ max_threads=40,
+ css="""
+a {
+ color: var(--primary-600);
+}
+a:hover {
+ color: var(--primary-700);
+}
+""",
+ root_path=""
+ )
+ print("Gradio interface launched successfully")
+ except KeyboardInterrupt:
+ print("\nShutting down gracefully...")
+ with stream_lock:
+ if current_stream:
+ try:
+ if hasattr(current_stream, 'is_active') and current_stream.is_active():
+ current_stream.stop_stream()
+ current_stream.close()
+ except Exception:
+ pass
+ if current_pyaudio_instance:
+ try:
+ current_pyaudio_instance.terminate()
+ except Exception:
+ pass
+ sys.exit(0)
+ except Exception as e:
+ print(f"Error starting Gradio interface: {e}")
+ traceback.print_exc()
+ with stream_lock:
+ if current_stream:
+ try:
+ if hasattr(current_stream, 'is_active') and current_stream.is_active():
+ current_stream.stop_stream()
+ current_stream.close()
+ except Exception:
+ pass
+ if current_pyaudio_instance:
+ try:
+ current_pyaudio_instance.terminate()
+ except Exception:
+ pass
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()