Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kittentts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from kittentts.get_model import get_model, KittenTTS

__version__ = "0.1.0"
__version__ = "0.8.1"
__author__ = "KittenML"
__description__ = "Ultra-lightweight text-to-speech model with just 15 million parameters"

Expand Down
11 changes: 6 additions & 5 deletions kittentts/get_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,29 +23,29 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.8", cache_dir=None, ba

self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir, backend=backend)

def generate(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=False):
def generate(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=True):
"""Generate audio from text.

Args:
text: Input text to synthesize
voice: Voice to use for synthesis
speed: Speech speed (1.0 = normal)
clean_text: If True, preprocess text (expand numbers, etc.)

Returns:
Audio data as numpy array
"""
print(f"Generating audio for text: {text}")
return self.model.generate(text, voice=voice, speed=speed, clean_text=clean_text)

def generate_stream(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=False):
def generate_stream(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=True):
"""Generate audio as a stream of chunks.

Yields:
numpy.ndarray: Audio data for each text chunk.
"""
yield from self.model.generate_stream(text, voice=voice, speed=speed, clean_text=clean_text)

def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000):
def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000, clean_text=True):
"""Generate audio from text and save to file.

Args:
Expand All @@ -54,8 +54,9 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0,
voice: Voice to use for synthesis
speed: Speech speed (1.0 = normal)
sample_rate: Audio sample rate
clean_text: If True, preprocess text (expand numbers, etc.)
"""
return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate)
return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate, clean_text=clean_text)

@property
def available_voices(self):
Expand Down
24 changes: 17 additions & 7 deletions kittentts/onnx_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,29 @@ def ensure_punctuation(text):


def chunk_text(text, max_len=400):
"""Split text into chunks for processing long texts."""
"""Split text into chunks for processing long texts.

Splits on sentence boundaries while preserving the original punctuation
(periods, exclamation marks, question marks, etc.) so the TTS model can
use them for correct prosody and intonation.
"""
import re

sentences = re.split(r'[.!?]+', text)

# Split into sentences while keeping the delimiter attached to the
# preceding text. e.g. "Hello world. How are you?" →
# ["Hello world.", " How are you?"]
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []

for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue


sentence = ensure_punctuation(sentence)

if len(sentence) <= max_len:
chunks.append(ensure_punctuation(sentence))
chunks.append(sentence)
else:
# Split long sentences by words
words = sentence.split()
Expand All @@ -52,7 +62,7 @@ def chunk_text(text, max_len=400):
temp_chunk = word
if temp_chunk:
chunks.append(ensure_punctuation(temp_chunk.strip()))

return chunks


Expand Down
Loading