KittenML · voidborne-d · Apr 21, 2026
diff --git a/kittentts/__init__.py b/kittentts/__init__.py
@@ -1,6 +1,6 @@
 from kittentts.get_model import get_model, KittenTTS
 
-__version__ = "0.1.0"
+__version__ = "0.8.1"
 __author__ = "KittenML"
 __description__ = "Ultra-lightweight text-to-speech model with just 15 million parameters"
 

diff --git a/kittentts/get_model.py b/kittentts/get_model.py
@@ -23,29 +23,29 @@ def __init__(self, model_name="KittenML/kitten-tts-nano-0.8", cache_dir=None, ba
 
         self.model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir, backend=backend)
 
-    def generate(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=False):
+    def generate(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=True):
         """Generate audio from text.
 
         Args:
             text: Input text to synthesize
             voice: Voice to use for synthesis
             speed: Speech speed (1.0 = normal)
+            clean_text: If True, preprocess text (expand numbers, etc.)
 
         Returns:
             Audio data as numpy array
         """
-        print(f"Generating audio for text: {text}")
         return self.model.generate(text, voice=voice, speed=speed, clean_text=clean_text)
 
-    def generate_stream(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=False):
+    def generate_stream(self, text, voice="expr-voice-5-m", speed=1.0, clean_text=True):
         """Generate audio as a stream of chunks.
 
         Yields:
             numpy.ndarray: Audio data for each text chunk.
         """
         yield from self.model.generate_stream(text, voice=voice, speed=speed, clean_text=clean_text)
 
-    def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000):
+    def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0, sample_rate=24000, clean_text=True):
         """Generate audio from text and save to file.
 
         Args:
@@ -54,8 +54,9 @@ def generate_to_file(self, text, output_path, voice="expr-voice-5-m", speed=1.0,
             voice: Voice to use for synthesis
             speed: Speech speed (1.0 = normal)
             sample_rate: Audio sample rate
+            clean_text: If True, preprocess text (expand numbers, etc.)
         """
-        return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate)
+        return self.model.generate_to_file(text, output_path, voice=voice, speed=speed, sample_rate=sample_rate, clean_text=clean_text)
 
     @property
     def available_voices(self):

diff --git a/kittentts/onnx_model.py b/kittentts/onnx_model.py
@@ -26,19 +26,29 @@ def ensure_punctuation(text):
 
 
 def chunk_text(text, max_len=400):
-    """Split text into chunks for processing long texts."""
+    """Split text into chunks for processing long texts.
+
+    Splits on sentence boundaries while preserving the original punctuation
+    (periods, exclamation marks, question marks, etc.) so the TTS model can
+    use them for correct prosody and intonation.
+    """
     import re
-
-    sentences = re.split(r'[.!?]+', text)
+
+    # Split into sentences while keeping the delimiter attached to the
+    # preceding text.  e.g. "Hello world. How are you?" →
+    # ["Hello world.", " How are you?"]
+    sentences = re.split(r'(?<=[.!?])\s+', text)
     chunks = []
-    
+
     for sentence in sentences:
         sentence = sentence.strip()
         if not sentence:
             continue
-
+
+        sentence = ensure_punctuation(sentence)
+
         if len(sentence) <= max_len:
-            chunks.append(ensure_punctuation(sentence))
+            chunks.append(sentence)
         else:
             # Split long sentences by words
             words = sentence.split()
@@ -52,7 +62,7 @@ def chunk_text(text, max_len=400):
                     temp_chunk = word
             if temp_chunk:
                 chunks.append(ensure_punctuation(temp_chunk.strip()))
-    
+
     return chunks