Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,4 @@ cython_debug/
/recorded_audio
LOGI POSZLO.txt
test.txt
*.txt
5 changes: 5 additions & 0 deletions core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def __init__(self) -> None:
"GPT_EVALUATION_MODEL",
"gpt-4o-mini",
)
self.openai_tts_char_limit: Final[int] = int(
os.getenv(
"OPENAI_TTS_CHAR_LIMIT",
"4096",
))

self.microphone_name: Final[
Optional[str]] = self.__resolve_microphone_name()
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ sounddevice~=0.5.1
openai~=1.71.0
pydantic~=2.11.2
numpy~=2.2.4
soundfile~=0.13.1
soundfile~=0.13.1
audioop-lts~=0.2.1
pydub~=0.25.1
152 changes: 131 additions & 21 deletions services/tts_service.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,31 @@
"""Text-to-speech service using OpenAI TTS API."""
"""Text-to-speech service using the OpenAI TTS API.

This module provides a service to convert long texts into audio files
by chunking the text, generating audio for each chunk, and concatenating
the results.
"""
import hashlib
import logging
import shutil
import tempfile
from pathlib import Path
from typing import Final
from typing import Final, List

from openai import OpenAI
from pydub import AudioSegment

from core.config import config


class TTSService:
"""Service for text-to-speech conversion using OpenAI TTS API."""
"""Manages text-to-speech conversion, handling API limits gracefully."""

def __init__(self) -> None:
"""Initialize the TTS service."""
"""Initializes the TTS service and the OpenAI client.

Raises:
ValueError: If the OPENAI_API_KEY is not set in the environment.
"""
self.__logger = logging.getLogger(self.__class__.__name__)
if not config.openai_api_key:
raise ValueError("OPENAI_API_KEY is not set in the environment.")
Expand All @@ -23,40 +34,139 @@ def __init__(self) -> None:
self.__model: Final[str] = config.openai_tts_model
self.__voice: Final[str] = config.openai_tts_voice
self.__format: Final[str] = config.openai_tts_output_format
self.__api_char_limit: Final[int] = config.openai_tts_char_limit

def __chunk_text(self, text: str) -> List[str]:
"""Splits a long text into chunks that respect the API character limit.

The method splits text primarily by sentences, then falls back to new
lines or spaces to ensure no chunk exceeds the limit.

Args:
text (str): The input text to be split.

Returns:
List[str]: A list of text chunks, each smaller than the API limit.
"""
chunks = []
current_chunk = ""
sentences = text.replace("!", "!.").replace("?", "?. ").split(". ")

for sentence in sentences:
if not sentence:
continue

if len(current_chunk) + len(sentence) + 1 > self.__api_char_limit:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
else:
current_chunk += sentence + ". "

if current_chunk:
chunks.append(current_chunk.strip())

final_chunks = []
for chunk in chunks:
if len(chunk) > self.__api_char_limit:
while len(chunk) > self.__api_char_limit:
split_pos = chunk.rfind(" ", 0, self.__api_char_limit)
if split_pos == -1:
split_pos = self.__api_char_limit
final_chunks.append(chunk[:split_pos])
chunk = chunk[split_pos:]
final_chunks.append(chunk)

return [c for c in final_chunks if c]

def __generate_chunk_audio(self, text_chunk: str, file_path: Path) -> None:
"""Generates an audio file for a single text chunk via OpenAI API.

Args:
text_chunk (str): The text chunk to convert to speech.
file_path (Path): The path to save the generated audio file.

Raises:
Exception: Propagates exceptions from the OpenAI API client.
"""
with self.__client.audio.speech.with_streaming_response.create(
model=self.__model,
voice=self.__voice,
input=text_chunk,
response_format=self.__format,
) as response:
response.stream_to_file(file_path)

def __process_chunks(self, text_chunks: List[str]) -> AudioSegment:
"""Generates and concatenates audio for a list of text chunks.

This method iterates through text chunks, generating audio for each
in a temporary directory, and then combines them into a single
AudioSegment.

Args:
text_chunks (List[str]): A list of text chunks to process.

Returns:
AudioSegment: A pydub AudioSegment with the combined audio.

Raises:
RuntimeError: If audio generation results in no processable
segments.
"""
audio_segments = []
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
for i, chunk in enumerate(text_chunks):
chunk_file_path = temp_path / f"chunk_{i}.{self.__format}"
log_msg = f"Generating audio for chunk {i+1}/{len(text_chunks)}"
self.__logger.info(log_msg)

self.__generate_chunk_audio(chunk, chunk_file_path)
segment = AudioSegment.from_file(chunk_file_path,
format=self.__format)
audio_segments.append(segment)

if not audio_segments:
raise RuntimeError("Audio generation resulted in no segments.")

self.__logger.info("Concatenating audio chunks...")
return sum(audio_segments)

async def generate_audio(self, text: str, output_path: Path) -> bool:
"""Generate audio from text using OpenAI TTS API.
"""Generates an audio file from text, using cache if available.

This is the main public method. It checks for a cached version of the
audio first. If not found, it chunks the text, generates audio
for each part, combines them, saves the final file, and caches it.

Args:
text: Text to convert to speech.
output_path: Path where to save the generated audio.
text (str): The full text to be converted to speech.
output_path (Path): The path to save the final audio file.

Returns:
True if cached audio was used, False if new audio was generated.
bool: True if a cached audio file was used, False otherwise.

Raises:
Exception: If audio generation fails.
Exception: If any part of the audio generation or file handling
fails.
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
# noinspection PyTypeChecker
prompt_hash: str = hashlib.sha256(text.encode("utf-8")).hexdigest()
cache_file: Path = (output_path.parent /
f"jailbreak_prompt_{prompt_hash}.{self.__format}")
cache_file_name = f"jailbreak_prompt_{prompt_hash}.{self.__format}"
cache_file: Path = output_path.parent / cache_file_name

if cache_file.exists():
shutil.copy(cache_file, output_path)
return True

text_chunks = self.__chunk_text(text)
try:
with self.__client.audio.speech.with_streaming_response.create(
model=self.__model,
voice=self.__voice,
input=text,
response_format=self.__format,
) as response:
response.stream_to_file(output_path)
shutil.copy(output_path, cache_file)
return False
combined_audio = self.__process_chunks(text_chunks)
combined_audio.export(output_path, format=self.__format)
except Exception:
self.__logger.exception("Failed to generate audio with OpenAI TTS.")
raise

shutil.copy(output_path, cache_file)
return False