Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/src/components/Generation/EngineModelSelector.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const ENGINE_OPTIONS = [
{ value: 'tada:1B', label: 'TADA 1B', engine: 'tada' },
{ value: 'tada:3B', label: 'TADA 3B Multilingual', engine: 'tada' },
{ value: 'kokoro', label: 'Kokoro 82M', engine: 'kokoro' },
{ value: 'minimax', label: 'MiniMax Cloud TTS', engine: 'minimax' },
] as const;

const ENGINE_DESCRIPTIONS: Record<string, string> = {
Expand All @@ -34,6 +35,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
chatterbox_turbo: 'English, [laugh] [cough] tags',
tada: 'HumeAI, 700s+ coherent audio',
kokoro: '82M params, CPU realtime, 8 langs',
minimax: 'Cloud TTS, no download needed',
};

/** Engines that only support English and should force language to 'en' on select. */
Expand Down
4 changes: 3 additions & 1 deletion app/src/components/VoiceProfiles/ProfileForm.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,15 @@ import { AudioSampleUpload } from './AudioSampleUpload';
import { SampleList } from './SampleList';

const MAX_AUDIO_DURATION_SECONDS = 30;
const PRESET_ONLY_ENGINES = new Set(['kokoro']);
const PRESET_ONLY_ENGINES = new Set(['kokoro', 'minimax']);
const DEFAULT_ENGINE_OPTIONS = [
{ value: 'qwen', label: 'Qwen3-TTS' },
{ value: 'luxtts', label: 'LuxTTS' },
{ value: 'chatterbox', label: 'Chatterbox' },
{ value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
{ value: 'tada', label: 'TADA' },
{ value: 'kokoro', label: 'Kokoro 82M' },
{ value: 'minimax', label: 'MiniMax Cloud TTS' },
] as const;

const baseProfileSchema = z.object({
Expand Down Expand Up @@ -849,6 +850,7 @@ export function ProfileForm() {
</FormControl>
<SelectContent>
<SelectItem value="kokoro">Kokoro 82M</SelectItem>
<SelectItem value="minimax">MiniMax Cloud TTS</SelectItem>
</SelectContent>
</Select>
</FormItem>
Expand Down
2 changes: 1 addition & 1 deletion app/src/lib/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export interface GenerationRequest {
language: LanguageCode;
seed?: number;
model_size?: '1.7B' | '0.6B' | '1B' | '3B';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'kokoro';
engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'kokoro' | 'minimax';
instruct?: string;
max_chunk_chars?: number;
crossfade_ms?: number;
Expand Down
1 change: 1 addition & 0 deletions app/src/lib/constants/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
chatterbox_turbo: ['en'],
tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
kokoro: ['en', 'es', 'fr', 'hi', 'it', 'pt', 'ja', 'zh'],
minimax: ['en', 'zh', 'ja', 'ko', 'de', 'fr', 'ru', 'pt', 'es', 'it'],
} as const;

/** Helper: get language options for a given engine. */
Expand Down
14 changes: 9 additions & 5 deletions app/src/lib/hooks/useGenerationForm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ const generationSchema = z.object({
seed: z.number().int().optional(),
modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
instruct: z.string().max(500).optional(),
engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'kokoro']).optional(),
engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'kokoro', 'minimax']).optional(),
});

export type GenerationFormValues = z.infer<typeof generationSchema>;
Expand Down Expand Up @@ -85,7 +85,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
: 'tada-1b'
: engine === 'kokoro'
? 'kokoro'
: `qwen-tts-${data.modelSize}`;
: engine === 'minimax'
? 'minimax-cloud-tts'
: `qwen-tts-${data.modelSize}`;
const displayName =
engine === 'luxtts'
? 'LuxTTS'
Expand All @@ -99,9 +101,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
: 'TADA 1B'
: engine === 'kokoro'
? 'Kokoro 82M'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';
: engine === 'minimax'
? 'MiniMax Cloud TTS'
: data.modelSize === '1.7B'
? 'Qwen TTS 1.7B'
: 'Qwen TTS 0.6B';

// Check if model needs downloading
try {
Expand Down
5 changes: 5 additions & 0 deletions backend/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def is_loaded(self) -> bool:
"chatterbox_turbo": "Chatterbox Turbo",
"tada": "TADA",
"kokoro": "Kokoro",
"minimax": "MiniMax Cloud TTS",
}


Expand Down Expand Up @@ -528,6 +529,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
from .kokoro_backend import KokoroTTSBackend

backend = KokoroTTSBackend()
elif engine == "minimax":
from .minimax_backend import MiniMaxTTSBackend

backend = MiniMaxTTSBackend()
else:
raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")

Expand Down
210 changes: 210 additions & 0 deletions backend/backends/minimax_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""
MiniMax Cloud TTS backend implementation.

Wraps MiniMax's Text-to-Speech API for cloud-based voice synthesis.
Two model variants:
- speech-2.8-hd: High-quality, maximized timbre similarity (default)
- speech-2.8-turbo: Faster, more affordable version

Unlike local backends, this requires a MINIMAX_API_KEY environment variable
and makes HTTP requests to the MiniMax API. No local model downloads needed.

24kHz output, PCM audio format.
"""

import asyncio
import json
import logging
import os
import urllib.request
import urllib.error
from typing import List, Optional, Tuple

import numpy as np

logger = logging.getLogger(__name__)

MINIMAX_API_BASE = "https://api.minimax.io/v1"
MINIMAX_TTS_ENDPOINT = f"{MINIMAX_API_BASE}/t2a_v2"

MINIMAX_DEFAULT_MODEL = "speech-2.8-hd"
MINIMAX_SAMPLE_RATE = 24000

# Available preset voice IDs
MINIMAX_VOICES = [
("English_Graceful_Lady", "Graceful Lady", "female", "en"),
("English_Insightful_Speaker", "Insightful Speaker", "male", "en"),
("English_radiant_girl", "Radiant Girl", "female", "en"),
("English_Persuasive_Man", "Persuasive Man", "male", "en"),
("English_Lucky_Robot", "Lucky Robot", "male", "en"),
("Wise_Woman", "Wise Woman", "female", "en"),
("cute_boy", "Cute Boy", "male", "en"),
("lovely_girl", "Lovely Girl", "female", "en"),
("Friendly_Person", "Friendly Person", "male", "en"),
("Inspirational_girl", "Inspirational Girl", "female", "en"),
("Deep_Voice_Man", "Deep Voice Man", "male", "en"),
("sweet_girl", "Sweet Girl", "female", "en"),
]

DEFAULT_VOICE_ID = "English_Graceful_Lady"


class MiniMaxTTSBackend:
"""MiniMax Cloud TTS backend for cloud-based voice synthesis."""

def __init__(self):
self._api_key: Optional[str] = None
self._model: str = MINIMAX_DEFAULT_MODEL
self._ready = False

def is_loaded(self) -> bool:
return self._ready

def _get_model_path(self, model_size: str = "default") -> str:
return MINIMAX_DEFAULT_MODEL

def _is_model_cached(self, model_size: str = "default") -> bool:
# Cloud backend — always "cached" (no download needed)
return True

async def load_model(self, model_size: str = "default") -> None:
"""Validate API key availability. No model download needed."""
if self._ready:
return

api_key = os.environ.get("MINIMAX_API_KEY")
if not api_key:
raise RuntimeError(
"MINIMAX_API_KEY environment variable is required for MiniMax TTS. "
"Get your API key from https://platform.minimax.io"
)
self._api_key = api_key
self._ready = True
logger.info("MiniMax Cloud TTS ready (model: %s)", self._model)

def unload_model(self) -> None:
"""Clear API key reference."""
self._api_key = None
self._ready = False
logger.info("MiniMax Cloud TTS unloaded")

async def create_voice_prompt(
self,
audio_path: str,
reference_text: str,
use_cache: bool = True,
) -> Tuple[dict, bool]:
"""
MiniMax TTS uses preset voice IDs, not reference audio cloning.

Returns a preset voice prompt using the default voice ID.
The reference audio is ignored.
"""
return {
"voice_type": "preset",
"preset_engine": "minimax",
"preset_voice_id": DEFAULT_VOICE_ID,
}, False

async def combine_voice_prompts(
self,
audio_paths: List[str],
reference_texts: List[str],
) -> Tuple[np.ndarray, str]:
"""Not supported — MiniMax uses preset voices, not audio cloning."""
raise NotImplementedError(
"MiniMax Cloud TTS uses preset voice IDs and does not support "
"voice cloning from reference audio."
)

async def generate(
self,
text: str,
voice_prompt: dict,
language: str = "en",
seed: Optional[int] = None,
instruct: Optional[str] = None,
) -> Tuple[np.ndarray, int]:
"""
Generate audio via MiniMax TTS API.

Args:
text: Text to synthesize (max 10,000 chars)
voice_prompt: Dict with voice_type and preset_voice_id
language: Language code (MiniMax auto-detects language)
seed: Not supported by MiniMax TTS (ignored)
instruct: Not supported by MiniMax TTS (ignored)

Returns:
Tuple of (audio_array, sample_rate=24000)
"""
await self.load_model()

voice_id = DEFAULT_VOICE_ID
if isinstance(voice_prompt, dict):
voice_id = voice_prompt.get("preset_voice_id", DEFAULT_VOICE_ID)

def _generate_sync():
payload = {
"model": self._model,
"text": text,
"stream": False,
"voice_setting": {
"voice_id": voice_id,
"speed": 1.0,
"vol": 1.0,
"pitch": 0,
},
"audio_setting": {
"format": "pcm",
"sample_rate": MINIMAX_SAMPLE_RATE,
},
}

req = urllib.request.Request(
MINIMAX_TTS_ENDPOINT,
data=json.dumps(payload).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {self._api_key}",
},
method="POST",
)

logger.info(
"[MiniMax TTS] Generating (%s), voice: %s, text length: %d",
language,
voice_id,
len(text),
)

try:
with urllib.request.urlopen(req, timeout=120) as resp:
body = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
error_body = e.read().decode("utf-8", errors="replace")
raise RuntimeError(
f"MiniMax TTS API error ({e.code}): {error_body}"
) from e

# Check for API-level errors
base_resp = body.get("base_resp", {})
if base_resp.get("status_code", 0) != 0:
raise RuntimeError(
f"MiniMax TTS API error: {base_resp.get('status_msg', 'unknown')}"
)

# Extract hex-encoded audio
audio_hex = body.get("data", {}).get("audio", "")
if not audio_hex:
raise RuntimeError("MiniMax TTS API returned empty audio data")

# Decode hex → raw PCM bytes → float32 numpy array
audio_bytes = bytes.fromhex(audio_hex)
audio = (
np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
)

return audio, MINIMAX_SAMPLE_RATE

return await asyncio.to_thread(_generate_sync)
2 changes: 1 addition & 1 deletion backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class GenerationRequest(BaseModel):
seed: Optional[int] = Field(None, ge=0)
model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$")
instruct: Optional[str] = Field(None, max_length=500)
engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$")
engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|kokoro|minimax)$")
max_chunk_chars: int = Field(
default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting"
)
Expand Down
15 changes: 15 additions & 0 deletions backend/routes/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,21 @@ async def list_preset_voices(engine: str):
for vid, name, gender, lang in KOKORO_VOICES
],
}
if engine == "minimax":
from ..backends.minimax_backend import MINIMAX_VOICES

return {
"engine": engine,
"voices": [
{
"voice_id": vid,
"name": name,
"gender": gender,
"language": lang,
}
for vid, name, gender, lang in MINIMAX_VOICES
],
}
return {"engine": engine, "voices": []}


Expand Down
Loading