jamiepine · octo-patch · Mar 20, 2026
diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
@@ -25,6 +25,7 @@ const ENGINE_OPTIONS = [
   { value: 'tada:1B', label: 'TADA 1B', engine: 'tada' },
   { value: 'tada:3B', label: 'TADA 3B Multilingual', engine: 'tada' },
   { value: 'kokoro', label: 'Kokoro 82M', engine: 'kokoro' },
+  { value: 'minimax', label: 'MiniMax Cloud TTS', engine: 'minimax' },
 ] as const;
 
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
@@ -34,6 +35,7 @@ const ENGINE_DESCRIPTIONS: Record<string, string> = {
   chatterbox_turbo: 'English, [laugh] [cough] tags',
   tada: 'HumeAI, 700s+ coherent audio',
   kokoro: '82M params, CPU realtime, 8 langs',
+  minimax: 'Cloud TTS, no download needed',
 };
 
 /** Engines that only support English and should force language to 'en' on select. */

diff --git a/app/src/components/VoiceProfiles/ProfileForm.tsx b/app/src/components/VoiceProfiles/ProfileForm.tsx
@@ -60,14 +60,15 @@ import { AudioSampleUpload } from './AudioSampleUpload';
 import { SampleList } from './SampleList';
 
 const MAX_AUDIO_DURATION_SECONDS = 30;
-const PRESET_ONLY_ENGINES = new Set(['kokoro']);
+const PRESET_ONLY_ENGINES = new Set(['kokoro', 'minimax']);
 const DEFAULT_ENGINE_OPTIONS = [
   { value: 'qwen', label: 'Qwen3-TTS' },
   { value: 'luxtts', label: 'LuxTTS' },
   { value: 'chatterbox', label: 'Chatterbox' },
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
   { value: 'tada', label: 'TADA' },
   { value: 'kokoro', label: 'Kokoro 82M' },
+  { value: 'minimax', label: 'MiniMax Cloud TTS' },
 ] as const;
 
 const baseProfileSchema = z.object({
@@ -849,6 +850,7 @@ export function ProfileForm() {
                               </FormControl>
                               <SelectContent>
                                 <SelectItem value="kokoro">Kokoro 82M</SelectItem>
+                                <SelectItem value="minimax">MiniMax Cloud TTS</SelectItem>
                               </SelectContent>
                             </Select>
                           </FormItem>

diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
@@ -62,7 +62,7 @@ export interface GenerationRequest {
   language: LanguageCode;
   seed?: number;
   model_size?: '1.7B' | '0.6B' | '1B' | '3B';
-  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'kokoro';
+  engine?: 'qwen' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'kokoro' | 'minimax';
   instruct?: string;
   max_chunk_chars?: number;
   crossfade_ms?: number;

diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
@@ -69,6 +69,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
   chatterbox_turbo: ['en'],
   tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
   kokoro: ['en', 'es', 'fr', 'hi', 'it', 'pt', 'ja', 'zh'],
+  minimax: ['en', 'zh', 'ja', 'ko', 'de', 'fr', 'ru', 'pt', 'es', 'it'],
 } as const;
 
 /** Helper: get language options for a given engine. */

diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
@@ -17,7 +17,7 @@ const generationSchema = z.object({
   seed: z.number().int().optional(),
   modelSize: z.enum(['1.7B', '0.6B', '1B', '3B']).optional(),
   instruct: z.string().max(500).optional(),
-  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'kokoro']).optional(),
+  engine: z.enum(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada', 'kokoro', 'minimax']).optional(),
 });
 
 export type GenerationFormValues = z.infer<typeof generationSchema>;
@@ -85,7 +85,9 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                   : 'tada-1b'
                 : engine === 'kokoro'
                   ? 'kokoro'
-                  : `qwen-tts-${data.modelSize}`;
+                  : engine === 'minimax'
+                    ? 'minimax-cloud-tts'
+                    : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
           ? 'LuxTTS'
@@ -99,9 +101,11 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                   : 'TADA 1B'
                 : engine === 'kokoro'
                   ? 'Kokoro 82M'
-                  : data.modelSize === '1.7B'
-                    ? 'Qwen TTS 1.7B'
-                    : 'Qwen TTS 0.6B';
+                  : engine === 'minimax'
+                    ? 'MiniMax Cloud TTS'
+                    : data.modelSize === '1.7B'
+                      ? 'Qwen TTS 1.7B'
+                      : 'Qwen TTS 0.6B';
 
       // Check if model needs downloading
       try {

diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
@@ -168,6 +168,7 @@ def is_loaded(self) -> bool:
     "chatterbox_turbo": "Chatterbox Turbo",
     "tada": "TADA",
     "kokoro": "Kokoro",
+    "minimax": "MiniMax Cloud TTS",
 }
 
 
@@ -528,6 +529,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .kokoro_backend import KokoroTTSBackend
 
             backend = KokoroTTSBackend()
+        elif engine == "minimax":
+            from .minimax_backend import MiniMaxTTSBackend
+
+            backend = MiniMaxTTSBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")
 

diff --git a/backend/backends/minimax_backend.py b/backend/backends/minimax_backend.py
@@ -0,0 +1,210 @@
+"""
+MiniMax Cloud TTS backend implementation.
+
+Wraps MiniMax's Text-to-Speech API for cloud-based voice synthesis.
+Two model variants:
+  - speech-2.8-hd: High-quality, maximized timbre similarity (default)
+  - speech-2.8-turbo: Faster, more affordable version
+
+Unlike local backends, this requires a MINIMAX_API_KEY environment variable
+and makes HTTP requests to the MiniMax API. No local model downloads needed.
+
+24kHz output, PCM audio format.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import urllib.request
+import urllib.error
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+MINIMAX_API_BASE = "https://api.minimax.io/v1"
+MINIMAX_TTS_ENDPOINT = f"{MINIMAX_API_BASE}/t2a_v2"
+
+MINIMAX_DEFAULT_MODEL = "speech-2.8-hd"
+MINIMAX_SAMPLE_RATE = 24000
+
+# Available preset voice IDs
+MINIMAX_VOICES = [
+    ("English_Graceful_Lady", "Graceful Lady", "female", "en"),
+    ("English_Insightful_Speaker", "Insightful Speaker", "male", "en"),
+    ("English_radiant_girl", "Radiant Girl", "female", "en"),
+    ("English_Persuasive_Man", "Persuasive Man", "male", "en"),
+    ("English_Lucky_Robot", "Lucky Robot", "male", "en"),
+    ("Wise_Woman", "Wise Woman", "female", "en"),
+    ("cute_boy", "Cute Boy", "male", "en"),
+    ("lovely_girl", "Lovely Girl", "female", "en"),
+    ("Friendly_Person", "Friendly Person", "male", "en"),
+    ("Inspirational_girl", "Inspirational Girl", "female", "en"),
+    ("Deep_Voice_Man", "Deep Voice Man", "male", "en"),
+    ("sweet_girl", "Sweet Girl", "female", "en"),
+]
+
+DEFAULT_VOICE_ID = "English_Graceful_Lady"
+
+
+class MiniMaxTTSBackend:
+    """MiniMax Cloud TTS backend for cloud-based voice synthesis."""
+
+    def __init__(self):
+        self._api_key: Optional[str] = None
+        self._model: str = MINIMAX_DEFAULT_MODEL
+        self._ready = False
+
+    def is_loaded(self) -> bool:
+        return self._ready
+
+    def _get_model_path(self, model_size: str = "default") -> str:
+        return MINIMAX_DEFAULT_MODEL
+
+    def _is_model_cached(self, model_size: str = "default") -> bool:
+        # Cloud backend — always "cached" (no download needed)
+        return True
+
+    async def load_model(self, model_size: str = "default") -> None:
+        """Validate API key availability. No model download needed."""
+        if self._ready:
+            return
+
+        api_key = os.environ.get("MINIMAX_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "MINIMAX_API_KEY environment variable is required for MiniMax TTS. "
+                "Get your API key from https://platform.minimax.io"
+            )
+        self._api_key = api_key
+        self._ready = True
+        logger.info("MiniMax Cloud TTS ready (model: %s)", self._model)
+
+    def unload_model(self) -> None:
+        """Clear API key reference."""
+        self._api_key = None
+        self._ready = False
+        logger.info("MiniMax Cloud TTS unloaded")
+
+    async def create_voice_prompt(
+        self,
+        audio_path: str,
+        reference_text: str,
+        use_cache: bool = True,
+    ) -> Tuple[dict, bool]:
+        """
+        MiniMax TTS uses preset voice IDs, not reference audio cloning.
+
+        Returns a preset voice prompt using the default voice ID.
+        The reference audio is ignored.
+        """
+        return {
+            "voice_type": "preset",
+            "preset_engine": "minimax",
+            "preset_voice_id": DEFAULT_VOICE_ID,
+        }, False
+
+    async def combine_voice_prompts(
+        self,
+        audio_paths: List[str],
+        reference_texts: List[str],
+    ) -> Tuple[np.ndarray, str]:
+        """Not supported — MiniMax uses preset voices, not audio cloning."""
+        raise NotImplementedError(
+            "MiniMax Cloud TTS uses preset voice IDs and does not support "
+            "voice cloning from reference audio."
+        )
+
+    async def generate(
+        self,
+        text: str,
+        voice_prompt: dict,
+        language: str = "en",
+        seed: Optional[int] = None,
+        instruct: Optional[str] = None,
+    ) -> Tuple[np.ndarray, int]:
+        """
+        Generate audio via MiniMax TTS API.
+
+        Args:
+            text: Text to synthesize (max 10,000 chars)
+            voice_prompt: Dict with voice_type and preset_voice_id
+            language: Language code (MiniMax auto-detects language)
+            seed: Not supported by MiniMax TTS (ignored)
+            instruct: Not supported by MiniMax TTS (ignored)
+
+        Returns:
+            Tuple of (audio_array, sample_rate=24000)
+        """
+        await self.load_model()
+
+        voice_id = DEFAULT_VOICE_ID
+        if isinstance(voice_prompt, dict):
+            voice_id = voice_prompt.get("preset_voice_id", DEFAULT_VOICE_ID)
+
+        def _generate_sync():
+            payload = {
+                "model": self._model,
+                "text": text,
+                "stream": False,
+                "voice_setting": {
+                    "voice_id": voice_id,
+                    "speed": 1.0,
+                    "vol": 1.0,
+                    "pitch": 0,
+                },
+                "audio_setting": {
+                    "format": "pcm",
+                    "sample_rate": MINIMAX_SAMPLE_RATE,
+                },
+            }
+
+            req = urllib.request.Request(
+                MINIMAX_TTS_ENDPOINT,
+                data=json.dumps(payload).encode("utf-8"),
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {self._api_key}",
+                },
+                method="POST",
+            )
+
+            logger.info(
+                "[MiniMax TTS] Generating (%s), voice: %s, text length: %d",
+                language,
+                voice_id,
+                len(text),
+            )
+
+            try:
+                with urllib.request.urlopen(req, timeout=120) as resp:
+                    body = json.loads(resp.read().decode("utf-8"))
+            except urllib.error.HTTPError as e:
+                error_body = e.read().decode("utf-8", errors="replace")
+                raise RuntimeError(
+                    f"MiniMax TTS API error ({e.code}): {error_body}"
+                ) from e
+
+            # Check for API-level errors
+            base_resp = body.get("base_resp", {})
+            if base_resp.get("status_code", 0) != 0:
+                raise RuntimeError(
+                    f"MiniMax TTS API error: {base_resp.get('status_msg', 'unknown')}"
+                )
+
+            # Extract hex-encoded audio
+            audio_hex = body.get("data", {}).get("audio", "")
+            if not audio_hex:
+                raise RuntimeError("MiniMax TTS API returned empty audio data")
+
+            # Decode hex → raw PCM bytes → float32 numpy array
+            audio_bytes = bytes.fromhex(audio_hex)
+            audio = (
+                np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
+            )
+
+            return audio, MINIMAX_SAMPLE_RATE
+
+        return await asyncio.to_thread(_generate_sync)
diff --git a/backend/models.py b/backend/models.py
@@ -78,7 +78,7 @@ class GenerationRequest(BaseModel):
     seed: Optional[int] = Field(None, ge=0)
     model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$")
     instruct: Optional[str] = Field(None, max_length=500)
-    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$")
+    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|luxtts|chatterbox|chatterbox_turbo|tada|kokoro|minimax)$")
     max_chunk_chars: int = Field(
         default=800, ge=100, le=5000, description="Max characters per chunk for long text splitting"
     )

diff --git a/backend/routes/profiles.py b/backend/routes/profiles.py
@@ -90,6 +90,21 @@ async def list_preset_voices(engine: str):
                 for vid, name, gender, lang in KOKORO_VOICES
             ],
         }
+    if engine == "minimax":
+        from ..backends.minimax_backend import MINIMAX_VOICES
+
+        return {
+            "engine": engine,
+            "voices": [
+                {
+                    "voice_id": vid,
+                    "name": name,
+                    "gender": gender,
+                    "language": lang,
+                }
+                for vid, name, gender, lang in MINIMAX_VOICES
+            ],
+        }
     return {"engine": engine, "voices": []}