From 4f9e45a9e96c7273ade0a1a876cd5ab9a38e3514 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 9 Dec 2025 18:52:29 +0100 Subject: [PATCH 1/5] feat(vibevoice): add backend Signed-off-by: Ettore Di Giacinto --- Makefile | 13 +- backend/python/vibevoice/Makefile | 23 + backend/python/vibevoice/backend.py | 485 ++++++++++++++++++ backend/python/vibevoice/example-config.yaml | 101 ++++ backend/python/vibevoice/install.sh | 28 + backend/python/vibevoice/requirements-cpu.txt | 22 + .../vibevoice/requirements-cublas11.txt | 22 + .../vibevoice/requirements-cublas12.txt | 22 + .../vibevoice/requirements-cublas13.txt | 22 + .../python/vibevoice/requirements-hipblas.txt | 22 + .../python/vibevoice/requirements-intel.txt | 26 + .../python/vibevoice/requirements-l4t12.txt | 22 + .../python/vibevoice/requirements-l4t13.txt | 22 + backend/python/vibevoice/requirements-mps.txt | 21 + backend/python/vibevoice/requirements.txt | 4 + backend/python/vibevoice/run.sh | 9 + backend/python/vibevoice/test.py | 82 +++ backend/python/vibevoice/test.sh | 11 + 18 files changed, 956 insertions(+), 1 deletion(-) create mode 100644 backend/python/vibevoice/Makefile create mode 100644 backend/python/vibevoice/backend.py create mode 100644 backend/python/vibevoice/example-config.yaml create mode 100755 backend/python/vibevoice/install.sh create mode 100644 backend/python/vibevoice/requirements-cpu.txt create mode 100644 backend/python/vibevoice/requirements-cublas11.txt create mode 100644 backend/python/vibevoice/requirements-cublas12.txt create mode 100644 backend/python/vibevoice/requirements-cublas13.txt create mode 100644 backend/python/vibevoice/requirements-hipblas.txt create mode 100644 backend/python/vibevoice/requirements-intel.txt create mode 100644 backend/python/vibevoice/requirements-l4t12.txt create mode 100644 backend/python/vibevoice/requirements-l4t13.txt create mode 100644 backend/python/vibevoice/requirements-mps.txt create mode 100644 backend/python/vibevoice/requirements.txt create mode 100755 backend/python/vibevoice/run.sh create mode 100644 backend/python/vibevoice/test.py create mode 100755 backend/python/vibevoice/test.sh diff --git a/Makefile b/Makefile index 1f855b02a673..b22196d64675 100644 --- a/Makefile +++ b/Makefile @@ -287,12 +287,14 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/chatterbox $(MAKE) -C backend/python/vllm + $(MAKE) -C backend/python/vibevoice test-extra: prepare-test-extra $(MAKE) -C backend/python/transformers test $(MAKE) -C backend/python/diffusers test $(MAKE) -C backend/python/chatterbox test $(MAKE) -C backend/python/vllm test + $(MAKE) -C backend/python/vibevoice test DOCKER_IMAGE?=local-ai DOCKER_AIO_IMAGE?=local-ai-aio @@ -389,6 +391,9 @@ backends/neutts: docker-build-neutts docker-save-neutts build backends/vllm: docker-build-vllm docker-save-vllm build ./local-ai backends install "ocifile://$(abspath ./backend-images/vllm.tar)" +backends/vibevoice: docker-build-vibevoice docker-save-vibevoice build + ./local-ai backends install "ocifile://$(abspath ./backend-images/vibevoice.tar)" + build-darwin-python-backend: build bash ./scripts/build/python-darwin.sh @@ -445,6 +450,9 @@ docker-save-kitten-tts: backend-images docker-save-chatterbox: backend-images docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar +docker-save-vibevoice: backend-images + docker save local-ai-backend:vibevoice -o backend-images/vibevoice.tar + docker-build-neutts: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:neutts -f backend/Dockerfile.python --build-arg BACKEND=neutts ./backend @@ -523,10 +531,13 @@ docker-build-bark: docker-build-chatterbox: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox ./backend +docker-build-vibevoice: + docker build --progress=plain --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vibevoice -f backend/Dockerfile.python --build-arg BACKEND=vibevoice ./backend + docker-build-exllama2: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 . -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-exllama2 +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 ######################################################## ### END Backends diff --git a/backend/python/vibevoice/Makefile b/backend/python/vibevoice/Makefile new file mode 100644 index 000000000000..2fd2297be202 --- /dev/null +++ b/backend/python/vibevoice/Makefile @@ -0,0 +1,23 @@ +.PHONY: vibevoice +vibevoice: + bash install.sh + +.PHONY: run +run: vibevoice + @echo "Running vibevoice..." + bash run.sh + @echo "vibevoice run." + +.PHONY: test +test: vibevoice + @echo "Testing vibevoice..." + bash test.sh + @echo "vibevoice tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/vibevoice/backend.py b/backend/python/vibevoice/backend.py new file mode 100644 index 000000000000..418940bcb817 --- /dev/null +++ b/backend/python/vibevoice/backend.py @@ -0,0 +1,485 @@ +#!/usr/bin/env python3 +""" +This is an extra gRPC server of LocalAI for VibeVoice +""" +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import copy +import traceback +from pathlib import Path +import backend_pb2 +import backend_pb2_grpc +import torch +from vibevoice.modular.modeling_vibevoice_streaming_inference import VibeVoiceStreamingForConditionalGenerationInference +from vibevoice.processor.vibevoice_streaming_processor import VibeVoiceStreamingProcessor + +import grpc + +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + BackendServicer is the class that implements the gRPC service + """ + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + # Get device + if torch.cuda.is_available(): + print("CUDA is available", file=sys.stderr) + device = "cuda" + else: + print("CUDA is not available", file=sys.stderr) + device = "cpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + # Normalize potential 'mpx' typo to 'mps' + if device == "mpx": + print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr) + device = "mps" + + # Validate mps availability if requested + if device == "mps" and not torch.backends.mps.is_available(): + print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) + device = "cpu" + + self.device = device + self._torch_device = torch.device(device) + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the audio + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon + # if value is a number, convert it to the appropriate type + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + # Get model path from request + model_path = request.Model + if not model_path: + model_path = "microsoft/VibeVoice-Realtime-0.5B" + + # Get inference steps from options, default to 5 + self.inference_steps = self.options.get("inference_steps", 5) + if not isinstance(self.inference_steps, int) or self.inference_steps <= 0: + self.inference_steps = 5 + + # Get cfg_scale from options, default to 1.5 + self.cfg_scale = self.options.get("cfg_scale", 1.5) + if not isinstance(self.cfg_scale, (int, float)) or self.cfg_scale <= 0: + self.cfg_scale = 1.5 + + # Determine voices directory + # Priority order: + # 1. voices_dir option (explicitly set by user - highest priority) + # 2. Relative to ModelFile if provided + # 3. Relative to ModelPath (models directory) if provided + # 4. Backend directory + # 5. Absolute path from AudioPath if provided + voices_dir = None + + # First check if voices_dir is explicitly set in options + if "voices_dir" in self.options: + voices_dir_option = self.options["voices_dir"] + if isinstance(voices_dir_option, str) and voices_dir_option.strip(): + voices_dir = voices_dir_option.strip() + # If relative path, try to resolve it relative to ModelPath or ModelFile + if not os.path.isabs(voices_dir): + if hasattr(request, 'ModelPath') and request.ModelPath: + voices_dir = os.path.join(request.ModelPath, voices_dir) + elif request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + voices_dir = os.path.join(model_file_base, voices_dir) + # If still relative, make it absolute from current working directory + if not os.path.isabs(voices_dir): + voices_dir = os.path.abspath(voices_dir) + # Check if the directory exists + if not os.path.exists(voices_dir): + print(f"Warning: voices_dir option specified but directory does not exist: {voices_dir}", file=sys.stderr) + voices_dir = None + + # If not set via option, try relative to ModelFile if provided + if not voices_dir and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + voices_dir = os.path.join(model_file_base, "voices", "streaming_model") + if not os.path.exists(voices_dir): + voices_dir = None + + # If not found, try relative to ModelPath (models directory) + if not voices_dir and hasattr(request, 'ModelPath') and request.ModelPath: + voices_dir = os.path.join(request.ModelPath, "voices", "streaming_model") + if not os.path.exists(voices_dir): + voices_dir = None + + # If not found, try relative to backend directory + if not voices_dir: + backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + voices_dir = os.path.join(backend_dir, "vibevoice", "voices", "streaming_model") + if not os.path.exists(voices_dir): + # Try absolute path from AudioPath if provided + if request.AudioPath and os.path.isabs(request.AudioPath): + voices_dir = os.path.dirname(request.AudioPath) + else: + voices_dir = None + + self.voices_dir = voices_dir + self.voice_presets = {} + self._voice_cache = {} + self.default_voice_key = None + + # Load voice presets if directory exists + if self.voices_dir and os.path.exists(self.voices_dir): + self._load_voice_presets() + else: + print(f"Warning: Voices directory not found. Voice presets will not be available.", file=sys.stderr) + + try: + print(f"Loading processor & model from {model_path}", file=sys.stderr) + self.processor = VibeVoiceStreamingProcessor.from_pretrained(model_path) + + # Decide dtype & attention implementation + if self.device == "mps": + load_dtype = torch.float32 # MPS requires float32 + device_map = None + attn_impl_primary = "sdpa" # flash_attention_2 not supported on MPS + elif self.device == "cuda": + load_dtype = torch.bfloat16 + device_map = "cuda" + attn_impl_primary = "flash_attention_2" + else: # cpu + load_dtype = torch.float32 + device_map = "cpu" + attn_impl_primary = "sdpa" + + print(f"Using device: {self.device}, torch_dtype: {load_dtype}, attn_implementation: {attn_impl_primary}", file=sys.stderr) + + # Load model with device-specific logic + try: + if self.device == "mps": + self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( + model_path, + torch_dtype=load_dtype, + attn_implementation=attn_impl_primary, + device_map=None, # load then move + ) + self.model.to("mps") + elif self.device == "cuda": + self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( + model_path, + torch_dtype=load_dtype, + device_map="cuda", + attn_implementation=attn_impl_primary, + ) + else: # cpu + self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( + model_path, + torch_dtype=load_dtype, + device_map="cpu", + attn_implementation=attn_impl_primary, + ) + except Exception as e: + if attn_impl_primary == 'flash_attention_2': + print(f"[ERROR] : {type(e).__name__}: {e}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + print("Error loading the model. Trying to use SDPA. However, note that only flash_attention_2 has been fully tested, and using SDPA may result in lower audio quality.", file=sys.stderr) + self.model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( + model_path, + torch_dtype=load_dtype, + device_map=(self.device if self.device in ("cuda", "cpu") else None), + attn_implementation='sdpa' + ) + if self.device == "mps": + self.model.to("mps") + else: + raise e + + self.model.eval() + self.model.set_ddpm_inference_steps(num_steps=self.inference_steps) + + # Set default voice key + if self.voice_presets: + # Try to get default from environment or use first available + preset_name = os.environ.get("VOICE_PRESET") + self.default_voice_key = self._determine_voice_key(preset_name) + print(f"Default voice preset: {self.default_voice_key}", file=sys.stderr) + else: + print("Warning: No voice presets available. Voice selection will not work.", file=sys.stderr) + + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def _load_voice_presets(self): + """Load voice presets from the voices directory.""" + if not self.voices_dir or not os.path.exists(self.voices_dir): + self.voice_presets = {} + return + + self.voice_presets = {} + + # Get all .pt files in the voices directory + pt_files = [f for f in os.listdir(self.voices_dir) + if f.lower().endswith('.pt') and os.path.isfile(os.path.join(self.voices_dir, f))] + + # Create dictionary with filename (without extension) as key + for pt_file in pt_files: + # Remove .pt extension to get the name + name = os.path.splitext(pt_file)[0] + # Create full path + full_path = os.path.join(self.voices_dir, pt_file) + self.voice_presets[name] = full_path + + # Sort the voice presets alphabetically by name + self.voice_presets = dict(sorted(self.voice_presets.items())) + + print(f"Found {len(self.voice_presets)} voice files in {self.voices_dir}", file=sys.stderr) + if self.voice_presets: + print(f"Available voices: {', '.join(self.voice_presets.keys())}", file=sys.stderr) + + def _determine_voice_key(self, name): + """Determine voice key from name or use default.""" + if name and name in self.voice_presets: + return name + + # Try default key + default_key = "en-WHTest_man" + if default_key in self.voice_presets: + return default_key + + # Use first available + if self.voice_presets: + first_key = next(iter(self.voice_presets)) + print(f"Using fallback voice preset: {first_key}", file=sys.stderr) + return first_key + + return None + + def _get_voice_path(self, speaker_name): + """Get voice file path for a given speaker name.""" + if not self.voice_presets: + return None + + # First try exact match + if speaker_name and speaker_name in self.voice_presets: + return self.voice_presets[speaker_name] + + # Try partial matching (case insensitive) + if speaker_name: + speaker_lower = speaker_name.lower() + for preset_name, path in self.voice_presets.items(): + if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower(): + return path + + # Default to first voice if no match found + if self.default_voice_key and self.default_voice_key in self.voice_presets: + return self.voice_presets[self.default_voice_key] + elif self.voice_presets: + default_voice = list(self.voice_presets.values())[0] + print(f"Warning: No voice preset found for '{speaker_name}', using default voice: {default_voice}", file=sys.stderr) + return default_voice + + return None + + def _ensure_voice_cached(self, voice_path): + """Load and cache voice preset.""" + if not voice_path or not os.path.exists(voice_path): + return None + + # Use path as cache key + if voice_path not in self._voice_cache: + print(f"Loading prefilled prompt from {voice_path}", file=sys.stderr) + prefilled_outputs = torch.load( + voice_path, + map_location=self._torch_device, + weights_only=False, + ) + self._voice_cache[voice_path] = prefilled_outputs + + return self._voice_cache[voice_path] + + def TTS(self, request, context): + try: + # Get voice selection + # Priority: request.voice > AudioPath > default + voice_path = None + voice_key = None + + if request.voice: + # Try to get voice by name + voice_path = self._get_voice_path(request.voice) + if voice_path: + voice_key = request.voice + elif request.AudioPath: + # Use AudioPath as voice file + if os.path.isabs(request.AudioPath): + voice_path = request.AudioPath + elif request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + voice_path = os.path.join(model_file_base, request.AudioPath) + elif hasattr(request, 'ModelPath') and request.ModelPath: + voice_path = os.path.join(request.ModelPath, request.AudioPath) + else: + voice_path = request.AudioPath + elif self.default_voice_key: + voice_path = self._get_voice_path(self.default_voice_key) + voice_key = self.default_voice_key + + if not voice_path or not os.path.exists(voice_path): + return backend_pb2.Result( + success=False, + message=f"Voice file not found: {voice_path}. Please provide a valid voice preset or AudioPath." + ) + + # Load voice preset + prefilled_outputs = self._ensure_voice_cached(voice_path) + if prefilled_outputs is None: + return backend_pb2.Result( + success=False, + message=f"Failed to load voice preset from {voice_path}" + ) + + # Get generation parameters from options + cfg_scale = self.options.get("cfg_scale", self.cfg_scale) + inference_steps = self.options.get("inference_steps", self.inference_steps) + do_sample = self.options.get("do_sample", False) + temperature = self.options.get("temperature", 0.9) + top_p = self.options.get("top_p", 0.9) + + # Update inference steps if needed + if inference_steps != self.inference_steps: + self.model.set_ddpm_inference_steps(num_steps=inference_steps) + self.inference_steps = inference_steps + + # Prepare text + text = request.text.strip().replace("'", "'").replace('"', '"').replace('"', '"') + + # Prepare inputs + inputs = self.processor.process_input_with_cached_prompt( + text=text, + cached_prompt=prefilled_outputs, + padding=True, + return_tensors="pt", + return_attention_mask=True, + ) + + # Move tensors to target device + target_device = self._torch_device + for k, v in inputs.items(): + if torch.is_tensor(v): + inputs[k] = v.to(target_device) + + print(f"Generating audio with cfg_scale: {cfg_scale}, inference_steps: {inference_steps}", file=sys.stderr) + + # Generate audio + outputs = self.model.generate( + **inputs, + max_new_tokens=None, + cfg_scale=cfg_scale, + tokenizer=self.processor.tokenizer, + generation_config={ + 'do_sample': do_sample, + 'temperature': temperature if do_sample else 1.0, + 'top_p': top_p if do_sample else 1.0, + }, + verbose=False, + all_prefilled_outputs=copy.deepcopy(prefilled_outputs) if prefilled_outputs is not None else None, + ) + + # Save output + if outputs.speech_outputs and outputs.speech_outputs[0] is not None: + self.processor.save_audio( + outputs.speech_outputs[0], # First (and only) batch item + output_path=request.dst, + ) + print(f"Saved output to {request.dst}", file=sys.stderr) + else: + return backend_pb2.Result( + success=False, + message="No audio output generated" + ) + + except Exception as err: + print(f"Error in TTS: {err}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(success=True) + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + serve(args.addr) diff --git a/backend/python/vibevoice/example-config.yaml b/backend/python/vibevoice/example-config.yaml new file mode 100644 index 000000000000..33680407ed1e --- /dev/null +++ b/backend/python/vibevoice/example-config.yaml @@ -0,0 +1,101 @@ +name: vibevoice-realtime +backend: vibevoice +description: | + VibeVoice-Realtime is a real-time text-to-speech model that generates natural-sounding speech. + This model supports voice cloning through voice preset files (.pt files). + +# Model path (HuggingFace model ID or local path) +parameters: + model: microsoft/VibeVoice-Realtime-0.5B + +# TTS configuration +tts: + # Voice selection - can be: + # 1. Voice preset name (e.g., "Frank", "en-Frank_man", "Grace") - looks for .pt files in voices/streaming_model/ + # 2. Path to a voice preset .pt file (relative to model directory or absolute) + # Available English voices: Carter, Davis, Emma, Frank, Grace, Mike + voice: "Frank" + # Alternative: use audio_path to specify a voice file directly + # audio_path: "voices/streaming_model/en-Frank_man.pt" + +known_usecases: + - tts + +# Backend-specific options +# These are passed as "key:value" strings to the backend +options: + # CFG (Classifier-Free Guidance) scale for generation (default: 1.5) + # Higher values can improve quality but may slow generation + - "cfg_scale:1.5" + + # Number of inference steps for the diffusion process (default: 5) + # More steps = better quality but slower. Typical range: 3-10 + - "inference_steps:5" + + # Enable sampling (default: false) + # When true, uses temperature and top_p for sampling + - "do_sample:false" + + # Temperature for sampling (only used if do_sample=true, default: 0.9) + - "temperature:0.9" + + # Top-p (nucleus) sampling (only used if do_sample=true, default: 0.9) + - "top_p:0.9" + + # Voices directory path + # This explicitly sets where to look for voice preset files (.pt files) + # Since we're downloading voices to voices/streaming_model/, we set it here + # + # Examples: + # - Relative path (relative to models directory): "voices/streaming_model" + # - Absolute path: "/custom/path/to/voices/streaming_model" + # - Custom relative path: "my_custom_voices/streaming_model" + # + # If not specified, the backend will auto-detect from common locations: + # 1. {ModelFile directory}/voices/streaming_model/ + # 2. {models_dir}/voices/streaming_model/ + # 3. Backend directory + - "voices_dir:voices/streaming_model" + +# Download voice preset files +# Voice presets are downloaded to: {models_dir}/voices/streaming_model/ +# The voices_dir option above tells the backend to look in this location +download_files: + # English voices + - filename: voices/streaming_model/en-Frank_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Frank_man.pt + - filename: voices/streaming_model/en-Grace_woman.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Grace_woman.pt + - filename: voices/streaming_model/en-Mike_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Mike_man.pt + - filename: voices/streaming_model/en-Emma_woman.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Emma_woman.pt + - filename: voices/streaming_model/en-Carter_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Carter_man.pt + - filename: voices/streaming_model/en-Davis_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt + # Uncomment to add more languages: + # - filename: voices/streaming_model/fr-Spk0_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/fr-Spk0_man.pt + # - filename: voices/streaming_model/de-Spk0_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/de-Spk0_man.pt + +# Usage example +usage: | + To test this model, you can use the following curl command: + + curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "vibevoice-realtime", + "input": "Hello, this is a test of the VibeVoice text-to-speech system.", + "voice": "Frank" + }' + + Or using a different voice: + + curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ + "model": "vibevoice-realtime", + "input": "Hello, this is a test.", + "voice": "Grace" + }' + + Available voices: Frank, Grace, Mike, Emma, Carter, Davis diff --git a/backend/python/vibevoice/install.sh b/backend/python/vibevoice/install.sh new file mode 100755 index 000000000000..85c5f70cf35a --- /dev/null +++ b/backend/python/vibevoice/install.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links. +# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match. +# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index +# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index +if [ "x${BUILD_PROFILE}" == "xintel" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +fi + +installRequirements + +git clone https://github.com/microsoft/VibeVoice.git +cd VibeVoice/ + +if [ "x${USE_PIP}" == "xtrue" ]; then + pip install ${EXTRA_PIP_INSTALL_FLAGS:-} . +else + uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} . +fi \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-cpu.txt b/backend/python/vibevoice/requirements-cpu.txt new file mode 100644 index 000000000000..607db4ae3ffe --- /dev/null +++ b/backend/python/vibevoice/requirements-cpu.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +torchvision==0.22.1 +accelerate +compel +peft +sentencepiece +torch==2.7.1 +optimum-quanto +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-cublas11.txt b/backend/python/vibevoice/requirements-cublas11.txt new file mode 100644 index 000000000000..547b198aa870 --- /dev/null +++ b/backend/python/vibevoice/requirements-cublas11.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +torchvision==0.22.1 +accelerate +compel +peft +sentencepiece +torch==2.7.1 +optimum-quanto +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-cublas12.txt b/backend/python/vibevoice/requirements-cublas12.txt new file mode 100644 index 000000000000..267a0313e407 --- /dev/null +++ b/backend/python/vibevoice/requirements-cublas12.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +torchvision +accelerate +compel +peft +sentencepiece +torch +ftfy +optimum-quanto +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-cublas13.txt b/backend/python/vibevoice/requirements-cublas13.txt new file mode 100644 index 000000000000..372be740b24b --- /dev/null +++ b/backend/python/vibevoice/requirements-cublas13.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +torchvision +accelerate +compel +peft +sentencepiece +torch +ftfy +optimum-quanto +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-hipblas.txt b/backend/python/vibevoice/requirements-hipblas.txt new file mode 100644 index 000000000000..291096c3f755 --- /dev/null +++ b/backend/python/vibevoice/requirements-hipblas.txt @@ -0,0 +1,22 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +torch==2.7.1+rocm6.3 +torchvision==0.22.1+rocm6.3 +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +accelerate +compel +peft +sentencepiece +optimum-quanto +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-intel.txt b/backend/python/vibevoice/requirements-intel.txt new file mode 100644 index 000000000000..e040ef6b56aa --- /dev/null +++ b/backend/python/vibevoice/requirements-intel.txt @@ -0,0 +1,26 @@ +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +intel-extension-for-pytorch==2.3.110+xpu +torch==2.5.1+cxx11.abi +torchvision==0.20.1+cxx11.abi +oneccl_bind_pt==2.8.0+xpu +optimum[openvino] +setuptools +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +accelerate +compel +peft +sentencepiece +optimum-quanto +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-l4t12.txt b/backend/python/vibevoice/requirements-l4t12.txt new file mode 100644 index 000000000000..4e033c0f6cb9 --- /dev/null +++ b/backend/python/vibevoice/requirements-l4t12.txt @@ -0,0 +1,22 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +torch +git+https://github.com/huggingface/diffusers +transformers==4.51.3 +accelerate +compel +peft +optimum-quanto +numpy<2 +sentencepiece +torchvision +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-l4t13.txt b/backend/python/vibevoice/requirements-l4t13.txt new file mode 100644 index 000000000000..16612683ad7a --- /dev/null +++ b/backend/python/vibevoice/requirements-l4t13.txt @@ -0,0 +1,22 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/sbsa/cu130 +torch +git+https://github.com/huggingface/diffusers +transformers==4.51.3 +accelerate +compel +peft +optimum-quanto +numpy<2 +sentencepiece +torchvision +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements-mps.txt b/backend/python/vibevoice/requirements-mps.txt new file mode 100644 index 000000000000..11757190ecf5 --- /dev/null +++ b/backend/python/vibevoice/requirements-mps.txt @@ -0,0 +1,21 @@ +torch==2.7.1 +torchvision==0.22.1 +git+https://github.com/huggingface/diffusers +opencv-python +transformers==4.51.3 +accelerate +compel +peft +sentencepiece +optimum-quanto +ftfy +llvmlite>=0.40.0 +numba>=0.57.0 +tqdm +numpy +scipy +librosa +ml-collections +absl-py +gradio +av \ No newline at end of file diff --git a/backend/python/vibevoice/requirements.txt b/backend/python/vibevoice/requirements.txt new file mode 100644 index 000000000000..9e532186b2c8 --- /dev/null +++ b/backend/python/vibevoice/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.71.0 +protobuf +certifi +packaging==24.1 diff --git a/backend/python/vibevoice/run.sh b/backend/python/vibevoice/run.sh new file mode 100755 index 000000000000..82b7b09ecc7d --- /dev/null +++ b/backend/python/vibevoice/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ \ No newline at end of file diff --git a/backend/python/vibevoice/test.py b/backend/python/vibevoice/test.py new file mode 100644 index 000000000000..e0b1a0bdd124 --- /dev/null +++ b/backend/python/vibevoice/test.py @@ -0,0 +1,82 @@ +""" +A test script to test the gRPC service +""" +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) + time.sleep(30) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + """ + This method tests if the server starts up successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits")) + print(response) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_tts(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits")) + self.assertTrue(response.success) + tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story") + tts_response = stub.TTS(tts_request) + self.assertIsNotNone(tts_response) + except Exception as err: + print(err) + self.fail("TTS service failed") + finally: + self.tearDown() \ No newline at end of file diff --git a/backend/python/vibevoice/test.sh b/backend/python/vibevoice/test.sh new file mode 100755 index 000000000000..eb59f2aaf3f3 --- /dev/null +++ b/backend/python/vibevoice/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests From b9474c0736c71d0291fbe13a93c7635dbc2b80d4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 9 Dec 2025 20:29:24 +0000 Subject: [PATCH 2/5] chore: add workflow and backend index Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 91 +++++++++++++++++++++++++++++ backend/index.yaml | 105 ++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index a18068c754f5..d2af2b320b17 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -198,6 +198,19 @@ jobs: context: "./backend" ubuntu-version: '2204' # CUDA 12 builds + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-vibevoice' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:22.04" + skip-drivers: 'false' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "0" @@ -407,6 +420,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-vibevoice' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:22.04" + skip-drivers: 'false' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -459,6 +485,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-vibevoice' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" @@ -669,6 +708,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-vibevoice' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-22.04:6.4.3" + skip-drivers: 'false' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -787,6 +839,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: 'l4t' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-vibevoice' + runs-on: 'ubuntu-24.04-arm' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + skip-drivers: 'true' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' - build-type: 'l4t' cuda-major-version: "12" cuda-minor-version: "0" @@ -827,6 +892,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-vibevoice' + runs-on: 'ubuntu-latest' + base-image: "quay.io/go-skynet/intel-oneapi-base:latest" + skip-drivers: 'false' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1319,6 +1397,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./backend" ubuntu-version: '2204' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-vibevoice' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:22.04" + skip-drivers: 'false' + backend: "vibevoice" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" + ubuntu-version: '2204' backend-jobs-darwin: uses: ./.github/workflows/backend_build_darwin.yml strategy: diff --git a/backend/index.yaml b/backend/index.yaml index 9b89716809b9..8b7219c189ce 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -390,6 +390,28 @@ nvidia-cuda-12: "cuda12-chatterbox" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-chatterbox" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-chatterbox" +- &vibevoice + urls: + - https://github.com/microsoft/VibeVoice + description: | + VibeVoice-Realtime is a real-time text-to-speech model that generates natural-sounding speech. + tags: + - text-to-speech + - TTS + license: mit + name: "vibevoice" + alias: "vibevoice" + capabilities: + nvidia: "cuda12-vibevoice" + intel: "intel-vibevoice" + amd: "rocm-vibevoice" + nvidia-l4t: "nvidia-l4t-vibevoice" + default: "cpu-vibevoice" + nvidia-cuda-13: "cuda13-vibevoice" + nvidia-cuda-12: "cuda12-vibevoice" + nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice" + icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 - &piper name: "piper" uri: "quay.io/go-skynet/local-ai-backends:latest-piper" @@ -1571,3 +1593,86 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-chatterbox" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-chatterbox +## vibevoice +- !!merge <<: *vibevoice + name: "vibevoice-development" + capabilities: + nvidia: "cuda12-vibevoice-development" + intel: "intel-vibevoice-development" + amd: "rocm-vibevoice-development" + nvidia-l4t: "nvidia-l4t-vibevoice-development" + default: "cpu-vibevoice-development" + nvidia-cuda-13: "cuda13-vibevoice-development" + nvidia-cuda-12: "cuda12-vibevoice-development" + nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice-development" +- !!merge <<: *vibevoice + name: "cpu-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-vibevoice" + mirrors: + - localai/localai-backends:latest-cpu-vibevoice +- !!merge <<: *vibevoice + name: "cpu-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vibevoice" + mirrors: + - localai/localai-backends:master-cpu-vibevoice +- !!merge <<: *vibevoice + name: "cuda12-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vibevoice" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-vibevoice +- !!merge <<: *vibevoice + name: "cuda12-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vibevoice" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-vibevoice +- !!merge <<: *vibevoice + name: "cuda13-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-vibevoice" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-vibevoice +- !!merge <<: *vibevoice + name: "cuda13-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-vibevoice" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-vibevoice +- !!merge <<: *vibevoice + name: "intel-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vibevoice" + mirrors: + - localai/localai-backends:latest-gpu-intel-vibevoice +- !!merge <<: *vibevoice + name: "intel-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vibevoice" + mirrors: + - localai/localai-backends:master-gpu-intel-vibevoice +- !!merge <<: *vibevoice + name: "rocm-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vibevoice" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-vibevoice +- !!merge <<: *vibevoice + name: "rocm-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vibevoice" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-vibevoice +- !!merge <<: *vibevoice + name: "nvidia-l4t-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-vibevoice" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-vibevoice +- !!merge <<: *vibevoice + name: "nvidia-l4t-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-vibevoice" + mirrors: + - localai/localai-backends:master-nvidia-l4t-vibevoice +- !!merge <<: *vibevoice + name: "cuda13-nvidia-l4t-arm64-vibevoice" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-vibevoice" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-vibevoice +- !!merge <<: *vibevoice + name: "cuda13-nvidia-l4t-arm64-vibevoice-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice From 715302c21a88c301709205038b101dec9638ffa9 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 9 Dec 2025 20:41:43 +0000 Subject: [PATCH 3/5] chore(gallery): add vibevoice Signed-off-by: Ettore Di Giacinto --- backend/python/vibevoice/example-config.yaml | 101 ------------------- gallery/index.yaml | 28 +++++ gallery/vibevoice.yaml | 78 ++++++++++++++ 3 files changed, 106 insertions(+), 101 deletions(-) delete mode 100644 backend/python/vibevoice/example-config.yaml create mode 100644 gallery/vibevoice.yaml diff --git a/backend/python/vibevoice/example-config.yaml b/backend/python/vibevoice/example-config.yaml deleted file mode 100644 index 33680407ed1e..000000000000 --- a/backend/python/vibevoice/example-config.yaml +++ /dev/null @@ -1,101 +0,0 @@ -name: vibevoice-realtime -backend: vibevoice -description: | - VibeVoice-Realtime is a real-time text-to-speech model that generates natural-sounding speech. - This model supports voice cloning through voice preset files (.pt files). - -# Model path (HuggingFace model ID or local path) -parameters: - model: microsoft/VibeVoice-Realtime-0.5B - -# TTS configuration -tts: - # Voice selection - can be: - # 1. Voice preset name (e.g., "Frank", "en-Frank_man", "Grace") - looks for .pt files in voices/streaming_model/ - # 2. Path to a voice preset .pt file (relative to model directory or absolute) - # Available English voices: Carter, Davis, Emma, Frank, Grace, Mike - voice: "Frank" - # Alternative: use audio_path to specify a voice file directly - # audio_path: "voices/streaming_model/en-Frank_man.pt" - -known_usecases: - - tts - -# Backend-specific options -# These are passed as "key:value" strings to the backend -options: - # CFG (Classifier-Free Guidance) scale for generation (default: 1.5) - # Higher values can improve quality but may slow generation - - "cfg_scale:1.5" - - # Number of inference steps for the diffusion process (default: 5) - # More steps = better quality but slower. Typical range: 3-10 - - "inference_steps:5" - - # Enable sampling (default: false) - # When true, uses temperature and top_p for sampling - - "do_sample:false" - - # Temperature for sampling (only used if do_sample=true, default: 0.9) - - "temperature:0.9" - - # Top-p (nucleus) sampling (only used if do_sample=true, default: 0.9) - - "top_p:0.9" - - # Voices directory path - # This explicitly sets where to look for voice preset files (.pt files) - # Since we're downloading voices to voices/streaming_model/, we set it here - # - # Examples: - # - Relative path (relative to models directory): "voices/streaming_model" - # - Absolute path: "/custom/path/to/voices/streaming_model" - # - Custom relative path: "my_custom_voices/streaming_model" - # - # If not specified, the backend will auto-detect from common locations: - # 1. {ModelFile directory}/voices/streaming_model/ - # 2. {models_dir}/voices/streaming_model/ - # 3. Backend directory - - "voices_dir:voices/streaming_model" - -# Download voice preset files -# Voice presets are downloaded to: {models_dir}/voices/streaming_model/ -# The voices_dir option above tells the backend to look in this location -download_files: - # English voices - - filename: voices/streaming_model/en-Frank_man.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Frank_man.pt - - filename: voices/streaming_model/en-Grace_woman.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Grace_woman.pt - - filename: voices/streaming_model/en-Mike_man.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Mike_man.pt - - filename: voices/streaming_model/en-Emma_woman.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Emma_woman.pt - - filename: voices/streaming_model/en-Carter_man.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Carter_man.pt - - filename: voices/streaming_model/en-Davis_man.pt - uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt - # Uncomment to add more languages: - # - filename: voices/streaming_model/fr-Spk0_man.pt - # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/fr-Spk0_man.pt - # - filename: voices/streaming_model/de-Spk0_man.pt - # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/de-Spk0_man.pt - -# Usage example -usage: | - To test this model, you can use the following curl command: - - curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ - "model": "vibevoice-realtime", - "input": "Hello, this is a test of the VibeVoice text-to-speech system.", - "voice": "Frank" - }' - - Or using a different voice: - - curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{ - "model": "vibevoice-realtime", - "input": "Hello, this is a test.", - "voice": "Grace" - }' - - Available voices: Frank, Grace, Mike, Emma, Carter, Davis diff --git a/gallery/index.yaml b/gallery/index.yaml index faab804c3a76..c6172de18975 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,32 @@ --- +- &vibevoice + url: "github:mudler/LocalAI/gallery/vibevoice.yaml@master" + icon: https://github.com/microsoft/VibeVoice/raw/main/Figures/VibeVoice_logo_white.png + license: mit + tags: + - text-to-speech + - TTS + name: "vibevoice" + urls: + - https://github.com/microsoft/VibeVoice + + # Download voice preset files + # Voice presets are downloaded to: {models_dir}/voices/streaming_model/ + # The voices_dir option above tells the backend to look in this location + files: + # English voices + - filename: voices/streaming_model/en-Frank_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Frank_man.pt + - filename: voices/streaming_model/en-Grace_woman.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Grace_woman.pt + - filename: voices/streaming_model/en-Mike_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Mike_man.pt + - filename: voices/streaming_model/en-Emma_woman.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Emma_woman.pt + - filename: voices/streaming_model/en-Carter_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Carter_man.pt + - filename: voices/streaming_model/en-Davis_man.pt + uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt - &qwen3vl url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png diff --git a/gallery/vibevoice.yaml b/gallery/vibevoice.yaml new file mode 100644 index 000000000000..f97456972259 --- /dev/null +++ b/gallery/vibevoice.yaml @@ -0,0 +1,78 @@ +--- +name: localai + +config_file: |- + name: vibevoice + backend: vibevoice + description: | + VibeVoice-Realtime is a real-time text-to-speech model that generates natural-sounding speech. + This model supports voice cloning through voice preset files (.pt files). + + parameters: + model: microsoft/VibeVoice-Realtime-0.5B + + # TTS configuration + tts: + # Voice selection - can be: + # 1. Voice preset name (e.g., "Frank", "en-Frank_man", "Grace") - looks for .pt files in voices/streaming_model/ + # 2. Path to a voice preset .pt file (relative to model directory or absolute) + # Available English voices: Carter, Davis, Emma, Frank, Grace, Mike + voice: "Frank" + # Alternative: use audio_path to specify a voice file directly + # audio_path: "voices/streaming_model/en-Frank_man.pt" + + known_usecases: + - tts + + # Backend-specific options + # These are passed as "key:value" strings to the backend + options: + # CFG (Classifier-Free Guidance) scale for generation (default: 1.5) + # Higher values can improve quality but may slow generation + - "cfg_scale:1.5" + # Number of inference steps for the diffusion process (default: 5) + # More steps = better quality but slower. Typical range: 3-10 + - "inference_steps:5" + # Enable sampling (default: false) + # When true, uses temperature and top_p for sampling + - "do_sample:false" + # Temperature for sampling (only used if do_sample=true, default: 0.9) + - "temperature:0.9" + # Top-p (nucleus) sampling (only used if do_sample=true, default: 0.9) + - "top_p:0.9" + # Voices directory path + # This explicitly sets where to look for voice preset files (.pt files) + # Since we're downloading voices to voices/streaming_model/, we set it here + # + # Examples: + # - Relative path (relative to models directory): "voices/streaming_model" + # - Absolute path: "/custom/path/to/voices/streaming_model" + # - Custom relative path: "my_custom_voices/streaming_model" + # + # If not specified, the backend will auto-detect from common locations: + # 1. {ModelFile directory}/voices/streaming_model/ + # 2. {models_dir}/voices/streaming_model/ + # 3. Backend directory + - "voices_dir:voices/streaming_model" + # # Download voice preset files + # # Voice presets are downloaded to: {models_dir}/voices/streaming_model/ + # # The voices_dir option above tells the backend to look in this location + # download_files: + # # English voices + # - filename: voices/streaming_model/en-Frank_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Frank_man.pt + # - filename: voices/streaming_model/en-Grace_woman.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Grace_woman.pt + # - filename: voices/streaming_model/en-Mike_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Mike_man.pt + # - filename: voices/streaming_model/en-Emma_woman.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Emma_woman.pt + # - filename: voices/streaming_model/en-Carter_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Carter_man.pt + # - filename: voices/streaming_model/en-Davis_man.pt + # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt + # # Uncomment to add more languages: + # # - filename: voices/streaming_model/fr-Spk0_man.pt + # # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/fr-Spk0_man.pt + # # - filename: voices/streaming_model/de-Spk0_man.pt + # # uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/de-Spk0_man.pt From 2b5402ac0fcdb0433cd2f7954b5f40d12e786066 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 10 Dec 2025 18:18:36 +0100 Subject: [PATCH 4/5] Use self-hosted for intel builds Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 2 +- gallery/vibevoice.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index d2af2b320b17..f12775aeae8b 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -898,7 +898,7 @@ jobs: platforms: 'linux/amd64' tag-latest: 'auto' tag-suffix: '-gpu-intel-vibevoice' - runs-on: 'ubuntu-latest' + runs-on: 'arc-runner-set' base-image: "quay.io/go-skynet/intel-oneapi-base:latest" skip-drivers: 'false' backend: "vibevoice" diff --git a/gallery/vibevoice.yaml b/gallery/vibevoice.yaml index f97456972259..a9611efd8f26 100644 --- a/gallery/vibevoice.yaml +++ b/gallery/vibevoice.yaml @@ -43,7 +43,7 @@ config_file: |- # Voices directory path # This explicitly sets where to look for voice preset files (.pt files) # Since we're downloading voices to voices/streaming_model/, we set it here - # + # # Examples: # - Relative path (relative to models directory): "voices/streaming_model" # - Absolute path: "/custom/path/to/voices/streaming_model" From d1a4ccb62caf3795cb7b5e67cb260239419f8ffa Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 10 Dec 2025 18:31:51 +0100 Subject: [PATCH 5/5] Pin python version for l4t Signed-off-by: Ettore Di Giacinto --- backend/python/vibevoice/install.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/python/vibevoice/install.sh b/backend/python/vibevoice/install.sh index 85c5f70cf35a..a7603aebce11 100755 --- a/backend/python/vibevoice/install.sh +++ b/backend/python/vibevoice/install.sh @@ -16,6 +16,13 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi +# Use python 3.12 for l4t +if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + installRequirements git clone https://github.com/microsoft/VibeVoice.git