From 97a34302cb7856df93c21046e3a35af0624e4cf1 Mon Sep 17 00:00:00 2001 From: mpaepper <4135790+mpaepper@users.noreply.github.com> Date: Thu, 5 Feb 2026 09:04:02 +0000 Subject: [PATCH 1/2] Add support for Mistral Voxtral Realtime model - Add `TRANSCRIPTION_BACKEND` env var to switch between 'whisper' (default) and 'voxtral'. - Implement Voxtral transcription using OpenAI-compatible API. - Update `requirements.txt` and `pyproject.toml` with `openai` and audio libs. - Update README with setup instructions. --- README.md | 39 ++++++++++++++++++++++++++ pyproject.toml | 8 +++++- requirements.txt | 4 +++ src/vibevoice/cli.py | 65 +++++++++++++++++++++++++++++++++++--------- 4 files changed, 102 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index fb292b6..d644767 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,14 @@ To use the screenshot functionality: sudo apt install gnome-screenshot ``` +#### Transcription Backend +- `TRANSCRIPTION_BACKEND`: Choose between `whisper` (default) and `voxtral`. + ```bash + export TRANSCRIPTION_BACKEND="voxtral" + ``` +- `VOXTRAL_URL`: Set the Voxtral API URL (default: "http://localhost:8000/v1") +- `VOXTRAL_MODEL`: Set the Voxtral model name (default: "mistralai/Voxtral-Mini-4B-Realtime-2602") + ## Usage Modes 💡 VibeVoice supports two modes: @@ -144,6 +152,37 @@ VibeVoice supports two modes: 3. Release the key 4. The AI will analyze your request (and current screen if enabled) and type a response +## Voxtral Realtime Support 🎙️ + +You can use the new Mistral Voxtral real-time model instead of the local Whisper model. + +### Setup Instructions + +1. **Install vLLM (nightly version):** + ```bash + uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly + ``` + Ensure `mistral_common >= 1.9.0` is installed. + +2. **Start the model via vLLM:** + ```bash + vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --host 0.0.0.0 --port 8000 + ``` + *Note: The model requires a GPU with at least 16 GB VRAM.* + +3. **Run VibeVoice with Voxtral backend:** + ```bash + export TRANSCRIPTION_BACKEND="voxtral" + python src/vibevoice/cli.py + ``` + +### Optional Optimizations +Install additional audio libraries for better performance: +```bash +pip install soxr librosa soundfile +``` +(Note: `soundfile`, `librosa`, and `soxr` are already included in `requirements.txt`) + ## Credits 🙏 - Original inspiration: [whisper-keyboard](https://github.com/vlad-ds/whisper-keyboard) by Vlad diff --git a/pyproject.toml b/pyproject.toml index c6617a7..2a563b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,13 @@ dependencies = [ "numpy>=1.26.0", "requests==2.32.3", "pynput==1.7.8", - "scipy==1.16.1" + "scipy==1.16.1", + "pyautogui==0.9.54", + "Pillow==11.1.0", + "openai>=1.0.0", + "soundfile", + "librosa", + "soxr" ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 781c02f..760c60f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,7 @@ pynput==1.7.8 scipy==1.16.1 pyautogui==0.9.54 Pillow==11.1.0 +openai>=1.0.0 +soundfile +librosa +soxr diff --git a/src/vibevoice/cli.py b/src/vibevoice/cli.py index f8524df..e6a325c 100644 --- a/src/vibevoice/cli.py +++ b/src/vibevoice/cli.py @@ -10,6 +10,11 @@ import sys import base64 +try: + from openai import OpenAI +except ImportError: + print("OpenAI library not found. Voxtral backend will not work. Install with: pip install openai") + SCREENSHOT_AVAILABLE = False try: import pyautogui @@ -144,6 +149,25 @@ def _process_llm_cmd(keyboard_controller, transcript): finally: loading_indicator.hide() +def transcribe_with_voxtral(file_path): + """Transcribe audio using Voxtral Realtime model via OpenAI-compatible API.""" + try: + base_url = os.getenv('VOXTRAL_URL', 'http://localhost:8000/v1') + model = os.getenv('VOXTRAL_MODEL', 'mistralai/Voxtral-Mini-4B-Realtime-2602') + api_key = os.getenv('VOXTRAL_API_KEY', 'sk-no-key-required') + + client = OpenAI(base_url=base_url, api_key=api_key) + + with open(file_path, "rb") as audio_file: + response = client.audio.transcriptions.create( + model=model, + file=audio_file + ) + return response.text + except Exception as e: + print(f"Error transcribing with Voxtral: {e}") + return None + def main(): load_dotenv() key_label = os.environ.get("VOICEKEY", "ctrl_r") @@ -152,6 +176,8 @@ def main(): CMD_KEY = Key[cmd_label] # CMD_KEY = KeyCode(vk=65027) # This is how you can use non-standard keys, this is AltGr for me + transcription_backend = os.getenv('TRANSCRIPTION_BACKEND', 'whisper').lower() + recording = False audio_data = [] sample_rate = 16000 @@ -181,10 +207,16 @@ def on_release(key): wavfile.write(recording_path, sample_rate, audio_data_int16) try: - response = requests.post('http://localhost:4242/transcribe/', - json={'file_path': recording_path}) - response.raise_for_status() - transcript = response.json()['text'] + transcript = None + + if transcription_backend == 'voxtral': + transcript = transcribe_with_voxtral(recording_path) + else: + # Default to local whisper server + response = requests.post('http://localhost:4242/transcribe/', + json={'file_path': recording_path}) + response.raise_for_status() + transcript = response.json()['text'] if transcript and key == RECORD_KEY: processed_transcript = transcript + " " @@ -203,23 +235,30 @@ def callback(indata, frames, time, status): if recording: audio_data.append(indata.copy()) - server_process = start_whisper_server() - + server_process = None + if transcription_backend == 'whisper': + server_process = start_whisper_server() + try: + print(f"Waiting for the server to be ready...") + wait_for_server() + except TimeoutError as e: + print(f"Error: {e}") + if server_process: + server_process.terminate() + sys.exit(1) + elif transcription_backend == 'voxtral': + print("Using Voxtral backend. Make sure the Voxtral server is running (e.g. vLLM).") + try: - print(f"Waiting for the server to be ready...") - wait_for_server() print(f"vibevoice is active. Hold down {key_label} to start dictating.") with Listener(on_press=on_press, on_release=on_release) as listener: with sd.InputStream(callback=callback, channels=1, samplerate=sample_rate): listener.join() - except TimeoutError as e: - print(f"Error: {e}") - server_process.terminate() - sys.exit(1) except KeyboardInterrupt: print("\nStopping...") finally: - server_process.terminate() + if server_process: + server_process.terminate() if __name__ == "__main__": main() From 887877634e164f5b6ec5c77853dfc5d28c88ca1f Mon Sep 17 00:00:00 2001 From: mpaepper <4135790+mpaepper@users.noreply.github.com> Date: Thu, 5 Feb 2026 09:18:28 +0000 Subject: [PATCH 2/2] Add real-time streaming support for Voxtral backend - Update `cli.py` to support `TRANSCRIPTION_BACKEND` env var. - Implement background thread for periodic transcription (simulated streaming). - Implement diff-based text updating (backspacing) for real-time feedback. - Add `text_lock` for thread safety. - Update `requirements.txt` and `pyproject.toml`. - Update README with Voxtral setup instructions. --- src/vibevoice/cli.py | 92 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/src/vibevoice/cli.py b/src/vibevoice/cli.py index e6a325c..b7894f5 100644 --- a/src/vibevoice/cli.py +++ b/src/vibevoice/cli.py @@ -9,6 +9,8 @@ import requests import sys import base64 +import threading +import tempfile try: from openai import OpenAI @@ -180,18 +182,93 @@ def main(): recording = False audio_data = [] + last_typed_text = "" sample_rate = 16000 keyboard_controller = KeyboardController() + text_lock = threading.Lock() - def on_press(key): + def update_text(new_text, final=False): + nonlocal last_typed_text + if new_text is None: + return + + with text_lock: + # If we are not doing a final update and recording has stopped, + # abort to avoid overwriting the final result with a partial one. + if not final and not recording: + return + + # Calculate common prefix + common_len = 0 + min_len = min(len(last_typed_text), len(new_text)) + for i in range(min_len): + if last_typed_text[i] == new_text[i]: + common_len += 1 + else: + break + + backspaces = len(last_typed_text) - common_len + to_type = new_text[common_len:] + + if backspaces > 0: + for _ in range(backspaces): + keyboard_controller.press(Key.backspace) + keyboard_controller.release(Key.backspace) + + if to_type: + keyboard_controller.type(to_type) + + last_typed_text = new_text + + def streaming_worker(): nonlocal recording, audio_data + + while recording: + time.sleep(0.8) + if not recording: + break + + try: + # We need at least some audio to transcribe + if not audio_data: + continue + + # Use slice copy to be thread-safe + current_audio = np.concatenate(audio_data[:], axis=0) + + # Skip if audio is too short (less than 0.5s) + if len(current_audio) < sample_rate * 0.5: + continue + + # Save temp file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: + temp_wav_path = temp_wav.name + + audio_data_int16 = (current_audio * np.iinfo(np.int16).max).astype(np.int16) + wavfile.write(temp_wav_path, sample_rate, audio_data_int16) + + transcript = transcribe_with_voxtral(temp_wav_path) + os.remove(temp_wav_path) + + if transcript: + update_text(transcript, final=False) + + except Exception as e: + print(f"Error in streaming worker: {e}") + + def on_press(key): + nonlocal recording, audio_data, last_typed_text if key == RECORD_KEY or key == CMD_KEY and not recording: recording = True audio_data = [] + last_typed_text = "" print("Listening...") + if transcription_backend == 'voxtral' and key == RECORD_KEY: + threading.Thread(target=streaming_worker, daemon=True).start() + def on_release(key): - nonlocal recording, audio_data + nonlocal recording, audio_data, last_typed_text if key == RECORD_KEY or key == CMD_KEY: recording = False print("Transcribing...") @@ -219,9 +296,14 @@ def on_release(key): transcript = response.json()['text'] if transcript and key == RECORD_KEY: - processed_transcript = transcript + " " - print(processed_transcript) - keyboard_controller.type(processed_transcript) + if transcription_backend == 'voxtral': + # For Voxtral, we might have already typed some text, so we update it + # Adding a trailing space as per original behavior + update_text(transcript + " ", final=True) + else: + processed_transcript = transcript + " " + print(processed_transcript) + keyboard_controller.type(processed_transcript) elif transcript and key == CMD_KEY: _process_llm_cmd(keyboard_controller, transcript) except requests.exceptions.RequestException as e: