From 97a34302cb7856df93c21046e3a35af0624e4cf1 Mon Sep 17 00:00:00 2001
From: mpaepper <4135790+mpaepper@users.noreply.github.com>
Date: Thu, 5 Feb 2026 09:04:02 +0000
Subject: [PATCH 1/2] Add support for Mistral Voxtral Realtime model

- Add `TRANSCRIPTION_BACKEND` env var to switch between 'whisper' (default) and 'voxtral'.
- Implement Voxtral transcription using OpenAI-compatible API.
- Update `requirements.txt` and `pyproject.toml` with `openai` and audio libs.
- Update README with setup instructions.
---
 README.md            | 39 ++++++++++++++++++++++++++
 pyproject.toml       |  8 +++++-
 requirements.txt     |  4 +++
 src/vibevoice/cli.py | 65 +++++++++++++++++++++++++++++++++++---------
 4 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index fb292b6..d644767 100644
--- a/README.md
+++ b/README.md
@@ -128,6 +128,14 @@ To use the screenshot functionality:
 sudo apt install gnome-screenshot
 ```
 
+#### Transcription Backend
+- `TRANSCRIPTION_BACKEND`: Choose between `whisper` (default) and `voxtral`.
+  ```bash
+  export TRANSCRIPTION_BACKEND="voxtral"
+  ```
+- `VOXTRAL_URL`: Set the Voxtral API URL (default: "http://localhost:8000/v1")
+- `VOXTRAL_MODEL`: Set the Voxtral model name (default: "mistralai/Voxtral-Mini-4B-Realtime-2602")
+
 ## Usage Modes 💡
 
 VibeVoice supports two modes:
@@ -144,6 +152,37 @@ VibeVoice supports two modes:
 3. Release the key
 4. The AI will analyze your request (and current screen if enabled) and type a response
 
+## Voxtral Realtime Support 🎙️
+
+You can use the new Mistral Voxtral real-time model instead of the local Whisper model.
+
+### Setup Instructions
+
+1.  **Install vLLM (nightly version):**
+    ```bash
+    uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly
+    ```
+    Ensure `mistral_common >= 1.9.0` is installed.
+
+2.  **Start the model via vLLM:**
+    ```bash
+    vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --host 0.0.0.0 --port 8000
+    ```
+    *Note: The model requires a GPU with at least 16 GB VRAM.*
+
+3.  **Run VibeVoice with Voxtral backend:**
+    ```bash
+    export TRANSCRIPTION_BACKEND="voxtral"
+    python src/vibevoice/cli.py
+    ```
+
+### Optional Optimizations
+Install additional audio libraries for better performance:
+```bash
+pip install soxr librosa soundfile
+```
+(Note: `soundfile`, `librosa`, and `soxr` are already included in `requirements.txt`)
+
 ## Credits 🙏
 
 - Original inspiration: [whisper-keyboard](https://github.com/vlad-ds/whisper-keyboard) by Vlad
diff --git a/pyproject.toml b/pyproject.toml
index c6617a7..2a563b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,13 @@ dependencies = [
     "numpy>=1.26.0",
     "requests==2.32.3",
     "pynput==1.7.8",
-    "scipy==1.16.1"
+    "scipy==1.16.1",
+    "pyautogui==0.9.54",
+    "Pillow==11.1.0",
+    "openai>=1.0.0",
+    "soundfile",
+    "librosa",
+    "soxr"
 ]
 
 [project.scripts]
diff --git a/requirements.txt b/requirements.txt
index 781c02f..760c60f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,7 @@ pynput==1.7.8
 scipy==1.16.1
 pyautogui==0.9.54
 Pillow==11.1.0
+openai>=1.0.0
+soundfile
+librosa
+soxr
diff --git a/src/vibevoice/cli.py b/src/vibevoice/cli.py
index f8524df..e6a325c 100644
--- a/src/vibevoice/cli.py
+++ b/src/vibevoice/cli.py
@@ -10,6 +10,11 @@
 import sys
 import base64
 
+try:
+    from openai import OpenAI
+except ImportError:
+    print("OpenAI library not found. Voxtral backend will not work. Install with: pip install openai")
+
 SCREENSHOT_AVAILABLE = False
 try:
     import pyautogui
@@ -144,6 +149,25 @@ def _process_llm_cmd(keyboard_controller, transcript):
     finally:
         loading_indicator.hide()
 
+def transcribe_with_voxtral(file_path):
+    """Transcribe audio using Voxtral Realtime model via OpenAI-compatible API."""
+    try:
+        base_url = os.getenv('VOXTRAL_URL', 'http://localhost:8000/v1')
+        model = os.getenv('VOXTRAL_MODEL', 'mistralai/Voxtral-Mini-4B-Realtime-2602')
+        api_key = os.getenv('VOXTRAL_API_KEY', 'sk-no-key-required')
+
+        client = OpenAI(base_url=base_url, api_key=api_key)
+
+        with open(file_path, "rb") as audio_file:
+            response = client.audio.transcriptions.create(
+                model=model,
+                file=audio_file
+            )
+        return response.text
+    except Exception as e:
+        print(f"Error transcribing with Voxtral: {e}")
+        return None
+
 def main():
     load_dotenv()
     key_label = os.environ.get("VOICEKEY", "ctrl_r")
@@ -152,6 +176,8 @@ def main():
     CMD_KEY = Key[cmd_label]
 #    CMD_KEY = KeyCode(vk=65027)  # This is how you can use non-standard keys, this is AltGr for me
 
+    transcription_backend = os.getenv('TRANSCRIPTION_BACKEND', 'whisper').lower()
+
     recording = False
     audio_data = []
     sample_rate = 16000
@@ -181,10 +207,16 @@ def on_release(key):
             wavfile.write(recording_path, sample_rate, audio_data_int16)
 
             try:
-                response = requests.post('http://localhost:4242/transcribe/', 
-                                      json={'file_path': recording_path})
-                response.raise_for_status()
-                transcript = response.json()['text']
+                transcript = None
+
+                if transcription_backend == 'voxtral':
+                    transcript = transcribe_with_voxtral(recording_path)
+                else:
+                    # Default to local whisper server
+                    response = requests.post('http://localhost:4242/transcribe/',
+                                          json={'file_path': recording_path})
+                    response.raise_for_status()
+                    transcript = response.json()['text']
                 
                 if transcript and key == RECORD_KEY:
                     processed_transcript = transcript + " "
@@ -203,23 +235,30 @@ def callback(indata, frames, time, status):
         if recording:
             audio_data.append(indata.copy())
 
-    server_process = start_whisper_server()
-    
+    server_process = None
+    if transcription_backend == 'whisper':
+        server_process = start_whisper_server()
+        try:
+            print(f"Waiting for the server to be ready...")
+            wait_for_server()
+        except TimeoutError as e:
+            print(f"Error: {e}")
+            if server_process:
+                server_process.terminate()
+            sys.exit(1)
+    elif transcription_backend == 'voxtral':
+        print("Using Voxtral backend. Make sure the Voxtral server is running (e.g. vLLM).")
+
     try:
-        print(f"Waiting for the server to be ready...")
-        wait_for_server()
         print(f"vibevoice is active. Hold down {key_label} to start dictating.")
         with Listener(on_press=on_press, on_release=on_release) as listener:
             with sd.InputStream(callback=callback, channels=1, samplerate=sample_rate):
                 listener.join()
-    except TimeoutError as e:
-        print(f"Error: {e}")
-        server_process.terminate()
-        sys.exit(1)
     except KeyboardInterrupt:
         print("\nStopping...")
     finally:
-        server_process.terminate()
+        if server_process:
+            server_process.terminate()
 
 if __name__ == "__main__":
     main()

From 887877634e164f5b6ec5c77853dfc5d28c88ca1f Mon Sep 17 00:00:00 2001
From: mpaepper <4135790+mpaepper@users.noreply.github.com>
Date: Thu, 5 Feb 2026 09:18:28 +0000
Subject: [PATCH 2/2] Add real-time streaming support for Voxtral backend

- Update `cli.py` to support `TRANSCRIPTION_BACKEND` env var.
- Implement background thread for periodic transcription (simulated streaming).
- Implement diff-based text updating (backspacing) for real-time feedback.
- Add `text_lock` for thread safety.
- Update `requirements.txt` and `pyproject.toml`.
- Update README with Voxtral setup instructions.
---
 src/vibevoice/cli.py | 92 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/src/vibevoice/cli.py b/src/vibevoice/cli.py
index e6a325c..b7894f5 100644
--- a/src/vibevoice/cli.py
+++ b/src/vibevoice/cli.py
@@ -9,6 +9,8 @@
 import requests
 import sys
 import base64
+import threading
+import tempfile
 
 try:
     from openai import OpenAI
@@ -180,18 +182,93 @@ def main():
 
     recording = False
     audio_data = []
+    last_typed_text = ""
     sample_rate = 16000
     keyboard_controller = KeyboardController()
+    text_lock = threading.Lock()
 
-    def on_press(key):
+    def update_text(new_text, final=False):
+        nonlocal last_typed_text
+        if new_text is None:
+            return
+
+        with text_lock:
+            # If we are not doing a final update and recording has stopped,
+            # abort to avoid overwriting the final result with a partial one.
+            if not final and not recording:
+                return
+
+            # Calculate common prefix
+            common_len = 0
+            min_len = min(len(last_typed_text), len(new_text))
+            for i in range(min_len):
+                if last_typed_text[i] == new_text[i]:
+                    common_len += 1
+                else:
+                    break
+
+            backspaces = len(last_typed_text) - common_len
+            to_type = new_text[common_len:]
+
+            if backspaces > 0:
+                for _ in range(backspaces):
+                    keyboard_controller.press(Key.backspace)
+                    keyboard_controller.release(Key.backspace)
+
+            if to_type:
+                keyboard_controller.type(to_type)
+
+            last_typed_text = new_text
+
+    def streaming_worker():
         nonlocal recording, audio_data
+
+        while recording:
+            time.sleep(0.8)
+            if not recording:
+                break
+
+            try:
+                # We need at least some audio to transcribe
+                if not audio_data:
+                    continue
+
+                # Use slice copy to be thread-safe
+                current_audio = np.concatenate(audio_data[:], axis=0)
+
+                # Skip if audio is too short (less than 0.5s)
+                if len(current_audio) < sample_rate * 0.5:
+                    continue
+
+                # Save temp file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+                    temp_wav_path = temp_wav.name
+
+                audio_data_int16 = (current_audio * np.iinfo(np.int16).max).astype(np.int16)
+                wavfile.write(temp_wav_path, sample_rate, audio_data_int16)
+
+                transcript = transcribe_with_voxtral(temp_wav_path)
+                os.remove(temp_wav_path)
+
+                if transcript:
+                    update_text(transcript, final=False)
+
+            except Exception as e:
+                print(f"Error in streaming worker: {e}")
+
+    def on_press(key):
+        nonlocal recording, audio_data, last_typed_text
         if key == RECORD_KEY or key == CMD_KEY and not recording:
             recording = True
             audio_data = []
+            last_typed_text = ""
             print("Listening...")
 
+            if transcription_backend == 'voxtral' and key == RECORD_KEY:
+                threading.Thread(target=streaming_worker, daemon=True).start()
+
     def on_release(key):
-        nonlocal recording, audio_data
+        nonlocal recording, audio_data, last_typed_text
         if key == RECORD_KEY or key == CMD_KEY:
             recording = False
             print("Transcribing...")
@@ -219,9 +296,14 @@ def on_release(key):
                     transcript = response.json()['text']
                 
                 if transcript and key == RECORD_KEY:
-                    processed_transcript = transcript + " "
-                    print(processed_transcript)
-                    keyboard_controller.type(processed_transcript)
+                    if transcription_backend == 'voxtral':
+                        # For Voxtral, we might have already typed some text, so we update it
+                        # Adding a trailing space as per original behavior
+                        update_text(transcript + " ", final=True)
+                    else:
+                        processed_transcript = transcript + " "
+                        print(processed_transcript)
+                        keyboard_controller.type(processed_transcript)
                 elif transcript and key == CMD_KEY:
                     _process_llm_cmd(keyboard_controller, transcript)
             except requests.exceptions.RequestException as e: