Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ To use the screenshot functionality:
sudo apt install gnome-screenshot
```

#### Transcription Backend
- `TRANSCRIPTION_BACKEND`: Choose between `whisper` (default) and `voxtral`.
```bash
export TRANSCRIPTION_BACKEND="voxtral"
```
- `VOXTRAL_URL`: Set the Voxtral API URL (default: "http://localhost:8000/v1")
- `VOXTRAL_MODEL`: Set the Voxtral model name (default: "mistralai/Voxtral-Mini-4B-Realtime-2602")

## Usage Modes 💡

VibeVoice supports two modes:
Expand All @@ -144,6 +152,37 @@ VibeVoice supports two modes:
3. Release the key
4. The AI will analyze your request (and current screen if enabled) and type a response

## Voxtral Realtime Support 🎙️

You can use the new Mistral Voxtral real-time model instead of the local Whisper model.

### Setup Instructions

1. **Install vLLM (nightly version):**
```bash
uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly
```
Ensure `mistral_common >= 1.9.0` is installed.

2. **Start the model via vLLM:**
```bash
vllm serve mistralai/Voxtral-Mini-4B-Realtime-2602 --host 0.0.0.0 --port 8000
```
*Note: The model requires a GPU with at least 16 GB VRAM.*

3. **Run VibeVoice with Voxtral backend:**
```bash
export TRANSCRIPTION_BACKEND="voxtral"
python src/vibevoice/cli.py
```

### Optional Optimizations
Install additional audio libraries for better performance:
```bash
pip install soxr librosa soundfile
```
(Note: `soundfile`, `librosa`, and `soxr` are already included in `requirements.txt`)

## Credits 🙏

- Original inspiration: [whisper-keyboard](https://github.com/vlad-ds/whisper-keyboard) by Vlad
Expand Down
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ dependencies = [
"numpy>=1.26.0",
"requests==2.32.3",
"pynput==1.7.8",
"scipy==1.16.1"
"scipy==1.16.1",
"pyautogui==0.9.54",
"Pillow==11.1.0",
"openai>=1.0.0",
"soundfile",
"librosa",
"soxr"
]

[project.scripts]
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@ pynput==1.7.8
scipy==1.16.1
pyautogui==0.9.54
Pillow==11.1.0
openai>=1.0.0
soundfile
librosa
soxr
157 changes: 139 additions & 18 deletions src/vibevoice/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
import requests
import sys
import base64
import threading
import tempfile

try:
from openai import OpenAI
except ImportError:
print("OpenAI library not found. Voxtral backend will not work. Install with: pip install openai")

SCREENSHOT_AVAILABLE = False
try:
Expand Down Expand Up @@ -144,6 +151,25 @@ def _process_llm_cmd(keyboard_controller, transcript):
finally:
loading_indicator.hide()

def transcribe_with_voxtral(file_path):
"""Transcribe audio using Voxtral Realtime model via OpenAI-compatible API."""
try:
base_url = os.getenv('VOXTRAL_URL', 'http://localhost:8000/v1')
model = os.getenv('VOXTRAL_MODEL', 'mistralai/Voxtral-Mini-4B-Realtime-2602')
api_key = os.getenv('VOXTRAL_API_KEY', 'sk-no-key-required')

client = OpenAI(base_url=base_url, api_key=api_key)

with open(file_path, "rb") as audio_file:
response = client.audio.transcriptions.create(
model=model,
file=audio_file
)
return response.text
except Exception as e:
print(f"Error transcribing with Voxtral: {e}")
return None

def main():
load_dotenv()
key_label = os.environ.get("VOICEKEY", "ctrl_r")
Expand All @@ -152,20 +178,97 @@ def main():
CMD_KEY = Key[cmd_label]
# CMD_KEY = KeyCode(vk=65027) # This is how you can use non-standard keys, this is AltGr for me

transcription_backend = os.getenv('TRANSCRIPTION_BACKEND', 'whisper').lower()

recording = False
audio_data = []
last_typed_text = ""
sample_rate = 16000
keyboard_controller = KeyboardController()
text_lock = threading.Lock()

def on_press(key):
def update_text(new_text, final=False):
nonlocal last_typed_text
if new_text is None:
return

with text_lock:
# If we are not doing a final update and recording has stopped,
# abort to avoid overwriting the final result with a partial one.
if not final and not recording:
return

# Calculate common prefix
common_len = 0
min_len = min(len(last_typed_text), len(new_text))
for i in range(min_len):
if last_typed_text[i] == new_text[i]:
common_len += 1
else:
break

backspaces = len(last_typed_text) - common_len
to_type = new_text[common_len:]

if backspaces > 0:
for _ in range(backspaces):
keyboard_controller.press(Key.backspace)
keyboard_controller.release(Key.backspace)

if to_type:
keyboard_controller.type(to_type)

last_typed_text = new_text

def streaming_worker():
nonlocal recording, audio_data

while recording:
time.sleep(0.8)
if not recording:
break

try:
# We need at least some audio to transcribe
if not audio_data:
continue

# Use slice copy to be thread-safe
current_audio = np.concatenate(audio_data[:], axis=0)

# Skip if audio is too short (less than 0.5s)
if len(current_audio) < sample_rate * 0.5:
continue

# Save temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
temp_wav_path = temp_wav.name

audio_data_int16 = (current_audio * np.iinfo(np.int16).max).astype(np.int16)
wavfile.write(temp_wav_path, sample_rate, audio_data_int16)

transcript = transcribe_with_voxtral(temp_wav_path)
os.remove(temp_wav_path)

if transcript:
update_text(transcript, final=False)

except Exception as e:
print(f"Error in streaming worker: {e}")

def on_press(key):
nonlocal recording, audio_data, last_typed_text
if key == RECORD_KEY or key == CMD_KEY and not recording:
recording = True
audio_data = []
last_typed_text = ""
print("Listening...")

if transcription_backend == 'voxtral' and key == RECORD_KEY:
threading.Thread(target=streaming_worker, daemon=True).start()

def on_release(key):
nonlocal recording, audio_data
nonlocal recording, audio_data, last_typed_text
if key == RECORD_KEY or key == CMD_KEY:
recording = False
print("Transcribing...")
Expand All @@ -181,15 +284,26 @@ def on_release(key):
wavfile.write(recording_path, sample_rate, audio_data_int16)

try:
response = requests.post('http://localhost:4242/transcribe/',
json={'file_path': recording_path})
response.raise_for_status()
transcript = response.json()['text']
transcript = None

if transcription_backend == 'voxtral':
transcript = transcribe_with_voxtral(recording_path)
else:
# Default to local whisper server
response = requests.post('http://localhost:4242/transcribe/',
json={'file_path': recording_path})
response.raise_for_status()
transcript = response.json()['text']

if transcript and key == RECORD_KEY:
processed_transcript = transcript + " "
print(processed_transcript)
keyboard_controller.type(processed_transcript)
if transcription_backend == 'voxtral':
# For Voxtral, we might have already typed some text, so we update it
# Adding a trailing space as per original behavior
update_text(transcript + " ", final=True)
else:
processed_transcript = transcript + " "
print(processed_transcript)
keyboard_controller.type(processed_transcript)
elif transcript and key == CMD_KEY:
_process_llm_cmd(keyboard_controller, transcript)
except requests.exceptions.RequestException as e:
Expand All @@ -203,23 +317,30 @@ def callback(indata, frames, time, status):
if recording:
audio_data.append(indata.copy())

server_process = start_whisper_server()

server_process = None
if transcription_backend == 'whisper':
server_process = start_whisper_server()
try:
print(f"Waiting for the server to be ready...")
wait_for_server()
except TimeoutError as e:
print(f"Error: {e}")
if server_process:
server_process.terminate()
sys.exit(1)
elif transcription_backend == 'voxtral':
print("Using Voxtral backend. Make sure the Voxtral server is running (e.g. vLLM).")

try:
print(f"Waiting for the server to be ready...")
wait_for_server()
print(f"vibevoice is active. Hold down {key_label} to start dictating.")
with Listener(on_press=on_press, on_release=on_release) as listener:
with sd.InputStream(callback=callback, channels=1, samplerate=sample_rate):
listener.join()
except TimeoutError as e:
print(f"Error: {e}")
server_process.terminate()
sys.exit(1)
except KeyboardInterrupt:
print("\nStopping...")
finally:
server_process.terminate()
if server_process:
server_process.terminate()

if __name__ == "__main__":
main()