spacedriveapp · Marenz · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026 · Feb 23, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -137,11 +137,17 @@ tempfile = "3"
 
 # Prometheus metrics (optional, behind "metrics" feature)
 prometheus = { version = "0.13", optional = true }
+whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] }
+hf-hub = { version = "0.5", optional = true }
+symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
+ogg = { version = "0.9", optional = true }
+opus = { version = "0.3", optional = true }
 pdf-extract = "0.10.0"
 open = "5.3.3"
 urlencoding = "2.1.3"
 
 [features]
+stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"]
 metrics = ["dep:prometheus"]
 
 [lints.clippy]

diff --git a/README.md b/README.md
@@ -193,6 +193,30 @@ channel = "my-provider/my-model"
 
 Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`.
 
+### Voice Transcription
+
+Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend:
+
+**Provider-based** — route through any configured LLM provider that supports audio input:
+
+```toml
+[defaults.routing]
+voice = "openai/whisper-1"
+```
+
+**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed:
+
+```toml
+[defaults.routing]
+voice = "whisper-local://small"
+```
+
+The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works.
+
+GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models.
+
+Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia.
+
 ### Skills
 
 Extensible skill system integrated with [skills.sh](https://skills.sh):

diff --git a/prompts/en/tools/transcribe_audio_description.md.j2 b/prompts/en/tools/transcribe_audio_description.md.j2
@@ -0,0 +1 @@
+Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools.
diff --git a/src/agent/channel.rs b/src/agent/channel.rs
@@ -4,7 +4,7 @@ use crate::agent::branch::Branch;
 use crate::agent::compactor::Compactor;
 use crate::agent::status::StatusBlock;
 use crate::agent::worker::Worker;
-use crate::config::ApiType;
+
 use crate::conversation::{ChannelStore, ConversationLogger, ProcessRunLogger};
 use crate::error::{AgentError, Result};
 use crate::hooks::SpacebotHook;
@@ -1956,193 +1956,32 @@ async fn transcribe_audio_attachment(
     );
 
     let routing = deps.runtime_config.routing.load();
-    let voice_model = routing.voice.trim();
-    if voice_model.is_empty() {
-        return UserContent::text(format!(
+    let voice_model = routing.voice.clone();
+
+    match crate::stt::transcribe_bytes(&voice_model, &bytes, &attachment.mime_type, &deps.llm_manager, http).await {
+        Ok(transcript) => UserContent::text(format!(
+            "<voice_transcript name=\"{}\" mime=\"{}\">\n{}\n</voice_transcript>",
+            attachment.filename, attachment.mime_type, transcript
+        )),
+        Err(crate::stt::SttError::NotConfigured) => UserContent::text(format!(
             "[Audio attachment received but no voice model is configured in routing.voice: {}]",
             attachment.filename
-        ));
-    }
-
-    let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) {
-        Ok(parts) => parts,
-        Err(error) => {
-            tracing::warn!(%error, model = %voice_model, "invalid voice model route");
-            return UserContent::text(format!(
-                "[Audio transcription failed for {}: invalid voice model '{}']",
-                attachment.filename, voice_model
-            ));
-        }
-    };
-
-    let provider = match deps.llm_manager.get_provider(&provider_id) {
-        Ok(provider) => provider,
-        Err(error) => {
-            tracing::warn!(%error, provider = %provider_id, "voice provider not configured");
-            return UserContent::text(format!(
-                "[Audio transcription failed for {}: provider '{}' is not configured]",
-                attachment.filename, provider_id
-            ));
-        }
-    };
-
-    if provider.api_type == ApiType::Anthropic {
-        return UserContent::text(format!(
-            "[Audio transcription failed for {}: provider '{}' does not support input_audio on this endpoint]",
-            attachment.filename, provider_id
-        ));
-    }
-
-    let format = audio_format_for_attachment(attachment);
-    use base64::Engine as _;
-    let base64_audio = base64::engine::general_purpose::STANDARD.encode(&bytes);
-
-    let endpoint = format!(
-        "{}/v1/chat/completions",
-        provider.base_url.trim_end_matches('/')
-    );
-    let body = serde_json::json!({
-        "model": model_name,
-        "messages": [{
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "Transcribe this audio verbatim. Return only the transcription text."
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        "data": base64_audio,
-                        "format": format,
-                    }
-                }
-            ]
-        }],
-        "temperature": 0
-    });
-
-    let response = match http
-        .post(&endpoint)
-        .header("authorization", format!("Bearer {}", provider.api_key))
-        .header("content-type", "application/json")
-        .json(&body)
-        .send()
-        .await
-    {
-        Ok(response) => response,
-        Err(error) => {
-            tracing::warn!(%error, model = %voice_model, "voice transcription request failed");
-            return UserContent::text(format!(
-                "[Audio transcription failed for {}]",
+        )),
+        Err(crate::stt::SttError::EmptyResult) => {
+            tracing::warn!(filename = %attachment.filename, "transcription returned empty text");
+            UserContent::text(format!(
+                "[Audio transcription returned empty text for {}]",
                 attachment.filename
-            ));
+            ))
         }
-    };
-
-    let status = response.status();
-    let response_body = match response.json::<serde_json::Value>().await {
-        Ok(body) => body,
         Err(error) => {
-            tracing::warn!(%error, model = %voice_model, "invalid transcription response");
-            return UserContent::text(format!(
-                "[Audio transcription failed for {}]",
-                attachment.filename
-            ));
+            tracing::warn!(%error, filename = %attachment.filename, "audio transcription failed");
+            UserContent::text(format!(
+                "[Audio transcription failed for {}: {}]",
+                attachment.filename, error
+            ))
         }
-    };
-
-    if !status.is_success() {
-        let message = response_body["error"]["message"]
-            .as_str()
-            .unwrap_or("unknown error");
-        tracing::warn!(
-            status = %status,
-            model = %voice_model,
-            error = %message,
-            "voice transcription provider returned error"
-        );
-        return UserContent::text(format!(
-            "[Audio transcription failed for {}: {}]",
-            attachment.filename, message
-        ));
-    }
-
-    let transcript = extract_transcript_text(&response_body);
-    if transcript.is_empty() {
-        tracing::warn!(model = %voice_model, "empty transcription returned");
-        return UserContent::text(format!(
-            "[Audio transcription returned empty text for {}]",
-            attachment.filename
-        ));
-    }
-
-    UserContent::text(format!(
-        "<voice_transcript name=\"{}\" mime=\"{}\">\n{}\n</voice_transcript>",
-        attachment.filename, attachment.mime_type, transcript
-    ))
-}
-
-fn audio_format_for_attachment(attachment: &crate::Attachment) -> &'static str {
-    let mime = attachment.mime_type.to_lowercase();
-    if mime.contains("mpeg") || mime.contains("mp3") {
-        return "mp3";
-    }
-    if mime.contains("wav") {
-        return "wav";
     }
-    if mime.contains("flac") {
-        return "flac";
-    }
-    if mime.contains("aac") {
-        return "aac";
-    }
-    if mime.contains("ogg") {
-        return "ogg";
-    }
-    if mime.contains("mp4") || mime.contains("m4a") {
-        return "m4a";
-    }
-
-    match attachment
-        .filename
-        .rsplit('.')
-        .next()
-        .unwrap_or_default()
-        .to_lowercase()
-        .as_str()
-    {
-        "mp3" => "mp3",
-        "wav" => "wav",
-        "flac" => "flac",
-        "aac" => "aac",
-        "m4a" | "mp4" => "m4a",
-        "oga" | "ogg" => "ogg",
-        _ => "ogg",
-    }
-}
-
-fn extract_transcript_text(body: &serde_json::Value) -> String {
-    if let Some(text) = body["choices"][0]["message"]["content"].as_str() {
-        return text.trim().to_string();
-    }
-
-    let Some(parts) = body["choices"][0]["message"]["content"].as_array() else {
-        return String::new();
-    };
-
-    parts
-        .iter()
-        .filter_map(|part| {
-            if part["type"].as_str() == Some("text") {
-                part["text"].as_str().map(str::trim)
-            } else {
-                None
-            }
-        })
-        .filter(|text| !text.is_empty())
-        .collect::<Vec<_>>()
-        .join("\n")
 }
 
 /// Download a text attachment and inline its content for the LLM.

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
@@ -1,5 +1,7 @@
 //! Worker: Independent task execution process.
 
+use std::sync::Arc;
+
 use crate::agent::compactor::estimate_history_tokens;
 use crate::config::BrowserConfig;
 use crate::error::Result;
@@ -193,6 +195,9 @@ impl Worker {
         let mcp_tools = self.deps.mcp_manager.get_tools().await;
 
         // Create per-worker ToolServer with task tools
+        let routing = self.deps.runtime_config.routing.load();
+        let voice_model = routing.voice.clone();
+
         let worker_tool_server = crate::tools::create_worker_tool_server(
             self.deps.agent_id.clone(),
             self.id,
@@ -204,9 +209,10 @@ impl Worker {
             self.deps.runtime_config.workspace_dir.clone(),
             self.deps.runtime_config.instance_dir.clone(),
             mcp_tools,
+            voice_model,
+            Arc::clone(&self.deps.llm_manager),
+            self.deps.llm_manager.http_client().clone(),
         );
-
-        let routing = self.deps.runtime_config.routing.load();
         let model_name = routing.resolve(ProcessType::Worker, None).to_string();
         let model = SpacebotModel::make(&self.deps.llm_manager, &model_name)
             .with_context(&*self.deps.agent_id, "worker")

diff --git a/src/lib.rs b/src/lib.rs
@@ -20,6 +20,7 @@ pub mod prompts;
 pub mod secrets;
 pub mod settings;
 pub mod skills;
+pub mod stt;
 #[cfg(feature = "metrics")]
 pub mod telemetry;
 pub mod tools;

diff --git a/src/prompts/text.rs b/src/prompts/text.rs
@@ -158,6 +158,9 @@ fn lookup(lang: &str, key: &str) -> &'static str {
         ("en", "tools/send_message_to_another_channel") => {
             include_str!("../../prompts/en/tools/send_message_description.md.j2")
         }
+        ("en", "tools/transcribe_audio") => {
+            include_str!("../../prompts/en/tools/transcribe_audio_description.md.j2")
+        }
 
         // Fallback: unknown language or key -> try English
         (lang, key) if lang != "en" => {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools.