Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,115 changes: 496 additions & 619 deletions Cargo.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,17 @@ tempfile = "3"

# Prometheus metrics (optional, behind "metrics" feature)
prometheus = { version = "0.13", optional = true }
whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] }
hf-hub = { version = "0.5", optional = true }
symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true }
ogg = { version = "0.9", optional = true }
opus = { version = "0.3", optional = true }
pdf-extract = "0.10.0"
open = "5.3.3"
urlencoding = "2.1.3"

[features]
stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"]
metrics = ["dep:prometheus"]

[lints.clippy]
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,30 @@ channel = "my-provider/my-model"

Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`.

### Voice Transcription

Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend:

**Provider-based** — route through any configured LLM provider that supports audio input:

```toml
[defaults.routing]
voice = "openai/whisper-1"
```

**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed:

```toml
[defaults.routing]
voice = "whisper-local://small"
```

The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works.

GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models.

Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia.

### Skills

Extensible skill system integrated with [skills.sh](https://skills.sh):
Expand Down
1 change: 1 addition & 0 deletions prompts/en/tools/transcribe_audio_description.md.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools.
201 changes: 20 additions & 181 deletions src/agent/channel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::agent::branch::Branch;
use crate::agent::compactor::Compactor;
use crate::agent::status::StatusBlock;
use crate::agent::worker::Worker;
use crate::config::ApiType;

use crate::conversation::{ChannelStore, ConversationLogger, ProcessRunLogger};
use crate::error::{AgentError, Result};
use crate::hooks::SpacebotHook;
Expand Down Expand Up @@ -1956,193 +1956,32 @@ async fn transcribe_audio_attachment(
);

let routing = deps.runtime_config.routing.load();
let voice_model = routing.voice.trim();
if voice_model.is_empty() {
return UserContent::text(format!(
let voice_model = routing.voice.clone();

match crate::stt::transcribe_bytes(&voice_model, &bytes, &attachment.mime_type, &deps.llm_manager, http).await {
Ok(transcript) => UserContent::text(format!(
"<voice_transcript name=\"{}\" mime=\"{}\">\n{}\n</voice_transcript>",
attachment.filename, attachment.mime_type, transcript
)),
Err(crate::stt::SttError::NotConfigured) => UserContent::text(format!(
"[Audio attachment received but no voice model is configured in routing.voice: {}]",
attachment.filename
));
}

let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) {
Ok(parts) => parts,
Err(error) => {
tracing::warn!(%error, model = %voice_model, "invalid voice model route");
return UserContent::text(format!(
"[Audio transcription failed for {}: invalid voice model '{}']",
attachment.filename, voice_model
));
}
};

let provider = match deps.llm_manager.get_provider(&provider_id) {
Ok(provider) => provider,
Err(error) => {
tracing::warn!(%error, provider = %provider_id, "voice provider not configured");
return UserContent::text(format!(
"[Audio transcription failed for {}: provider '{}' is not configured]",
attachment.filename, provider_id
));
}
};

if provider.api_type == ApiType::Anthropic {
return UserContent::text(format!(
"[Audio transcription failed for {}: provider '{}' does not support input_audio on this endpoint]",
attachment.filename, provider_id
));
}

let format = audio_format_for_attachment(attachment);
use base64::Engine as _;
let base64_audio = base64::engine::general_purpose::STANDARD.encode(&bytes);

let endpoint = format!(
"{}/v1/chat/completions",
provider.base_url.trim_end_matches('/')
);
let body = serde_json::json!({
"model": model_name,
"messages": [{
"role": "user",
"content": [
{
"type": "text",
"text": "Transcribe this audio verbatim. Return only the transcription text."
},
{
"type": "input_audio",
"input_audio": {
"data": base64_audio,
"format": format,
}
}
]
}],
"temperature": 0
});

let response = match http
.post(&endpoint)
.header("authorization", format!("Bearer {}", provider.api_key))
.header("content-type", "application/json")
.json(&body)
.send()
.await
{
Ok(response) => response,
Err(error) => {
tracing::warn!(%error, model = %voice_model, "voice transcription request failed");
return UserContent::text(format!(
"[Audio transcription failed for {}]",
)),
Err(crate::stt::SttError::EmptyResult) => {
tracing::warn!(filename = %attachment.filename, "transcription returned empty text");
UserContent::text(format!(
"[Audio transcription returned empty text for {}]",
attachment.filename
));
))
}
};

let status = response.status();
let response_body = match response.json::<serde_json::Value>().await {
Ok(body) => body,
Err(error) => {
tracing::warn!(%error, model = %voice_model, "invalid transcription response");
return UserContent::text(format!(
"[Audio transcription failed for {}]",
attachment.filename
));
tracing::warn!(%error, filename = %attachment.filename, "audio transcription failed");
UserContent::text(format!(
"[Audio transcription failed for {}: {}]",
attachment.filename, error
))
}
};

if !status.is_success() {
let message = response_body["error"]["message"]
.as_str()
.unwrap_or("unknown error");
tracing::warn!(
status = %status,
model = %voice_model,
error = %message,
"voice transcription provider returned error"
);
return UserContent::text(format!(
"[Audio transcription failed for {}: {}]",
attachment.filename, message
));
}

let transcript = extract_transcript_text(&response_body);
if transcript.is_empty() {
tracing::warn!(model = %voice_model, "empty transcription returned");
return UserContent::text(format!(
"[Audio transcription returned empty text for {}]",
attachment.filename
));
}

UserContent::text(format!(
"<voice_transcript name=\"{}\" mime=\"{}\">\n{}\n</voice_transcript>",
attachment.filename, attachment.mime_type, transcript
))
}

fn audio_format_for_attachment(attachment: &crate::Attachment) -> &'static str {
let mime = attachment.mime_type.to_lowercase();
if mime.contains("mpeg") || mime.contains("mp3") {
return "mp3";
}
if mime.contains("wav") {
return "wav";
}
if mime.contains("flac") {
return "flac";
}
if mime.contains("aac") {
return "aac";
}
if mime.contains("ogg") {
return "ogg";
}
if mime.contains("mp4") || mime.contains("m4a") {
return "m4a";
}

match attachment
.filename
.rsplit('.')
.next()
.unwrap_or_default()
.to_lowercase()
.as_str()
{
"mp3" => "mp3",
"wav" => "wav",
"flac" => "flac",
"aac" => "aac",
"m4a" | "mp4" => "m4a",
"oga" | "ogg" => "ogg",
_ => "ogg",
}
}

fn extract_transcript_text(body: &serde_json::Value) -> String {
if let Some(text) = body["choices"][0]["message"]["content"].as_str() {
return text.trim().to_string();
}

let Some(parts) = body["choices"][0]["message"]["content"].as_array() else {
return String::new();
};

parts
.iter()
.filter_map(|part| {
if part["type"].as_str() == Some("text") {
part["text"].as_str().map(str::trim)
} else {
None
}
})
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join("\n")
}

/// Download a text attachment and inline its content for the LLM.
Expand Down
10 changes: 8 additions & 2 deletions src/agent/worker.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Worker: Independent task execution process.

use std::sync::Arc;

use crate::agent::compactor::estimate_history_tokens;
use crate::config::BrowserConfig;
use crate::error::Result;
Expand Down Expand Up @@ -193,6 +195,9 @@ impl Worker {
let mcp_tools = self.deps.mcp_manager.get_tools().await;

// Create per-worker ToolServer with task tools
let routing = self.deps.runtime_config.routing.load();
let voice_model = routing.voice.clone();

let worker_tool_server = crate::tools::create_worker_tool_server(
self.deps.agent_id.clone(),
self.id,
Expand All @@ -204,9 +209,10 @@ impl Worker {
self.deps.runtime_config.workspace_dir.clone(),
self.deps.runtime_config.instance_dir.clone(),
mcp_tools,
voice_model,
Arc::clone(&self.deps.llm_manager),
self.deps.llm_manager.http_client().clone(),
);

let routing = self.deps.runtime_config.routing.load();
let model_name = routing.resolve(ProcessType::Worker, None).to_string();
let model = SpacebotModel::make(&self.deps.llm_manager, &model_name)
.with_context(&*self.deps.agent_id, "worker")
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub mod prompts;
pub mod secrets;
pub mod settings;
pub mod skills;
pub mod stt;
#[cfg(feature = "metrics")]
pub mod telemetry;
pub mod tools;
Expand Down
3 changes: 3 additions & 0 deletions src/prompts/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ fn lookup(lang: &str, key: &str) -> &'static str {
("en", "tools/send_message_to_another_channel") => {
include_str!("../../prompts/en/tools/send_message_description.md.j2")
}
("en", "tools/transcribe_audio") => {
include_str!("../../prompts/en/tools/transcribe_audio_description.md.j2")
}

// Fallback: unknown language or key -> try English
(lang, key) if lang != "en" => {
Expand Down
Loading