diff --git a/Cargo.lock b/Cargo.lock index d123d5e6..7c98b754 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,6 +49,12 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "base64" version = "0.22.1" @@ -76,12 +82,24 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.11.1" @@ -110,6 +128,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -205,6 +229,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -362,6 +395,16 @@ dependencies = [ "wasip3", ] +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -597,6 +640,34 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -740,6 +811,16 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -755,6 +836,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -763,10 +853,11 @@ checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "openab" -version = "0.6.4" +version = "0.6.6" dependencies = [ "anyhow", "base64", + "image", "rand 0.8.5", "regex", "reqwest", @@ -815,6 +906,19 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -858,6 +962,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pxfm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quinn" version = "0.11.9" @@ -2026,6 +2142,12 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + [[package]] name = "windows-link" version = "0.2.1" @@ -2399,3 +2521,18 @@ name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml index cf504eb0..7ea46dbe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,6 @@ uuid = { version = "1", features = ["v4"] } regex = "1" anyhow = "1" rand = "0.8" -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "multipart", "json"] } base64 = "0.22" image = { version = "0.25", default-features = false, features = ["jpeg", "png", "gif", "webp"] } diff --git a/docs/stt.md b/docs/stt.md new file mode 100644 index 00000000..1eea3978 --- /dev/null +++ b/docs/stt.md @@ -0,0 +1,144 @@ +# Speech-to-Text (STT) for Voice Messages + +openab can automatically transcribe Discord voice message attachments and forward the transcript to your ACP agent as text. + +## Quick Start + +Add an `[stt]` section to your `config.toml`: + +```toml +[stt] +enabled = true +``` + +If `GROQ_API_KEY` is set in your environment, that's all you need — openab will auto-detect it and use Groq's free tier. You can also set the key explicitly: + +```toml +[stt] +enabled = true +api_key = "${GROQ_API_KEY}" +``` + +## How It Works + +``` +Discord voice message (.ogg) + │ + ▼ + openab downloads the audio file + │ + ▼ + POST /audio/transcriptions → STT provider + │ + ▼ + transcript injected as: + "[Voice message transcript]: " + │ + ▼ + ACP agent receives plain text +``` + +The transcript is prepended to the prompt as a `ContentBlock::Text`, so the downstream agent (Kiro CLI, Claude Code, etc.) sees it as regular text input. + +## Configuration Reference + +```toml +[stt] +enabled = true # default: false +api_key = "${GROQ_API_KEY}" # required for cloud providers +model = "whisper-large-v3-turbo" # default +base_url = "https://api.groq.com/openai/v1" # default +``` + +| Field | Required | Default | Description | +|---|---|---|---| +| `enabled` | no | `false` | Enable/disable STT. When disabled, audio attachments are silently skipped. | +| `api_key` | no* | — | API key for the STT provider. *Auto-detected from `GROQ_API_KEY` env var if not set. For local servers, use any non-empty string (e.g. `"not-needed"`). | +| `model` | no | `whisper-large-v3-turbo` | Whisper model name. Varies by provider. | +| `base_url` | no | `https://api.groq.com/openai/v1` | OpenAI-compatible API base URL. | + +## Deployment Options + +openab uses the standard OpenAI-compatible `/audio/transcriptions` endpoint. Any provider that implements this API works — just change `base_url`. + +### Option 1: Groq Cloud (recommended, free tier) + +```toml +[stt] +enabled = true +api_key = "${GROQ_API_KEY}" +``` + +- Free tier with rate limits +- Model: `whisper-large-v3-turbo` (default) +- Sign up at https://console.groq.com + +### Option 2: OpenAI + +```toml +[stt] +enabled = true +api_key = "${OPENAI_API_KEY}" +model = "whisper-1" +base_url = "https://api.openai.com/v1" +``` + +- ~$0.006 per minute of audio +- Model: `whisper-1` + +### Option 3: Local Whisper Server + +For users running openab on a Mac Mini, home lab, or any machine with a local whisper server: + +```toml +[stt] +enabled = true +api_key = "not-needed" +model = "large-v3-turbo" +base_url = "http://localhost:8080/v1" +``` + +- Audio stays local — never leaves your machine +- No API key or cloud account needed +- Apple Silicon users get hardware acceleration + +Compatible local whisper servers: + +| Server | Install | Apple Silicon | +|---|---|---| +| [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) | `pip install faster-whisper-server` | ✅ CoreML | +| [whisper.cpp server](https://github.com/ggerganov/whisper.cpp) | `brew install whisper-cpp` | ✅ Metal | +| [LocalAI](https://github.com/mudler/LocalAI) | Docker or binary | ✅ | + +### Option 4: LAN / Sidecar Server + +Point to a whisper server running on another machine in your network: + +```toml +[stt] +enabled = true +api_key = "not-needed" +base_url = "http://192.168.1.100:8080/v1" +``` + +### Not Supported + +- **Ollama** — does not expose an `/audio/transcriptions` endpoint. + +## Disabling STT + +Omit the `[stt]` section entirely, or set: + +```toml +[stt] +enabled = false +``` + +When disabled, audio attachments are silently skipped with no impact on existing functionality. + +## Technical Notes + +- openab sends `response_format=json` in the transcription request to ensure the response is always parseable JSON. Some local whisper servers default to plain text output without this parameter. +- The actual MIME type from the Discord attachment is passed through to the STT API (e.g. `audio/ogg`, `audio/mp4`, `audio/wav`). +- Environment variables in config values are expanded via `${VAR}` syntax (e.g. `api_key = "${GROQ_API_KEY}"`). +- The `api_key` field is auto-detected from the `GROQ_API_KEY` environment variable when using the default Groq endpoint. If you set a custom `base_url` (e.g. local server), auto-detect is disabled to avoid leaking the Groq key to unrelated endpoints — you must set `api_key` explicitly. diff --git a/src/config.rs b/src/config.rs index 6d341e27..c4ed3d30 100644 --- a/src/config.rs +++ b/src/config.rs @@ -11,8 +11,36 @@ pub struct Config { pub pool: PoolConfig, #[serde(default)] pub reactions: ReactionsConfig, + #[serde(default)] + pub stt: SttConfig, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct SttConfig { + #[serde(default)] + pub enabled: bool, + #[serde(default)] + pub api_key: String, + #[serde(default = "default_stt_model")] + pub model: String, + #[serde(default = "default_stt_base_url")] + pub base_url: String, } +impl Default for SttConfig { + fn default() -> Self { + Self { + enabled: false, + api_key: String::new(), + model: default_stt_model(), + base_url: default_stt_base_url(), + } + } +} + +fn default_stt_model() -> String { "whisper-large-v3-turbo".into() } +fn default_stt_base_url() -> String { "https://api.groq.com/openai/v1".into() } + #[derive(Debug, Deserialize)] pub struct DiscordConfig { pub bot_token: String, diff --git a/src/discord.rs b/src/discord.rs index d3a3f820..e267064e 100644 --- a/src/discord.rs +++ b/src/discord.rs @@ -1,5 +1,5 @@ use crate::acp::{classify_notification, AcpEvent, ContentBlock, SessionPool}; -use crate::config::ReactionsConfig; +use crate::config::{ReactionsConfig, SttConfig}; use crate::error_display::{format_coded_error, format_user_error}; use crate::format; use crate::reactions::StatusReactionController; @@ -32,6 +32,7 @@ pub struct Handler { pub allowed_channels: HashSet, pub allowed_users: HashSet, pub reactions_config: ReactionsConfig, + pub stt_config: SttConfig, } #[async_trait] @@ -126,18 +127,23 @@ impl EventHandler for Handler { text: prompt_with_sender.clone(), }); - // Add image attachments + // Process attachments: route by content type (audio → STT, image → encode) if !msg.attachments.is_empty() { for attachment in &msg.attachments { - if let Some(content_block) = download_and_encode_image(attachment).await { + if is_audio_attachment(attachment) { + if self.stt_config.enabled { + if let Some(transcript) = download_and_transcribe(attachment, &self.stt_config).await { + debug!(filename = %attachment.filename, chars = transcript.len(), "voice transcript injected"); + content_blocks.insert(0, ContentBlock::Text { + text: format!("[Voice message transcript]: {transcript}"), + }); + } + } else { + debug!(filename = %attachment.filename, "skipping audio attachment (STT disabled)"); + } + } else if let Some(content_block) = download_and_encode_image(attachment).await { debug!(url = %attachment.url, filename = %attachment.filename, "adding image attachment"); content_blocks.push(content_block); - } else { - error!( - url = %attachment.url, - filename = %attachment.filename, - "failed to download image attachment" - ); } } } @@ -235,6 +241,37 @@ impl EventHandler for Handler { } } +/// Check if an attachment is an audio file (voice messages are typically audio/ogg). +fn is_audio_attachment(attachment: &serenity::model::channel::Attachment) -> bool { + let mime = attachment.content_type.as_deref().unwrap_or(""); + mime.starts_with("audio/") +} + +/// Download an audio attachment and transcribe it via the configured STT provider. +async fn download_and_transcribe( + attachment: &serenity::model::channel::Attachment, + stt_config: &SttConfig, +) -> Option { + const MAX_SIZE: u64 = 25 * 1024 * 1024; // 25 MB (Whisper API limit) + + if u64::from(attachment.size) > MAX_SIZE { + error!(filename = %attachment.filename, size = attachment.size, "audio exceeds 25MB limit"); + return None; + } + + let resp = HTTP_CLIENT.get(&attachment.url).send().await.ok()?; + if !resp.status().is_success() { + error!(url = %attachment.url, status = %resp.status(), "audio download failed"); + return None; + } + let bytes = resp.bytes().await.ok()?.to_vec(); + + let mime_type = attachment.content_type.as_deref().unwrap_or("audio/ogg"); + let mime_type = mime_type.split(';').next().unwrap_or(mime_type).trim(); + + crate::stt::transcribe(&HTTP_CLIENT, stt_config, bytes, attachment.filename.clone(), mime_type).await +} + /// Maximum dimension (width or height) for resized images. /// Matches OpenClaw's DEFAULT_IMAGE_MAX_DIMENSION_PX. const IMAGE_MAX_DIMENSION_PX: u32 = 1200; diff --git a/src/main.rs b/src/main.rs index 39817342..225bf236 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ mod discord; mod error_display; mod format; mod reactions; +mod stt; use serenity::prelude::*; use std::collections::HashSet; @@ -25,7 +26,7 @@ async fn main() -> anyhow::Result<()> { .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("config.toml")); - let cfg = config::load_config(&config_path)?; + let mut cfg = config::load_config(&config_path)?; info!( agent_cmd = %cfg.agent.command, pool_max = cfg.pool.max_sessions, @@ -42,11 +43,28 @@ async fn main() -> anyhow::Result<()> { let allowed_users = parse_id_set(&cfg.discord.allowed_users, "allowed_users")?; info!(channels = allowed_channels.len(), users = allowed_users.len(), "parsed allowlists"); + // Resolve STT config before constructing handler (auto-detect mutates cfg.stt) + if cfg.stt.enabled { + if cfg.stt.api_key.is_empty() && cfg.stt.base_url.contains("groq.com") { + if let Ok(key) = std::env::var("GROQ_API_KEY") { + if !key.is_empty() { + info!("stt.api_key not set, using GROQ_API_KEY from environment"); + cfg.stt.api_key = key; + } + } + } + if cfg.stt.api_key.is_empty() { + anyhow::bail!("stt.enabled = true but no API key found — set stt.api_key in config or export GROQ_API_KEY"); + } + info!(model = %cfg.stt.model, base_url = %cfg.stt.base_url, "STT enabled"); + } + let handler = discord::Handler { pool: pool.clone(), allowed_channels, allowed_users, reactions_config: cfg.reactions, + stt_config: cfg.stt.clone(), }; let intents = GatewayIntents::GUILD_MESSAGES diff --git a/src/stt.rs b/src/stt.rs new file mode 100644 index 00000000..122db9b6 --- /dev/null +++ b/src/stt.rs @@ -0,0 +1,61 @@ +use crate::config::SttConfig; +use reqwest::multipart; +use tracing::{debug, error}; + +/// Transcribe audio bytes via an OpenAI-compatible `/audio/transcriptions` endpoint. +pub async fn transcribe( + client: &reqwest::Client, + cfg: &SttConfig, + audio_bytes: Vec, + filename: String, + mime_type: &str, +) -> Option { + let url = format!("{}/audio/transcriptions", cfg.base_url.trim_end_matches('/')); + + let file_part = multipart::Part::bytes(audio_bytes) + .file_name(filename) + .mime_str(mime_type) + .ok()?; + + let form = multipart::Form::new() + .part("file", file_part) + .text("model", cfg.model.clone()) + .text("response_format", "json"); + + let resp = match client + .post(&url) + .bearer_auth(&cfg.api_key) + .multipart(form) + .send() + .await + { + Ok(r) => r, + Err(e) => { + error!(error = %e, "STT request failed"); + return None; + } + }; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + error!(status = %status, body = %body, "STT API error"); + return None; + } + + let json: serde_json::Value = match resp.json().await { + Ok(v) => v, + Err(e) => { + error!(error = %e, "STT response parse failed"); + return None; + } + }; + + let text = json.get("text")?.as_str()?.trim().to_string(); + if text.is_empty() { + return None; + } + + debug!(chars = text.len(), "STT transcription complete"); + Some(text) +}