From e07e6aa3ca0694b1d3192f7c26265248f334f02d Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 11:54:15 +0100 Subject: [PATCH 1/4] Add local Whisper STT backend via whisper-rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When routing.voice = "whisper-local://", audio attachments are transcribed locally instead of via the LLM provider HTTP path. is either: - A known size name (tiny/base/small/medium/large) — fetched from ggerganov/whisper.cpp on HuggingFace via hf-hub, using the existing HF cache if already present - An absolute path to a GGML model file The WhisperContext is loaded once and cached in a OnceLock for the process lifetime. Audio decoding (ogg, opus, mp3, flac, wav, m4a) is handled by symphonia with linear resampling to 16 kHz mono f32. All three deps (whisper-rs, hf-hub, symphonia) are optional behind the stt-whisper feature flag. --- Cargo.lock | 1083 ++++++++++++++++++------------------------ Cargo.toml | 4 + src/agent/channel.rs | 26 + src/lib.rs | 2 + src/stt.rs | 275 +++++++++++ 5 files changed, 771 insertions(+), 619 deletions(-) create mode 100644 src/stt.rs diff --git a/Cargo.lock b/Cargo.lock index ffcc64c5e..a41956207 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,7 +640,7 @@ dependencies = [ "log", "num-rational", "num-traits", - "pastey 0.1.1", + "pastey", "rayon", "thiserror 2.0.18", "v_frame", @@ -702,10 +702,10 @@ dependencies = [ "bytes", "form_urlencoded", "futures-util", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "itoa", "matchit", @@ -718,7 +718,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_urlencoded", - "sync_wrapper 1.0.2", + "sync_wrapper", "tokio", "tower", "tower-layer", @@ -734,12 +734,12 @@ checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", "mime", "pin-project-lite", - "sync_wrapper 1.0.2", + "sync_wrapper", "tower-layer", "tower-service", "tracing", @@ -751,12 +751,6 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - [[package]] name = "base64" version = "0.22.1" @@ -782,6 +776,26 @@ dependencies = [ "num-traits", ] +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags 2.10.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.114", +] + [[package]] name = "bit_field" version = "0.10.3" @@ -895,9 +909,9 @@ dependencies = [ "futures-core", "futures-util", "hex", - "http 1.4.0", + "http", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-named-pipe", "hyper-util", "hyperlocal", @@ -1112,10 +1126,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" [[package]] -name = "cesu8" -version = "1.1.0" +name = "cexpr" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] [[package]] name = "cff-parser" @@ -1151,7 +1168,7 @@ dependencies = [ "futures", "futures-timer", "pin-project-lite", - "reqwest 0.12.28", + "reqwest", "serde", "serde_json", "thiserror 1.0.69", @@ -1235,6 +1252,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.58" @@ -1305,16 +1333,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "combine" -version = "4.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" -dependencies = [ - "bytes", - "memchr", -] - [[package]] name = "comfy-table" version = "7.2.2" @@ -1399,6 +1417,19 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "console" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.61.2", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -1455,6 +1486,35 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "cookie" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747" +dependencies = [ + "percent-encoding", + "time", + "version_check", +] + +[[package]] +name = "cookie_store" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206" +dependencies = [ + "cookie", + "document-features", + "idna", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "time", + "url", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -1620,7 +1680,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73736a89c4aff73035ba2ed2e565061954da00d4970fc9ac25dcc85a2a20d790" dependencies = [ "dispatch2", - "nix 0.30.1", + "nix", "windows-sys 0.61.2", ] @@ -1653,16 +1713,6 @@ dependencies = [ "darling_macro 0.21.3", ] -[[package]] -name = "darling" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" -dependencies = [ - "darling_core 0.23.0", - "darling_macro 0.23.0", -] - [[package]] name = "darling_core" version = "0.20.11" @@ -1691,19 +1741,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "darling_core" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" -dependencies = [ - "ident_case", - "proc-macro2", - "quote", - "strsim 0.11.1", - "syn 2.0.114", -] - [[package]] name = "darling_macro" version = "0.20.11" @@ -1726,17 +1763,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "darling_macro" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" -dependencies = [ - "darling_core 0.23.0", - "quote", - "syn 2.0.114", -] - [[package]] name = "dary_heap" version = "0.3.8" @@ -2537,7 +2563,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de" dependencies = [ - "console", + "console 0.15.11", "shell-words", "tempfile", "thiserror 1.0.69", @@ -2609,6 +2635,15 @@ dependencies = [ "const-random", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "dotenvy" version = "0.15.7" @@ -2849,6 +2884,12 @@ dependencies = [ "zune-inflate", ] +[[package]] +name = "extended" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365" + [[package]] name = "fast-float2" version = "0.2.3" @@ -2868,7 +2909,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04c269a76bfc6cea69553b7d040acb16c793119cebd97c756d21e08d0f075ff8" dependencies = [ "anyhow", - "hf-hub", + "hf-hub 0.4.3", "image", "ndarray", "ort", @@ -3402,25 +3443,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "h2" -version = "0.3.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap 2.13.0", - "slab", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "h2" version = "0.4.13" @@ -3432,7 +3454,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.0", + "http", "indexmap 2.13.0", "slab", "tokio", @@ -3544,20 +3566,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", - "http 1.4.0", - "indicatif", + "http", + "indicatif 0.17.11", "libc", "log", "native-tls", "rand 0.9.2", - "reqwest 0.12.28", + "reqwest", "serde", "serde_json", "thiserror 2.0.18", - "ureq", + "ureq 2.12.1", "windows-sys 0.60.2", ] +[[package]] +name = "hf-hub" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213" +dependencies = [ + "dirs", + "futures", + "http", + "indicatif 0.18.4", + "libc", + "log", + "native-tls", + "num_cpus", + "rand 0.9.2", + "reqwest", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", + "ureq 3.2.0", + "windows-sys 0.61.2", +] + [[package]] name = "hkdf" version = "0.12.4" @@ -3591,17 +3637,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - [[package]] name = "http" version = "1.4.0" @@ -3612,17 +3647,6 @@ dependencies = [ "itoa", ] -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - [[package]] name = "http-body" version = "1.0.1" @@ -3630,7 +3654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.0", + "http", ] [[package]] @@ -3641,8 +3665,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "pin-project-lite", ] @@ -3670,30 +3694,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.27", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2 0.5.10", - "tokio", - "tower-service", - "tracing", - "want", -] - [[package]] name = "hyper" version = "1.8.1" @@ -3704,9 +3704,9 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2 0.4.13", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "httparse", "httpdate", "itoa", @@ -3724,7 +3724,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" dependencies = [ "hex", - "hyper 1.8.1", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3732,28 +3732,14 @@ dependencies = [ "winapi", ] -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] - [[package]] name = "hyper-rustls" version = "0.27.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ - "http 1.4.0", - "hyper 1.8.1", + "http", + "hyper", "hyper-util", "log", "rustls 0.23.36", @@ -3773,7 +3759,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "native-tls", "tokio", @@ -3791,15 +3777,15 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "hyper 1.8.1", + "http", + "http-body", + "hyper", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", - "system-configuration 0.7.0", + "socket2", + "system-configuration", "tokio", "tower-service", "tracing", @@ -3814,7 +3800,7 @@ checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" dependencies = [ "hex", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -4121,13 +4107,26 @@ version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ - "console", + "console 0.15.11", "number_prefix", "portable-atomic", "unicode-width", "web-time", ] +[[package]] +name = "indicatif" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" +dependencies = [ + "console 0.16.2", + "portable-atomic", + "unicode-width", + "unit-prefix", + "web-time", +] + [[package]] name = "indoc" version = "2.0.7" @@ -4311,28 +4310,6 @@ dependencies = [ "jiff-tzdb", ] -[[package]] -name = "jni" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" -dependencies = [ - "cesu8", - "cfg-if", - "combine", - "jni-sys", - "log", - "thiserror 1.0.69", - "walkdir", - "windows-sys 0.45.0", -] - -[[package]] -name = "jni-sys" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" - [[package]] name = "jobserver" version = "0.1.34" @@ -4853,7 +4830,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ - "reqwest 0.12.28", + "reqwest", "serde", "serde_json", "serde_repr", @@ -5058,9 +5035,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.180" +version = "0.2.181" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" +checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" [[package]] name = "libfuzzer-sys" @@ -5072,6 +5049,16 @@ dependencies = [ "cc", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link 0.2.1", +] + [[package]] name = "libm" version = "0.2.16" @@ -5118,6 +5105,12 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -5476,7 +5469,7 @@ dependencies = [ "bytes", "encoding_rs", "futures-util", - "http 1.4.0", + "http", "httparse", "memchr", "mime", @@ -5555,18 +5548,6 @@ dependencies = [ "libc", ] -[[package]] -name = "nix" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225e7cfe711e0ba79a68baeddb2982723e4235247aefce1482f2f16c27865b66" -dependencies = [ - "bitflags 2.10.0", - "cfg-if", - "cfg_aliases", - "libc", -] - [[package]] name = "nom" version = "7.1.3" @@ -5805,7 +5786,7 @@ dependencies = [ "bytes", "chrono", "futures", - "http 1.4.0", + "http", "humantime", "itertools 0.14.0", "parking_lot", @@ -5948,9 +5929,9 @@ checksum = "46d7ab32b827b5b495bd90fa95a6cb65ccc293555dcc3199ae2937d2d237c8ed" dependencies = [ "async-trait", "bytes", - "http 1.4.0", + "http", "opentelemetry", - "reqwest 0.12.28", + "reqwest", "tracing", ] @@ -5961,13 +5942,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d899720fe06916ccba71c01d04ecd77312734e2de3467fd30d9d580c8ce85656" dependencies = [ "futures-core", - "http 1.4.0", + "http", "opentelemetry", "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", "prost 0.13.5", - "reqwest 0.12.28", + "reqwest", "thiserror 2.0.18", ] @@ -6055,7 +6036,7 @@ dependencies = [ "pkg-config", "sha2", "tar", - "ureq", + "ureq 2.12.1", ] [[package]] @@ -6108,12 +6089,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" -[[package]] -name = "pastey" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b867cad97c0791bbd3aaa6472142568c6c9e8f71937e98379f584cfb0cf35bec" - [[package]] name = "path_abs" version = "0.5.1" @@ -6447,20 +6422,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "process-wrap" -version = "9.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccd9713fe2c91c3c85ac388b31b89de339365d2c995146e630b5e0da9d06526a" -dependencies = [ - "futures", - "indexmap 2.13.0", - "nix 0.31.1", - "tokio", - "tracing", - "windows", -] - [[package]] name = "profiling" version = "1.0.17" @@ -6633,7 +6594,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.36", - "socket2 0.6.2", + "socket2", "thiserror 2.0.18", "tokio", "tracing", @@ -6646,7 +6607,6 @@ version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ - "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", @@ -6671,7 +6631,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.2", + "socket2", "tracing", "windows-sys 0.60.2", ] @@ -6995,63 +6955,22 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" [[package]] name = "reqwest" -version = "0.11.27" +version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "bytes", "encoding_rs", + "futures-channel", "futures-core", "futures-util", - "h2 0.3.27", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.32", - "hyper-rustls 0.24.2", - "ipnet", - "js-sys", - "log", - "mime", - "once_cell", - "percent-encoding", - "pin-project-lite", - "rustls 0.21.12", - "rustls-pemfile", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper 0.1.2", - "system-configuration 0.5.1", - "tokio", - "tokio-rustls 0.24.1", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots 0.25.4", - "winreg", -] - -[[package]] -name = "reqwest" -version = "0.12.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" -dependencies = [ - "base64 0.22.1", - "bytes", - "encoding_rs", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.4.13", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "http-body-util", - "hyper 1.8.1", - "hyper-rustls 0.27.7", + "hyper", + "hyper-rustls", "hyper-tls", "hyper-util", "js-sys", @@ -7068,7 +6987,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper 1.0.2", + "sync_wrapper", "tokio", "tokio-native-tls", "tokio-rustls 0.26.4", @@ -7079,51 +6998,11 @@ dependencies = [ "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams 0.4.2", + "wasm-streams", "web-sys", "webpki-roots 1.0.6", ] -[[package]] -name = "reqwest" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" -dependencies = [ - "base64 0.22.1", - "bytes", - "futures-core", - "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "hyper 1.8.1", - "hyper-rustls 0.27.7", - "hyper-util", - "js-sys", - "log", - "percent-encoding", - "pin-project-lite", - "quinn", - "rustls 0.23.36", - "rustls-pki-types", - "rustls-platform-verifier", - "serde", - "serde_json", - "sync_wrapper 1.0.2", - "tokio", - "tokio-rustls 0.26.4", - "tokio-util", - "tower", - "tower-http", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-streams 0.5.0", - "web-sys", -] - [[package]] name = "rgb" version = "0.8.52" @@ -7148,13 +7027,13 @@ dependencies = [ "futures", "futures-timer", "glob", - "http 1.4.0", + "http", "mime", "mime_guess", "nanoid", "ordered-float", "pin-project-lite", - "reqwest 0.12.28", + "reqwest", "rig-derive", "schemars 1.2.1", "serde", @@ -7195,46 +7074,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rmcp" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc4c9c94680f75470ee8083a0667988b5d7b5beb70b9f998a8e51de7c682ce60" -dependencies = [ - "async-trait", - "base64 0.22.1", - "chrono", - "futures", - "http 1.4.0", - "pastey 0.2.1", - "pin-project-lite", - "process-wrap", - "reqwest 0.13.2", - "rmcp-macros", - "schemars 1.2.1", - "serde", - "serde_json", - "sse-stream", - "thiserror 2.0.18", - "tokio", - "tokio-stream", - "tokio-util", - "tracing", -] - -[[package]] -name = "rmcp-macros" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90c23c8f26cae4da838fbc3eadfaecf2d549d97c04b558e7bd90526a9c28b42a" -dependencies = [ - "darling 0.23.0", - "proc-macro2", - "quote", - "serde_json", - "syn 2.0.114", -] - [[package]] name = "roaring" version = "0.10.12" @@ -7486,15 +7325,6 @@ dependencies = [ "security-framework 3.5.1", ] -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -7505,33 +7335,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-platform-verifier" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" -dependencies = [ - "core-foundation 0.10.1", - "core-foundation-sys", - "jni", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-native-certs", - "rustls-platform-verifier-android", - "rustls-webpki 0.103.9", - "security-framework 3.5.1", - "security-framework-sys", - "webpki-root-certs", - "windows-sys 0.52.0", -] - -[[package]] -name = "rustls-platform-verifier-android" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" - [[package]] name = "rustls-webpki" version = "0.101.7" @@ -7645,7 +7448,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ - "chrono", "dyn-clone", "ref-cast", "schemars_derive 1.2.1", @@ -7937,7 +7739,7 @@ dependencies = [ "mime_guess", "parking_lot", "percent-encoding", - "reqwest 0.12.28", + "reqwest", "rustc-hash", "secrecy", "serde", @@ -8121,10 +7923,10 @@ dependencies = [ "futures-util", "hex", "hmac", - "http 1.4.0", + "http", "http-body-util", - "hyper 1.8.1", - "hyper-rustls 0.27.7", + "hyper", + "hyper-rustls", "hyper-util", "lazy_static", "mime", @@ -8176,16 +7978,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "socket2" version = "0.6.2" @@ -8209,7 +8001,7 @@ dependencies = [ [[package]] name = "spacebot" -version = "0.1.14" +version = "0.1.12" dependencies = [ "aes-gcm", "anyhow", @@ -8224,7 +8016,6 @@ dependencies = [ "chromiumoxide", "chromiumoxide_cdp", "chrono", - "chrono-tz", "clap", "config", "daemonize", @@ -8232,7 +8023,7 @@ dependencies = [ "dirs", "fastembed", "futures", - "hex", + "hf-hub 0.5.0", "ignore", "indoc", "lance-index", @@ -8252,9 +8043,8 @@ dependencies = [ "rand 0.9.2", "redb", "regex", - "reqwest 0.12.28", + "reqwest", "rig-core", - "rmcp", "rust-embed", "rustls 0.23.36", "schemars 0.8.22", @@ -8265,6 +8055,7 @@ dependencies = [ "sha2", "slack-morphism", "sqlx", + "symphonia", "teloxide", "tempfile", "thiserror 2.0.18", @@ -8281,6 +8072,7 @@ dependencies = [ "twitch-irc", "urlencoding", "uuid", + "whisper-rs", "zip", ] @@ -8544,19 +8336,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "sse-stream" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb4dc4d33c68ec1f27d386b5610a351922656e1fdf5c05bbaad930cd1519479a" -dependencies = [ - "bytes", - "futures-util", - "http-body 1.0.1", - "http-body-util", - "pin-project-lite", -] - [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -8645,6 +8424,178 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symphonia" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039" +dependencies = [ + "lazy_static", + "symphonia-bundle-flac", + "symphonia-bundle-mp3", + "symphonia-codec-aac", + "symphonia-codec-adpcm", + "symphonia-codec-pcm", + "symphonia-codec-vorbis", + "symphonia-core", + "symphonia-format-isomp4", + "symphonia-format-mkv", + "symphonia-format-ogg", + "symphonia-format-riff", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-bundle-flac" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-bundle-mp3" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-codec-aac" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c263845aa86881416849c1729a54c7f55164f8b96111dba59de46849e73a790" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-adpcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-pcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-vorbis" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73" +dependencies = [ + "log", + "symphonia-core", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-core" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af" +dependencies = [ + "arrayvec", + "bitflags 1.3.2", + "bytemuck", + "lazy_static", + "log", +] + +[[package]] +name = "symphonia-format-isomp4" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "243739585d11f81daf8dac8d9f3d18cc7898f6c09a259675fc364b382c30e0a5" +dependencies = [ + "encoding_rs", + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-mkv" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-ogg" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-riff" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f" +dependencies = [ + "extended", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-metadata" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16" +dependencies = [ + "encoding_rs", + "lazy_static", + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-utils-xiph" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16" +dependencies = [ + "symphonia-core", + "symphonia-metadata", +] + [[package]] name = "syn" version = "1.0.109" @@ -8667,12 +8618,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - [[package]] name = "sync_wrapper" version = "1.0.2" @@ -8693,17 +8638,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "system-configuration" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" -dependencies = [ - "bitflags 1.3.2", - "core-foundation 0.9.4", - "system-configuration-sys 0.5.0", -] - [[package]] name = "system-configuration" version = "0.7.0" @@ -8712,17 +8646,7 @@ checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ "bitflags 2.10.0", "core-foundation 0.9.4", - "system-configuration-sys 0.6.0", -] - -[[package]] -name = "system-configuration-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" -dependencies = [ - "core-foundation-sys", - "libc", + "system-configuration-sys", ] [[package]] @@ -8958,7 +8882,7 @@ dependencies = [ "once_cell", "pin-project", "rc-box", - "reqwest 0.12.28", + "reqwest", "rgb", "serde", "serde_json", @@ -9168,7 +9092,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.2", + "socket2", "tokio-macros", "tracing", "windows-sys 0.61.2", @@ -9397,8 +9321,8 @@ dependencies = [ "async-trait", "base64 0.22.1", "bytes", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", "percent-encoding", "pin-project", @@ -9418,7 +9342,7 @@ dependencies = [ "futures-core", "futures-util", "pin-project-lite", - "sync_wrapper 1.0.2", + "sync_wrapper", "tokio", "tower-layer", "tower-service", @@ -9436,8 +9360,8 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", "http-range-header", "httpdate", @@ -9597,7 +9521,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http 1.4.0", + "http", "httparse", "log", "rand 0.8.5", @@ -9617,7 +9541,7 @@ checksum = "8628dcc84e5a09eb3d8423d6cb682965dea9133204e8fb3efee74c2a0c259442" dependencies = [ "bytes", "data-encoding", - "http 1.4.0", + "http", "httparse", "log", "rand 0.9.2", @@ -9640,8 +9564,6 @@ dependencies = [ "either", "enum_dispatch", "futures-util", - "reqwest 0.11.27", - "serde", "smallvec", "thiserror 1.0.69", "tokio", @@ -9789,6 +9711,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "universal-hash" version = "0.5.1" @@ -9825,6 +9753,42 @@ dependencies = [ "webpki-roots 0.26.11", ] +[[package]] +name = "ureq" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc" +dependencies = [ + "base64 0.22.1", + "cookie_store", + "der", + "flate2", + "log", + "native-tls", + "percent-encoding", + "rustls 0.23.36", + "rustls-pki-types", + "serde", + "serde_json", + "socks", + "ureq-proto", + "utf-8", + "webpki-root-certs", + "webpki-roots 1.0.6", +] + +[[package]] +name = "ureq-proto" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.8" @@ -10052,19 +10016,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasm-streams" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb" -dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - [[package]] name = "wasmparser" version = "0.244.0" @@ -10099,9 +10050,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36a29fc0408b113f68cf32637857ab740edfafdf460c326cd2afaa2d84cc05dc" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" dependencies = [ "rustls-pki-types", ] @@ -10147,6 +10098,27 @@ dependencies = [ "winsafe", ] +[[package]] +name = "whisper-rs" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8" +dependencies = [ + "whisper-rs-sys", +] + +[[package]] +name = "whisper-rs-sys" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e2a6e06e7ac7b8f53c53a5f50bb0bc823ba69b63ecd887339f807a5598bbd2" +dependencies = [ + "bindgen", + "cfg-if", + "cmake", + "fs_extra", +] + [[package]] name = "whoami" version = "1.6.1" @@ -10188,27 +10160,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.62.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" -dependencies = [ - "windows-collections", - "windows-core", - "windows-future", - "windows-numerics", -] - -[[package]] -name = "windows-collections" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" -dependencies = [ - "windows-core", -] - [[package]] name = "windows-core" version = "0.62.2" @@ -10222,17 +10173,6 @@ dependencies = [ "windows-strings 0.5.1", ] -[[package]] -name = "windows-future" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" -dependencies = [ - "windows-core", - "windows-link 0.2.1", - "windows-threading", -] - [[package]] name = "windows-implement" version = "0.60.2" @@ -10267,16 +10207,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-numerics" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" -dependencies = [ - "windows-core", - "windows-link 0.2.1", -] - [[package]] name = "windows-registry" version = "0.5.3" @@ -10335,15 +10265,6 @@ dependencies = [ "windows-link 0.2.1", ] -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -10389,21 +10310,6 @@ dependencies = [ "windows-link 0.2.1", ] -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows-targets" version = "0.48.5" @@ -10452,21 +10358,6 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] -[[package]] -name = "windows-threading" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" -dependencies = [ - "windows-link 0.2.1", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -10485,12 +10376,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -10509,12 +10394,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -10545,12 +10424,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -10569,12 +10442,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -10593,12 +10460,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -10617,12 +10478,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -10659,16 +10514,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.50.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "winsafe" version = "0.0.19" diff --git a/Cargo.toml b/Cargo.toml index ba7a53a3c..608fa4ae7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -137,11 +137,15 @@ tempfile = "3" # Prometheus metrics (optional, behind "metrics" feature) prometheus = { version = "0.13", optional = true } +whisper-rs = { version = "0.15", optional = true } +hf-hub = { version = "0.5", optional = true } +symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true } pdf-extract = "0.10.0" open = "5.3.3" urlencoding = "2.1.3" [features] +stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"] metrics = ["dep:prometheus"] [lints.clippy] diff --git a/src/agent/channel.rs b/src/agent/channel.rs index fd1a83d71..a091f10d2 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -1964,6 +1964,32 @@ async fn transcribe_audio_attachment( )); } + // Local Whisper backend — bypass the LLM provider path entirely. + #[cfg(feature = "stt-whisper")] + if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") { + let transcript = match crate::stt::transcribe(model_spec, &bytes).await { + Ok(text) if text.is_empty() => { + tracing::warn!(filename = %attachment.filename, "local Whisper returned empty transcript"); + return UserContent::text(format!( + "[Audio transcription returned empty text for {}]", + attachment.filename + )); + } + Ok(text) => text, + Err(error) => { + tracing::warn!(%error, filename = %attachment.filename, "local Whisper transcription failed"); + return UserContent::text(format!( + "[Audio transcription failed for {}: {}]", + attachment.filename, error + )); + } + }; + return UserContent::text(format!( + "\n{}\n", + attachment.filename, attachment.mime_type, transcript + )); + } + let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) { Ok(parts) => parts, Err(error) => { diff --git a/src/lib.rs b/src/lib.rs index 98b4eac3f..9abc71594 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,8 @@ pub mod skills; #[cfg(feature = "metrics")] pub mod telemetry; pub mod tools; +#[cfg(feature = "stt-whisper")] +pub mod stt; pub mod update; pub use error::{Error, Result}; diff --git a/src/stt.rs b/src/stt.rs new file mode 100644 index 000000000..2e0e5a8cc --- /dev/null +++ b/src/stt.rs @@ -0,0 +1,275 @@ +//! Local Whisper speech-to-text via whisper-rs. +//! +//! Only compiled when the `stt-whisper` feature is enabled. +//! Exposed as a single async `transcribe` function that lazily loads and caches +//! the model context for the lifetime of the process. + +#[cfg(feature = "stt-whisper")] +pub use local::transcribe; + +#[cfg(feature = "stt-whisper")] +mod local { + use std::sync::OnceLock; + + use hf_hub::api::sync::Api; + use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; + + /// Known model size names and their GGML filenames on `ggerganov/whisper.cpp`. + const KNOWN_SIZES: &[(&str, &str)] = &[ + ("tiny", "ggml-tiny.bin"), + ("tiny.en", "ggml-tiny.en.bin"), + ("base", "ggml-base.bin"), + ("base.en", "ggml-base.en.bin"), + ("small", "ggml-small.bin"), + ("small.en", "ggml-small.en.bin"), + ("medium", "ggml-medium.bin"), + ("medium.en", "ggml-medium.en.bin"), + ("large", "ggml-large-v3.bin"), + ("large-v1", "ggml-large-v1.bin"), + ("large-v2", "ggml-large-v2.bin"), + ("large-v3", "ggml-large-v3.bin"), + ]; + + /// Cached (model_spec, WhisperContext) — one per process. + /// + /// If the user changes `routing.voice` at runtime we just keep using the + /// already-loaded model; a restart is required to switch models. + static CONTEXT: OnceLock<(String, WhisperContext)> = OnceLock::new(); + + #[derive(Debug, thiserror::Error)] + pub enum WhisperError { + #[error("model not found and could not be downloaded: {0}")] + ModelNotFound(String), + #[error("hf-hub error: {0}")] + HfHub(String), + #[error("failed to load whisper model: {0}")] + Load(String), + #[error("failed to create whisper state: {0}")] + State(String), + #[error("transcription failed: {0}")] + Transcription(String), + #[error("audio decode error: {0}")] + Decode(String), + } + + /// Transcribe raw audio bytes using the local Whisper model. + /// + /// `model_spec` is the part after `whisper-local://`: + /// - A known size name (`small`, `medium`, `large`, …) — downloaded from HF + /// into the HF cache on first use. + /// - An absolute path (`/path/to/ggml-small.bin`) — loaded directly. + pub async fn transcribe(model_spec: &str, audio: &[u8]) -> Result { + let model_spec = model_spec.to_owned(); + let audio = audio.to_vec(); + + // Whisper inference is CPU-bound and blocking — run on a thread pool. + tokio::task::spawn_blocking(move || transcribe_blocking(&model_spec, &audio)) + .await + .map_err(|e| WhisperError::Transcription(e.to_string()))? + } + + fn transcribe_blocking(model_spec: &str, audio: &[u8]) -> Result { + let ctx = get_or_load_context(model_spec)?; + + let mut state = ctx + .create_state() + .map_err(|e| WhisperError::State(e.to_string()))?; + + let samples = decode_to_f32(audio)?; + + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); + params.set_language(Some("auto")); + params.set_print_progress(false); + params.set_print_realtime(false); + params.set_print_timestamps(false); + + state + .full(params, &samples) + .map_err(|e| WhisperError::Transcription(e.to_string()))?; + + let n = state.full_n_segments(); + let mut parts = Vec::with_capacity(n as usize); + for i in 0..n { + if let Some(segment) = state.get_segment(i) { + if let Ok(text) = segment.to_str() { + let trimmed = text.trim(); + if !trimmed.is_empty() { + parts.push(trimmed.to_owned()); + } + } + } + } + + Ok(parts.join(" ")) + } + + /// Return the cached context, loading it first if necessary. + fn get_or_load_context(model_spec: &str) -> Result<&'static WhisperContext, WhisperError> { + if let Some((_, ctx)) = CONTEXT.get() { + return Ok(ctx); + } + + let model_path = resolve_model_path(model_spec)?; + + tracing::info!(model_path = %model_path, "loading local Whisper model"); + + let params = WhisperContextParameters::default(); + let ctx = WhisperContext::new_with_params(&model_path, params) + .map_err(|e| WhisperError::Load(e.to_string()))?; + + let _ = CONTEXT.set((model_spec.to_owned(), ctx)); + + tracing::info!(model_path = %model_path, "Whisper model loaded and cached"); + + Ok(&CONTEXT.get().unwrap().1) + } + + /// Resolve a model spec to an absolute path on disk, downloading via hf-hub if needed. + fn resolve_model_path(spec: &str) -> Result { + // Absolute path — use directly. + if spec.starts_with('/') { + if std::path::Path::new(spec).exists() { + return Ok(spec.to_owned()); + } + return Err(WhisperError::ModelNotFound(format!( + "model file not found: {spec}" + ))); + } + + // Known size name — fetch via hf-hub (uses HF_HOME cache, downloads if missing). + let filename = KNOWN_SIZES + .iter() + .find(|(name, _)| *name == spec) + .map(|(_, file)| *file) + .ok_or_else(|| { + WhisperError::ModelNotFound(format!( + "unknown model size '{spec}'; use one of: {}", + KNOWN_SIZES + .iter() + .map(|(n, _)| *n) + .collect::>() + .join(", ") + )) + })?; + + tracing::info!(model = %spec, filename = %filename, "fetching Whisper model via hf-hub"); + + let api = Api::new().map_err(|e| WhisperError::HfHub(e.to_string()))?; + let repo = api.model("ggerganov/whisper.cpp".to_owned()); + let path = repo + .get(filename) + .map_err(|e| WhisperError::HfHub(e.to_string()))?; + + Ok(path.to_string_lossy().to_string()) + } + + /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper. + /// + /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual + /// format detection. + fn decode_to_f32(audio: &[u8]) -> Result, WhisperError> { + use symphonia::core::codecs::DecoderOptions; + use symphonia::core::formats::FormatOptions; + use symphonia::core::io::MediaSourceStream; + use symphonia::core::meta::MetadataOptions; + use symphonia::core::probe::Hint; + + let cursor = std::io::Cursor::new(audio.to_vec()); + let mss = MediaSourceStream::new(Box::new(cursor), Default::default()); + + let probed = symphonia::default::get_probe() + .format( + &Hint::new(), + mss, + &FormatOptions::default(), + &MetadataOptions::default(), + ) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + let mut format = probed.format; + let track = format + .tracks() + .iter() + .find(|t| { + t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL + }) + .ok_or_else(|| WhisperError::Decode("no audio track found".into()))? + .clone(); + + let mut decoder = symphonia::default::get_codecs() + .make(&track.codec_params, &DecoderOptions::default()) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + let track_id = track.id; + let sample_rate = track.codec_params.sample_rate.unwrap_or(16000); + let channels = track + .codec_params + .channels + .map(|c| c.count()) + .unwrap_or(1); + + let mut raw_samples: Vec = Vec::new(); + + loop { + let packet = match format.next_packet() { + Ok(p) => p, + Err(symphonia::core::errors::Error::IoError(_)) => break, + Err(symphonia::core::errors::Error::ResetRequired) => break, + Err(e) => return Err(WhisperError::Decode(e.to_string())), + }; + + if packet.track_id() != track_id { + continue; + } + + let decoded = decoder + .decode(&packet) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + // Convert to f32 mono using a sample-converting audio buffer. + use symphonia::core::audio::{AudioBuffer, Signal as _}; + + let mut f32_buf: AudioBuffer = AudioBuffer::new( + decoded.capacity() as u64, + decoded.spec().clone(), + ); + decoded.convert(&mut f32_buf); + + // Mix down to mono. + let frames = f32_buf.frames(); + for frame in 0..frames { + let mut sum = 0f32; + for ch in 0..channels { + sum += f32_buf.chan(ch)[frame]; + } + raw_samples.push(sum / channels as f32); + } + } + + // Resample to 16 kHz if needed. + if sample_rate != 16000 { + raw_samples = resample(raw_samples, sample_rate, 16000); + } + + Ok(raw_samples) + } + + /// Simple linear resampler (good enough for speech; not for music). + fn resample(samples: Vec, from_hz: u32, to_hz: u32) -> Vec { + if from_hz == to_hz { + return samples; + } + let ratio = from_hz as f64 / to_hz as f64; + let out_len = (samples.len() as f64 / ratio) as usize; + let mut out = Vec::with_capacity(out_len); + for i in 0..out_len { + let pos = i as f64 * ratio; + let idx = pos as usize; + let frac = (pos - idx as f64) as f32; + let a = samples.get(idx).copied().unwrap_or(0.0); + let b = samples.get(idx + 1).copied().unwrap_or(0.0); + out.push(a + frac * (b - a)); + } + out + } +} From c85c922d9707da1bb45f2722e3dcf7c0f385584d Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 13:20:17 +0100 Subject: [PATCH 2/4] Enable Vulkan GPU backend and Ogg/Opus decode for local Whisper STT --- Cargo.lock | 32 +++++++++++++++++++++++ Cargo.toml | 6 +++-- src/stt.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a41956207..1846f49df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -621,6 +621,17 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "audiopus_sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62314a1546a2064e033665d658e88c620a62904be945f8147e6b16c3db9f8651" +dependencies = [ + "cmake", + "log", + "pkg-config", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -5800,6 +5811,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "ogg" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdab8dcd8d4052eaacaf8fb07a3ccd9a6e26efadb42878a413c68fc4af1dee2b" +dependencies = [ + "byteorder", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -5996,6 +6016,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "opus" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d3809943dff6fbad5f0484449ea26bdb9cb7d8efdf26ed50d3c7f227f69eb5c" +dependencies = [ + "audiopus_sys", +] + [[package]] name = "ordered-float" version = "5.1.0" @@ -8032,11 +8061,13 @@ dependencies = [ "mime_guess", "minijinja", "notify", + "ogg", "open", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", "opentelemetry_sdk", + "opus", "pdf-extract", "pin-project", "prometheus", @@ -10104,6 +10135,7 @@ version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71ea5d2401f30f51d08126a2d133fee4c1955136519d7ac6cf6f5ac0a91e6bc8" dependencies = [ + "libc", "whisper-rs-sys", ] diff --git a/Cargo.toml b/Cargo.toml index 608fa4ae7..5a2ef8480 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -137,15 +137,17 @@ tempfile = "3" # Prometheus metrics (optional, behind "metrics" feature) prometheus = { version = "0.13", optional = true } -whisper-rs = { version = "0.15", optional = true } +whisper-rs = { version = "0.15", optional = true, features = ["vulkan"] } hf-hub = { version = "0.5", optional = true } symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "wav", "isomp4"], optional = true } +ogg = { version = "0.9", optional = true } +opus = { version = "0.3", optional = true } pdf-extract = "0.10.0" open = "5.3.3" urlencoding = "2.1.3" [features] -stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia"] +stt-whisper = ["dep:whisper-rs", "dep:hf-hub", "dep:symphonia", "dep:ogg", "dep:opus"] metrics = ["dep:prometheus"] [lints.clippy] diff --git a/src/stt.rs b/src/stt.rs index 2e0e5a8cc..d07afa400 100644 --- a/src/stt.rs +++ b/src/stt.rs @@ -165,9 +165,13 @@ mod local { /// Decode arbitrary audio bytes to 16 kHz mono f32 samples for Whisper. /// - /// Uses symphonia so ogg/opus, mp3, flac, wav, etc. all work without manual - /// format detection. + /// Ogg/Opus (Telegram voice messages) is handled directly via the `ogg` + + /// `opus` crates. Everything else falls through to symphonia. fn decode_to_f32(audio: &[u8]) -> Result, WhisperError> { + if is_ogg_opus(audio) { + return decode_ogg_opus(audio); + } + use symphonia::core::codecs::DecoderOptions; use symphonia::core::formats::FormatOptions; use symphonia::core::io::MediaSourceStream; @@ -254,6 +258,75 @@ mod local { Ok(raw_samples) } + /// Check if the audio is an Ogg container with an Opus stream. + fn is_ogg_opus(audio: &[u8]) -> bool { + // OggS capture pattern at offset 0, and OpusHead magic at offset 28 + // (first packet of the first logical stream). + audio.starts_with(b"OggS") && audio.len() > 36 && &audio[28..36] == b"OpusHead" + } + + /// Decode Ogg/Opus audio to 16 kHz mono f32 samples. + fn decode_ogg_opus(audio: &[u8]) -> Result, WhisperError> { + use ogg::reading::PacketReader; + + let cursor = std::io::Cursor::new(audio); + let mut reader = PacketReader::new(cursor); + + // Skip the OpusHead and OpusTags header packets. + let mut header_packets = 0; + let mut decoder: Option = None; + let mut sample_rate = 48000u32; + let mut channels = 1usize; + let mut samples: Vec = Vec::new(); + + while let Ok(Some(packet)) = reader.read_packet() { + if header_packets < 2 { + if header_packets == 0 { + // Parse OpusHead to get channel count and pre-skip. + if packet.data.len() >= 11 && &packet.data[0..8] == b"OpusHead" { + channels = packet.data[9] as usize; + // Output sample rate is always 48000 for libopus. + sample_rate = 48000; + } + decoder = Some( + opus::Decoder::new(sample_rate, if channels == 2 { + opus::Channels::Stereo + } else { + opus::Channels::Mono + }) + .map_err(|e| WhisperError::Decode(e.to_string()))?, + ); + } + header_packets += 1; + continue; + } + + let dec = decoder.as_mut().unwrap(); + // Max Opus frame: 120ms at 48kHz = 5760 samples per channel. + let max_samples = 5760 * channels; + let mut pcm = vec![0f32; max_samples]; + let n = dec + .decode_float(&packet.data, &mut pcm, false) + .map_err(|e| WhisperError::Decode(e.to_string()))?; + + // Mix down to mono. + if channels == 1 { + samples.extend_from_slice(&pcm[..n]); + } else { + for frame in 0..n { + let mut sum = 0f32; + for ch in 0..channels { + sum += pcm[frame * channels + ch]; + } + samples.push(sum / channels as f32); + } + } + } + + // Resample from 48 kHz to 16 kHz. + Ok(resample(samples, sample_rate, 16000)) + } + /// Simple linear resampler (good enough for speech; not for music). fn resample(samples: Vec, from_hz: u32, to_hz: u32) -> Vec { if from_hz == to_hz { From d9b87649ed82ab52b80feefbffd6b334cd9760c0 Mon Sep 17 00:00:00 2001 From: Marenz Date: Sat, 21 Feb 2026 13:36:15 +0100 Subject: [PATCH 3/4] docs: document local Whisper STT backend in README --- README.md | 24 ++++++++++++++++++++++++ src/lib.rs | 4 ++-- src/stt.rs | 29 ++++++++++++----------------- 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 8a7bafb1d..33894ae6d 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,30 @@ channel = "my-provider/my-model" Additional built-in providers include **NVIDIA**, **MiniMax**, **Moonshot AI (Kimi)**, and **Z.AI Coding Plan** — configure with `nvidia_key`, `minimax_key`, `moonshot_key`, or `zai_coding_plan_key` in `[llm]`. +### Voice Transcription + +Audio attachments (voice messages, audio files) are transcribed before being passed to the channel. Set `routing.voice` to choose the backend: + +**Provider-based** — route through any configured LLM provider that supports audio input: + +```toml +[defaults.routing] +voice = "openai/whisper-1" +``` + +**Local Whisper** (`stt-whisper` feature, requires `--features stt-whisper` at build time) — run inference locally via [whisper-rs](https://codeberg.org/tazz4843/whisper-rs), no API call needed: + +```toml +[defaults.routing] +voice = "whisper-local://small" +``` + +The model is downloaded automatically from [`ggerganov/whisper.cpp`](https://huggingface.co/ggerganov/whisper.cpp) on first use and cached in `~/.cache/huggingface/hub`. Supported size names: `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. An absolute path to a GGML model file also works. + +GPU acceleration via Vulkan is enabled automatically when a compatible device is detected. The loaded model is cached for the process lifetime — restart to switch models. + +Ogg/Opus audio (Telegram voice messages) is decoded natively. All other formats are handled via symphonia. + ### Skills Extensible skill system integrated with [skills.sh](https://skills.sh): diff --git a/src/lib.rs b/src/lib.rs index 9abc71594..021c0e80d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,11 +20,11 @@ pub mod prompts; pub mod secrets; pub mod settings; pub mod skills; +#[cfg(feature = "stt-whisper")] +pub mod stt; #[cfg(feature = "metrics")] pub mod telemetry; pub mod tools; -#[cfg(feature = "stt-whisper")] -pub mod stt; pub mod update; pub use error::{Error, Result}; diff --git a/src/stt.rs b/src/stt.rs index d07afa400..14860f11e 100644 --- a/src/stt.rs +++ b/src/stt.rs @@ -194,9 +194,7 @@ mod local { let track = format .tracks() .iter() - .find(|t| { - t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL - }) + .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL) .ok_or_else(|| WhisperError::Decode("no audio track found".into()))? .clone(); @@ -206,11 +204,7 @@ mod local { let track_id = track.id; let sample_rate = track.codec_params.sample_rate.unwrap_or(16000); - let channels = track - .codec_params - .channels - .map(|c| c.count()) - .unwrap_or(1); + let channels = track.codec_params.channels.map(|c| c.count()).unwrap_or(1); let mut raw_samples: Vec = Vec::new(); @@ -233,10 +227,8 @@ mod local { // Convert to f32 mono using a sample-converting audio buffer. use symphonia::core::audio::{AudioBuffer, Signal as _}; - let mut f32_buf: AudioBuffer = AudioBuffer::new( - decoded.capacity() as u64, - decoded.spec().clone(), - ); + let mut f32_buf: AudioBuffer = + AudioBuffer::new(decoded.capacity() as u64, decoded.spec().clone()); decoded.convert(&mut f32_buf); // Mix down to mono. @@ -289,11 +281,14 @@ mod local { sample_rate = 48000; } decoder = Some( - opus::Decoder::new(sample_rate, if channels == 2 { - opus::Channels::Stereo - } else { - opus::Channels::Mono - }) + opus::Decoder::new( + sample_rate, + if channels == 2 { + opus::Channels::Stereo + } else { + opus::Channels::Mono + }, + ) .map_err(|e| WhisperError::Decode(e.to_string()))?, ); } From 37d9e0d93e4a8c8bacc97d583b8eb607166e659d Mon Sep 17 00:00:00 2001 From: Marenz Date: Mon, 23 Feb 2026 22:01:14 +0100 Subject: [PATCH 4/4] feat(stt): add transcribe_audio worker tool and unify STT dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers can now call transcribe_audio(path) to transcribe local audio files. The tool uses whatever is configured in routing.voice — local Whisper (whisper-local://) or any OpenAI-compatible HTTP provider. The transcription logic is extracted from channel.rs into stt.rs as transcribe_bytes(), shared by both the channel attachment handler and the new tool. The stt module is now always compiled (not gated on stt-whisper) since it handles all provider paths. --- .../tools/transcribe_audio_description.md.j2 | 1 + src/agent/channel.rs | 225 ++---------------- src/agent/worker.rs | 10 +- src/lib.rs | 1 - src/prompts/text.rs | 3 + src/stt.rs | 188 ++++++++++++++- src/tools.rs | 16 ++ src/tools/transcribe_audio.rs | 117 +++++++++ 8 files changed, 348 insertions(+), 213 deletions(-) create mode 100644 prompts/en/tools/transcribe_audio_description.md.j2 create mode 100644 src/tools/transcribe_audio.rs diff --git a/prompts/en/tools/transcribe_audio_description.md.j2 b/prompts/en/tools/transcribe_audio_description.md.j2 new file mode 100644 index 000000000..b00515fba --- /dev/null +++ b/prompts/en/tools/transcribe_audio_description.md.j2 @@ -0,0 +1 @@ +Transcribe an audio file to text using local speech-to-text. Provide the path to the audio file. Supports ogg, opus, mp3, flac, wav, and m4a formats. Use this instead of external whisper CLI tools. diff --git a/src/agent/channel.rs b/src/agent/channel.rs index a091f10d2..9f4b617f7 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -4,7 +4,7 @@ use crate::agent::branch::Branch; use crate::agent::compactor::Compactor; use crate::agent::status::StatusBlock; use crate::agent::worker::Worker; -use crate::config::ApiType; + use crate::conversation::{ChannelStore, ConversationLogger, ProcessRunLogger}; use crate::error::{AgentError, Result}; use crate::hooks::SpacebotHook; @@ -1956,221 +1956,34 @@ async fn transcribe_audio_attachment( ); let routing = deps.runtime_config.routing.load(); - let voice_model = routing.voice.trim(); - if voice_model.is_empty() { - return UserContent::text(format!( - "[Audio attachment received but no voice model is configured in routing.voice: {}]", - attachment.filename - )); - } + let voice_model = routing.voice.clone(); - // Local Whisper backend — bypass the LLM provider path entirely. - #[cfg(feature = "stt-whisper")] - if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") { - let transcript = match crate::stt::transcribe(model_spec, &bytes).await { - Ok(text) if text.is_empty() => { - tracing::warn!(filename = %attachment.filename, "local Whisper returned empty transcript"); - return UserContent::text(format!( - "[Audio transcription returned empty text for {}]", - attachment.filename - )); - } - Ok(text) => text, - Err(error) => { - tracing::warn!(%error, filename = %attachment.filename, "local Whisper transcription failed"); - return UserContent::text(format!( - "[Audio transcription failed for {}: {}]", - attachment.filename, error - )); - } - }; - return UserContent::text(format!( + match crate::stt::transcribe_bytes(&voice_model, &bytes, &attachment.mime_type, &deps.llm_manager, http).await { + Ok(transcript) => UserContent::text(format!( "\n{}\n", attachment.filename, attachment.mime_type, transcript - )); - } - - let (provider_id, model_name) = match deps.llm_manager.resolve_model(voice_model) { - Ok(parts) => parts, - Err(error) => { - tracing::warn!(%error, model = %voice_model, "invalid voice model route"); - return UserContent::text(format!( - "[Audio transcription failed for {}: invalid voice model '{}']", - attachment.filename, voice_model - )); - } - }; - - let provider = match deps.llm_manager.get_provider(&provider_id) { - Ok(provider) => provider, - Err(error) => { - tracing::warn!(%error, provider = %provider_id, "voice provider not configured"); - return UserContent::text(format!( - "[Audio transcription failed for {}: provider '{}' is not configured]", - attachment.filename, provider_id - )); - } - }; - - if provider.api_type == ApiType::Anthropic { - return UserContent::text(format!( - "[Audio transcription failed for {}: provider '{}' does not support input_audio on this endpoint]", - attachment.filename, provider_id - )); - } - - let format = audio_format_for_attachment(attachment); - use base64::Engine as _; - let base64_audio = base64::engine::general_purpose::STANDARD.encode(&bytes); - - let endpoint = format!( - "{}/v1/chat/completions", - provider.base_url.trim_end_matches('/') - ); - let body = serde_json::json!({ - "model": model_name, - "messages": [{ - "role": "user", - "content": [ - { - "type": "text", - "text": "Transcribe this audio verbatim. Return only the transcription text." - }, - { - "type": "input_audio", - "input_audio": { - "data": base64_audio, - "format": format, - } - } - ] - }], - "temperature": 0 - }); - - let response = match http - .post(&endpoint) - .header("authorization", format!("Bearer {}", provider.api_key)) - .header("content-type", "application/json") - .json(&body) - .send() - .await - { - Ok(response) => response, - Err(error) => { - tracing::warn!(%error, model = %voice_model, "voice transcription request failed"); - return UserContent::text(format!( - "[Audio transcription failed for {}]", + )), + Err(crate::stt::SttError::NotConfigured) => UserContent::text(format!( + "[Audio attachment received but no voice model is configured in routing.voice: {}]", + attachment.filename + )), + Err(crate::stt::SttError::EmptyResult) => { + tracing::warn!(filename = %attachment.filename, "transcription returned empty text"); + UserContent::text(format!( + "[Audio transcription returned empty text for {}]", attachment.filename - )); + )) } - }; - - let status = response.status(); - let response_body = match response.json::().await { - Ok(body) => body, Err(error) => { - tracing::warn!(%error, model = %voice_model, "invalid transcription response"); - return UserContent::text(format!( - "[Audio transcription failed for {}]", - attachment.filename - )); + tracing::warn!(%error, filename = %attachment.filename, "audio transcription failed"); + UserContent::text(format!( + "[Audio transcription failed for {}: {}]", + attachment.filename, error + )) } - }; - - if !status.is_success() { - let message = response_body["error"]["message"] - .as_str() - .unwrap_or("unknown error"); - tracing::warn!( - status = %status, - model = %voice_model, - error = %message, - "voice transcription provider returned error" - ); - return UserContent::text(format!( - "[Audio transcription failed for {}: {}]", - attachment.filename, message - )); - } - - let transcript = extract_transcript_text(&response_body); - if transcript.is_empty() { - tracing::warn!(model = %voice_model, "empty transcription returned"); - return UserContent::text(format!( - "[Audio transcription returned empty text for {}]", - attachment.filename - )); - } - - UserContent::text(format!( - "\n{}\n", - attachment.filename, attachment.mime_type, transcript - )) -} - -fn audio_format_for_attachment(attachment: &crate::Attachment) -> &'static str { - let mime = attachment.mime_type.to_lowercase(); - if mime.contains("mpeg") || mime.contains("mp3") { - return "mp3"; - } - if mime.contains("wav") { - return "wav"; - } - if mime.contains("flac") { - return "flac"; - } - if mime.contains("aac") { - return "aac"; - } - if mime.contains("ogg") { - return "ogg"; - } - if mime.contains("mp4") || mime.contains("m4a") { - return "m4a"; - } - - match attachment - .filename - .rsplit('.') - .next() - .unwrap_or_default() - .to_lowercase() - .as_str() - { - "mp3" => "mp3", - "wav" => "wav", - "flac" => "flac", - "aac" => "aac", - "m4a" | "mp4" => "m4a", - "oga" | "ogg" => "ogg", - _ => "ogg", } } -fn extract_transcript_text(body: &serde_json::Value) -> String { - if let Some(text) = body["choices"][0]["message"]["content"].as_str() { - return text.trim().to_string(); - } - - let Some(parts) = body["choices"][0]["message"]["content"].as_array() else { - return String::new(); - }; - - parts - .iter() - .filter_map(|part| { - if part["type"].as_str() == Some("text") { - part["text"].as_str().map(str::trim) - } else { - None - } - }) - .filter(|text| !text.is_empty()) - .collect::>() - .join("\n") -} - /// Download a text attachment and inline its content for the LLM. async fn download_text_attachment( http: &reqwest::Client, diff --git a/src/agent/worker.rs b/src/agent/worker.rs index ae369ad70..5b7d8fcc1 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -1,5 +1,7 @@ //! Worker: Independent task execution process. +use std::sync::Arc; + use crate::agent::compactor::estimate_history_tokens; use crate::config::BrowserConfig; use crate::error::Result; @@ -193,6 +195,9 @@ impl Worker { let mcp_tools = self.deps.mcp_manager.get_tools().await; // Create per-worker ToolServer with task tools + let routing = self.deps.runtime_config.routing.load(); + let voice_model = routing.voice.clone(); + let worker_tool_server = crate::tools::create_worker_tool_server( self.deps.agent_id.clone(), self.id, @@ -204,9 +209,10 @@ impl Worker { self.deps.runtime_config.workspace_dir.clone(), self.deps.runtime_config.instance_dir.clone(), mcp_tools, + voice_model, + Arc::clone(&self.deps.llm_manager), + self.deps.llm_manager.http_client().clone(), ); - - let routing = self.deps.runtime_config.routing.load(); let model_name = routing.resolve(ProcessType::Worker, None).to_string(); let model = SpacebotModel::make(&self.deps.llm_manager, &model_name) .with_context(&*self.deps.agent_id, "worker") diff --git a/src/lib.rs b/src/lib.rs index 021c0e80d..4844f0550 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,6 @@ pub mod prompts; pub mod secrets; pub mod settings; pub mod skills; -#[cfg(feature = "stt-whisper")] pub mod stt; #[cfg(feature = "metrics")] pub mod telemetry; diff --git a/src/prompts/text.rs b/src/prompts/text.rs index d9614adae..7840882e1 100644 --- a/src/prompts/text.rs +++ b/src/prompts/text.rs @@ -158,6 +158,9 @@ fn lookup(lang: &str, key: &str) -> &'static str { ("en", "tools/send_message_to_another_channel") => { include_str!("../../prompts/en/tools/send_message_description.md.j2") } + ("en", "tools/transcribe_audio") => { + include_str!("../../prompts/en/tools/transcribe_audio_description.md.j2") + } // Fallback: unknown language or key -> try English (lang, key) if lang != "en" => { diff --git a/src/stt.rs b/src/stt.rs index 14860f11e..735fc5bdf 100644 --- a/src/stt.rs +++ b/src/stt.rs @@ -1,12 +1,192 @@ -//! Local Whisper speech-to-text via whisper-rs. +//! Speech-to-text transcription. //! -//! Only compiled when the `stt-whisper` feature is enabled. -//! Exposed as a single async `transcribe` function that lazily loads and caches -//! the model context for the lifetime of the process. +//! Provides a unified `transcribe_bytes` function that dispatches to either: +//! - The local Whisper backend (`whisper-local://`) when the `stt-whisper` +//! feature is enabled. +//! - An OpenAI-compatible HTTP provider (anything else) via `input_audio`. + +use crate::llm::manager::LlmManager; +use crate::config::ApiType; #[cfg(feature = "stt-whisper")] pub use local::transcribe; +/// Unified error type for all STT backends. +#[derive(Debug, thiserror::Error)] +pub enum SttError { + #[error("no voice model configured in routing.voice")] + NotConfigured, + #[error("local Whisper STT is not available in this build")] + WhisperNotBuilt, + #[error("whisper error: {0}")] + #[cfg(feature = "stt-whisper")] + Whisper(#[from] local::WhisperError), + #[error("provider '{0}' is not configured")] + ProviderNotConfigured(String), + #[error("provider '{0}' does not support audio transcription on this endpoint")] + ProviderUnsupported(String), + #[error("invalid voice model spec '{0}': {1}")] + InvalidModel(String, String), + #[error("transcription request failed: {0}")] + Http(String), + #[error("transcription returned empty result")] + EmptyResult, +} + +/// Transcribe raw audio bytes using the configured voice model. +/// +/// `voice_model` is the full value from `routing.voice`, e.g.: +/// - `"whisper-local://small"` — local Whisper +/// - `"openai/whisper-1"` — OpenAI-compatible HTTP provider +/// +/// `mime_type` is used to set the audio format hint for HTTP providers. +pub async fn transcribe_bytes( + voice_model: &str, + audio: &[u8], + mime_type: &str, + llm_manager: &LlmManager, + http: &reqwest::Client, +) -> Result { + let voice_model = voice_model.trim(); + if voice_model.is_empty() { + return Err(SttError::NotConfigured); + } + + // Local Whisper backend. + if let Some(model_spec) = voice_model.strip_prefix("whisper-local://") { + #[cfg(feature = "stt-whisper")] + { + return local::transcribe(model_spec, audio) + .await + .map_err(SttError::Whisper); + } + #[cfg(not(feature = "stt-whisper"))] + { + let _ = (model_spec, audio); + return Err(SttError::WhisperNotBuilt); + } + } + + // HTTP provider path. + let (provider_id, model_name) = llm_manager + .resolve_model(voice_model) + .map_err(|e| SttError::InvalidModel(voice_model.to_string(), e.to_string()))?; + + let provider = llm_manager + .get_provider(&provider_id) + .map_err(|_| SttError::ProviderNotConfigured(provider_id.clone()))?; + + if provider.api_type == ApiType::Anthropic { + return Err(SttError::ProviderUnsupported(provider_id)); + } + + let format = audio_format_for_mime(mime_type); + use base64::Engine as _; + let base64_audio = base64::engine::general_purpose::STANDARD.encode(audio); + + let endpoint = format!( + "{}/v1/chat/completions", + provider.base_url.trim_end_matches('/') + ); + let body = serde_json::json!({ + "model": model_name, + "messages": [{ + "role": "user", + "content": [ + { + "type": "text", + "text": "Transcribe this audio verbatim. Return only the transcription text." + }, + { + "type": "input_audio", + "input_audio": { + "data": base64_audio, + "format": format, + } + } + ] + }], + "temperature": 0 + }); + + let response = http + .post(&endpoint) + .header("authorization", format!("Bearer {}", provider.api_key)) + .header("content-type", "application/json") + .json(&body) + .send() + .await + .map_err(|e| SttError::Http(e.to_string()))?; + + let status = response.status(); + let response_body = response + .json::() + .await + .map_err(|e| SttError::Http(e.to_string()))?; + + if !status.is_success() { + let message = response_body["error"]["message"] + .as_str() + .unwrap_or("unknown error"); + return Err(SttError::Http(format!("{status}: {message}"))); + } + + let transcript = extract_transcript_text(&response_body); + if transcript.is_empty() { + return Err(SttError::EmptyResult); + } + + Ok(transcript) +} + +/// Infer the audio format string from a MIME type. +pub fn audio_format_for_mime(mime_type: &str) -> &'static str { + let mime = mime_type.to_lowercase(); + if mime.contains("mpeg") || mime.contains("mp3") { + return "mp3"; + } + if mime.contains("wav") { + return "wav"; + } + if mime.contains("flac") { + return "flac"; + } + if mime.contains("aac") { + return "aac"; + } + if mime.contains("ogg") { + return "ogg"; + } + if mime.contains("mp4") || mime.contains("m4a") { + return "m4a"; + } + "ogg" +} + +/// Extract the transcript text from an OpenAI-compatible chat completion response. +fn extract_transcript_text(body: &serde_json::Value) -> String { + if let Some(text) = body["choices"][0]["message"]["content"].as_str() { + return text.trim().to_string(); + } + + let Some(parts) = body["choices"][0]["message"]["content"].as_array() else { + return String::new(); + }; + + parts + .iter() + .filter_map(|part| { + if part["type"].as_str() == Some("text") { + part["text"].as_str().map(str::trim) + } else { + None + } + }) + .filter(|text| !text.is_empty()) + .collect::>() + .join("\n") +} + #[cfg(feature = "stt-whisper")] mod local { use std::sync::OnceLock; diff --git a/src/tools.rs b/src/tools.rs index d4caa2c69..3e3bb3dcb 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -41,6 +41,7 @@ pub mod set_status; pub mod shell; pub mod skip; pub mod spawn_worker; +pub mod transcribe_audio; pub mod web_search; pub use branch_tool::{BranchArgs, BranchError, BranchOutput, BranchTool}; @@ -76,10 +77,14 @@ pub use set_status::{SetStatusArgs, SetStatusError, SetStatusOutput, SetStatusTo pub use shell::{ShellArgs, ShellError, ShellOutput, ShellResult, ShellTool}; pub use skip::{SkipArgs, SkipError, SkipFlag, SkipOutput, SkipTool, new_skip_flag}; pub use spawn_worker::{SpawnWorkerArgs, SpawnWorkerError, SpawnWorkerOutput, SpawnWorkerTool}; +pub use transcribe_audio::{ + TranscribeAudioArgs, TranscribeAudioError, TranscribeAudioOutput, TranscribeAudioTool, +}; pub use web_search::{SearchResult, WebSearchArgs, WebSearchError, WebSearchOutput, WebSearchTool}; use crate::agent::channel::ChannelState; use crate::config::BrowserConfig; +use crate::llm::manager::LlmManager; use crate::memory::MemorySearch; use crate::{AgentId, ChannelId, OutboundResponse, ProcessEvent, WorkerId}; use rig::tool::Tool as _; @@ -272,6 +277,9 @@ pub fn create_worker_tool_server( workspace: PathBuf, instance_dir: PathBuf, mcp_tools: Vec, + voice_model: String, + llm_manager: Arc, + http: reqwest::Client, ) -> ToolServerHandle { let mut server = ToolServer::new() .tool(ShellTool::new(instance_dir.clone(), workspace.clone())) @@ -281,6 +289,14 @@ pub fn create_worker_tool_server( agent_id, worker_id, channel_id, event_tx, )); + if !voice_model.is_empty() { + server = server.tool(TranscribeAudioTool::new( + voice_model, + llm_manager, + http, + )); + } + if browser_config.enabled { server = server.tool(BrowserTool::new(browser_config, screenshot_dir)); } diff --git a/src/tools/transcribe_audio.rs b/src/tools/transcribe_audio.rs new file mode 100644 index 000000000..f9b1194a8 --- /dev/null +++ b/src/tools/transcribe_audio.rs @@ -0,0 +1,117 @@ +//! Transcribe audio tool for workers. +//! +//! Allows workers to transcribe audio files using whatever STT backend is +//! configured in `routing.voice` — local Whisper or an HTTP provider. + +use std::sync::Arc; + +use rig::completion::ToolDefinition; +use rig::tool::Tool; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use crate::llm::manager::LlmManager; + +/// Tool for transcribing audio files to text. +#[derive(Clone)] +pub struct TranscribeAudioTool { + /// The configured voice model spec (full `routing.voice` value). + voice_model: String, + llm_manager: Arc, + http: reqwest::Client, +} + +impl TranscribeAudioTool { + /// Create a new transcribe audio tool. + pub fn new( + voice_model: impl Into, + llm_manager: Arc, + http: reqwest::Client, + ) -> Self { + Self { + voice_model: voice_model.into(), + llm_manager, + http, + } + } +} + +/// Error type for transcribe audio tool. +#[derive(Debug, thiserror::Error)] +#[error("Audio transcription failed: {0}")] +pub struct TranscribeAudioError(String); + +/// Arguments for transcribe audio tool. +#[derive(Debug, Deserialize, JsonSchema)] +pub struct TranscribeAudioArgs { + /// Path to the audio file to transcribe (absolute or relative to the workspace). + /// Supports ogg, opus, mp3, flac, wav, m4a. + pub path: String, +} + +/// Output from transcribe audio tool. +#[derive(Debug, Serialize)] +pub struct TranscribeAudioOutput { + /// The transcribed text. + pub transcript: String, +} + +impl Tool for TranscribeAudioTool { + const NAME: &'static str = "transcribe_audio"; + + type Error = TranscribeAudioError; + type Args = TranscribeAudioArgs; + type Output = TranscribeAudioOutput; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: crate::prompts::text::get("tools/transcribe_audio").to_string(), + parameters: serde_json::json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to the audio file to transcribe (absolute or relative to the workspace). Supports ogg, opus, mp3, flac, wav, m4a." + } + }, + "required": ["path"] + }), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let audio = tokio::fs::read(&args.path) + .await + .map_err(|e| TranscribeAudioError(format!("failed to read {}: {}", args.path, e)))?; + + // Infer mime type from file extension for the HTTP provider path. + let mime_type = mime_from_path(&args.path); + + let transcript = + crate::stt::transcribe_bytes(&self.voice_model, &audio, mime_type, &self.llm_manager, &self.http) + .await + .map_err(|e| TranscribeAudioError(e.to_string()))?; + + Ok(TranscribeAudioOutput { transcript }) + } +} + +/// Infer a MIME type string from a file path extension. +fn mime_from_path(path: &str) -> &'static str { + match path + .rsplit('.') + .next() + .unwrap_or_default() + .to_lowercase() + .as_str() + { + "mp3" => "audio/mpeg", + "wav" => "audio/wav", + "flac" => "audio/flac", + "aac" => "audio/aac", + "m4a" | "mp4" => "audio/mp4", + "opus" => "audio/opus", + _ => "audio/ogg", + } +}