diff --git a/src/discord.rs b/src/discord.rs index ea02b7c0..41f12953 100644 --- a/src/discord.rs +++ b/src/discord.rs @@ -164,7 +164,7 @@ impl EventHandler for Handler { .filter(|(mid, _)| **mid < msg.id) .map(|(_, m)| m.clone()) .collect(); - recent.sort_unstable_by(|a, b| b.id.cmp(&a.id)); + recent.sort_unstable_by_key(|m| std::cmp::Reverse(m.id)); recent.truncate(cap); recent }) @@ -263,8 +263,13 @@ impl EventHandler for Handler { is_bot: msg.author.bot, }; - // Build extra content blocks from attachments (images, audio) + // Build extra content blocks from attachments (audio → STT, text → inline, image → encode) let mut extra_blocks = Vec::new(); + let mut text_file_bytes: u64 = 0; + let mut text_file_count: u32 = 0; + const TEXT_TOTAL_CAP: u64 = 1024 * 1024; // 1 MB total for all text file attachments + const TEXT_FILE_COUNT_CAP: u32 = 5; + for attachment in &msg.attachments { let mime = attachment.content_type.as_deref().unwrap_or(""); if media::is_audio_mime(mime) { @@ -288,6 +293,28 @@ impl EventHandler for Handler { let msg_ref = discord_msg_ref(&msg); let _ = adapter.add_reaction(&msg_ref, "🎤").await; } + } else if media::is_text_file(&attachment.filename, attachment.content_type.as_deref()) { + if text_file_count >= TEXT_FILE_COUNT_CAP { + tracing::warn!(filename = %attachment.filename, count = text_file_count, "text file count cap reached, skipping"); + continue; + } + // Pre-check with Discord-reported size (fast path, avoids unnecessary download). + // Running total uses actual downloaded bytes for accurate accounting. + if text_file_bytes + u64::from(attachment.size) > TEXT_TOTAL_CAP { + tracing::warn!(filename = %attachment.filename, total = text_file_bytes, "text attachments total exceeds 1MB cap, skipping remaining"); + continue; + } + if let Some((block, actual_bytes)) = media::download_and_read_text_file( + &attachment.url, + &attachment.filename, + u64::from(attachment.size), + None, + ).await { + text_file_bytes += actual_bytes; + text_file_count += 1; + debug!(filename = %attachment.filename, "adding text file attachment"); + extra_blocks.push(block); + } } else if let Some(block) = media::download_and_encode_image( &attachment.url, attachment.content_type.as_deref(), diff --git a/src/media.rs b/src/media.rs index 709f7885..5e0c057f 100644 --- a/src/media.rs +++ b/src/media.rs @@ -182,6 +182,114 @@ pub fn is_audio_mime(mime: &str) -> bool { mime.starts_with("audio/") } +/// Extensions recognised as text-based files that can be inlined into the prompt. +const TEXT_EXTENSIONS: &[&str] = &[ + "txt", "csv", "log", "md", "json", "jsonl", "yaml", "yml", "toml", "xml", + "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "h", "hpp", + "rb", "sh", "bash", "zsh", "fish", "ps1", "bat", "sql", "html", "css", + "scss", "less", "ini", "cfg", "conf", "env", +]; + +/// Exact filenames (no extension) recognised as text files. +const TEXT_FILENAMES: &[&str] = &[ + "dockerfile", "makefile", "justfile", "rakefile", "gemfile", + "procfile", "vagrantfile", ".gitignore", ".dockerignore", ".editorconfig", +]; + +/// MIME types recognised as text-based (beyond `text/*`). +const TEXT_MIME_TYPES: &[&str] = &[ + "application/json", + "application/xml", + "application/javascript", + "application/x-yaml", + "application/x-sh", + "application/toml", + "application/x-toml", +]; + +/// Check if a file is text-based and can be inlined into the prompt. +pub fn is_text_file(filename: &str, content_type: Option<&str>) -> bool { + let mime = content_type.unwrap_or(""); + let mime_base = mime.split(';').next().unwrap_or(mime).trim(); + if mime_base.starts_with("text/") || TEXT_MIME_TYPES.contains(&mime_base) { + return true; + } + // Check extension + if filename.contains('.') { + if let Some(ext) = filename.rsplit('.').next() { + if TEXT_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { + return true; + } + } + } + // Check exact filename (Dockerfile, Makefile, etc.) + TEXT_FILENAMES.contains(&filename.to_lowercase().as_str()) +} + +/// Download a text-based file and return it as a ContentBlock::Text. +/// Files larger than 512 KB are skipped to avoid bloating the prompt. +/// +/// Pass `auth_token` for platforms that require authentication (e.g. Slack private files). +/// +/// Note: the caller already guards total size via a total cap; the per-file +/// MAX_SIZE check here is intentional defense-in-depth so this function remains +/// self-contained and safe when called from other contexts. +pub async fn download_and_read_text_file( + url: &str, + filename: &str, + size: u64, + auth_token: Option<&str>, +) -> Option<(ContentBlock, u64)> { + const MAX_SIZE: u64 = 512 * 1024; // 512 KB + + if size > MAX_SIZE { + tracing::warn!(filename, size, "text file exceeds 512KB limit, skipping"); + return None; + } + + let mut req = HTTP_CLIENT.get(url); + if let Some(token) = auth_token { + req = req.header("Authorization", format!("Bearer {token}")); + } + + let resp = match req.send().await { + Ok(r) => r, + Err(e) => { + tracing::warn!(url, error = %e, "text file download failed"); + return None; + } + }; + if !resp.status().is_success() { + tracing::warn!(url, status = %resp.status(), "text file download failed"); + return None; + } + let bytes = resp.bytes().await.ok()?; + let actual_size = bytes.len() as u64; + + // Defense-in-depth: verify actual download size + if actual_size > MAX_SIZE { + tracing::warn!(filename, size = actual_size, "downloaded text file exceeds 512KB limit, skipping"); + return None; + } + + // from_utf8_lossy returns Cow::Borrowed for valid UTF-8 (zero-copy) + let text = String::from_utf8_lossy(&bytes).into_owned(); + + // Dynamic fence: keep adding backticks until the fence doesn't appear in content + let mut fence = "```".to_string(); + while text.contains(fence.as_str()) { + fence.push('`'); + } + + debug!(filename, bytes = text.len(), "text file inlined"); + Some(( + ContentBlock::Text { + text: format!("[File: {filename}]\n{fence}\n{text}\n{fence}"), + }, + actual_size, + )) +} + #[cfg(test)] mod tests { use super::*;