Skip to content
31 changes: 29 additions & 2 deletions src/discord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ impl EventHandler for Handler {
.filter(|(mid, _)| **mid < msg.id)
.map(|(_, m)| m.clone())
.collect();
recent.sort_unstable_by(|a, b| b.id.cmp(&a.id));
recent.sort_unstable_by_key(|m| std::cmp::Reverse(m.id));
recent.truncate(cap);
recent
})
Expand Down Expand Up @@ -263,8 +263,13 @@ impl EventHandler for Handler {
is_bot: msg.author.bot,
};

// Build extra content blocks from attachments (images, audio)
// Build extra content blocks from attachments (audio → STT, text → inline, image → encode)
let mut extra_blocks = Vec::new();
let mut text_file_bytes: u64 = 0;
let mut text_file_count: u32 = 0;
const TEXT_TOTAL_CAP: u64 = 1024 * 1024; // 1 MB total for all text file attachments
const TEXT_FILE_COUNT_CAP: u32 = 5;

for attachment in &msg.attachments {
let mime = attachment.content_type.as_deref().unwrap_or("");
if media::is_audio_mime(mime) {
Expand All @@ -288,6 +293,28 @@ impl EventHandler for Handler {
let msg_ref = discord_msg_ref(&msg);
let _ = adapter.add_reaction(&msg_ref, "🎤").await;
}
} else if media::is_text_file(&attachment.filename, attachment.content_type.as_deref()) {
if text_file_count >= TEXT_FILE_COUNT_CAP {
tracing::warn!(filename = %attachment.filename, count = text_file_count, "text file count cap reached, skipping");
continue;
}
// Pre-check with Discord-reported size (fast path, avoids unnecessary download).
// Running total uses actual downloaded bytes for accurate accounting.
if text_file_bytes + u64::from(attachment.size) > TEXT_TOTAL_CAP {
tracing::warn!(filename = %attachment.filename, total = text_file_bytes, "text attachments total exceeds 1MB cap, skipping remaining");
continue;
}
if let Some((block, actual_bytes)) = media::download_and_read_text_file(
&attachment.url,
&attachment.filename,
u64::from(attachment.size),
None,
).await {
text_file_bytes += actual_bytes;
text_file_count += 1;
debug!(filename = %attachment.filename, "adding text file attachment");
extra_blocks.push(block);
}
} else if let Some(block) = media::download_and_encode_image(
&attachment.url,
attachment.content_type.as_deref(),
Expand Down
108 changes: 108 additions & 0 deletions src/media.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,114 @@ pub fn is_audio_mime(mime: &str) -> bool {
mime.starts_with("audio/")
}

/// Extensions recognised as text-based files that can be inlined into the prompt.
const TEXT_EXTENSIONS: &[&str] = &[
"txt", "csv", "log", "md", "json", "jsonl", "yaml", "yml", "toml", "xml",
"rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "h", "hpp",
"rb", "sh", "bash", "zsh", "fish", "ps1", "bat", "sql", "html", "css",
"scss", "less", "ini", "cfg", "conf", "env",
];

/// Exact filenames (no extension) recognised as text files.
const TEXT_FILENAMES: &[&str] = &[
"dockerfile", "makefile", "justfile", "rakefile", "gemfile",
"procfile", "vagrantfile", ".gitignore", ".dockerignore", ".editorconfig",
];

/// MIME types recognised as text-based (beyond `text/*`).
const TEXT_MIME_TYPES: &[&str] = &[
"application/json",
"application/xml",
"application/javascript",
"application/x-yaml",
"application/x-sh",
"application/toml",
"application/x-toml",
];

/// Check if a file is text-based and can be inlined into the prompt.
pub fn is_text_file(filename: &str, content_type: Option<&str>) -> bool {
let mime = content_type.unwrap_or("");
let mime_base = mime.split(';').next().unwrap_or(mime).trim();
if mime_base.starts_with("text/") || TEXT_MIME_TYPES.contains(&mime_base) {
return true;
}
// Check extension
if filename.contains('.') {
if let Some(ext) = filename.rsplit('.').next() {
if TEXT_EXTENSIONS.contains(&ext.to_lowercase().as_str()) {
return true;
}
}
}
// Check exact filename (Dockerfile, Makefile, etc.)
TEXT_FILENAMES.contains(&filename.to_lowercase().as_str())
}

/// Download a text-based file and return it as a ContentBlock::Text.
/// Files larger than 512 KB are skipped to avoid bloating the prompt.
///
/// Pass `auth_token` for platforms that require authentication (e.g. Slack private files).
///
/// Note: the caller already guards total size via a total cap; the per-file
/// MAX_SIZE check here is intentional defense-in-depth so this function remains
/// self-contained and safe when called from other contexts.
pub async fn download_and_read_text_file(
url: &str,
filename: &str,
size: u64,
auth_token: Option<&str>,
) -> Option<(ContentBlock, u64)> {
const MAX_SIZE: u64 = 512 * 1024; // 512 KB

if size > MAX_SIZE {
tracing::warn!(filename, size, "text file exceeds 512KB limit, skipping");
return None;
}

let mut req = HTTP_CLIENT.get(url);
if let Some(token) = auth_token {
req = req.header("Authorization", format!("Bearer {token}"));
}

let resp = match req.send().await {
Ok(r) => r,
Err(e) => {
tracing::warn!(url, error = %e, "text file download failed");
return None;
}
};
if !resp.status().is_success() {
tracing::warn!(url, status = %resp.status(), "text file download failed");
return None;
}
let bytes = resp.bytes().await.ok()?;
let actual_size = bytes.len() as u64;

// Defense-in-depth: verify actual download size
if actual_size > MAX_SIZE {
tracing::warn!(filename, size = actual_size, "downloaded text file exceeds 512KB limit, skipping");
return None;
}

// from_utf8_lossy returns Cow::Borrowed for valid UTF-8 (zero-copy)
let text = String::from_utf8_lossy(&bytes).into_owned();

// Dynamic fence: keep adding backticks until the fence doesn't appear in content
let mut fence = "```".to_string();
while text.contains(fence.as_str()) {
fence.push('`');
}

debug!(filename, bytes = text.len(), "text file inlined");
Some((
ContentBlock::Text {
text: format!("[File: {filename}]\n{fence}\n{text}\n{fence}"),
},
actual_size,
))
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading