diff --git a/src/parsers/instagram.rs b/src/parsers/instagram.rs index f41cca38..8d2b1560 100644 --- a/src/parsers/instagram.rs +++ b/src/parsers/instagram.rs @@ -13,7 +13,7 @@ use crate::Message; use crate::config::InstagramConfig; use crate::error::ChatpackError; use crate::parser::{Parser, Platform}; -use crate::parsing::instagram::{InstagramExport, parse_instagram_message}; +use crate::parsing::instagram::{InstagramExport, parse_instagram_message_owned}; #[cfg(feature = "streaming")] use crate::streaming::{InstagramStreamingParser, StreamingConfig, StreamingParser}; @@ -64,10 +64,11 @@ impl InstagramParser { let export: InstagramExport = serde_json::from_str(content)?; let fix = self.config.fix_encoding; + // Use into_iter() with owned version to avoid allocations let mut messages: Vec = export .messages - .iter() - .filter_map(|msg| parse_instagram_message(msg, fix)) + .into_iter() + .filter_map(|msg| parse_instagram_message_owned(msg, fix)) .collect(); // Instagram stores messages newest-first, reverse for chronological order diff --git a/src/parsing/instagram.rs b/src/parsing/instagram.rs index 55cb1655..8c1435d9 100644 --- a/src/parsing/instagram.rs +++ b/src/parsing/instagram.rs @@ -64,6 +64,13 @@ pub struct InstagramExport { /// // (actual mojibake text would be converted back to proper UTF-8) /// ``` pub fn fix_mojibake_encoding(s: &str) -> String { + // Fast path: if all chars are ASCII, no mojibake fix is needed + // Mojibake only occurs with non-ASCII chars (codepoints > 127) + if s.is_ascii() { + return s.to_string(); + } + + // Slow path: check if we have mojibake (chars in Latin-1 range that decode to UTF-8) let bytes: Vec = s.chars().map(|c| c as u8).collect(); String::from_utf8(bytes).unwrap_or_else(|_| s.to_string()) } @@ -73,7 +80,58 @@ pub fn parse_ms_timestamp(timestamp_ms: i64) -> Option> { Utc.timestamp_millis_opt(timestamp_ms).single() } -/// Parses a raw Instagram message into a `Message`. +/// Parses a raw Instagram message into a `Message` (owned version). +/// +/// Takes ownership of the message to avoid allocations. Use this version +/// when you can consume the `InstagramRawMessage`. +/// +/// Returns `None` if the message has no content. +/// +/// If `fix_encoding` is true, applies Mojibake fix to sender and content. +pub fn parse_instagram_message_owned( + msg: InstagramRawMessage, + fix_encoding: bool, +) -> Option { + // Get content from various possible locations (move, no clone) + let content = msg + .content + .or_else(|| msg.share.and_then(|s| s.share_text)); + + // Apply encoding fix if needed, but skip for ASCII (no mojibake possible) + let content = content.map(|c| { + if fix_encoding && !c.is_ascii() { + fix_mojibake_encoding(&c) + } else { + c // move, no allocation + } + }); + + // Skip messages without content + let content = match content { + Some(c) if !c.trim().is_empty() => c, + _ => return None, + }; + + let timestamp = parse_ms_timestamp(msg.timestamp_ms); + + // Move sender_name or apply fix (avoids allocation when ASCII or fix_encoding=false) + let sender = if fix_encoding && !msg.sender_name.is_ascii() { + fix_mojibake_encoding(&msg.sender_name) + } else { + msg.sender_name // move, no allocation + }; + + Some(Message::with_metadata( + sender, content, timestamp, None, // Instagram doesn't have message IDs in export + None, // No reply references + None, // No edit timestamps + )) +} + +/// Parses a raw Instagram message into a `Message` (reference version). +/// +/// Use this version when you need to borrow the message (e.g., streaming). +/// For better performance when ownership is available, use [`parse_instagram_message_owned`]. /// /// Returns `None` if the message has no content. /// @@ -82,15 +140,15 @@ pub fn parse_instagram_message(msg: &InstagramRawMessage, fix_encoding: bool) -> // Get content from various possible locations let content = msg .content - .clone() - .or_else(|| msg.share.as_ref().and_then(|s| s.share_text.clone())); + .as_ref() + .or_else(|| msg.share.as_ref().and_then(|s| s.share_text.as_ref())); - // Apply encoding fix if needed + // Apply encoding fix if needed, but skip for ASCII (no mojibake possible) let content = content.map(|c| { - if fix_encoding { - fix_mojibake_encoding(&c) + if fix_encoding && !c.is_ascii() { + fix_mojibake_encoding(c) } else { - c + c.clone() } }); @@ -102,7 +160,8 @@ pub fn parse_instagram_message(msg: &InstagramRawMessage, fix_encoding: bool) -> let timestamp = parse_ms_timestamp(msg.timestamp_ms); - let sender = if fix_encoding { + // Apply encoding fix if needed, but skip for ASCII (no mojibake possible) + let sender = if fix_encoding && !msg.sender_name.is_ascii() { fix_mojibake_encoding(&msg.sender_name) } else { msg.sender_name.clone() diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs index 799c0d4e..aff313f4 100644 --- a/src/parsing/mod.rs +++ b/src/parsing/mod.rs @@ -20,7 +20,10 @@ pub mod discord; pub use telegram::{TelegramRawMessage, extract_telegram_text, parse_telegram_message}; #[cfg(feature = "instagram")] -pub use instagram::{InstagramRawMessage, fix_mojibake_encoding, parse_instagram_message}; +pub use instagram::{ + InstagramRawMessage, fix_mojibake_encoding, parse_instagram_message, + parse_instagram_message_owned, +}; #[cfg(feature = "whatsapp")] pub use whatsapp::{