Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/parsers/instagram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use crate::Message;
use crate::config::InstagramConfig;
use crate::error::ChatpackError;
use crate::parser::{Parser, Platform};
use crate::parsing::instagram::{InstagramExport, parse_instagram_message};
use crate::parsing::instagram::{InstagramExport, parse_instagram_message_owned};

#[cfg(feature = "streaming")]
use crate::streaming::{InstagramStreamingParser, StreamingConfig, StreamingParser};
Expand Down Expand Up @@ -64,10 +64,11 @@ impl InstagramParser {
let export: InstagramExport = serde_json::from_str(content)?;

let fix = self.config.fix_encoding;
// Use into_iter() with owned version to avoid allocations
let mut messages: Vec<Message> = export
.messages
.iter()
.filter_map(|msg| parse_instagram_message(msg, fix))
.into_iter()
.filter_map(|msg| parse_instagram_message_owned(msg, fix))
.collect();

// Instagram stores messages newest-first, reverse for chronological order
Expand Down
75 changes: 67 additions & 8 deletions src/parsing/instagram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ pub struct InstagramExport {
/// // (actual mojibake text would be converted back to proper UTF-8)
/// ```
pub fn fix_mojibake_encoding(s: &str) -> String {
// Fast path: if all chars are ASCII, no mojibake fix is needed
// Mojibake only occurs with non-ASCII chars (codepoints > 127)
if s.is_ascii() {
return s.to_string();
}

// Slow path: check if we have mojibake (chars in Latin-1 range that decode to UTF-8)
let bytes: Vec<u8> = s.chars().map(|c| c as u8).collect();
String::from_utf8(bytes).unwrap_or_else(|_| s.to_string())
}
Expand All @@ -73,7 +80,58 @@ pub fn parse_ms_timestamp(timestamp_ms: i64) -> Option<DateTime<Utc>> {
Utc.timestamp_millis_opt(timestamp_ms).single()
}

/// Parses a raw Instagram message into a `Message`.
/// Parses a raw Instagram message into a `Message` (owned version).
///
/// Takes ownership of the message to avoid allocations. Use this version
/// when you can consume the `InstagramRawMessage`.
///
/// Returns `None` if the message has no content.
///
/// If `fix_encoding` is true, applies Mojibake fix to sender and content.
pub fn parse_instagram_message_owned(
msg: InstagramRawMessage,
fix_encoding: bool,
) -> Option<Message> {
// Get content from various possible locations (move, no clone)
let content = msg
.content
.or_else(|| msg.share.and_then(|s| s.share_text));

// Apply encoding fix if needed, but skip for ASCII (no mojibake possible)
let content = content.map(|c| {
if fix_encoding && !c.is_ascii() {
fix_mojibake_encoding(&c)
} else {
c // move, no allocation
}
});

// Skip messages without content
let content = match content {
Some(c) if !c.trim().is_empty() => c,
_ => return None,
};

let timestamp = parse_ms_timestamp(msg.timestamp_ms);

// Move sender_name or apply fix (avoids allocation when ASCII or fix_encoding=false)
let sender = if fix_encoding && !msg.sender_name.is_ascii() {
fix_mojibake_encoding(&msg.sender_name)
} else {
msg.sender_name // move, no allocation
};

Some(Message::with_metadata(
sender, content, timestamp, None, // Instagram doesn't have message IDs in export
None, // No reply references
None, // No edit timestamps
))
}

/// Parses a raw Instagram message into a `Message` (reference version).
///
/// Use this version when you need to borrow the message (e.g., streaming).
/// For better performance when ownership is available, use [`parse_instagram_message_owned`].
///
/// Returns `None` if the message has no content.
///
Expand All @@ -82,15 +140,15 @@ pub fn parse_instagram_message(msg: &InstagramRawMessage, fix_encoding: bool) ->
// Get content from various possible locations
let content = msg
.content
.clone()
.or_else(|| msg.share.as_ref().and_then(|s| s.share_text.clone()));
.as_ref()
.or_else(|| msg.share.as_ref().and_then(|s| s.share_text.as_ref()));

// Apply encoding fix if needed
// Apply encoding fix if needed, but skip for ASCII (no mojibake possible)
let content = content.map(|c| {
if fix_encoding {
fix_mojibake_encoding(&c)
if fix_encoding && !c.is_ascii() {
fix_mojibake_encoding(c)
} else {
c
c.clone()
}
});

Expand All @@ -102,7 +160,8 @@ pub fn parse_instagram_message(msg: &InstagramRawMessage, fix_encoding: bool) ->

let timestamp = parse_ms_timestamp(msg.timestamp_ms);

let sender = if fix_encoding {
// Apply encoding fix if needed, but skip for ASCII (no mojibake possible)
let sender = if fix_encoding && !msg.sender_name.is_ascii() {
fix_mojibake_encoding(&msg.sender_name)
} else {
msg.sender_name.clone()
Expand Down
5 changes: 4 additions & 1 deletion src/parsing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ pub mod discord;
pub use telegram::{TelegramRawMessage, extract_telegram_text, parse_telegram_message};

#[cfg(feature = "instagram")]
pub use instagram::{InstagramRawMessage, fix_mojibake_encoding, parse_instagram_message};
pub use instagram::{
InstagramRawMessage, fix_mojibake_encoding, parse_instagram_message,
parse_instagram_message_owned,
};

#[cfg(feature = "whatsapp")]
pub use whatsapp::{
Expand Down
Loading