diff --git a/src/core/filter.rs b/src/core/filter.rs index 00197137..040534c5 100644 --- a/src/core/filter.rs +++ b/src/core/filter.rs @@ -1,12 +1,21 @@ -//! Message filtering by date and sender. +//! Filter messages by date range and sender. //! -//! This module provides filtering capabilities for chat messages: -//! - Filter by date range (after/before) -//! - Filter by sender name (case-insensitive) +//! This module provides [`FilterConfig`] for defining filter criteria and +//! [`apply_filters`] for filtering message collections. //! -//! # Example +//! # Filter Types //! -//! ```rust +//! | Filter | Method | Description | +//! |--------|--------|-------------| +//! | Date from | [`with_date_from`](FilterConfig::with_date_from) | Messages on or after date | +//! | Date to | [`with_date_to`](FilterConfig::with_date_to) | Messages on or before date | +//! | Sender | [`with_sender`](FilterConfig::with_sender) | Messages from specific user | +//! +//! # Examples +//! +//! ## Filter by Sender +//! +//! ``` //! use chatpack::core::filter::{FilterConfig, apply_filters}; //! use chatpack::Message; //! @@ -16,91 +25,141 @@ //! Message::new("Alice", "How are you?"), //! ]; //! -//! // Filter to only Alice's messages -//! let config = FilterConfig::new().with_user("Alice".to_string()); +//! // Case-insensitive sender matching +//! let config = FilterConfig::new().with_sender("alice"); //! let filtered = apply_filters(messages, &config); //! //! assert_eq!(filtered.len(), 2); //! ``` +//! +//! ## Filter by Date Range +//! +//! ``` +//! use chatpack::core::filter::{FilterConfig, apply_filters}; +//! use chatpack::Message; +//! use chrono::{TimeZone, Utc}; +//! +//! # fn main() -> chatpack::Result<()> { +//! let messages = vec![ +//! Message::new("Alice", "Old").with_timestamp(Utc.with_ymd_and_hms(2024, 1, 1, 12, 0, 0).unwrap()), +//! Message::new("Alice", "New").with_timestamp(Utc.with_ymd_and_hms(2024, 6, 15, 12, 0, 0).unwrap()), +//! ]; +//! +//! let config = FilterConfig::new() +//! .with_date_from("2024-06-01")? +//! .with_date_to("2024-12-31")?; +//! +//! let filtered = apply_filters(messages, &config); +//! assert_eq!(filtered.len(), 1); +//! assert_eq!(filtered[0].content, "New"); +//! # Ok(()) +//! # } +//! ``` +//! +//! # Behavior Notes +//! +//! - Messages without timestamps are **excluded** when date filters are active +//! - Sender matching is case-insensitive for ASCII characters +//! - Multiple filters are combined with AND logic use chrono::{DateTime, NaiveDate, Utc}; use crate::Message; use crate::error::ChatpackError; -/// Configuration for filtering messages. +/// Configuration for filtering messages by date and sender. /// -/// Use the builder pattern to construct filter configurations: +/// Filters are combined with AND logic: a message must match all active +/// filters to be included in the result. /// -/// ```rust +/// # Examples +/// +/// ``` /// use chatpack::core::filter::FilterConfig; /// -/// let config = FilterConfig::new() -/// .after_date("2024-01-01").unwrap() -/// .before_date("2024-12-31").unwrap() -/// .with_user("Alice".to_string()); +/// # fn main() -> chatpack::Result<()> { +/// // Filter by sender only +/// let by_sender = FilterConfig::new().with_sender("Alice"); +/// +/// // Filter by date range +/// let by_date = FilterConfig::new() +/// .with_date_from("2024-01-01")? +/// .with_date_to("2024-12-31")?; +/// +/// // Combined filters +/// let combined = FilterConfig::new() +/// .with_sender("Alice") +/// .with_date_from("2024-06-01")?; +/// # Ok(()) +/// # } /// ``` #[derive(Debug, Clone, Default)] pub struct FilterConfig { - /// Only include messages after this date + /// Include only messages on or after this timestamp. pub after: Option>, - /// Only include messages before this date + + /// Include only messages on or before this timestamp. pub before: Option>, - /// Only include messages from this sender (case-insensitive) + + /// Include only messages from this sender (case-insensitive). pub from: Option, } impl FilterConfig { /// Creates a new empty filter configuration. /// - /// No filters are active by default. + /// No filters are active by default; all messages pass through. pub fn new() -> Self { Self::default() } - /// Parse date string in YYYY-MM-DD format and set as "after" filter. + /// Sets the start date filter (inclusive). /// - /// The time is set to 00:00:00 UTC, so the specified date is included. + /// Only messages on or after this date will be included. + /// Date format: `YYYY-MM-DD`. /// /// # Errors /// - /// Returns [`ChatpackError::InvalidDate`] if the date string - /// doesn't match YYYY-MM-DD format. + /// Returns [`ChatpackError::InvalidDate`] if the format is invalid. /// - /// # Example + /// # Examples /// - /// ```rust + /// ``` /// use chatpack::core::filter::FilterConfig; /// + /// # fn main() -> chatpack::Result<()> { /// let config = FilterConfig::new() - /// .after_date("2024-01-01") - /// .unwrap(); + /// .with_date_from("2024-01-01")?; + /// # Ok(()) + /// # } /// ``` - pub fn after_date(mut self, date_str: &str) -> Result { + pub fn with_date_from(mut self, date_str: &str) -> Result { let dt = parse_date_start(date_str)?; self.after = Some(dt); Ok(self) } - /// Parse date string in YYYY-MM-DD format and set as "before" filter. + /// Sets the end date filter (inclusive). /// - /// The time is set to 23:59:59 UTC to include the entire specified day. + /// Only messages on or before this date will be included. + /// Date format: `YYYY-MM-DD`. /// /// # Errors /// - /// Returns [`ChatpackError::InvalidDate`] if the date string - /// doesn't match YYYY-MM-DD format. + /// Returns [`ChatpackError::InvalidDate`] if the format is invalid. /// - /// # Example + /// # Examples /// - /// ```rust + /// ``` /// use chatpack::core::filter::FilterConfig; /// + /// # fn main() -> chatpack::Result<()> { /// let config = FilterConfig::new() - /// .before_date("2024-12-31") - /// .unwrap(); + /// .with_date_to("2024-12-31")?; + /// # Ok(()) + /// # } /// ``` - pub fn before_date(mut self, date_str: &str) -> Result { + pub fn with_date_to(mut self, date_str: &str) -> Result { let naive = NaiveDate::parse_from_str(date_str, "%Y-%m-%d") .map_err(|_| ChatpackError::invalid_date(date_str))?; @@ -111,55 +170,75 @@ impl FilterConfig { Ok(self) } - /// Set a `DateTime` directly as the "after" filter. - /// Use this when you already have a parsed `DateTime`. + /// Sets the sender filter. + /// + /// Only messages from this sender will be included. + /// Matching is case-insensitive for ASCII characters. + /// + /// # Examples + /// + /// ``` + /// use chatpack::core::filter::FilterConfig; + /// + /// // Matches "Alice", "alice", "ALICE" + /// let config = FilterConfig::new().with_sender("Alice"); + /// ``` #[must_use] - pub fn with_after(mut self, dt: DateTime) -> Self { - self.after = Some(dt); + pub fn with_sender(mut self, sender: impl Into) -> Self { + self.from = Some(sender.into()); self } - /// Set a `DateTime` directly as the "before" filter. + // Legacy method names for backwards compatibility + + /// Sets the start date filter. Alias for [`with_date_from`](Self::with_date_from). + #[doc(hidden)] + pub fn after_date(self, date_str: &str) -> Result { + self.with_date_from(date_str) + } + + /// Sets the end date filter. Alias for [`with_date_to`](Self::with_date_to). + #[doc(hidden)] + pub fn before_date(self, date_str: &str) -> Result { + self.with_date_to(date_str) + } + + /// Sets the sender filter. Alias for [`with_sender`](Self::with_sender). + #[doc(hidden)] + #[must_use] + pub fn with_user(self, user: String) -> Self { + self.with_sender(user) + } + + /// Sets the start timestamp directly. /// - /// Use this when you already have a parsed `DateTime`. + /// Use this when you already have a parsed [`DateTime`]. #[must_use] - pub fn with_before(mut self, dt: DateTime) -> Self { - self.before = Some(dt); + pub fn with_after(mut self, dt: DateTime) -> Self { + self.after = Some(dt); self } - /// Set the sender filter. + /// Sets the end timestamp directly. /// - /// Filtering is case-insensitive for ASCII characters. - /// - /// # Example - /// - /// ```rust - /// use chatpack::core::filter::FilterConfig; - /// - /// // Both "Alice" and "alice" will match - /// let config = FilterConfig::new() - /// .with_user("Alice".to_string()); - /// ``` + /// Use this when you already have a parsed [`DateTime`]. #[must_use] - pub fn with_user(mut self, user: String) -> Self { - self.from = Some(user); + pub fn with_before(mut self, dt: DateTime) -> Self { + self.before = Some(dt); self } - /// Check if any filter is active. - /// - /// Returns `true` if at least one of after, before, or from is set. + /// Returns `true` if any filter is active. pub fn is_active(&self) -> bool { self.after.is_some() || self.before.is_some() || self.from.is_some() } - /// Check if date filters are active. + /// Returns `true` if date filters are active. pub fn has_date_filter(&self) -> bool { self.after.is_some() || self.before.is_some() } - /// Check if sender filter is active. + /// Returns `true` if sender filter is active. pub fn has_user_filter(&self) -> bool { self.from.is_some() } @@ -175,36 +254,41 @@ fn parse_date_start(date_str: &str) -> Result, ChatpackError> { Ok(naive_dt.and_utc()) } -/// Apply filters to a vector of messages. -/// -/// # Behavior +/// Filters a collection of messages based on the provided configuration. /// -/// - Messages matching all active filters are kept -/// - Sender matching is case-insensitive (ASCII) -/// - Messages without timestamps are **excluded** when date filters are active +/// Returns a new vector containing only messages that match all active filters. +/// If no filters are active, returns the original messages unchanged. /// -/// # Performance +/// # Filter Behavior /// -/// This function consumes the input vector and returns a new filtered vector. -/// For large datasets, consider streaming approaches. +/// - **Sender filter**: Case-insensitive ASCII matching +/// - **Date filters**: Messages without timestamps are excluded +/// - **Multiple filters**: Combined with AND logic /// -/// # Example +/// # Examples /// -/// ```rust +/// ``` /// use chatpack::core::filter::{FilterConfig, apply_filters}; /// use chatpack::Message; /// /// let messages = vec![ /// Message::new("Alice", "Hello"), /// Message::new("Bob", "Hi"), +/// Message::new("Alice", "Goodbye"), /// ]; /// -/// let config = FilterConfig::new().with_user("Alice".to_string()); +/// // Filter by sender +/// let config = FilterConfig::new().with_sender("Alice"); /// let filtered = apply_filters(messages, &config); /// -/// assert_eq!(filtered.len(), 1); -/// assert_eq!(filtered[0].sender(), "Alice"); +/// assert_eq!(filtered.len(), 2); +/// assert!(filtered.iter().all(|m| m.sender() == "Alice")); /// ``` +/// +/// # Performance +/// +/// This function consumes the input vector. For streaming use cases, +/// apply filtering inline during iteration instead. pub fn apply_filters(messages: Vec, config: &FilterConfig) -> Vec { if !config.is_active() { return messages; diff --git a/src/core/models.rs b/src/core/models.rs index a4781de6..d398501c 100644 --- a/src/core/models.rs +++ b/src/core/models.rs @@ -1,65 +1,95 @@ -//! Core data models for chat output configuration. +//! Output configuration for message export. //! -//! This module provides the [`OutputConfig`] type for configuring output generation. +//! This module provides [`OutputConfig`] for controlling which metadata fields +//! are included when exporting messages to CSV, JSON, or JSONL formats. //! -//! # Example +//! # Overview //! -//! ```rust -//! use chatpack::Message; +//! By default, only `sender` and `content` are included in output. Use the +//! builder pattern to selectively enable additional fields: +//! +//! | Method | Field | Description | +//! |--------|-------|-------------| +//! | [`with_timestamps`](OutputConfig::with_timestamps) | `timestamp` | When message was sent | +//! | [`with_ids`](OutputConfig::with_ids) | `id` | Platform-specific message ID | +//! | [`with_replies`](OutputConfig::with_replies) | `reply_to` | Parent message reference | +//! | [`with_edited`](OutputConfig::with_edited) | `edited` | Last edit timestamp | +//! +//! # Examples +//! +//! ``` //! use chatpack::core::models::OutputConfig; -//! use chrono::Utc; //! -//! // Create a simple message -//! let msg = Message::new("Alice", "Hello, world!"); +//! // Minimal output (sender + content only) +//! let minimal = OutputConfig::new(); +//! assert!(!minimal.has_any()); //! -//! // Create with builder pattern -//! let msg_with_meta = Message::new("Bob", "Hi there!") -//! .with_id(12345) -//! .with_timestamp(Utc::now()); +//! // Include timestamps +//! let with_time = OutputConfig::new().with_timestamps(); //! -//! // Configure output -//! let config = OutputConfig::new() -//! .with_timestamps() -//! .with_replies(); +//! // Include everything +//! let full = OutputConfig::all(); +//! assert!(full.include_timestamps); +//! assert!(full.include_ids); //! ``` use serde::{Deserialize, Serialize}; -/// Configuration for output format. +/// Controls which message fields are included in output. +/// +/// Used by [`write_csv`](crate::core::output::write_csv), +/// [`write_json`](crate::core::output::write_json), and +/// [`write_jsonl`](crate::core::output::write_jsonl) to determine +/// which optional fields to include. /// -/// Controls which metadata fields are included in the output when writing -/// to CSV, JSON, or JSONL formats. +/// # Default Behavior /// -/// # Example +/// By default, only `sender` and `content` are included. This produces +/// the most compact output, optimal for LLM context windows. /// -/// ```rust -/// use chatpack::core::models::OutputConfig; +/// # Examples /// -/// // Default: only sender and content -/// let minimal = OutputConfig::new(); +/// ```no_run +/// # #[cfg(feature = "csv-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; /// -/// // Include all available metadata -/// let full = OutputConfig::new() +/// let messages = vec![Message::new("Alice", "Hello!")]; +/// +/// // Minimal output +/// write_csv(&messages, "minimal.csv", &OutputConfig::new())?; +/// +/// // With timestamps and IDs +/// let config = OutputConfig::new() /// .with_timestamps() -/// .with_ids() -/// .with_replies() -/// .with_edited(); +/// .with_ids(); +/// write_csv(&messages, "detailed.csv", &config)?; /// -/// // Or use the convenience method -/// let full = OutputConfig::all(); +/// // Everything +/// write_csv(&messages, "full.csv", &OutputConfig::all())?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "csv-output"))] +/// # fn main() {} /// ``` #[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)] pub struct OutputConfig { - /// Include timestamps in output + /// Include message timestamps in output. + /// + /// Timestamps are formatted as RFC 3339 in JSON, ISO 8601 in CSV. pub include_timestamps: bool, - /// Include message IDs in output + /// Include platform-specific message IDs in output. pub include_ids: bool, - /// Include reply references in output + /// Include reply-to references in output. + /// + /// Useful for reconstructing conversation threads. pub include_replies: bool, - /// Include edited timestamps in output + /// Include edit timestamps in output. + /// + /// Shows when messages were last modified. pub include_edited: bool, } diff --git a/src/core/output/csv_writer.rs b/src/core/output/csv_writer.rs index 55d563dd..9aa51c59 100644 --- a/src/core/output/csv_writer.rs +++ b/src/core/output/csv_writer.rs @@ -1,4 +1,7 @@ -//! CSV output writer. +//! CSV output writer with semicolon delimiter. +//! +//! CSV format provides the best token efficiency for LLM context windows, +//! achieving up to 13x compression compared to raw chat exports. use std::fs::File; @@ -6,17 +9,38 @@ use crate::Message; use crate::core::models::OutputConfig; use crate::error::ChatpackError; -/// Writes messages to CSV file with semicolon delimiter. +/// Writes messages to a CSV file. +/// +/// Uses semicolon (`;`) as delimiter for Excel compatibility and to avoid +/// conflicts with commas in message content. /// /// # Format -/// - Delimiter: `;` -/// - Columns: Depends on `OutputConfig` -/// - Basic: `Sender`, `Content` -/// - With timestamps: `Timestamp`, `Sender`, `Content` -/// - With IDs: `ID`, `Sender`, `Content` -/// - With replies: `Sender`, `Content`, `ReplyTo` -/// - With edited: `Sender`, `Content`, `Edited` -/// - Encoding: UTF-8 +/// +/// Columns depend on [`OutputConfig`]: +/// - Base: `Sender`, `Content` +/// - `with_timestamps()`: adds `Timestamp` column +/// - `with_ids()`: adds `ID` column +/// - `with_replies()`: adds `ReplyTo` column +/// - `with_edited()`: adds `Edited` column +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(feature = "csv-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![Message::new("Alice", "Hello!")]; +/// write_csv(&messages, "output.csv", &OutputConfig::new())?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "csv-output"))] +/// # fn main() {} +/// ``` +/// +/// # Errors +/// +/// Returns [`ChatpackError::Io`] if the file cannot be created or written. pub fn write_csv( messages: &[Message], output_path: &str, @@ -39,10 +63,31 @@ pub fn write_csv( Ok(()) } -/// Converts messages to CSV string with semicolon delimiter. +/// Converts messages to a CSV string. +/// +/// Same format as [`write_csv`], but returns a [`String`] instead of writing +/// to a file. Useful for WASM environments or when you need the output in memory. +/// +/// # Examples +/// +/// ``` +/// # #[cfg(feature = "csv-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![ +/// Message::new("Alice", "Hello"), +/// Message::new("Bob", "Hi"), +/// ]; /// -/// Same format as `write_csv`, but returns a String instead of writing to file. -/// Useful for WASM environments where file system access is not available. +/// let csv = to_csv(&messages, &OutputConfig::new())?; +/// assert!(csv.contains("Sender;Content")); +/// assert!(csv.contains("Alice;Hello")); +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "csv-output"))] +/// # fn main() {} +/// ``` pub fn to_csv(messages: &[Message], config: &OutputConfig) -> Result { let mut writer = csv::WriterBuilder::new() .delimiter(b';') diff --git a/src/core/output/json_writer.rs b/src/core/output/json_writer.rs index bebda30a..802f04da 100644 --- a/src/core/output/json_writer.rs +++ b/src/core/output/json_writer.rs @@ -1,4 +1,6 @@ -//! JSON output writer. +//! JSON array output writer. +//! +//! Writes messages as a JSON array, suitable for APIs and structured data processing. use std::fs::File; use std::io::Write; @@ -9,8 +11,9 @@ use crate::Message; use crate::core::models::OutputConfig; use crate::error::ChatpackError; -/// Minimal message structure for JSON output. -/// Only includes fields enabled in `OutputConfig`. +/// Internal message representation for JSON serialization. +/// +/// Only includes fields enabled in [`OutputConfig`]. #[derive(Serialize)] struct JsonMessage { sender: String, @@ -52,15 +55,38 @@ impl JsonMessage { } } -/// Writes messages to JSON file as an array. +/// Writes messages to a JSON file as an array. +/// +/// Produces a pretty-printed JSON array suitable for APIs and structured +/// data processing. /// /// # Format +/// /// ```json /// [ /// {"sender": "Alice", "content": "Hello"}, /// {"sender": "Bob", "content": "Hi"} /// ] /// ``` +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(feature = "json-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![Message::new("Alice", "Hello!")]; +/// write_json(&messages, "output.json", &OutputConfig::new())?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "json-output"))] +/// # fn main() {} +/// ``` +/// +/// # Errors +/// +/// Returns [`ChatpackError::Io`] if the file cannot be created or written. pub fn write_json( messages: &[Message], output_path: &str, @@ -72,10 +98,27 @@ pub fn write_json( Ok(()) } -/// Converts messages to JSON string as an array. +/// Converts messages to a JSON array string. +/// +/// Same format as [`write_json`], but returns a [`String`] instead of writing +/// to a file. Useful for WASM environments or API responses. /// -/// Same format as `write_json`, but returns a String instead of writing to file. -/// Useful for WASM environments where file system access is not available. +/// # Examples +/// +/// ``` +/// # #[cfg(feature = "json-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![Message::new("Alice", "Hello!")]; +/// let json = to_json(&messages, &OutputConfig::new())?; +/// +/// assert!(json.contains(r#""sender": "Alice""#)); +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "json-output"))] +/// # fn main() {} +/// ``` pub fn to_json(messages: &[Message], config: &OutputConfig) -> Result { let json_messages: Vec = messages .iter() diff --git a/src/core/output/jsonl_writer.rs b/src/core/output/jsonl_writer.rs index 528cc342..40cf45ce 100644 --- a/src/core/output/jsonl_writer.rs +++ b/src/core/output/jsonl_writer.rs @@ -1,8 +1,8 @@ -//! JSON Lines (JSONL) output writer. +//! JSON Lines (JSONL/NDJSON) output writer. //! -//! JSONL format is ideal for: -//! - Machine learning pipelines -//! - RAG (Retrieval-Augmented Generation) +//! JSONL format outputs one JSON object per line, making it ideal for: +//! - RAG (Retrieval-Augmented Generation) pipelines +//! - ML training data //! - Streaming processing //! - Large datasets that don't fit in memory @@ -15,8 +15,9 @@ use crate::Message; use crate::core::models::OutputConfig; use crate::error::ChatpackError; -/// Minimal message structure for JSONL output. -/// Only includes fields enabled in `OutputConfig`. +/// Internal message representation for JSONL serialization. +/// +/// Only includes fields enabled in [`OutputConfig`]. #[derive(Serialize)] struct JsonlMessage { sender: String, @@ -58,18 +59,39 @@ impl JsonlMessage { } } -/// Writes messages to JSONL (JSON Lines) file. +/// Writes messages to a JSONL (JSON Lines) file. +/// +/// Each line is a complete, valid JSON object that can be parsed independently. +/// Also known as NDJSON (Newline Delimited JSON). /// -/// Each line is a valid JSON object: -/// ```jsonl +/// # Format +/// +/// ```text /// {"sender":"Alice","content":"Hello"} /// {"sender":"Bob","content":"Hi"} /// ``` /// -/// This format is ideal for: -/// - Streaming processing (one record at a time) -/// - ML training data -/// - RAG document ingestion +/// # Examples +/// +/// ```no_run +/// # #[cfg(feature = "json-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![ +/// Message::new("Alice", "Hello!"), +/// Message::new("Bob", "Hi there!"), +/// ]; +/// write_jsonl(&messages, "output.jsonl", &OutputConfig::new())?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "json-output"))] +/// # fn main() {} +/// ``` +/// +/// # Errors +/// +/// Returns [`ChatpackError::Io`] if the file cannot be created or written. pub fn write_jsonl( messages: &[Message], output_path: &str, @@ -88,10 +110,33 @@ pub fn write_jsonl( Ok(()) } -/// Converts messages to JSONL (JSON Lines) string. +/// Converts messages to a JSONL string. /// -/// Same format as `write_jsonl`, but returns a String instead of writing to file. -/// Useful for WASM environments where file system access is not available. +/// Same format as [`write_jsonl`], but returns a [`String`] instead of writing +/// to a file. Useful for WASM environments or streaming to other destinations. +/// +/// # Examples +/// +/// ``` +/// # #[cfg(feature = "json-output")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::prelude::*; +/// +/// let messages = vec![ +/// Message::new("Alice", "Hello"), +/// Message::new("Bob", "Hi"), +/// ]; +/// +/// let jsonl = to_jsonl(&messages, &OutputConfig::new())?; +/// let lines: Vec<&str> = jsonl.lines().collect(); +/// +/// assert_eq!(lines.len(), 2); +/// assert!(lines[0].contains("Alice")); +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "json-output"))] +/// # fn main() {} +/// ``` pub fn to_jsonl(messages: &[Message], config: &OutputConfig) -> Result { let mut output = String::new(); diff --git a/src/core/output/mod.rs b/src/core/output/mod.rs index 41abbe9a..7ca8743b 100644 --- a/src/core/output/mod.rs +++ b/src/core/output/mod.rs @@ -1,46 +1,61 @@ -//! Output format writers. +//! Export messages to CSV, JSON, and JSONL formats. //! -//! This module provides writers for different output formats: -//! - [`write_csv`] / [`to_csv`] - CSV with semicolon delimiter (best for LLMs) - requires `csv-output` feature -//! - [`write_json`] / [`to_json`] - JSON array of messages - requires `json-output` feature -//! - [`write_jsonl`] / [`to_jsonl`] - JSON Lines (one JSON per line, best for RAG) - requires `json-output` feature +//! This module provides format writers optimized for different use cases. +//! Each format has both file-writing and string-generating variants. //! -//! # Choosing a Format +//! # Format Comparison //! -//! | Format | Use Case | Token Efficiency | -//! |--------|----------|-----------------| -//! | CSV | ChatGPT/Claude context | ⭐⭐⭐ Best (13x compression) | -//! | JSON | Structured data, APIs | ⭐ Good | -//! | JSONL | RAG pipelines, streaming | ⭐⭐ Better | +//! | Format | Function | Feature | Best For | +//! |--------|----------|---------|----------| +//! | CSV | [`write_csv`] / [`to_csv`] | `csv-output` | LLM context (13x compression) | +//! | JSON | [`write_json`] / [`to_json`] | `json-output` | APIs, structured data | +//! | JSONL | [`write_jsonl`] / [`to_jsonl`] | `json-output` | RAG pipelines, streaming | //! -//! # Example +//! # Examples //! -//! ```rust,no_run +//! ## Write to Files +//! +//! ```no_run //! # #[cfg(all(feature = "csv-output", feature = "json-output"))] //! # fn main() -> chatpack::Result<()> { -//! use chatpack::core::output::{write_csv, write_json, write_jsonl, to_csv}; -//! use chatpack::core::models::OutputConfig; -//! use chatpack::Message; +//! use chatpack::prelude::*; //! //! let messages = vec![ //! Message::new("Alice", "Hello!"), //! Message::new("Bob", "Hi there!"), //! ]; -//! //! let config = OutputConfig::new().with_timestamps(); //! -//! // Write to files //! write_csv(&messages, "output.csv", &config)?; //! write_json(&messages, "output.json", &config)?; //! write_jsonl(&messages, "output.jsonl", &config)?; -//! -//! // Or get as strings (useful for WASM) -//! let csv_string = to_csv(&messages, &config)?; //! # Ok(()) //! # } //! # #[cfg(not(all(feature = "csv-output", feature = "json-output")))] //! # fn main() {} //! ``` +//! +//! ## Generate Strings (WASM-friendly) +//! +//! ``` +//! # #[cfg(feature = "csv-output")] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::prelude::*; +//! +//! let messages = vec![Message::new("Alice", "Hello!")]; +//! let csv = to_csv(&messages, &OutputConfig::new())?; +//! +//! assert!(csv.contains("Alice")); +//! # Ok(()) +//! # } +//! # #[cfg(not(feature = "csv-output"))] +//! # fn main() {} +//! ``` +//! +//! # Feature Flags +//! +//! - `csv-output`: Enables CSV functions ([`write_csv`], [`to_csv`]) +//! - `json-output`: Enables JSON functions ([`write_json`], [`to_json`], [`write_jsonl`], [`to_jsonl`]) #[cfg(feature = "csv-output")] mod csv_writer; diff --git a/src/core/processor.rs b/src/core/processor.rs index 73981915..027e985a 100644 --- a/src/core/processor.rs +++ b/src/core/processor.rs @@ -1,8 +1,44 @@ -//! Message processing utilities. +//! Message merging and processing statistics. //! -//! This module provides: -//! - [`merge_consecutive`] - Merge consecutive messages from same sender -//! - [`ProcessingStats`] - Statistics about processing results +//! This module provides utilities for reducing message count while preserving +//! conversation structure, primarily for LLM context optimization. +//! +//! # Overview +//! +//! | Function | Description | +//! |----------|-------------| +//! | [`merge_consecutive`] | Combine consecutive messages from same sender | +//! | [`ProcessingStats`] | Track compression metrics | +//! +//! # Token Compression +//! +//! Merging consecutive messages typically achieves 30-50% reduction in message count, +//! which translates to significant token savings when feeding to LLMs. +//! +//! # Examples +//! +//! ``` +//! use chatpack::core::processor::{merge_consecutive, ProcessingStats}; +//! use chatpack::Message; +//! +//! let messages = vec![ +//! Message::new("Alice", "Hey"), +//! Message::new("Alice", "Are you there?"), +//! Message::new("Alice", "Hello???"), +//! Message::new("Bob", "Sorry, was busy"), +//! ]; +//! +//! let original_count = messages.len(); +//! let merged = merge_consecutive(messages); +//! +//! // 4 messages -> 2 messages (50% reduction) +//! assert_eq!(merged.len(), 2); +//! assert!(merged[0].content.contains("Hey")); +//! assert!(merged[0].content.contains("Hello???")); +//! +//! let stats = ProcessingStats::new(original_count, merged.len()); +//! println!("{}", stats); // "4 → 2 messages (50.0% reduction)" +//! ``` use crate::Message; diff --git a/src/lib.rs b/src/lib.rs index c8098fb5..f1c80538 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,77 +1,136 @@ -//! # Chatpack +//! Parse and convert chat exports from messaging platforms into LLM-friendly formats. //! -//! A Rust library for parsing and converting chat exports from popular messaging -//! platforms into LLM-friendly formats. +//! # Overview //! -//! ## Overview +//! Chatpack provides a unified API for parsing chat exports from popular messaging +//! platforms and converting them into formats optimized for Large Language Models. +//! It handles platform-specific quirks (encoding issues, date formats, message types) +//! and provides tools for filtering, merging, and exporting messages. //! -//! Chatpack provides a unified API for working with chat exports from: -//! - **Telegram** - JSON exports from Telegram Desktop -//! - **WhatsApp** - Text exports (both iOS and Android formats) -//! - **Instagram** - JSON exports from Instagram data download -//! - **Discord** - JSON/TXT/CSV exports from DiscordChatExporter +//! **Supported platforms:** //! -//! The library handles the complexity of different export formats and provides -//! tools for filtering, merging, and outputting messages in formats optimized -//! for use with Large Language Models. +//! | Platform | Export Format | Special Handling | +//! |----------|---------------|------------------| +//! | Telegram | JSON | Service messages, forwarded messages | +//! | WhatsApp | TXT | Auto-detects 4 locale-specific date formats | +//! | Instagram | JSON | Fixes Mojibake encoding from Meta exports | +//! | Discord | JSON/TXT/CSV | Attachments, stickers, replies | //! -//! ## Feature Flags +//! # Quick Start //! -//! Chatpack uses feature flags to minimize dependencies: +//! ```no_run +//! use chatpack::prelude::*; //! -//! | Feature | Description | Dependencies | -//! |---------|-------------|--------------| -//! | `telegram` | Telegram parser | `serde_json` | -//! | `whatsapp` | WhatsApp parser | `regex` | -//! | `instagram` | Instagram parser | `serde_json` | -//! | `discord` | Discord parser | `serde_json`, `regex`, `csv` | -//! | `csv-output` | CSV output writer | `csv` | -//! | `json-output` | JSON/JSONL output writers | `serde_json` | -//! | `streaming` | Streaming parsers for large files | (none) | -//! | `full` | Everything (default) | all above | +//! # #[cfg(all(feature = "telegram", feature = "csv-output"))] +//! # fn main() -> chatpack::Result<()> { +//! // Parse Telegram export +//! let parser = create_parser(Platform::Telegram); +//! let messages = parser.parse("export.json".as_ref())?; +//! +//! // Filter, merge, and export +//! let filtered = apply_filters(messages, &FilterConfig::new().with_sender("Alice")); +//! let merged = merge_consecutive(filtered); +//! write_csv(&merged, "output.csv", &OutputConfig::default())?; +//! # Ok(()) +//! # } +//! # #[cfg(not(all(feature = "telegram", feature = "csv-output")))] +//! # fn main() {} +//! ``` //! -//! ## Quick Start +//! # Core Concepts //! -//! The [`parser`] module provides a unified API with streaming support: +//! ## Message //! -//! ```rust,no_run -//! # #[cfg(all(feature = "telegram", feature = "json-output"))] -//! # fn main() -> chatpack::Result<()> { -//! use chatpack::parser::{Parser, Platform, create_parser}; -//! use chatpack::prelude::*; +//! [`Message`] is the universal representation of a chat message across all platforms: //! -//! // Parse a Telegram export -//! let parser = create_parser(Platform::Telegram); -//! let messages = parser.parse("telegram_export.json".as_ref())?; +//! ``` +//! use chatpack::Message; //! -//! // Merge consecutive messages from the same sender -//! let merged = merge_consecutive(messages); +//! let msg = Message::new("Alice", "Hello, world!"); +//! assert_eq!(msg.sender, "Alice"); +//! assert_eq!(msg.content, "Hello, world!"); +//! ``` //! -//! // Write to JSON -//! write_json(&merged, "output.json", &OutputConfig::new())?; +//! ## Parser Trait //! +//! All platform parsers implement the [`Parser`](parser::Parser) trait, providing +//! a consistent interface: +//! +//! ```no_run +//! # #[cfg(feature = "whatsapp")] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::parser::Parser; +//! use chatpack::parsers::WhatsAppParser; +//! +//! let parser = WhatsAppParser::new(); +//! +//! // Parse from file +//! let messages = parser.parse("chat.txt".as_ref())?; +//! +//! // Or parse from string +//! let content = "[1/15/24, 10:30:45 AM] Alice: Hello"; +//! let messages = parser.parse_str(content)?; //! # Ok(()) //! # } -//! # #[cfg(not(all(feature = "telegram", feature = "json-output")))] +//! # #[cfg(not(feature = "whatsapp"))] //! # fn main() {} //! ``` //! -//! ## Streaming for Large Files +//! # Common Patterns +//! +//! ## Filter by Date Range +//! +//! ``` +//! use chatpack::prelude::*; +//! +//! # fn main() -> chatpack::Result<()> { +//! let messages = vec![ +//! Message::new("Alice", "Old message"), +//! Message::new("Bob", "Recent message"), +//! ]; +//! +//! let filter = FilterConfig::new() +//! .with_date_from("2024-01-01")? +//! .with_date_to("2024-12-31")?; +//! +//! let filtered = apply_filters(messages, &filter); +//! # Ok(()) +//! # } +//! ``` +//! +//! ## Merge Consecutive Messages +//! +//! Combine messages from the same sender within a time window: +//! +//! ``` +//! use chatpack::prelude::*; //! -//! For files larger than 1GB, use the streaming API to avoid memory issues: +//! let messages = vec![ +//! Message::new("Alice", "Hello"), +//! Message::new("Alice", "How are you?"), +//! Message::new("Bob", "I'm fine!"), +//! ]; //! -//! ```rust,no_run +//! let merged = merge_consecutive(messages); +//! assert_eq!(merged.len(), 2); // Alice's messages merged +//! assert!(merged[0].content.contains("Hello")); +//! assert!(merged[0].content.contains("How are you?")); +//! ``` +//! +//! ## Stream Large Files +//! +//! Process files larger than available memory: +//! +//! ```no_run //! # #[cfg(all(feature = "telegram", feature = "streaming"))] //! # fn main() -> chatpack::Result<()> { -//! use chatpack::parser::{Parser, Platform, create_streaming_parser}; +//! use chatpack::prelude::*; //! //! let parser = create_streaming_parser(Platform::Telegram); //! -//! // Process messages one at a time //! for result in parser.stream("huge_export.json".as_ref())? { -//! if let Ok(msg) = result { -//! println!("{}: {}", msg.sender, msg.content); -//! } +//! let msg = result?; +//! println!("{}: {}", msg.sender, msg.content); //! } //! # Ok(()) //! # } @@ -79,29 +138,77 @@ //! # fn main() {} //! ``` //! -//! ## Module Structure -//! -//! - [`parser`] - **Unified parser API** (recommended) -//! - [`Parser`](parser::Parser) - Unified parser trait with streaming -//! - [`Platform`](parser::Platform) - Supported platforms enum -//! - [`create_parser`](parser::create_parser), [`create_streaming_parser`](parser::create_streaming_parser) -//! - [`config`] - Parser configuration types -//! - [`TelegramConfig`](config::TelegramConfig), [`WhatsAppConfig`](config::WhatsAppConfig), etc. -//! - [`core`] - Core types and functionality -//! - [`core::models`] - [`Message`], [`OutputConfig`] -//! - [`core::filter`] - [`FilterConfig`], [`apply_filters`] -//! - [`core::processor`] - [`merge_consecutive`], [`ProcessingStats`] -//! - [`core::output`] - [`write_json`], [`write_jsonl`], [`write_csv`] -//! - [`parsers`] - Platform-specific parser implementations -//! - [`TelegramParser`], [`WhatsAppParser`], [`InstagramParser`], [`DiscordParser`] -//! - [`streaming`] - Streaming parsers for large files (requires `streaming` feature) -//! - [`TelegramStreamingParser`], [`DiscordStreamingParser`] -//! - [`format`] - Output format types -//! - [`OutputFormat`](format::OutputFormat), [`write_to_format`](format::write_to_format) -//! - [`progress`] - Progress reporting for long-running operations -//! - [`Progress`](progress::Progress), [`ProgressCallback`](progress::ProgressCallback) -//! - [`error`] - Unified error types ([`ChatpackError`], [`Result`]) -//! - [`prelude`] - Convenient re-exports +//! ## Export to Multiple Formats +//! +//! ```no_run +//! # #[cfg(all(feature = "csv-output", feature = "json-output"))] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::prelude::*; +//! +//! let messages = vec![Message::new("Alice", "Hello!")]; +//! let config = OutputConfig::new().with_timestamps(); +//! +//! // CSV - best for LLM context (13x token compression) +//! write_csv(&messages, "output.csv", &config)?; +//! +//! // JSON - structured array for APIs +//! write_json(&messages, "output.json", &config)?; +//! +//! // JSONL - one object per line for RAG pipelines +//! write_jsonl(&messages, "output.jsonl", &config)?; +//! # Ok(()) +//! # } +//! # #[cfg(not(all(feature = "csv-output", feature = "json-output")))] +//! # fn main() {} +//! ``` +//! +//! # Module Structure +//! +//! | Module | Description | +//! |--------|-------------| +//! | [`parser`] | Unified parser API with [`Parser`](parser::Parser) trait and [`Platform`](parser::Platform) enum | +//! | [`parsers`] | Platform-specific implementations: [`TelegramParser`](parsers::TelegramParser), [`WhatsAppParser`](parsers::WhatsAppParser), etc. | +//! | [`config`] | Parser configurations: [`TelegramConfig`](config::TelegramConfig), [`WhatsAppConfig`](config::WhatsAppConfig), etc. | +//! | [`core`] | Core types: [`Message`], [`OutputConfig`](core::OutputConfig), [`FilterConfig`](core::FilterConfig) | +//! | [`streaming`] | Memory-efficient streaming parsers for large files | +//! | [`format`] | Output formats: [`OutputFormat`](format::OutputFormat), [`write_to_format`](format::write_to_format) | +//! | [`error`] | Error types: [`ChatpackError`], [`Result`] | +//! | [`prelude`] | Convenient re-exports for common usage | +//! +//! # Feature Flags +//! +//! Enable only the features you need to minimize compile time and dependencies: +//! +//! | Feature | Description | Dependencies | +//! |---------|-------------|--------------| +//! | `telegram` | Telegram JSON parser | `serde_json` | +//! | `whatsapp` | WhatsApp TXT parser | `regex` | +//! | `instagram` | Instagram JSON parser | `serde_json` | +//! | `discord` | Discord multi-format parser | `serde_json`, `regex`, `csv` | +//! | `csv-output` | CSV output writer | `csv` | +//! | `json-output` | JSON/JSONL output writers | `serde_json` | +//! | `streaming` | Streaming parsers for large files | - | +//! | `async` | Async parser support | `tokio` | +//! | `full` | All features (default) | all above | +//! +//! ```toml +//! # Cargo.toml - minimal configuration +//! [dependencies] +//! chatpack = { version = "0.5", default-features = false, features = ["telegram", "csv-output"] } +//! ``` +//! +//! # Serialization +//! +//! All public types implement [`serde::Serialize`] and [`serde::Deserialize`]: +//! +//! ``` +//! use chatpack::Message; +//! +//! let msg = Message::new("Alice", "Hello!"); +//! let json = serde_json::to_string(&msg).unwrap(); +//! let parsed: Message = serde_json::from_str(&json).unwrap(); +//! assert_eq!(msg.content, parsed.content); +//! ``` // Core modules (always available) pub mod config; @@ -157,12 +264,25 @@ pub mod async_parser; pub use error::{ChatpackError, Result}; pub use message::Message; -/// Convenient re-exports for common usage. +/// Convenient re-exports for common usage patterns. /// -/// Import everything you need with a single line: +/// This module provides a single import for the most commonly used types +/// and functions. It's designed to cover 90% of use cases with minimal imports. /// -/// ```rust +/// # Example +/// +/// ``` /// use chatpack::prelude::*; +/// +/// // Now you have access to: +/// // - Message, ChatpackError, Result +/// // - Platform, Parser, create_parser, create_streaming_parser +/// // - FilterConfig, apply_filters +/// // - OutputConfig, merge_consecutive +/// // - write_csv, write_json, write_jsonl (with features) +/// // - All platform parsers (with features) +/// +/// let msg = Message::new("Alice", "Hello!"); /// ``` pub mod prelude { // Core message type diff --git a/src/message.rs b/src/message.rs index f43918e9..2b518d72 100644 --- a/src/message.rs +++ b/src/message.rs @@ -1,85 +1,147 @@ -//! Core message type for chatpack. +//! Universal message type for all chat platforms. //! -//! This module provides the [`Message`] type, the universal representation -//! for chat messages from all supported platforms. +//! This module provides [`Message`], the normalized representation of chat messages. +//! All platform parsers convert their native formats into this structure, enabling +//! uniform processing regardless of source. //! -//! # Example +//! # Overview //! -//! ```rust +//! A message consists of: +//! - **Required**: `sender` and `content` +//! - **Optional**: `timestamp`, `id`, `reply_to`, `edited` +//! +//! # Examples +//! +//! ## Basic Usage +//! +//! ``` //! use chatpack::Message; -//! use chrono::Utc; //! -//! // Create a simple message //! let msg = Message::new("Alice", "Hello, world!"); +//! assert_eq!(msg.sender(), "Alice"); +//! assert_eq!(msg.content(), "Hello, world!"); +//! ``` +//! +//! ## Builder Pattern //! -//! // Create with builder pattern -//! let msg_with_meta = Message::new("Bob", "Hi there!") +//! ``` +//! use chatpack::Message; +//! use chrono::Utc; +//! +//! let msg = Message::new("Bob", "Check this out!") //! .with_id(12345) -//! .with_timestamp(Utc::now()); +//! .with_timestamp(Utc::now()) +//! .with_reply_to(12344); +//! +//! assert!(msg.has_metadata()); +//! ``` +//! +//! ## Serialization +//! +//! ``` +//! use chatpack::Message; +//! +//! let msg = Message::new("Alice", "Hello!"); +//! let json = serde_json::to_string(&msg)?; +//! let parsed: Message = serde_json::from_str(&json)?; +//! +//! assert_eq!(msg, parsed); +//! # Ok::<(), serde_json::Error>(()) //! ``` use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; -/// A chat message with optional metadata. +/// A normalized chat message from any supported platform. /// -/// This is the universal message representation used across all chat sources. -/// All parsers convert their native format into this structure, enabling -/// uniform processing regardless of the original chat platform. +/// This struct is the core data type in chatpack. All platform-specific parsers +/// convert their native message formats into this universal representation, +/// enabling uniform processing, filtering, and export. /// /// # Fields /// -/// - `sender` and `content` are always present -/// - `timestamp`, `id`, `reply_to`, and `edited` are optional metadata +/// | Field | Type | Description | +/// |-------|------|-------------| +/// | `sender` | `String` | Display name or username of the message author | +/// | `content` | `String` | Text content of the message | +/// | `timestamp` | `Option>` | When the message was sent | +/// | `id` | `Option` | Platform-specific message identifier | +/// | `reply_to` | `Option` | ID of the parent message (for replies) | +/// | `edited` | `Option>` | When the message was last edited | /// -/// # Serialization -/// -/// The struct implements both `Serialize` and `Deserialize`, making it -/// suitable for: -/// - Saving/loading processed messages -/// - Inter-process communication -/// - Integration with other systems (RAG pipelines, databases, etc.) -/// -/// Optional fields are skipped during serialization when `None`. +/// # Construction /// -/// # Example +/// Use [`Message::new`] for simple messages or the builder pattern for metadata: /// -/// ```rust +/// ``` /// use chatpack::Message; /// use chrono::Utc; /// +/// // Simple message +/// let msg = Message::new("Alice", "Hello!"); +/// +/// // With metadata /// let msg = Message::new("Alice", "Hello!") /// .with_timestamp(Utc::now()) /// .with_id(12345); +/// ``` +/// +/// # Serialization +/// +/// Implements `Serialize` and `Deserialize` with these behaviors: +/// - Optional fields are omitted from JSON when `None` +/// - Timestamps use RFC 3339 format +/// - Suitable for storage, IPC, and RAG pipelines /// -/// assert_eq!(msg.sender(), "Alice"); -/// assert_eq!(msg.content(), "Hello!"); -/// assert!(msg.id().is_some()); +/// ``` +/// use chatpack::Message; +/// +/// let msg = Message::new("Alice", "Hello!").with_id(123); +/// let json = serde_json::to_string(&msg)?; +/// +/// // timestamp is omitted (None) +/// assert!(!json.contains("timestamp")); +/// assert!(json.contains("123")); +/// # Ok::<(), serde_json::Error>(()) /// ``` #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Message { - /// Message sender name/username + /// Display name or username of the message author. pub sender: String, - /// Message text content + /// Text content of the message. + /// + /// May contain newlines for multiline messages. Platform-specific + /// attachments (images, files) are typically represented as text + /// placeholders like `[Attachment: image.png]`. pub content: String, - /// Message timestamp (if available from source) + /// When the message was originally sent. + /// + /// Available from most platforms except some WhatsApp export formats. #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub timestamp: Option>, - /// Platform-specific message ID (if available) + /// Platform-specific message identifier. + /// + /// - Telegram: message ID from the chat + /// - Discord: snowflake ID + /// - WhatsApp/Instagram: typically not available in exports #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub id: Option, - /// ID of the message this is replying to (if available) + /// ID of the message this is replying to. + /// + /// Enables reconstruction of reply chains and conversation threads. #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub reply_to: Option, - /// Timestamp when message was last edited (if available) + /// When the message was last edited. + /// + /// Present when the platform tracks edit history (Telegram, Discord). #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub edited: Option>, diff --git a/src/parser.rs b/src/parser.rs index 972e994d..98493b3c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,47 +1,75 @@ -//! Unified parser trait for chat exports. +//! Unified parser API for chat exports. //! -//! This module provides a single entry point for parsing chat exports, with support -//! for both in-memory and streaming modes. +//! This module provides a platform-agnostic interface for parsing chat exports. +//! All platform parsers implement the [`Parser`] trait, enabling consistent +//! usage patterns across Telegram, WhatsApp, Instagram, and Discord. //! -//! # Example +//! # Overview //! -//! ```rust,no_run +//! The module provides: +//! - [`Parser`] - Unified trait for all parsers +//! - [`Platform`] - Enum for dynamic parser selection +//! - [`create_parser`] - Factory function for standard parsers +//! - [`create_streaming_parser`] - Factory function for memory-efficient streaming +//! +//! # Examples +//! +//! ## Basic Parsing +//! +//! ```no_run //! # #[cfg(feature = "telegram")] //! # fn main() -> chatpack::Result<()> { -//! use chatpack::parser::{Parser, Platform}; -//! use chatpack::parsers::TelegramParser; -//! use std::path::Path; -//! -//! let parser = TelegramParser::new(); +//! use chatpack::parser::{Parser, Platform, create_parser}; //! -//! // Parse entire file into memory -//! let messages = parser.parse(Path::new("chat_export.json"))?; +//! // Create parser dynamically +//! let parser = create_parser(Platform::Telegram); +//! let messages = parser.parse("export.json".as_ref())?; //! -//! // Or stream for large files -//! for result in parser.stream(Path::new("large_export.json"))? { -//! if let Ok(msg) = result { -//! println!("{}: {}", msg.sender, msg.content); -//! } -//! } +//! println!("Parsed {} messages", messages.len()); //! # Ok(()) //! # } //! # #[cfg(not(feature = "telegram"))] //! # fn main() {} //! ``` //! -//! # Platform Selection +//! ## Parse from String //! -//! Use [`Platform`] enum to dynamically select parsers: +//! Useful for WASM or testing: //! -//! ```rust -//! # #[cfg(feature = "telegram")] -//! # fn main() { -//! use chatpack::parser::{Platform, create_parser}; +//! ``` +//! # #[cfg(feature = "whatsapp")] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::parser::{Parser, create_parser, Platform}; //! -//! let parser = create_parser(Platform::Telegram); -//! // parser.parse("file.json")?; +//! let content = "[1/15/24, 10:30:45 AM] Alice: Hello!"; +//! let parser = create_parser(Platform::WhatsApp); +//! let messages = parser.parse_str(content)?; +//! +//! assert_eq!(messages[0].sender, "Alice"); +//! # Ok(()) //! # } -//! # #[cfg(not(feature = "telegram"))] +//! # #[cfg(not(feature = "whatsapp"))] +//! # fn main() {} +//! ``` +//! +//! ## Streaming Large Files +//! +//! Process files that don't fit in memory: +//! +//! ```no_run +//! # #[cfg(all(feature = "telegram", feature = "streaming"))] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::parser::{Parser, Platform, create_streaming_parser}; +//! +//! let parser = create_streaming_parser(Platform::Telegram); +//! +//! for result in parser.stream("10gb_export.json".as_ref())? { +//! let msg = result?; +//! // Process one message at a time +//! } +//! # Ok(()) +//! # } +//! # #[cfg(not(all(feature = "telegram", feature = "streaming")))] //! # fn main() {} //! ``` @@ -55,47 +83,98 @@ use crate::error::ChatpackError; #[cfg(feature = "streaming")] use crate::streaming::MessageIterator; -/// Supported messaging platforms. +/// Supported messaging platforms for chat export parsing. /// -/// This enum identifies the source platform for chat exports, enabling -/// dynamic parser selection without CLI dependencies. +/// Each variant corresponds to a specific export format and parser implementation. +/// Use with [`create_parser`] or [`create_streaming_parser`] for dynamic parser selection. /// -/// # Example +/// # Aliases /// -/// ```rust +/// All platforms support short aliases for convenience: +/// - `telegram` / `tg` +/// - `whatsapp` / `wa` +/// - `instagram` / `ig` +/// - `discord` / `dc` +/// +/// # Examples +/// +/// ``` /// use chatpack::parser::Platform; /// use std::str::FromStr; /// -/// let platform = Platform::from_str("telegram").unwrap(); +/// // Parse from string (case-insensitive) +/// let platform = Platform::from_str("telegram")?; /// assert_eq!(platform, Platform::Telegram); /// -/// // Aliases are supported -/// let platform = Platform::from_str("tg").unwrap(); +/// // Aliases work too +/// let platform = Platform::from_str("tg")?; /// assert_eq!(platform, Platform::Telegram); +/// +/// // Get file extension +/// assert_eq!(Platform::WhatsApp.default_extension(), "txt"); +/// assert_eq!(Platform::Telegram.default_extension(), "json"); +/// # Ok::<(), String>(()) +/// ``` +/// +/// # Serialization +/// +/// Serializes to lowercase strings, deserializes with alias support: +/// +/// ``` +/// use chatpack::parser::Platform; +/// +/// let json = serde_json::to_string(&Platform::Telegram)?; +/// assert_eq!(json, "\"telegram\""); +/// +/// // Deserialize with alias +/// let platform: Platform = serde_json::from_str("\"tg\"")?; +/// assert_eq!(platform, Platform::Telegram); +/// # Ok::<(), serde_json::Error>(()) /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] #[non_exhaustive] pub enum Platform { - /// Telegram JSON exports from Telegram Desktop + /// Telegram Desktop JSON exports. + /// + /// Parses the `result.json` file from "Export chat history" feature. + /// Handles service messages, forwarded messages, and reply chains. #[serde(alias = "tg")] Telegram, - /// WhatsApp TXT exports (iOS and Android) + /// WhatsApp TXT exports from iOS and Android. + /// + /// Auto-detects locale-specific date formats (US, EU, RU variants). + /// Handles multiline messages and system notifications. #[serde(alias = "wa")] WhatsApp, - /// Instagram JSON exports from data download + /// Instagram JSON exports from Meta's data download. + /// + /// Automatically fixes Mojibake encoding issues in non-ASCII text. + /// Parses direct messages from the `messages/` directory. #[serde(alias = "ig")] Instagram, - /// Discord exports from DiscordChatExporter (JSON/TXT/CSV) + /// Discord exports from DiscordChatExporter tool. + /// + /// Supports multiple formats: JSON, TXT, and CSV. + /// Preserves attachments, stickers, and reply references. #[serde(alias = "dc")] Discord, } impl Platform { /// Returns the default file extension for exports from this platform. + /// + /// # Examples + /// + /// ``` + /// use chatpack::parser::Platform; + /// + /// assert_eq!(Platform::Telegram.default_extension(), "json"); + /// assert_eq!(Platform::WhatsApp.default_extension(), "txt"); + /// ``` pub fn default_extension(&self) -> &'static str { match self { Platform::WhatsApp => "txt", @@ -103,7 +182,19 @@ impl Platform { } } - /// Returns all platform names including aliases. + /// Returns all valid platform names and aliases. + /// + /// Useful for CLI help text or validation messages. + /// + /// # Examples + /// + /// ``` + /// use chatpack::parser::Platform; + /// + /// let names = Platform::all_names(); + /// assert!(names.contains(&"telegram")); + /// assert!(names.contains(&"tg")); // alias + /// ``` pub fn all_names() -> &'static [&'static str] { &[ "telegram", @@ -117,7 +208,17 @@ impl Platform { ] } - /// Returns all available platforms. + /// Returns all available platform variants. + /// + /// # Examples + /// + /// ``` + /// use chatpack::parser::Platform; + /// + /// for platform in Platform::all() { + /// println!("{}: .{}", platform, platform.default_extension()); + /// } + /// ``` pub fn all() -> &'static [Platform] { &[ Platform::Telegram, @@ -157,7 +258,33 @@ impl std::str::FromStr for Platform { } } -/// Iterator adapter that wraps StreamingError into ChatpackError. +/// Iterator over parsed messages with progress tracking. +/// +/// Wraps a streaming parser's [`MessageIterator`](crate::streaming::MessageIterator) +/// and converts errors to [`ChatpackError`]. Provides progress information for +/// long-running operations. +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(all(feature = "telegram", feature = "streaming"))] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::parser::{Parser, Platform, create_streaming_parser}; +/// +/// let parser = create_streaming_parser(Platform::Telegram); +/// let mut count = 0; +/// +/// for result in parser.stream("export.json".as_ref())? { +/// let msg = result?; +/// count += 1; +/// } +/// +/// println!("Processed {} messages", count); +/// # Ok(()) +/// # } +/// # #[cfg(not(all(feature = "telegram", feature = "streaming")))] +/// # fn main() {} +/// ``` #[cfg(feature = "streaming")] pub struct ParseIterator { inner: Box, @@ -170,7 +297,9 @@ impl ParseIterator { Self { inner } } - /// Returns the progress as a percentage (0.0 - 100.0). + /// Returns the progress as a percentage (0.0 to 100.0). + /// + /// Returns `None` if progress cannot be determined (e.g., unknown file size). pub fn progress(&self) -> Option { self.inner.progress() } @@ -197,40 +326,65 @@ impl Iterator for ParseIterator { } } -/// Unified trait for parsing chat exports. +/// Unified trait for parsing chat exports from any platform. /// -/// This trait combines the functionality of the previous `ChatParser` and -/// `StreamingParser` traits into a single, cohesive API. +/// All platform-specific parsers implement this trait, providing a consistent +/// interface for parsing chat exports regardless of the source platform. /// -/// # Implementation Notes +/// # Required Methods /// -/// Parsers must implement: -/// - [`name`](Parser::name) - Parser identifier -/// - [`platform`](Parser::platform) - Platform this parser handles -/// - [`parse`](Parser::parse) - Load entire file into memory -/// - [`parse_str`](Parser::parse_str) - Parse from a string +/// | Method | Description | +/// |--------|-------------| +/// | [`name`](Parser::name) | Human-readable parser name | +/// | [`platform`](Parser::platform) | Platform enum variant | +/// | [`parse`](Parser::parse) | Parse file into memory | +/// | [`parse_str`](Parser::parse_str) | Parse from string | /// -/// Optionally override: -/// - [`stream`](Parser::stream) - Streaming for large files (default: falls back to parse) -/// - [`supports_streaming`](Parser::supports_streaming) - Whether native streaming is supported +/// # Optional Methods /// -/// # Example Implementation +/// | Method | Default | Description | +/// |--------|---------|-------------| +/// | [`stream`](Parser::stream) | Falls back to `parse` | Memory-efficient streaming | +/// | [`supports_streaming`](Parser::supports_streaming) | `false` | Native streaming support | +/// | [`recommended_buffer_size`](Parser::recommended_buffer_size) | 64KB | Buffer size hint | /// -/// ```rust,ignore -/// impl Parser for MyParser { -/// fn name(&self) -> &'static str { "MyParser" } -/// fn platform(&self) -> Platform { Platform::Telegram } +/// # Examples /// -/// fn parse(&self, path: &Path) -> Result, ChatpackError> { -/// let content = std::fs::read_to_string(path)?; -/// self.parse_str(&content) -/// } +/// Using a parser directly: /// -/// fn parse_str(&self, content: &str) -> Result, ChatpackError> { -/// // Parse logic here -/// Ok(vec![]) -/// } -/// } +/// ```no_run +/// # #[cfg(feature = "telegram")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::parser::Parser; +/// use chatpack::parsers::TelegramParser; +/// +/// let parser = TelegramParser::new(); +/// +/// // Parse from file +/// let messages = parser.parse("export.json".as_ref())?; +/// +/// // Parse from string +/// let json = r#"{"messages": []}"#; +/// let messages = parser.parse_str(json)?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "telegram"))] +/// # fn main() {} +/// ``` +/// +/// Using the factory function: +/// +/// ```no_run +/// # #[cfg(feature = "telegram")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::parser::{Parser, Platform, create_parser}; +/// +/// let parser = create_parser(Platform::Telegram); +/// let messages = parser.parse("export.json".as_ref())?; +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "telegram"))] +/// # fn main() {} /// ``` pub trait Parser: Send + Sync { /// Returns the human-readable name of this parser. @@ -362,19 +516,24 @@ pub trait Parser: Send + Sync { } } -/// Creates a parser for the specified platform. +/// Creates a parser for the specified platform with default configuration. /// -/// This is the main entry point for dynamic parser creation. +/// This is the primary factory function for creating parsers dynamically. +/// The returned parser loads the entire file into memory, which is suitable +/// for files up to ~500MB. /// -/// # Example +/// For larger files, use [`create_streaming_parser`] instead. /// -/// ```rust +/// # Examples +/// +/// ``` /// # #[cfg(feature = "telegram")] /// # fn main() { /// use chatpack::parser::{Platform, create_parser}; /// /// let parser = create_parser(Platform::Telegram); /// assert_eq!(parser.name(), "Telegram"); +/// assert_eq!(parser.platform(), Platform::Telegram); /// # } /// # #[cfg(not(feature = "telegram"))] /// # fn main() {} @@ -382,7 +541,12 @@ pub trait Parser: Send + Sync { /// /// # Panics /// -/// Panics if the corresponding parser feature is not enabled. +/// Panics if the corresponding feature is not enabled. Enable features in `Cargo.toml`: +/// +/// ```toml +/// [dependencies] +/// chatpack = { version = "0.5", features = ["telegram"] } +/// ``` pub fn create_parser(platform: Platform) -> Box { match platform { #[cfg(feature = "telegram")] @@ -402,31 +566,47 @@ pub fn create_parser(platform: Platform) -> Box { } } -/// Creates a parser for the specified platform with streaming support. +/// Creates a parser optimized for streaming large files. /// -/// This creates a parser configured for optimal streaming performance. -/// All platforms now support streaming. +/// The returned parser is configured for memory-efficient processing of files +/// that may be larger than available RAM. Use the [`stream`](Parser::stream) +/// method to process messages one at a time. /// -/// # Example +/// # When to Use /// -/// ```rust,no_run -/// # #[cfg(feature = "telegram")] +/// - Files larger than 500MB +/// - Memory-constrained environments +/// - When you need progress tracking +/// +/// For smaller files, [`create_parser`] is simpler and often faster. +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(all(feature = "telegram", feature = "streaming"))] /// # fn main() -> chatpack::Result<()> { -/// use chatpack::parser::{Platform, create_streaming_parser}; +/// use chatpack::parser::{Parser, Platform, create_streaming_parser}; /// /// let parser = create_streaming_parser(Platform::Telegram); -/// for result in parser.stream("large_file.json".as_ref())? { -/// // Process each message +/// assert!(parser.supports_streaming()); +/// +/// let mut count = 0; +/// for result in parser.stream("10gb_export.json".as_ref())? { +/// let _msg = result?; +/// count += 1; +/// if count % 100_000 == 0 { +/// println!("Processed {} messages", count); +/// } /// } /// # Ok(()) /// # } -/// # #[cfg(not(feature = "telegram"))] +/// # #[cfg(not(all(feature = "telegram", feature = "streaming")))] /// # fn main() {} /// ``` /// /// # Panics /// -/// Panics if the corresponding parser feature is not enabled. +/// Panics if the corresponding feature is not enabled. pub fn create_streaming_parser(platform: Platform) -> Box { match platform { #[cfg(feature = "telegram")] diff --git a/src/parsers/discord.rs b/src/parsers/discord.rs index b5510612..cc84dc48 100644 --- a/src/parsers/discord.rs +++ b/src/parsers/discord.rs @@ -1,7 +1,6 @@ //! Discord export parser. //! -//! Handles exports from DiscordChatExporter tool. -//! Supports multiple formats: JSON, TXT, CSV. +//! Parses exports from the DiscordChatExporter tool in JSON, TXT, or CSV format. use std::fs::{self, File}; use std::io::BufReader; @@ -19,18 +18,46 @@ use crate::parser::{Parser, Platform}; #[cfg(feature = "streaming")] use crate::streaming::{DiscordStreamingParser, StreamingConfig, StreamingParser}; -/// Parser for Discord exports (from DiscordChatExporter). -/// Supports JSON, TXT, and CSV formats. +/// Parser for Discord channel exports. /// -/// # Example +/// Handles exports created by [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter). +/// Auto-detects format based on file extension. /// -/// ```rust,no_run +/// # Supported Formats +/// +/// | Extension | Format | Notes | +/// |-----------|--------|-------| +/// | `.json` | JSON | Full metadata, recommended | +/// | `.txt` | Plain text | Basic, regex-parsed | +/// | `.csv` | CSV | Tabular format | +/// +/// # Message Types +/// +/// - Regular messages +/// - Replies (preserves reference) +/// - Attachments (as placeholders) +/// - Stickers +/// - Embeds (text only) +/// +/// # Examples +/// +/// ```no_run /// use chatpack::parsers::DiscordParser; /// use chatpack::parser::Parser; /// +/// # fn main() -> chatpack::Result<()> { /// let parser = DiscordParser::new(); -/// let messages = parser.parse("discord_export.json".as_ref())?; -/// # Ok::<(), chatpack::ChatpackError>(()) +/// +/// // Auto-detects format from extension +/// let messages = parser.parse("channel.json".as_ref())?; +/// +/// for msg in &messages { +/// if let Some(id) = msg.id { +/// println!("[{}] {}: {}", id, msg.sender, msg.content); +/// } +/// } +/// # Ok(()) +/// # } /// ``` pub struct DiscordParser { config: DiscordConfig, diff --git a/src/parsers/instagram.rs b/src/parsers/instagram.rs index 8d2b1560..baf59013 100644 --- a/src/parsers/instagram.rs +++ b/src/parsers/instagram.rs @@ -1,10 +1,7 @@ //! Instagram JSON export parser. //! -//! Handles Meta's JSON exports with Mojibake encoding fix. -//! -//! Instagram exports messages as JSON (from "Download Your Data" feature). -//! The main quirk is that Meta exports UTF-8 text encoded as ISO-8859-1, -//! causing Cyrillic and other non-ASCII text to appear as garbage (Mojibake). +//! Parses JSON exports from Meta's "Download Your Data" feature with +//! automatic Mojibake encoding fix. use std::fs; use std::path::Path; @@ -18,17 +15,49 @@ use crate::parsing::instagram::{InstagramExport, parse_instagram_message_owned}; #[cfg(feature = "streaming")] use crate::streaming::{InstagramStreamingParser, StreamingConfig, StreamingParser}; -/// Parser for Instagram JSON exports. +/// Parser for Instagram DM JSON exports. +/// +/// Handles JSON files from Meta's "Download Your Data" feature (Settings > +/// Privacy > Download Your Information). Parses files from the `messages/` +/// directory. +/// +/// # Mojibake Fix +/// +/// Meta exports UTF-8 text encoded as ISO-8859-1, causing non-ASCII characters +/// (Cyrillic, emoji, etc.) to appear as garbage. This parser automatically +/// detects and fixes this encoding issue. /// -/// # Example +/// # JSON Structure /// -/// ```rust,no_run +/// ```json +/// { +/// "participants": [{"name": "Alice"}, {"name": "Bob"}], +/// "messages": [ +/// { +/// "sender_name": "Alice", +/// "timestamp_ms": 1234567890000, +/// "content": "Hello!" +/// } +/// ] +/// } +/// ``` +/// +/// # Examples +/// +/// ```no_run /// use chatpack::parsers::InstagramParser; /// use chatpack::parser::Parser; /// +/// # fn main() -> chatpack::Result<()> { /// let parser = InstagramParser::new(); -/// let messages = parser.parse("instagram_messages.json".as_ref())?; -/// # Ok::<(), chatpack::ChatpackError>(()) +/// let messages = parser.parse("message_1.json".as_ref())?; +/// +/// // Non-ASCII text is automatically fixed +/// for msg in &messages { +/// println!("{}: {}", msg.sender, msg.content); +/// } +/// # Ok(()) +/// # } /// ``` pub struct InstagramParser { config: InstagramConfig, diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs index a5a3681b..a30c8cfc 100644 --- a/src/parsers/mod.rs +++ b/src/parsers/mod.rs @@ -1,35 +1,67 @@ -//! Chat export parsers for various platforms. +//! Platform-specific chat export parsers. //! -//! This module provides parsers for chat exports from different messaging platforms. -//! Each parser implements the [`Parser`] trait from [`crate::parser`]. +//! This module provides parser implementations for each supported messaging platform. +//! All parsers implement the [`Parser`] trait, providing a consistent interface. //! //! # Available Parsers //! -//! - [`TelegramParser`] - Parses Telegram JSON exports (requires `telegram` feature) -//! - [`WhatsAppParser`] - Parses WhatsApp TXT exports (requires `whatsapp` feature) -//! - [`InstagramParser`] - Parses Instagram JSON exports (requires `instagram` feature) -//! - [`DiscordParser`] - Parses Discord JSON/TXT/CSV exports (requires `discord` feature) +//! | Parser | Feature | Export Format | Special Handling | +//! |--------|---------|---------------|------------------| +//! | [`TelegramParser`] | `telegram` | JSON | Service messages, forwards | +//! | [`WhatsAppParser`] | `whatsapp` | TXT | Auto-detects 4 date formats | +//! | [`InstagramParser`] | `instagram` | JSON | Fixes Mojibake encoding | +//! | [`DiscordParser`] | `discord` | JSON/TXT/CSV | Attachments, stickers | //! -//! # Usage +//! # Examples //! -//! ```rust,no_run +//! ## Direct Parser Usage +//! +//! ```no_run +//! # #[cfg(feature = "telegram")] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::parser::Parser; +//! use chatpack::parsers::TelegramParser; +//! +//! let parser = TelegramParser::new(); +//! let messages = parser.parse("result.json".as_ref())?; +//! +//! println!("Parsed {} messages", messages.len()); +//! # Ok(()) +//! # } +//! # #[cfg(not(feature = "telegram"))] +//! # fn main() {} +//! ``` +//! +//! ## Dynamic Parser Selection +//! +//! ```no_run //! # #[cfg(feature = "telegram")] //! # fn main() -> chatpack::Result<()> { //! use chatpack::parser::{Parser, Platform, create_parser}; //! //! let parser = create_parser(Platform::Telegram); -//! let messages = parser.parse("telegram_export.json".as_ref())?; +//! let messages = parser.parse("result.json".as_ref())?; +//! # Ok(()) +//! # } +//! # #[cfg(not(feature = "telegram"))] +//! # fn main() {} +//! ``` +//! +//! ## Streaming Large Files //! -//! // Or stream for large files -//! # #[cfg(feature = "streaming")] -//! let parser = chatpack::parser::create_streaming_parser(Platform::Telegram); -//! # #[cfg(feature = "streaming")] +//! ```no_run +//! # #[cfg(all(feature = "telegram", feature = "streaming"))] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::parser::{Parser, Platform, create_streaming_parser}; +//! +//! let parser = create_streaming_parser(Platform::Telegram); //! for result in parser.stream("large_export.json".as_ref())? { -//! // Process each message +//! let msg = result?; +//! // Process one message at a time //! } //! # Ok(()) //! # } -//! # #[cfg(not(feature = "telegram"))] +//! # #[cfg(not(all(feature = "telegram", feature = "streaming")))] //! # fn main() {} //! ``` diff --git a/src/parsers/telegram.rs b/src/parsers/telegram.rs index c44958b5..db64e443 100644 --- a/src/parsers/telegram.rs +++ b/src/parsers/telegram.rs @@ -1,4 +1,6 @@ //! Telegram JSON export parser. +//! +//! Parses JSON exports from Telegram Desktop's "Export chat history" feature. use std::fs; use std::path::Path; @@ -12,9 +14,21 @@ use crate::parsing::telegram::{TelegramExport, parse_telegram_message}; #[cfg(feature = "streaming")] use crate::streaming::{StreamingConfig, StreamingParser, TelegramStreamingParser}; -/// Parser for Telegram JSON exports. +/// Parser for Telegram Desktop JSON exports. +/// +/// Handles the `result.json` file produced by Telegram Desktop's +/// "Export chat history" feature (Settings > Advanced > Export). +/// +/// # Supported Message Types +/// +/// - Text messages (plain and with entities like links, mentions) +/// - Service messages (joins, leaves, pins) +/// - Forwarded messages +/// - Replies (preserves `reply_to` reference) +/// - Edited messages (preserves edit timestamp) +/// +/// # JSON Structure /// -/// Telegram exports chats as JSON with the following structure: /// ```json /// { /// "name": "Chat Name", @@ -24,23 +38,27 @@ use crate::streaming::{StreamingConfig, StreamingParser, TelegramStreamingParser /// "type": "message", /// "date_unixtime": "1234567890", /// "from": "Sender Name", -/// "text": "Hello" | ["Hello", {"type": "link", "text": "url"}], -/// "reply_to_message_id": 12344, -/// "edited_unixtime": "1234567899" +/// "text": "Hello" /// } /// ] /// } /// ``` /// -/// # Example +/// # Examples /// -/// ```rust,no_run +/// ```no_run /// use chatpack::parsers::TelegramParser; /// use chatpack::parser::Parser; /// +/// # fn main() -> chatpack::Result<()> { /// let parser = TelegramParser::new(); -/// let messages = parser.parse("telegram_export.json".as_ref())?; -/// # Ok::<(), chatpack::ChatpackError>(()) +/// let messages = parser.parse("result.json".as_ref())?; +/// +/// for msg in &messages { +/// println!("{}: {}", msg.sender, msg.content); +/// } +/// # Ok(()) +/// # } /// ``` pub struct TelegramParser { config: TelegramConfig, diff --git a/src/parsers/whatsapp.rs b/src/parsers/whatsapp.rs index 44e9db69..0bd180a2 100644 --- a/src/parsers/whatsapp.rs +++ b/src/parsers/whatsapp.rs @@ -1,13 +1,7 @@ -//! `WhatsApp` TXT export parser. +//! WhatsApp TXT export parser. //! -//! `WhatsApp` exports vary by locale. This parser auto-detects the format -//! by analyzing the first 20 lines of the file. -//! -//! Supported formats: -//! - US: `[1/15/24, 10:30:45 AM] Sender: Message` -//! - EU: `[15.01.24, 10:30:45] Sender: Message` -//! - EU2: `15/01/2024, 10:30 - Sender: Message` -//! - RU: `15.01.2024, 10:30 - Sender: Message` +//! Parses plain text exports from WhatsApp's "Export Chat" feature. +//! Auto-detects locale-specific date formats. use std::fs; use std::path::Path; @@ -27,15 +21,38 @@ use crate::streaming::{StreamingConfig, StreamingParser, WhatsAppStreamingParser /// Parser for WhatsApp TXT exports. /// -/// # Example +/// Handles plain text chat exports from WhatsApp on iOS and Android. +/// The format varies by locale; this parser auto-detects the format +/// by analyzing the first few lines. +/// +/// # Supported Date Formats +/// +/// | Format | Example | Region | +/// |--------|---------|--------| +/// | US | `[1/15/24, 10:30:45 AM] Sender: Message` | United States | +/// | EU | `[15.01.24, 10:30:45] Sender: Message` | Europe | +/// | EU2 | `15/01/2024, 10:30 - Sender: Message` | Europe (alt) | +/// | RU | `15.01.2024, 10:30 - Sender: Message` | Russia | +/// +/// # Handling /// -/// ```rust,no_run +/// - Multiline messages are properly joined +/// - System messages (joins, leaves) are filtered out +/// - Attachments are represented as `[Attachment]` placeholders +/// +/// # Examples +/// +/// ```no_run /// use chatpack::parsers::WhatsAppParser; /// use chatpack::parser::Parser; /// +/// # fn main() -> chatpack::Result<()> { /// let parser = WhatsAppParser::new(); -/// let messages = parser.parse("whatsapp_chat.txt".as_ref())?; -/// # Ok::<(), chatpack::ChatpackError>(()) +/// let messages = parser.parse("_chat.txt".as_ref())?; +/// +/// println!("Parsed {} messages", messages.len()); +/// # Ok(()) +/// # } /// ``` pub struct WhatsAppParser { config: WhatsAppConfig, diff --git a/src/streaming/mod.rs b/src/streaming/mod.rs index 90b8305b..a3189bec 100644 --- a/src/streaming/mod.rs +++ b/src/streaming/mod.rs @@ -1,57 +1,87 @@ -//! Streaming parsers for memory-efficient processing of large chat exports. +//! Memory-efficient streaming parsers for large chat exports. //! -//! This module provides streaming alternatives to the standard parsers, -//! designed for files >1GB where loading everything into memory is impractical. +//! This module provides streaming alternatives to standard parsers, designed +//! for files that are too large to fit in memory (>500MB). //! -//! # Architecture +//! # When to Use Streaming //! -//! The streaming API is built around two core traits: -//! - [`StreamingParser`] - produces an iterator of messages -//! - [`MessageIterator`] - the actual iterator implementation +//! | File Size | Recommendation | +//! |-----------|----------------| +//! | < 100MB | Standard parser (faster) | +//! | 100-500MB | Either works | +//! | > 500MB | Streaming parser (required) | //! -//! # Example +//! # Memory Comparison //! -//! ```rust,no_run +//! | Approach | 1GB File | 10GB File | +//! |----------|----------|-----------| +//! | Standard parser | ~3GB RAM | ~30GB RAM | +//! | Streaming parser | ~50MB RAM | ~50MB RAM | +//! +//! # Core Types +//! +//! | Type | Description | +//! |------|-------------| +//! | [`StreamingParser`] | Trait for parsers that produce message iterators | +//! | [`MessageIterator`] | Iterator over messages with progress tracking | +//! | [`StreamingConfig`] | Configuration for buffer sizes and behavior | +//! +//! # Examples +//! +//! ## Basic Usage +//! +//! ```no_run //! # #[cfg(feature = "telegram")] -//! # fn main() -> Result<(), Box> { +//! # fn main() -> chatpack::Result<()> { //! use chatpack::streaming::{StreamingParser, TelegramStreamingParser}; -//! use chatpack::Message; //! //! let parser = TelegramStreamingParser::new(); //! -//! // Process messages one at a time, never loading all into memory -//! for result in parser.stream("large_export.json").unwrap() { -//! match result { -//! Ok(message) => println!("{}: {}", message.sender, message.content), -//! Err(e) => eprintln!("Skipped invalid message: {}", e), -//! } +//! for result in parser.stream("large_export.json")? { +//! let msg = result?; +//! println!("{}: {}", msg.sender, msg.content); //! } -//! -//! // Or collect with error handling -//! let messages: Vec = parser -//! .stream("large_export.json") -//! .unwrap() -//! .filter_map(Result::ok) -//! .collect(); //! # Ok(()) //! # } //! # #[cfg(not(feature = "telegram"))] //! # fn main() {} //! ``` //! -//! # Memory Usage +//! ## With Progress Tracking //! -//! | Approach | 1GB File | 10GB File | -//! |----------|----------|-----------| -//! | Standard parser | ~3GB RAM | ~30GB RAM | -//! | Streaming parser | ~50MB RAM | ~50MB RAM | +//! ```no_run +//! # #[cfg(feature = "telegram")] +//! # fn main() -> chatpack::Result<()> { +//! use chatpack::streaming::{StreamingParser, TelegramStreamingParser}; +//! +//! let parser = TelegramStreamingParser::new(); +//! let mut iter = parser.stream("large_export.json")?; +//! let mut count = 0; +//! +//! while let Some(result) = iter.next() { +//! let _msg = result?; +//! count += 1; +//! +//! if count % 100_000 == 0 { +//! if let Some(progress) = iter.progress() { +//! println!("{:.1}% complete", progress); +//! } +//! } +//! } +//! # Ok(()) +//! # } +//! # #[cfg(not(feature = "telegram"))] +//! # fn main() {} +//! ``` //! -//! # Supported Formats +//! # Available Parsers //! -//! - Telegram JSON (via [`TelegramStreamingParser`]) - requires `telegram` feature -//! - Discord JSONL/JSON (via [`DiscordStreamingParser`]) - requires `discord` feature -//! - Instagram JSON (via [`InstagramStreamingParser`]) - requires `instagram` feature -//! - WhatsApp TXT (via [`WhatsAppStreamingParser`]) - requires `whatsapp` feature +//! | Parser | Feature | Format | +//! |--------|---------|--------| +//! | [`TelegramStreamingParser`] | `telegram` | JSON | +//! | [`WhatsAppStreamingParser`] | `whatsapp` | TXT | +//! | [`InstagramStreamingParser`] | `instagram` | JSON | +//! | [`DiscordStreamingParser`] | `discord` | JSON/JSONL/CSV | #[cfg(feature = "discord")] mod discord; diff --git a/src/streaming/traits.rs b/src/streaming/traits.rs index 03aed2c7..e3a4a04c 100644 --- a/src/streaming/traits.rs +++ b/src/streaming/traits.rs @@ -1,18 +1,51 @@ //! Core traits for streaming parsers. +//! +//! This module defines the trait hierarchy for memory-efficient streaming: +//! - [`MessageIterator`] - Iterator with progress tracking +//! - [`StreamingParser`] - Parser that produces iterators +//! - [`StreamingConfig`] - Configuration options use crate::Message; use crate::error::ChatpackError; use super::StreamingResult; -/// Iterator over messages from a streaming parser. +/// Iterator over messages from a streaming parser with progress tracking. /// -/// This trait is object-safe and allows for dynamic dispatch, -/// enabling runtime selection of streaming parsers. +/// Extends the standard [`Iterator`] trait with methods for monitoring +/// parsing progress, useful for progress bars and logging. +/// +/// # Object Safety +/// +/// This trait is object-safe, enabling dynamic dispatch via `Box`. +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(feature = "telegram")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::streaming::{StreamingParser, TelegramStreamingParser, MessageIterator}; +/// +/// let parser = TelegramStreamingParser::new(); +/// let mut iter = parser.stream("export.json")?; +/// +/// while let Some(result) = iter.next() { +/// let msg = result?; +/// +/// // Check progress periodically +/// if let Some(pct) = iter.progress() { +/// eprintln!("\r{:.1}%", pct); +/// } +/// } +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "telegram"))] +/// # fn main() {} +/// ``` pub trait MessageIterator: Iterator> + Send { - /// Returns approximate progress as a percentage (0.0 - 100.0). + /// Returns approximate progress as a percentage (0.0 to 100.0). /// - /// Returns `None` if progress cannot be determined. + /// Returns `None` if progress cannot be determined (e.g., unknown file size). fn progress(&self) -> Option { None } @@ -26,54 +59,93 @@ pub trait MessageIterator: Iterator> + Send { } } -/// A parser that can stream messages from large files. +/// A parser that streams messages from files without loading everything into memory. /// -/// Unlike [`ChatParser`], which loads everything into memory, -/// `StreamingParser` produces an iterator that yields messages one at a time. +/// Unlike standard parsers that load the entire file, `StreamingParser` produces +/// an iterator that yields messages one at a time, enabling processing of +/// arbitrarily large files with constant memory usage. /// -/// # Implementation Notes +/// # Implementation Guidelines /// /// Implementors should: -/// - Use buffered I/O with reasonable buffer sizes (64KB - 1MB) -/// - Handle malformed records gracefully (skip and continue) -/// - Provide progress reporting when possible +/// - Use buffered I/O (64KB - 1MB buffers) +/// - Skip malformed records gracefully +/// - Track bytes processed for progress reporting +/// +/// # Examples +/// +/// ```no_run +/// # #[cfg(feature = "telegram")] +/// # fn main() -> chatpack::Result<()> { +/// use chatpack::streaming::{StreamingParser, TelegramStreamingParser}; /// -/// [`ChatParser`]: crate::parsers::ChatParser +/// let parser = TelegramStreamingParser::new(); +/// let messages: Vec<_> = parser +/// .stream("export.json")? +/// .filter_map(Result::ok) +/// .collect(); +/// # Ok(()) +/// # } +/// # #[cfg(not(feature = "telegram"))] +/// # fn main() {} +/// ``` pub trait StreamingParser: Send + Sync { - /// Returns the name of this parser. + /// Returns the human-readable name of this parser. fn name(&self) -> &'static str; /// Opens a file and returns an iterator over messages. /// /// # Errors /// - /// Returns an error if the file cannot be opened or has invalid format. + /// Returns [`ChatpackError::Io`] if the file cannot be opened. fn stream(&self, file_path: &str) -> Result, ChatpackError>; /// Returns the recommended buffer size for this parser. + /// + /// Default: 64KB fn recommended_buffer_size(&self) -> usize { 64 * 1024 // 64KB default } - /// Returns true if this parser supports progress reporting. + /// Returns `true` if this parser supports progress reporting. fn supports_progress(&self) -> bool { true } } -/// Configuration for streaming parsers. +/// Configuration options for streaming parsers. +/// +/// Controls buffer sizes, error handling, and progress reporting behavior. +/// +/// # Examples +/// +/// ``` +/// use chatpack::streaming::StreamingConfig; +/// +/// let config = StreamingConfig::new() +/// .with_buffer_size(128 * 1024) // 128KB buffer +/// .with_skip_invalid(false); // Return errors instead of skipping +/// ``` #[derive(Debug, Clone, Copy)] pub struct StreamingConfig { - /// Buffer size for reading (default: 64KB) + /// Buffer size for file reading. + /// + /// Default: 64KB. Larger buffers improve throughput but use more memory. pub buffer_size: usize, - /// Maximum size of a single message in bytes (default: 10MB) + /// Maximum size of a single message in bytes. + /// + /// Default: 10MB. Messages exceeding this are skipped or error. pub max_message_size: usize, - /// Whether to skip invalid messages or return errors (default: skip) + /// Whether to skip invalid messages or return errors. + /// + /// Default: `true` (skip). Set to `false` for strict validation. pub skip_invalid: bool, - /// Report progress every N messages (default: 10000) + /// Report progress every N messages. + /// + /// Default: 10,000. Lower values provide more frequent updates. pub progress_interval: usize, }