From ebb45025a548a74d614f6a219e0deadc140f717c Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 27 Dec 2025 13:22:42 +0000 Subject: [PATCH] refactor: deduplicate parser logic, add async support, update API Major changes: - Extract shared parsing utilities into src/parsing/ module (DRY) - telegram.rs: extract_telegram_text, parse_telegram_message - instagram.rs: fix_mojibake_encoding, parse_instagram_message - whatsapp.rs: DateFormat, detect_whatsapp_format, is_whatsapp_system_message - discord.rs: DiscordRawMessage, parse_discord_message - Remove deprecated InternalMessage alias, use Message everywhere - Add async feature with tokio-based AsyncParser trait - Add AsyncTelegramParser for async/await parsing - Update README for library-only usage with new examples - Update all tests and examples to use Message instead of InternalMessage Breaking changes: - InternalMessage is removed, use Message instead - core::models no longer exports InternalMessage --- Cargo.lock | 65 +++++++ Cargo.toml | 8 + README.md | 289 ++++++++++++++++-------------- benches/parsing.rs | 12 +- examples/rag_integration.rs | 16 +- src/async_parser/mod.rs | 79 +++++++++ src/async_parser/telegram.rs | 111 ++++++++++++ src/core/mod.rs | 6 +- src/core/models.rs | 17 +- src/core/processor.rs | 34 ++-- src/lib.rs | 17 +- src/parsers/instagram.rs | 63 +------ src/parsers/telegram.rs | 98 +---------- src/parsers/whatsapp.rs | 275 +++++------------------------ src/parsing/discord.rs | 279 +++++++++++++++++++++++++++++ src/parsing/instagram.rs | 190 ++++++++++++++++++++ src/parsing/mod.rs | 32 ++++ src/parsing/telegram.rs | 225 ++++++++++++++++++++++++ src/parsing/whatsapp.rs | 328 +++++++++++++++++++++++++++++++++++ src/streaming/instagram.rs | 100 +---------- src/streaming/telegram.rs | 87 +--------- src/streaming/whatsapp.rs | 187 ++------------------ tests/integration.rs | 28 ++- tests/output_tests.rs | 38 ++-- tests/proptest.rs | 30 ++-- 25 files changed, 1652 insertions(+), 962 deletions(-) create mode 100644 src/async_parser/mod.rs create mode 100644 src/async_parser/telegram.rs create mode 100644 src/parsing/discord.rs create mode 100644 src/parsing/instagram.rs create mode 100644 src/parsing/mod.rs create mode 100644 src/parsing/telegram.rs create mode 100644 src/parsing/whatsapp.rs diff --git a/Cargo.lock b/Cargo.lock index 372ff101..c5b4c690 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -32,6 +32,17 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -65,6 +76,12 @@ version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +[[package]] +name = "bytes" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" + [[package]] name = "cargo-husky" version = "1.5.0" @@ -97,6 +114,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" name = "chatpack" version = "0.5.0" dependencies = [ + "async-trait", "cargo-husky", "chrono", "criterion", @@ -107,6 +125,8 @@ dependencies = [ "serde_json", "tempfile", "thiserror", + "tokio", + "tokio-stream", ] [[package]] @@ -303,6 +323,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + [[package]] name = "getrandom" version = "0.3.4" @@ -437,6 +463,12 @@ version = "11.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + [[package]] name = "plotters" version = "0.3.7" @@ -759,6 +791,39 @@ dependencies = [ "serde_json", ] +[[package]] +name = "tokio" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" +dependencies = [ + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "unarray" version = "0.1.4" diff --git a/Cargo.toml b/Cargo.toml index e02dbec6..b7fad534 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,9 @@ json-output = ["dep:serde_json"] # Streaming support (no extra deps, but gates streaming module) streaming = [] +# Async support (enables tokio-based async parsers) +async = ["dep:tokio", "dep:tokio-stream", "dep:async-trait"] + [dependencies] # Core dependencies (always required) chrono = { version = "0.4", features = ["serde"] } @@ -41,6 +44,11 @@ regex = { version = "1.11", optional = true } serde_json = { version = "1.0", optional = true } csv = { version = "1.3", optional = true } +# Async dependencies +tokio = { version = "1", features = ["fs", "io-util", "rt", "macros"], optional = true } +tokio-stream = { version = "0.1", optional = true } +async-trait = { version = "0.1", optional = true } + [lib] name = "chatpack" path = "src/lib.rs" diff --git a/README.md b/README.md index 33ae7cdd..d22a59e8 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,6 @@ [![Downloads](https://img.shields.io/crates/d/chatpack.svg)](https://crates.io/crates/chatpack) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -**Platforms:** Windows • macOS • Linux - ## The Problem You want to ask Claude/ChatGPT about your conversations, but: @@ -27,7 +25,7 @@ You want to ask Claude/ChatGPT about your conversations, but: ``` ┌─────────────────┐ ┌──────────┐ ┌─────────────────┐ │ Telegram JSON │ │ │ │ Clean CSV │ -│ WhatsApp TXT │ ──▶│ chatpack │ ──▶ │ Ready for LLM │ +│ WhatsApp TXT │ ──▶ │ chatpack │ ──▶ │ Ready for LLM │ │ Instagram JSON │ │ │ │ 13x less tokens │ │ Discord Export │ │ │ │ │ └─────────────────┘ └──────────┘ └─────────────────┘ @@ -43,26 +41,6 @@ You want to ask Claude/ChatGPT about your conversations, but: > 💡 **Use CSV for maximum token savings.** JSONL is good for RAG pipelines. JSON keeps full structure but wastes tokens. -## Use Cases - -### 💬 Chat with your chat history -```bash -chatpack tg telegram_export.json -o context.txt -# Paste into ChatGPT: "Based on this conversation, what did we decide about...?" -``` - -### 🔍 Build RAG pipeline -```bash -chatpack tg chat.json -f jsonl -t -o dataset.jsonl -# Each line = one document with timestamp for vector DB -``` - -### 📊 Analyze conversations -```bash -chatpack wa chat.txt --from "Alice" --after 2024-01-01 -f json -# Filter and export specific messages -``` - ## Features - 🚀 **Fast** — 1.6M+ messages/sec (full pipeline) @@ -70,126 +48,163 @@ chatpack wa chat.txt --from "Alice" --after 2024-01-01 -f json - 🔀 **Smart merge** — Consecutive messages from same sender → one entry - 🎯 **Filters** — By date, by sender - 📄 **Formats** — CSV (13x compression), JSON, JSONL (for RAG) -- 📚 **Library** — Use as Rust crate in your projects +- 📡 **Streaming** — O(1) memory for large files +- ⚡ **Async** — Tokio-based async parsers (optional) ## Installation -### Pre-built binaries - -| Platform | Download | -|----------|----------| -| Windows | [chatpack-windows-x64.exe](https://github.com/berektassuly/chatpack/releases/latest/download/chatpack-windows-x64.exe) | -| macOS (Intel) | [chatpack-macos-x64](https://github.com/berektassuly/chatpack/releases/latest/download/chatpack-macos-x64) | -| macOS (Apple Silicon) | [chatpack-macos-arm64](https://github.com/berektassuly/chatpack/releases/latest/download/chatpack-macos-arm64) | -| Linux | [chatpack-linux-x64](https://github.com/berektassuly/chatpack/releases/latest/download/chatpack-linux-x64) | - -### Via Cargo - -```bash -cargo install chatpack -``` - -### As a library +Add to your `Cargo.toml`: ```toml [dependencies] -chatpack = "0.2" +chatpack = "0.5" ``` -## Quick Start (CLI) +### Feature Flags -```bash -# Telegram -chatpack tg result.json +| Feature | Description | Default | +|---------|-------------|---------| +| `full` | All parsers + all output formats | ✅ | +| `telegram` | Telegram JSON parser | ✅ | +| `whatsapp` | WhatsApp TXT parser | ✅ | +| `instagram` | Instagram JSON parser | ✅ | +| `discord` | Discord JSON/TXT/CSV parser | ✅ | +| `csv-output` | CSV output format | ✅ | +| `json-output` | JSON/JSONL output formats | ✅ | +| `streaming` | Streaming parsers for large files | ✅ | +| `async` | Async/await support with tokio | ❌ | -# WhatsApp -chatpack wa chat.txt +Enable only what you need: -# Instagram -chatpack ig message_1.json +```toml +# Minimal: just Telegram parser with CSV output +chatpack = { version = "0.5", default-features = false, features = ["telegram", "csv-output"] } -# Discord -chatpack dc chat.json +# With async support +chatpack = { version = "0.5", features = ["async"] } ``` -**Output:** `optimized_chat.csv` — ready to paste into ChatGPT/Claude. - -## Library Usage +## Quick Start ### Basic example ```rust use chatpack::prelude::*; +use chatpack::parser::{Platform, create_parser}; fn main() -> Result<(), Box> { // Parse a Telegram export - let parser = create_parser(Source::Telegram); - let messages = parser.parse("telegram_export.json")?; + let parser = create_parser(Platform::Telegram); + let messages = parser.parse("telegram_export.json".as_ref())?; // Merge consecutive messages from the same sender let merged = merge_consecutive(messages); - // Write to JSON - write_json(&merged, "output.json", &OutputConfig::new())?; + // Write to CSV (13x compression) + write_csv(&merged, "output.csv", &OutputConfig::new())?; + println!("Processed {} messages", merged.len()); Ok(()) } ``` -### Auto-detect format +### Filter messages ```rust -use chatpack::parsers::parse_auto; +use chatpack::prelude::*; +use chatpack::parser::{Platform, create_parser}; -// Automatically detects Telegram, WhatsApp, or Instagram -let messages = parse_auto("unknown_chat.json")?; +fn main() -> Result<()> { + let parser = create_parser(Platform::Telegram); + let messages = parser.parse("chat.json".as_ref())?; + + // Filter by sender + let config = FilterConfig::new().with_user("Alice".to_string()); + let alice_only = apply_filters(messages.clone(), &config); + + // Filter by date range + let config = FilterConfig::new() + .after_date("2024-01-01")? + .before_date("2024-06-01")?; + let filtered = apply_filters(messages, &config); + + Ok(()) +} ``` -### Filter messages +### Output formats ```rust use chatpack::prelude::*; -let parser = create_parser(Source::Telegram); -let messages = parser.parse("chat.json")?; +fn main() -> Result<()> { + let messages = vec![ + Message::new("Alice", "Hello!"), + Message::new("Bob", "Hi there!"), + ]; + + // Minimal output (sender + content only) + let config = OutputConfig::new(); + + // Full metadata (timestamps, IDs, replies, edits) + let config = OutputConfig::all(); -// Filter by sender -let config = FilterConfig::new() - .with_user("Alice".to_string()); -let alice_only = apply_filters(messages.clone(), &config); + // Custom selection + let config = OutputConfig::new() + .with_timestamps() + .with_ids(); -// Filter by date range -let config = FilterConfig::new() - .after_date("2024-01-01")? - .before_date("2024-06-01")?; -let filtered = apply_filters(messages, &config); + // Write to different formats + write_json(&messages, "output.json", &config)?; + write_jsonl(&messages, "output.jsonl", &config)?; + write_csv(&messages, "output.csv", &config)?; + + Ok(()) +} ``` -### Output formats +### Streaming large files + +For files that don't fit in memory, use streaming parsers: ```rust -use chatpack::prelude::*; +use chatpack::streaming::{StreamingParser, TelegramStreamingParser, StreamingConfig}; + +fn main() -> Result<(), Box> { + let config = StreamingConfig::new() + .with_buffer_size(128 * 1024) // 128KB buffer + .with_skip_invalid(true); -let messages = vec![ - InternalMessage::new("Alice", "Hello!"), - InternalMessage::new("Bob", "Hi there!"), -]; + let parser = TelegramStreamingParser::with_config(config); -// Minimal output (sender + content only) -let config = OutputConfig::new(); + // Process messages one at a time - O(1) memory! + for result in parser.stream("huge_export.json")? { + match result { + Ok(msg) => println!("{}: {}", msg.sender, msg.content), + Err(e) => eprintln!("Error: {}", e), + } + } -// Full metadata (timestamps, IDs, replies, edits) -let config = OutputConfig::all(); + Ok(()) +} +``` -// Custom selection -let config = OutputConfig::new() - .with_timestamps() - .with_ids(); +### Async parsing (requires `async` feature) + +```rust +use chatpack::async_parser::{AsyncParser, AsyncTelegramParser}; -// Write to different formats -write_json(&messages, "output.json", &config)?; -write_jsonl(&messages, "output.jsonl", &config)?; -write_csv(&messages, "output.csv", &config)?; +#[tokio::main] +async fn main() -> Result<(), Box> { + let parser = AsyncTelegramParser::new(); + let messages = parser.parse("telegram_export.json").await?; + + for msg in messages { + println!("{}: {}", msg.sender, msg.content); + } + + Ok(()) +} ``` ### Processing statistics @@ -197,50 +212,52 @@ write_csv(&messages, "output.csv", &config)?; ```rust use chatpack::prelude::*; -let original_count = messages.len(); -let merged = merge_consecutive(messages); +fn main() { + let messages = vec![ + Message::new("Alice", "Hi"), + Message::new("Alice", "How are you?"), + Message::new("Bob", "Good!"), + ]; -let stats = ProcessingStats::new(original_count, merged.len()); -println!("Compression: {:.1}%", stats.compression_ratio()); -println!("Messages saved: {}", stats.messages_saved()); + let original_count = messages.len(); + let merged = merge_consecutive(messages); + + let stats = ProcessingStats::new(original_count, merged.len()); + println!("Compression: {:.1}%", stats.compression_ratio()); + println!("Messages saved: {}", stats.messages_saved()); +} ``` -📚 **Full API documentation:** [docs.rs/chatpack](https://docs.rs/chatpack) +## API Overview -## CLI Reference - -```bash -# Output formats -chatpack tg chat.json -f csv # 13x compression (default) -chatpack tg chat.json -f json # Structured array -chatpack tg chat.json -f jsonl # One JSON per line (for RAG) - -# Filters -chatpack tg chat.json --after 2024-01-01 -chatpack tg chat.json --before 2024-06-01 -chatpack tg chat.json --from "Alice" - -# Metadata -chatpack tg chat.json -t # Add timestamps -chatpack tg chat.json -r # Add reply references -chatpack tg chat.json -e # Add edit timestamps -chatpack tg chat.json --ids # Add message IDs -chatpack tg chat.json -t -r -e --ids # All metadata - -# Other options -chatpack tg chat.json --no-merge # Don't merge consecutive messages -chatpack tg chat.json -o out.csv # Custom output path -``` +### Core Types -## Documentation +| Type | Description | +|------|-------------| +| `Message` | Universal message representation with optional metadata | +| `OutputConfig` | Controls which fields are included in output | +| `FilterConfig` | Filters by date range and/or sender | +| `ProcessingStats` | Statistics about compression and merging | -| Guide | Description | -|-------|-------------| -| 📤 [Export Guide](docs/EXPORT_GUIDE.md) | How to export from Telegram, WhatsApp, Instagram, Discord | -| 📖 [Usage Guide](docs/USAGE.md) | All commands, flags, filters, formats | -| 📊 [Benchmarks](docs/BENCHMARKS.md) | Performance stats and compression metrics | -| 🧪 [Stress Testing](docs/STRESS_TEST.md) | Generate toxic data and run stress tests | -| 📚 [API Docs](https://docs.rs/chatpack) | Full library documentation | +### Parsers + +| Parser | Platform | Format | +|--------|----------|--------| +| `TelegramParser` | Telegram | JSON export | +| `WhatsAppParser` | WhatsApp | TXT export (auto-detects locale) | +| `InstagramParser` | Instagram | JSON (with Mojibake fix) | +| `DiscordParser` | Discord | JSON/TXT/CSV via DiscordChatExporter | + +### Streaming Parsers + +| Parser | Memory | Use Case | +|--------|--------|----------| +| `TelegramStreamingParser` | O(1) | Files > 1GB | +| `InstagramStreamingParser` | O(1) | Files > 1GB | +| `DiscordStreamingParser` | O(1) | Files > 1GB | +| `WhatsAppStreamingParser` | O(1) | Files > 1GB | + +📚 **Full API documentation:** [docs.rs/chatpack](https://docs.rs/chatpack) ## Supported Platforms @@ -261,10 +278,18 @@ chatpack tg chat.json -o out.csv # Custom output path | Parsing (Discord) | 1.5-1.8 M messages/sec | | Operations (merge/filter) | 11-14 M messages/sec | | CSV compression | 13x (92% token reduction) | -| Tested file size | 500MB+ | +| Streaming memory | ~50MB for 10GB file | > Run `cargo bench --bench parsing` to reproduce benchmarks. +## Documentation + +| Guide | Description | +|-------|-------------| +| 📤 [Export Guide](docs/EXPORT_GUIDE.md) | How to export from Telegram, WhatsApp, Instagram, Discord | +| 📊 [Benchmarks](docs/BENCHMARKS.md) | Performance stats and compression metrics | +| 📚 [API Docs](https://docs.rs/chatpack) | Full library documentation | + ## License -[MIT](LICENSE) © [Mukhammedali Berektassuly](https://berektassuly.com) \ No newline at end of file +[MIT](LICENSE) © [Mukhammedali Berektassuly](https://berektassuly.com) diff --git a/benches/parsing.rs b/benches/parsing.rs index a0c10341..b8f6b657 100644 --- a/benches/parsing.rs +++ b/benches/parsing.rs @@ -6,9 +6,7 @@ use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main}; use chatpack::core::output::{to_csv, to_json, to_jsonl}; -use chatpack::core::{ - FilterConfig, InternalMessage, OutputConfig, apply_filters, merge_consecutive, -}; +use chatpack::core::{FilterConfig, Message, OutputConfig, apply_filters, merge_consecutive}; use chatpack::parser::Parser; use chatpack::parsers::{DiscordParser, InstagramParser, TelegramParser, WhatsAppParser}; @@ -81,7 +79,7 @@ fn generate_discord_json(count: usize) -> String { ) } -fn generate_messages(count: usize) -> Vec { +fn generate_messages(count: usize) -> Vec { let base_time = Utc.with_ymd_and_hms(2024, 1, 15, 12, 0, 0).unwrap(); (0..count) .map(|i| { @@ -91,7 +89,7 @@ fn generate_messages(count: usize) -> Vec { "Bob".to_string() }; let ts = base_time + Duration::minutes(i as i64); - InternalMessage::with_metadata( + Message::with_metadata( sender, format!("Message number {}", i), Some(ts), @@ -226,10 +224,10 @@ fn bench_filter_by_date(c: &mut Criterion) { let base_time = Utc.with_ymd_and_hms(2024, 1, 15, 12, 0, 0).unwrap(); for size in [100_usize, 1_000, 10_000, 100_000] { - let messages: Vec = (0..size) + let messages: Vec = (0..size) .map(|i| { let ts = base_time - Duration::hours(i as i64); - InternalMessage::with_metadata( + Message::with_metadata( "Alice".to_string(), format!("Message {}", i), Some(ts), diff --git a/examples/rag_integration.rs b/examples/rag_integration.rs index 622e48fd..899381f7 100644 --- a/examples/rag_integration.rs +++ b/examples/rag_integration.rs @@ -18,7 +18,7 @@ struct ChatChunk { } /// Example: Time-window based chunking strategy -fn chunk_by_time_window(messages: &[InternalMessage], window_minutes: i64) -> Vec { +fn chunk_by_time_window(messages: &[Message], window_minutes: i64) -> Vec { let mut chunks = Vec::new(); let mut current_chunk: Option = None; @@ -78,23 +78,23 @@ fn main() { let base_time = Utc.with_ymd_and_hms(2024, 6, 15, 10, 0, 0).unwrap(); let messages = vec![ - InternalMessage::new("Alice", "Hey, have you seen the new project specs?") + Message::new("Alice", "Hey, have you seen the new project specs?") .with_timestamp(base_time) .with_id(1), - InternalMessage::new("Bob", "Yes! They look good") + Message::new("Bob", "Yes! They look good") .with_timestamp(base_time + chrono::Duration::minutes(1)) .with_id(2), - InternalMessage::new("Bob", "But I have some concerns about the timeline") + Message::new("Bob", "But I have some concerns about the timeline") .with_timestamp(base_time + chrono::Duration::minutes(2)) .with_id(3), - InternalMessage::new("Alice", "Let's discuss in the meeting") + Message::new("Alice", "Let's discuss in the meeting") .with_timestamp(base_time + chrono::Duration::minutes(3)) .with_id(4), // Gap of 30 minutes - InternalMessage::new("Alice", "Meeting notes: decided to extend deadline") + Message::new("Alice", "Meeting notes: decided to extend deadline") .with_timestamp(base_time + chrono::Duration::minutes(35)) .with_id(5), - InternalMessage::new("Bob", "Great, I'll update the schedule") + Message::new("Bob", "Great, I'll update the schedule") .with_timestamp(base_time + chrono::Duration::minutes(36)) .with_id(6), ]; @@ -139,7 +139,7 @@ fn main() { println!("\n=== Integration complete! ==="); println!("\nKey chatpack types used:"); - println!(" - InternalMessage: Universal message format"); + println!(" - Message: Universal message format"); println!(" - create_parser(Source): Get appropriate parser"); println!(" - merge_consecutive(): Reduce message count"); println!(" - apply_filters(): Filter by date/sender"); diff --git a/src/async_parser/mod.rs b/src/async_parser/mod.rs new file mode 100644 index 00000000..bb812825 --- /dev/null +++ b/src/async_parser/mod.rs @@ -0,0 +1,79 @@ +//! Async parser support for chatpack. +//! +//! This module provides async/await-based parsers for use with tokio. +//! +//! # Example +//! +//! ```rust,no_run +//! use chatpack::async_parser::{AsyncParser, AsyncTelegramParser}; +//! +//! # async fn example() -> Result<(), chatpack::ChatpackError> { +//! let parser = AsyncTelegramParser::new(); +//! let messages = parser.parse("telegram_export.json").await?; +//! +//! for msg in messages { +//! println!("{}: {}", msg.sender, msg.content); +//! } +//! # Ok(()) +//! # } +//! ``` +//! +//! # Features +//! +//! This module requires the `async` feature to be enabled: +//! +//! ```toml +//! [dependencies] +//! chatpack = { version = "0.5", features = ["async", "telegram"] } +//! ``` + +use std::path::Path; + +use async_trait::async_trait; +use tokio::fs; + +use crate::Message; +use crate::error::ChatpackError; + +mod telegram; + +#[cfg(feature = "telegram")] +pub use telegram::AsyncTelegramParser; + +/// Trait for async parsers. +/// +/// This is the async equivalent of the synchronous `Parser` trait. +/// It allows parsing files asynchronously using tokio. +/// +/// # Example +/// +/// ```rust,no_run +/// use chatpack::async_parser::{AsyncParser, AsyncTelegramParser}; +/// +/// # async fn example() -> Result<(), chatpack::ChatpackError> { +/// let parser = AsyncTelegramParser::new(); +/// let messages = parser.parse("export.json").await?; +/// # Ok(()) +/// # } +/// ``` +#[async_trait] +pub trait AsyncParser: Send + Sync { + /// Returns the name of the parser. + fn name(&self) -> &'static str; + + /// Parses a file asynchronously. + /// + /// Reads the file using tokio's async I/O and parses its contents. + async fn parse(&self, path: impl AsRef + Send) -> Result, ChatpackError>; + + /// Parses content from a string. + /// + /// This is useful when you already have the content in memory. + fn parse_str(&self, content: &str) -> Result, ChatpackError>; +} + +/// Helper function to read a file asynchronously. +pub(crate) async fn read_file_async(path: impl AsRef) -> Result { + let content = fs::read_to_string(path).await?; + Ok(content) +} diff --git a/src/async_parser/telegram.rs b/src/async_parser/telegram.rs new file mode 100644 index 00000000..60d38c7d --- /dev/null +++ b/src/async_parser/telegram.rs @@ -0,0 +1,111 @@ +//! Async Telegram parser. + +use std::path::Path; + +use async_trait::async_trait; + +use crate::Message; +use crate::config::TelegramConfig; +use crate::error::ChatpackError; +use crate::parsing::telegram::{TelegramExport, parse_telegram_message}; + +use super::{AsyncParser, read_file_async}; + +/// Async parser for Telegram JSON exports. +/// +/// # Example +/// +/// ```rust,no_run +/// use chatpack::async_parser::{AsyncParser, AsyncTelegramParser}; +/// +/// # async fn example() -> Result<(), chatpack::ChatpackError> { +/// let parser = AsyncTelegramParser::new(); +/// let messages = parser.parse("telegram_export.json").await?; +/// +/// for msg in messages { +/// println!("{}: {}", msg.sender, msg.content); +/// } +/// # Ok(()) +/// # } +/// ``` +pub struct AsyncTelegramParser { + #[allow(dead_code)] + config: TelegramConfig, +} + +impl AsyncTelegramParser { + /// Creates a new async parser with default configuration. + pub fn new() -> Self { + Self { + config: TelegramConfig::default(), + } + } + + /// Creates a parser with custom configuration. + pub fn with_config(config: TelegramConfig) -> Self { + Self { config } + } +} + +impl Default for AsyncTelegramParser { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl AsyncParser for AsyncTelegramParser { + fn name(&self) -> &'static str { + "Telegram (Async)" + } + + async fn parse(&self, path: impl AsRef + Send) -> Result, ChatpackError> { + let content = read_file_async(path).await?; + self.parse_str(&content) + } + + fn parse_str(&self, content: &str) -> Result, ChatpackError> { + let export: TelegramExport = serde_json::from_str(content)?; + + let messages = export + .messages + .iter() + .filter_map(parse_telegram_message) + .collect(); + + Ok(messages) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_async_parser_name() { + let parser = AsyncTelegramParser::new(); + assert_eq!(parser.name(), "Telegram (Async)"); + } + + #[test] + fn test_parse_str() { + let json = r#"{ + "messages": [ + { + "id": 1, + "type": "message", + "date_unixtime": "1705314600", + "from": "Alice", + "text": "Hello!" + } + ] + }"#; + + let parser = AsyncTelegramParser::new(); + let messages = parser.parse_str(json).unwrap(); + + assert_eq!(messages.len(), 1); + assert_eq!(messages[0].sender, "Alice"); + assert_eq!(messages[0].content, "Hello!"); + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index ffd2b01d..13d5a9b7 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -12,7 +12,7 @@ //! # #[cfg(all(feature = "csv-output", feature = "json-output"))] //! # fn example() { //! use chatpack::core::{ -//! InternalMessage, OutputConfig, FilterConfig, +//! Message, OutputConfig, FilterConfig, //! merge_consecutive, apply_filters, //! write_csv, write_json, write_jsonl, //! }; @@ -26,11 +26,9 @@ pub mod processor; // Re-export main types for convenience pub use filter::{FilterConfig, apply_filters}; -#[allow(deprecated)] -pub use models::InternalMessage; pub use models::OutputConfig; -// Re-export Message from the new location +// Re-export Message from the crate root pub use crate::Message; // Conditionally re-export output writers diff --git a/src/core/models.rs b/src/core/models.rs index 40c27815..a4781de6 100644 --- a/src/core/models.rs +++ b/src/core/models.rs @@ -1,10 +1,6 @@ -//! Core data models for chat messages. +//! Core data models for chat output configuration. //! -//! This module provides the [`InternalMessage`] type alias (which is deprecated) -//! and the [`OutputConfig`] type for configuring output generation. -//! -//! **Note**: [`InternalMessage`] is now a deprecated alias for [`crate::Message`]. -//! New code should use [`crate::Message`] directly. +//! This module provides the [`OutputConfig`] type for configuring output generation. //! //! # Example //! @@ -13,7 +9,7 @@ //! use chatpack::core::models::OutputConfig; //! use chrono::Utc; //! -//! // Create a simple message (new way) +//! // Create a simple message //! let msg = Message::new("Alice", "Hello, world!"); //! //! // Create with builder pattern @@ -29,13 +25,6 @@ use serde::{Deserialize, Serialize}; -// Re-export Message as InternalMessage for backward compatibility -#[deprecated( - since = "0.5.0", - note = "Use `chatpack::Message` instead. InternalMessage will be removed in v1.0.0" -)] -pub use crate::message::Message as InternalMessage; - /// Configuration for output format. /// /// Controls which metadata fields are included in the output when writing diff --git a/src/core/processor.rs b/src/core/processor.rs index 81cde668..73981915 100644 --- a/src/core/processor.rs +++ b/src/core/processor.rs @@ -4,7 +4,7 @@ //! - [`merge_consecutive`] - Merge consecutive messages from same sender //! - [`ProcessingStats`] - Statistics about processing results -use super::models::InternalMessage; +use crate::Message; /// Merges consecutive messages from the same sender into single entries. /// @@ -25,12 +25,12 @@ use super::models::InternalMessage; /// /// ```rust /// use chatpack::core::processor::merge_consecutive; -/// use chatpack::core::models::InternalMessage; +/// use chatpack::Message; /// /// let messages = vec![ -/// InternalMessage::new("Alice", "Hi"), -/// InternalMessage::new("Alice", "How are you?"), -/// InternalMessage::new("Bob", "Good!"), +/// Message::new("Alice", "Hi"), +/// Message::new("Alice", "How are you?"), +/// Message::new("Bob", "Good!"), /// ]; /// /// let merged = merge_consecutive(messages); @@ -46,8 +46,8 @@ use super::models::InternalMessage; /// - Consumes the input vector (no cloning of messages) /// - Allocates a new output vector /// - O(n) time complexity -pub fn merge_consecutive(messages: Vec) -> Vec { - let mut merged: Vec = Vec::with_capacity(messages.len()); +pub fn merge_consecutive(messages: Vec) -> Vec { + let mut merged: Vec = Vec::with_capacity(messages.len()); for msg in messages { match merged.last_mut() { @@ -170,11 +170,11 @@ mod tests { #[test] fn test_merge_consecutive() { let messages = vec![ - InternalMessage::new("Alice", "Hi"), - InternalMessage::new("Alice", "How are you?"), - InternalMessage::new("Bob", "Fine"), - InternalMessage::new("Bob", "Thanks"), - InternalMessage::new("Alice", "Great!"), + Message::new("Alice", "Hi"), + Message::new("Alice", "How are you?"), + Message::new("Bob", "Fine"), + Message::new("Bob", "Thanks"), + Message::new("Alice", "Great!"), ]; let merged = merge_consecutive(messages); @@ -190,14 +190,14 @@ mod tests { #[test] fn test_merge_empty() { - let messages: Vec = vec![]; + let messages: Vec = vec![]; let merged = merge_consecutive(messages); assert!(merged.is_empty()); } #[test] fn test_merge_single() { - let messages = vec![InternalMessage::new("Alice", "Hi")]; + let messages = vec![Message::new("Alice", "Hi")]; let merged = merge_consecutive(messages); assert_eq!(merged.len(), 1); } @@ -208,10 +208,8 @@ mod tests { let ts = Utc.with_ymd_and_hms(2024, 1, 1, 12, 0, 0).unwrap(); let messages = vec![ - InternalMessage::new("Alice", "First") - .with_timestamp(ts) - .with_id(1), - InternalMessage::new("Alice", "Second").with_id(2), + Message::new("Alice", "First").with_timestamp(ts).with_id(1), + Message::new("Alice", "Second").with_id(2), ]; let merged = merge_consecutive(messages); diff --git a/src/lib.rs b/src/lib.rs index bdc29d99..c8098fb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,7 @@ //! - [`config`] - Parser configuration types //! - [`TelegramConfig`](config::TelegramConfig), [`WhatsAppConfig`](config::WhatsAppConfig), etc. //! - [`core`] - Core types and functionality -//! - [`core::models`] - [`InternalMessage`], [`OutputConfig`] +//! - [`core::models`] - [`Message`], [`OutputConfig`] //! - [`core::filter`] - [`FilterConfig`], [`apply_filters`] //! - [`core::processor`] - [`merge_consecutive`], [`ProcessingStats`] //! - [`core::output`] - [`write_json`], [`write_jsonl`], [`write_csv`] @@ -111,6 +111,15 @@ pub mod format; pub mod message; pub mod progress; +// Shared parsing utilities (DRY - used by both parsers and streaming) +#[cfg(any( + feature = "telegram", + feature = "whatsapp", + feature = "instagram", + feature = "discord" +))] +pub mod parsing; + // Parser modules - require at least one parser feature #[cfg(any( feature = "telegram", @@ -140,6 +149,10 @@ pub mod parsers; ))] pub mod streaming; +// Async parser module (requires async feature and at least one parser) +#[cfg(all(feature = "async", feature = "telegram"))] +pub mod async_parser; + // Re-export the main types at the crate root for convenience pub use error::{ChatpackError, Result}; pub use message::Message; @@ -171,7 +184,7 @@ pub mod prelude { pub use crate::config::{DiscordConfig, InstagramConfig, TelegramConfig, WhatsAppConfig}; // Models - pub use crate::core::models::{InternalMessage, OutputConfig}; + pub use crate::core::models::OutputConfig; // Filtering pub use crate::core::filter::{FilterConfig, apply_filters}; diff --git a/src/parsers/instagram.rs b/src/parsers/instagram.rs index 08eb29a2..29f5e5a1 100644 --- a/src/parsers/instagram.rs +++ b/src/parsers/instagram.rs @@ -9,43 +9,15 @@ use std::fs; use std::path::Path; -use chrono::{TimeZone, Utc}; -use serde::Deserialize; - use crate::Message; use crate::config::InstagramConfig; use crate::error::ChatpackError; use crate::parser::{Parser, Platform}; +use crate::parsing::instagram::{InstagramExport, parse_instagram_message}; #[cfg(feature = "streaming")] use crate::streaming::{InstagramStreamingParser, StreamingConfig, StreamingParser}; -#[derive(Debug, Deserialize)] -struct InstagramExport { - messages: Vec, -} - -#[derive(Debug, Deserialize)] -struct InstagramMessage { - sender_name: String, - timestamp_ms: i64, - content: Option, -} - -/// Fix Meta's broken encoding (Mojibake). -/// -/// Meta exports UTF-8 text encoded as if it were ISO-8859-1. -/// Each UTF-8 byte is stored as a separate Unicode codepoint. -/// Example: "Привет" becomes "Привет" -/// -/// This function reverses that process by: -/// 1. Taking each char as its byte value -/// 2. Reconstructing the original UTF-8 string -fn fix_encoding(s: &str) -> String { - let bytes: Vec = s.chars().map(|c| c as u8).collect(); - String::from_utf8(bytes).unwrap_or_else(|_| s.to_string()) -} - /// Parser for Instagram JSON exports. /// /// # Example @@ -94,37 +66,8 @@ impl InstagramParser { let fix = self.config.fix_encoding; let mut messages: Vec = export .messages - .into_iter() - .filter_map(|msg| { - // Skip messages without content (shares, reactions without text, etc.) - let msg_content = msg.content?; - if msg_content.is_empty() { - return None; - } - - let timestamp = Utc.timestamp_millis_opt(msg.timestamp_ms).single()?; - - let sender = if fix { - fix_encoding(&msg.sender_name) - } else { - msg.sender_name - }; - - let content = if fix { - fix_encoding(&msg_content) - } else { - msg_content - }; - - Some(Message { - id: None, - timestamp: Some(timestamp), - sender, - reply_to: None, - edited: None, - content, - }) - }) + .iter() + .filter_map(|msg| parse_instagram_message(msg, fix)) .collect(); // Instagram stores messages newest-first, reverse for chronological order diff --git a/src/parsers/telegram.rs b/src/parsers/telegram.rs index 7942abf1..e468daf0 100644 --- a/src/parsers/telegram.rs +++ b/src/parsers/telegram.rs @@ -3,14 +3,11 @@ use std::fs; use std::path::Path; -use chrono::DateTime; -use serde::Deserialize; -use serde_json::Value; - use crate::Message; use crate::config::TelegramConfig; use crate::error::ChatpackError; use crate::parser::{Parser, Platform}; +use crate::parsing::telegram::{TelegramExport, parse_telegram_message}; #[cfg(feature = "streaming")] use crate::streaming::{StreamingConfig, StreamingParser, TelegramStreamingParser}; @@ -79,44 +76,11 @@ impl TelegramParser { fn parse_content(&self, content: &str) -> Result, ChatpackError> { let export: TelegramExport = serde_json::from_str(content)?; + // Use shared parsing logic let messages = export .messages .iter() - .filter(|msg| msg.msg_type == "message") - .filter_map(|msg| { - let sender = msg.from.as_ref()?; - let text_value = msg.text.as_ref()?; - let msg_content = extract_text(text_value); - - if msg_content.trim().is_empty() { - return None; - } - - // Parse timestamp - let timestamp = msg.date_unixtime.as_ref().and_then(|ts_str| { - ts_str - .parse::() - .ok() - .and_then(|ts| DateTime::from_timestamp(ts, 0)) - }); - - // Parse edited timestamp - let edited = msg.edited_unixtime.as_ref().and_then(|ts_str| { - ts_str - .parse::() - .ok() - .and_then(|ts| DateTime::from_timestamp(ts, 0)) - }); - - Some(Message::with_metadata( - sender, - msg_content, - timestamp, - msg.id, - msg.reply_to_message_id, - edited, - )) - }) + .filter_map(parse_telegram_message) .collect(); Ok(messages) @@ -129,32 +93,6 @@ impl Default for TelegramParser { } } -// Internal structures for deserializing Telegram JSON - -#[derive(Debug, Deserialize)] -struct TelegramExport { - messages: Vec, -} - -#[derive(Debug, Deserialize)] -struct TelegramMessage { - /// Message ID - id: Option, - /// Message type (we only care about "message") - #[serde(rename = "type")] - msg_type: String, - /// Unix timestamp as string - date_unixtime: Option, - /// Sender name - from: Option, - /// Message text (can be string or array) - text: Option, - /// Reply reference - reply_to_message_id: Option, - /// Edit timestamp as string (if message was edited) - edited_unixtime: Option, -} - // Implement the new unified Parser trait impl Parser for TelegramParser { fn name(&self) -> &'static str { @@ -212,38 +150,16 @@ impl Parser for TelegramParser { } } -/// Extracts text content from Telegram's `text` field. -/// -/// The field can be: -/// - A simple string: `"Hello"` -/// - An array with strings and objects: `["Text", {"type": "link", "text": "url"}]` -fn extract_text(text_value: &Value) -> String { - match text_value { - Value::String(s) => s.clone(), - Value::Array(arr) => arr - .iter() - .filter_map(|item| match item { - Value::String(s) => Some(s.clone()), - Value::Object(obj) => obj - .get("text") - .and_then(|v| v.as_str()) - .map(ToString::to_string), - _ => None, - }) - .collect::(), - _ => String::new(), - } -} - #[cfg(test)] mod tests { use super::*; + use crate::parsing::telegram::extract_telegram_text; use serde_json::json; #[test] fn test_extract_text_string() { let value = json!("Hello world"); - assert_eq!(extract_text(&value), "Hello world"); + assert_eq!(extract_telegram_text(&value), "Hello world"); } #[test] @@ -254,7 +170,7 @@ mod tests { " cool!" ]); assert_eq!( - extract_text(&value), + extract_telegram_text(&value), "Check this: https://example.com cool!" ); } @@ -262,7 +178,7 @@ mod tests { #[test] fn test_extract_text_empty() { let value = json!(null); - assert_eq!(extract_text(&value), ""); + assert_eq!(extract_telegram_text(&value), ""); } #[test] diff --git a/src/parsers/whatsapp.rs b/src/parsers/whatsapp.rs index bbffe124..3adc82e1 100644 --- a/src/parsers/whatsapp.rs +++ b/src/parsers/whatsapp.rs @@ -12,13 +12,15 @@ use std::fs; use std::path::Path; -use chrono::{DateTime, NaiveDateTime, Utc}; use regex::Regex; use crate::Message; use crate::config::WhatsAppConfig; use crate::error::ChatpackError; use crate::parser::{Parser, Platform}; +use crate::parsing::whatsapp::{ + detect_whatsapp_format, is_whatsapp_system_message, parse_whatsapp_timestamp, +}; #[cfg(feature = "streaming")] use crate::streaming::{StreamingConfig, StreamingParser, WhatsAppStreamingParser}; @@ -71,217 +73,6 @@ impl Default for WhatsAppParser { } } -/// Detected date format variants. -#[derive(Debug, Clone, Copy, PartialEq)] -enum DateFormat { - /// US format: M/D/YY or M/D/YYYY with optional AM/PM - /// Example: [1/15/24, 10:30:45 AM] - US, - /// EU format with dots in brackets: DD.MM.YY or DD.MM.YYYY - /// Example: [15.01.24, 10:30:45] - EuDotBracketed, - /// EU format with dots, no brackets: DD.MM.YYYY - /// Example: 26.10.2025, 20:40 - Sender: Message - EuDotNoBracket, - /// EU format with slashes, no brackets: DD/MM/YYYY - /// Example: 15/01/2024, 10:30 - - EuSlash, - /// Bracketed EU with slashes - /// Example: [15/01/2024, 10:30:45] - EuSlashBracketed, -} - -impl DateFormat { - /// Returns regex pattern for this date format. - fn pattern(self) -> &'static str { - match self { - // [1/15/24, 10:30:45 AM] Sender: Message - DateFormat::US => { - r"^\[(\d{1,2}/\d{1,2}/\d{2,4}),\s(\d{1,2}:\d{2}(?::\d{2})?(?:\s?[APap][Mm])?)\]\s([^:]+):\s?(.*)" - } - // [15.01.24, 10:30:45] Sender: Message - DateFormat::EuDotBracketed => { - r"^\[(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" - } - // 26.10.2025, 20:40 - Sender: Message - DateFormat::EuDotNoBracket => { - r"^(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" - } - // 15/01/2024, 10:30 - Sender: Message - DateFormat::EuSlash => { - r"^(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" - } - // [15/01/2024, 10:30:45] Sender: Message - DateFormat::EuSlashBracketed => { - r"^\[(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" - } - } - } - - /// Returns date parsing format string for chrono. - fn date_parse_formats(&self) -> &'static [&'static str] { - match self { - DateFormat::US => &[ - "%m/%d/%y, %I:%M:%S %p", - "%m/%d/%y, %I:%M %p", - "%m/%d/%Y, %I:%M:%S %p", - "%m/%d/%Y, %I:%M %p", - "%m/%d/%y, %H:%M:%S", - "%m/%d/%y, %H:%M", - "%m/%d/%Y, %H:%M:%S", - "%m/%d/%Y, %H:%M", - ], - DateFormat::EuDotBracketed | DateFormat::EuDotNoBracket => &[ - "%d.%m.%y, %H:%M:%S", - "%d.%m.%y, %H:%M", - "%d.%m.%Y, %H:%M:%S", - "%d.%m.%Y, %H:%M", - ], - DateFormat::EuSlash | DateFormat::EuSlashBracketed => &[ - "%d/%m/%y, %H:%M:%S", - "%d/%m/%y, %H:%M", - "%d/%m/%Y, %H:%M:%S", - "%d/%m/%Y, %H:%M", - ], - } - } -} - -/// Detection patterns for format auto-detection. -struct FormatDetector { - format: DateFormat, - regex: Regex, -} - -impl FormatDetector { - fn new(format: DateFormat) -> Self { - Self { - format, - regex: Regex::new(format.pattern()).unwrap(), - } - } - - fn matches(&self, line: &str) -> bool { - self.regex.is_match(line) - } -} - -/// Auto-detect date format by analyzing first N lines. -fn detect_format(lines: &[&str]) -> Option { - let detectors = [ - FormatDetector::new(DateFormat::US), - FormatDetector::new(DateFormat::EuDotBracketed), - FormatDetector::new(DateFormat::EuDotNoBracket), - FormatDetector::new(DateFormat::EuSlash), - FormatDetector::new(DateFormat::EuSlashBracketed), - ]; - - let mut scores = [0usize; 5]; - - for line in lines { - for (i, detector) in detectors.iter().enumerate() { - if detector.matches(line) { - scores[i] += 1; - } - } - } - - // Find the winner (highest score) - let max_score = *scores.iter().max()?; - if max_score == 0 { - return None; - } - - let winner_idx = scores.iter().position(|&s| s == max_score)?; - Some(detectors[winner_idx].format) -} - -/// Check if a line is a system message (no actual sender). -/// System messages: group created, user added/left, encryption notice, etc. -fn is_system_message(sender: &str, content: &str) -> bool { - // English system indicators - let system_indicators_en = [ - "Messages and calls are end-to-end encrypted", - "created group", - "added", - "removed", - "left", - "changed the subject", - "changed this group's icon", - "changed the group description", - "deleted this group's icon", - "changed their phone number", - "joined using this group's invite link", - "security code changed", - "You're now an admin", - "is now an admin", - "disappeared", - "turned on disappearing messages", - "turned off disappearing messages", - ]; - - // Russian system indicators - let system_indicators_ru = [ - "Сообщения и звонки защищены сквозным шифрованием", - "создал(а) группу", - "добавил", - "удалил", - "вышел", - "покинул", - "изменил тему", - "изменил иконку группы", - "изменил описание группы", - "удалил иконку группы", - "изменил номер телефона", - "присоединился по ссылке", - "код безопасности изменён", - "теперь администратор", - "включил исчезающие сообщения", - "выключил исчезающие сообщения", - "Подробнее", - ]; - - let content_lower = content.to_lowercase(); - let sender_lower = sender.to_lowercase(); - - // Check English indicators - for indicator in &system_indicators_en { - if content_lower.contains(&indicator.to_lowercase()) { - return true; - } - } - - // Check Russian indicators (case-sensitive for Cyrillic) - for indicator in &system_indicators_ru { - if content.contains(indicator) { - return true; - } - } - - // Check if sender is empty or system-like - if sender.trim().is_empty() - || sender_lower.contains("whatsapp") - || sender_lower.contains("system") - { - return true; - } - - false -} - -/// Parse timestamp from date and time strings. -fn parse_timestamp(date_str: &str, time_str: &str, format: DateFormat) -> Option> { - let datetime_str = format!("{date_str}, {time_str}"); - - for parse_format in format.date_parse_formats() { - if let Ok(naive) = NaiveDateTime::parse_from_str(&datetime_str, parse_format) { - return Some(naive.and_utc()); - } - } - - None -} - impl WhatsAppParser { /// Parses content from a string (internal implementation). fn parse_content(&self, content: &str) -> Result, ChatpackError> { @@ -293,7 +84,7 @@ impl WhatsAppParser { // Step 1: Auto-detect format from first 20 lines let sample_size = std::cmp::min(20, lines.len()); - let format = detect_format(&lines[..sample_size]).ok_or_else(|| { + let format = detect_whatsapp_format(&lines[..sample_size]).ok_or_else(|| { ChatpackError::invalid_format( "WhatsApp", "Could not detect WhatsApp export format. \ @@ -321,11 +112,13 @@ impl WhatsAppParser { let msg_content = caps.get(4).map_or("", |m| m.as_str()); // Skip system messages (if configured) - if self.config.skip_system_messages && is_system_message(sender, msg_content) { + if self.config.skip_system_messages + && is_whatsapp_system_message(sender, msg_content) + { continue; } - let timestamp = parse_timestamp(date_str, time_str, format); + let timestamp = parse_whatsapp_timestamp(date_str, time_str, format); let msg = Message::with_metadata( sender, @@ -410,6 +203,7 @@ impl Parser for WhatsAppParser { #[cfg(test)] mod tests { use super::*; + use crate::parsing::whatsapp::DateFormat; #[test] fn test_parser_name() { @@ -423,7 +217,7 @@ mod tests { "[1/15/24, 10:30:45 AM] Alice: Hello", "[1/15/24, 10:31:00 AM] Bob: Hi there", ]; - assert_eq!(detect_format(&lines), Some(DateFormat::US)); + assert_eq!(detect_whatsapp_format(&lines), Some(DateFormat::US)); } #[test] @@ -432,7 +226,10 @@ mod tests { "[15.01.24, 10:30:45] Alice: Hello", "[15.01.24, 10:31:00] Bob: Hi there", ]; - assert_eq!(detect_format(&lines), Some(DateFormat::EuDotBracketed)); + assert_eq!( + detect_whatsapp_format(&lines), + Some(DateFormat::EuDotBracketed) + ); } #[test] @@ -441,7 +238,10 @@ mod tests { "26.10.2025, 20:40 - Alice: Hello", "26.10.2025, 20:41 - Bob: Hi there", ]; - assert_eq!(detect_format(&lines), Some(DateFormat::EuDotNoBracket)); + assert_eq!( + detect_whatsapp_format(&lines), + Some(DateFormat::EuDotNoBracket) + ); } #[test] @@ -450,58 +250,61 @@ mod tests { "15/01/2024, 10:30 - Alice: Hello", "15/01/2024, 10:31 - Bob: Hi there", ]; - assert_eq!(detect_format(&lines), Some(DateFormat::EuSlash)); + assert_eq!(detect_whatsapp_format(&lines), Some(DateFormat::EuSlash)); } #[test] fn test_is_system_message_english() { - assert!(is_system_message( + assert!(is_whatsapp_system_message( "Alice", "Messages and calls are end-to-end encrypted" )); - assert!(is_system_message("Bob", "added Charlie to the group")); - assert!(is_system_message("Alice", "left")); - assert!(!is_system_message("Alice", "Hello everyone!")); - assert!(!is_system_message("Bob", "")); + assert!(is_whatsapp_system_message( + "Bob", + "added Charlie to the group" + )); + assert!(is_whatsapp_system_message("Alice", "left")); + assert!(!is_whatsapp_system_message("Alice", "Hello everyone!")); + assert!(!is_whatsapp_system_message("Bob", "")); } #[test] fn test_is_system_message_russian() { - assert!(is_system_message( + assert!(is_whatsapp_system_message( "Система", "Сообщения и звонки защищены сквозным шифрованием" )); - assert!(is_system_message("Bob", "Подробнее")); - assert!(!is_system_message("Муха", "Добрый вечер")); - assert!(!is_system_message("Bob", "<Без медиафайлов>")); + assert!(is_whatsapp_system_message("Bob", "Подробнее")); + assert!(!is_whatsapp_system_message("Муха", "Добрый вечер")); + assert!(!is_whatsapp_system_message("Bob", "<Без медиафайлов>")); } #[test] fn test_parse_timestamp_us() { - let ts = parse_timestamp("1/15/24", "10:30:45 AM", DateFormat::US); + let ts = parse_whatsapp_timestamp("1/15/24", "10:30:45 AM", DateFormat::US); assert!(ts.is_some()); } #[test] fn test_parse_timestamp_eu_dot() { - let ts = parse_timestamp("15.01.24", "10:30:45", DateFormat::EuDotBracketed); + let ts = parse_whatsapp_timestamp("15.01.24", "10:30:45", DateFormat::EuDotBracketed); assert!(ts.is_some()); - let ts2 = parse_timestamp("26.10.2025", "20:40", DateFormat::EuDotNoBracket); + let ts2 = parse_whatsapp_timestamp("26.10.2025", "20:40", DateFormat::EuDotNoBracket); assert!(ts2.is_some()); } #[test] fn test_media_not_filtered() { // should NOT be treated as system message - assert!(!is_system_message("Alice", "")); - assert!(!is_system_message("Bob", "image omitted")); - assert!(!is_system_message("Муха", "<Без медиафайлов>")); + assert!(!is_whatsapp_system_message("Alice", "")); + assert!(!is_whatsapp_system_message("Bob", "image omitted")); + assert!(!is_whatsapp_system_message("Муха", "<Без медиафайлов>")); } #[test] fn test_empty_sender_is_system() { - assert!(is_system_message("", "Some message")); - assert!(is_system_message(" ", "Some message")); + assert!(is_whatsapp_system_message("", "Some message")); + assert!(is_whatsapp_system_message(" ", "Some message")); } } diff --git a/src/parsing/discord.rs b/src/parsing/discord.rs new file mode 100644 index 00000000..98e40fc7 --- /dev/null +++ b/src/parsing/discord.rs @@ -0,0 +1,279 @@ +//! Shared Discord parsing utilities. +//! +//! This module contains types and functions shared between the standard +//! and streaming Discord parsers. + +use chrono::DateTime; +use serde::Deserialize; + +use crate::Message; + +/// Raw Discord message structure for deserialization. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordRawMessage { + pub id: String, + pub timestamp: String, + pub timestamp_edited: Option, + pub content: String, + pub author: DiscordAuthor, + pub reference: Option, + pub attachments: Option>, + pub stickers: Option>, +} + +/// Discord author structure. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordAuthor { + pub name: String, + pub nickname: Option, +} + +/// Discord message reference (for replies). +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordReference { + pub message_id: Option, +} + +/// Discord attachment structure. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordAttachment { + pub file_name: String, +} + +/// Discord sticker structure. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordSticker { + pub name: String, +} + +/// Discord export wrapper. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordExport { + pub messages: Vec, +} + +/// Parses a raw Discord message into a `Message`. +/// +/// Returns `None` if the message has no content and no attachments/stickers. +pub fn parse_discord_message(msg: &DiscordRawMessage) -> Option { + // Skip empty messages without attachments/stickers + if msg.content.trim().is_empty() + && msg.attachments.as_ref().is_none_or(|a| a.is_empty()) + && msg.stickers.as_ref().is_none_or(|s| s.is_empty()) + { + return None; + } + + // Build content: text + attachment/sticker info + let mut content = msg.content.clone(); + + // Append attachment filenames + if let Some(attachments) = &msg.attachments { + for att in attachments { + if !content.is_empty() { + content.push('\n'); + } + content.push_str(&format!("[Attachment: {}]", att.file_name)); + } + } + + // Append sticker names + if let Some(stickers) = &msg.stickers { + for sticker in stickers { + if !content.is_empty() { + content.push('\n'); + } + content.push_str(&format!("[Sticker: {}]", sticker.name)); + } + } + + // Use nickname if available, fallback to username + let sender = msg + .author + .nickname + .as_ref() + .unwrap_or(&msg.author.name) + .clone(); + + // Parse timestamp (ISO 8601 / RFC3339) + let timestamp = DateTime::parse_from_rfc3339(&msg.timestamp) + .ok() + .map(|dt| dt.to_utc()); + + // Parse edited timestamp + let edited = msg + .timestamp_edited + .as_ref() + .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()) + .map(|dt| dt.to_utc()); + + // Parse message ID (Discord snowflake) + let id = msg.id.parse::().ok(); + + // Parse reply reference + let reply_to = msg + .reference + .as_ref() + .and_then(|r| r.message_id.as_ref()) + .and_then(|id_str| id_str.parse::().ok()); + + Some(Message::with_metadata( + sender, content, timestamp, id, reply_to, edited, + )) +} + +/// Lightweight Discord message for streaming (without attachments/stickers). +/// +/// Used by JSONL streaming where each line is a complete message. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DiscordStreamMessage { + pub id: String, + pub timestamp: String, + pub timestamp_edited: Option, + pub content: String, + pub author: DiscordAuthor, + pub reference: Option, +} + +/// Parses a streaming Discord message (simpler, no attachments). +pub fn parse_discord_stream_message(msg: &DiscordStreamMessage) -> Option { + if msg.content.trim().is_empty() { + return None; + } + + let sender = msg + .author + .nickname + .as_ref() + .unwrap_or(&msg.author.name) + .clone(); + + let timestamp = DateTime::parse_from_rfc3339(&msg.timestamp) + .ok() + .map(|dt| dt.to_utc()); + + let edited = msg + .timestamp_edited + .as_ref() + .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()) + .map(|dt| dt.to_utc()); + + let id = msg.id.parse::().ok(); + + let reply_to = msg + .reference + .as_ref() + .and_then(|r| r.message_id.as_ref()) + .and_then(|id_str| id_str.parse::().ok()); + + Some(Message::with_metadata( + sender, + msg.content.clone(), + timestamp, + id, + reply_to, + edited, + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_discord_message_basic() { + let msg = DiscordRawMessage { + id: "123456789".to_string(), + timestamp: "2024-01-15T10:30:00+00:00".to_string(), + timestamp_edited: None, + content: "Hello world".to_string(), + author: DiscordAuthor { + name: "alice".to_string(), + nickname: None, + }, + reference: None, + attachments: None, + stickers: None, + }; + + let result = parse_discord_message(&msg); + assert!(result.is_some()); + + let parsed = result.unwrap(); + assert_eq!(parsed.sender, "alice"); + assert_eq!(parsed.content, "Hello world"); + assert!(parsed.timestamp.is_some()); + } + + #[test] + fn test_parse_discord_message_with_nickname() { + let msg = DiscordRawMessage { + id: "123".to_string(), + timestamp: "2024-01-15T10:30:00+00:00".to_string(), + timestamp_edited: None, + content: "Hi".to_string(), + author: DiscordAuthor { + name: "alice123".to_string(), + nickname: Some("Alice".to_string()), + }, + reference: None, + attachments: None, + stickers: None, + }; + + let result = parse_discord_message(&msg); + assert!(result.is_some()); + assert_eq!(result.unwrap().sender, "Alice"); + } + + #[test] + fn test_parse_discord_message_with_attachments() { + let msg = DiscordRawMessage { + id: "123".to_string(), + timestamp: "2024-01-15T10:30:00+00:00".to_string(), + timestamp_edited: None, + content: "Check this".to_string(), + author: DiscordAuthor { + name: "bob".to_string(), + nickname: None, + }, + reference: None, + attachments: Some(vec![DiscordAttachment { + file_name: "image.png".to_string(), + }]), + stickers: None, + }; + + let result = parse_discord_message(&msg); + assert!(result.is_some()); + + let parsed = result.unwrap(); + assert!(parsed.content.contains("[Attachment: image.png]")); + } + + #[test] + fn test_parse_discord_message_empty() { + let msg = DiscordRawMessage { + id: "123".to_string(), + timestamp: "2024-01-15T10:30:00+00:00".to_string(), + timestamp_edited: None, + content: "".to_string(), + author: DiscordAuthor { + name: "bob".to_string(), + nickname: None, + }, + reference: None, + attachments: None, + stickers: None, + }; + + assert!(parse_discord_message(&msg).is_none()); + } +} diff --git a/src/parsing/instagram.rs b/src/parsing/instagram.rs new file mode 100644 index 00000000..55cb1655 --- /dev/null +++ b/src/parsing/instagram.rs @@ -0,0 +1,190 @@ +//! Shared Instagram parsing utilities. +//! +//! This module contains types and functions shared between the standard +//! and streaming Instagram parsers. + +use chrono::{DateTime, TimeZone, Utc}; +use serde::Deserialize; + +use crate::Message; + +/// Raw Instagram message structure for deserialization. +#[derive(Debug, Deserialize)] +pub struct InstagramRawMessage { + pub sender_name: String, + pub timestamp_ms: i64, + pub content: Option, + pub share: Option, + #[serde(default)] + pub photos: Option>, + #[serde(default)] + pub videos: Option>, + #[serde(default)] + pub audio_files: Option>, +} + +/// Instagram share structure. +#[derive(Debug, Deserialize)] +pub struct InstagramShare { + pub share_text: Option, + pub link: Option, +} + +/// Instagram media (photo/video/audio) structure. +#[derive(Debug, Deserialize)] +pub struct InstagramMedia { + pub uri: Option, +} + +/// Instagram export wrapper. +#[derive(Debug, Deserialize)] +pub struct InstagramExport { + pub messages: Vec, +} + +/// Fix Meta's broken encoding (Mojibake). +/// +/// Meta exports UTF-8 text encoded as if it were ISO-8859-1. +/// Each UTF-8 byte is stored as a separate Unicode codepoint. +/// Example: "Привет" becomes "Привет" +/// +/// This function reverses that process by: +/// 1. Taking each char as its byte value +/// 2. Reconstructing the original UTF-8 string +/// +/// # Example +/// +/// ```ignore +/// use chatpack::parsing::instagram::fix_mojibake_encoding; +/// +/// // ASCII passes through unchanged +/// assert_eq!(fix_mojibake_encoding("Hello"), "Hello"); +/// +/// // Mojibake gets fixed +/// // (actual mojibake text would be converted back to proper UTF-8) +/// ``` +pub fn fix_mojibake_encoding(s: &str) -> String { + let bytes: Vec = s.chars().map(|c| c as u8).collect(); + String::from_utf8(bytes).unwrap_or_else(|_| s.to_string()) +} + +/// Parses a millisecond timestamp to DateTime. +pub fn parse_ms_timestamp(timestamp_ms: i64) -> Option> { + Utc.timestamp_millis_opt(timestamp_ms).single() +} + +/// Parses a raw Instagram message into a `Message`. +/// +/// Returns `None` if the message has no content. +/// +/// If `fix_encoding` is true, applies Mojibake fix to sender and content. +pub fn parse_instagram_message(msg: &InstagramRawMessage, fix_encoding: bool) -> Option { + // Get content from various possible locations + let content = msg + .content + .clone() + .or_else(|| msg.share.as_ref().and_then(|s| s.share_text.clone())); + + // Apply encoding fix if needed + let content = content.map(|c| { + if fix_encoding { + fix_mojibake_encoding(&c) + } else { + c + } + }); + + // Skip messages without content + let content = match content { + Some(c) if !c.trim().is_empty() => c, + _ => return None, + }; + + let timestamp = parse_ms_timestamp(msg.timestamp_ms); + + let sender = if fix_encoding { + fix_mojibake_encoding(&msg.sender_name) + } else { + msg.sender_name.clone() + }; + + Some(Message::with_metadata( + sender, content, timestamp, None, // Instagram doesn't have message IDs in export + None, // No reply references + None, // No edit timestamps + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fix_encoding_ascii() { + assert_eq!(fix_mojibake_encoding("Hello"), "Hello"); + assert_eq!(fix_mojibake_encoding("Test 123"), "Test 123"); + } + + #[test] + fn test_parse_ms_timestamp() { + let ts = parse_ms_timestamp(1705315800000); + assert!(ts.is_some()); + } + + #[test] + fn test_parse_instagram_message_basic() { + let msg = InstagramRawMessage { + sender_name: "user_one".to_string(), + timestamp_ms: 1705315800000, + content: Some("Hello!".to_string()), + share: None, + photos: None, + videos: None, + audio_files: None, + }; + + let result = parse_instagram_message(&msg, false); + assert!(result.is_some()); + + let parsed = result.unwrap(); + assert_eq!(parsed.sender, "user_one"); + assert_eq!(parsed.content, "Hello!"); + } + + #[test] + fn test_parse_instagram_message_with_share() { + let msg = InstagramRawMessage { + sender_name: "user".to_string(), + timestamp_ms: 1705315800000, + content: None, + share: Some(InstagramShare { + share_text: Some("Check this out!".to_string()), + link: Some("https://example.com".to_string()), + }), + photos: None, + videos: None, + audio_files: None, + }; + + let result = parse_instagram_message(&msg, false); + assert!(result.is_some()); + + let parsed = result.unwrap(); + assert_eq!(parsed.content, "Check this out!"); + } + + #[test] + fn test_parse_instagram_message_empty() { + let msg = InstagramRawMessage { + sender_name: "user".to_string(), + timestamp_ms: 1705315800000, + content: None, + share: None, + photos: None, + videos: None, + audio_files: None, + }; + + assert!(parse_instagram_message(&msg, false).is_none()); + } +} diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs new file mode 100644 index 00000000..799c0d4e --- /dev/null +++ b/src/parsing/mod.rs @@ -0,0 +1,32 @@ +//! Shared parsing utilities for all platforms. +//! +//! This module contains common types and functions used by both +//! standard (in-memory) and streaming parsers to avoid code duplication. + +#[cfg(feature = "telegram")] +pub mod telegram; + +#[cfg(feature = "instagram")] +pub mod instagram; + +#[cfg(feature = "whatsapp")] +pub mod whatsapp; + +#[cfg(feature = "discord")] +pub mod discord; + +// Re-export commonly used items +#[cfg(feature = "telegram")] +pub use telegram::{TelegramRawMessage, extract_telegram_text, parse_telegram_message}; + +#[cfg(feature = "instagram")] +pub use instagram::{InstagramRawMessage, fix_mojibake_encoding, parse_instagram_message}; + +#[cfg(feature = "whatsapp")] +pub use whatsapp::{ + DateFormat as WhatsAppDateFormat, detect_whatsapp_format, is_whatsapp_system_message, + parse_whatsapp_timestamp, +}; + +#[cfg(feature = "discord")] +pub use discord::{DiscordRawMessage, parse_discord_message}; diff --git a/src/parsing/telegram.rs b/src/parsing/telegram.rs new file mode 100644 index 00000000..ada97b48 --- /dev/null +++ b/src/parsing/telegram.rs @@ -0,0 +1,225 @@ +//! Shared Telegram parsing utilities. +//! +//! This module contains types and functions shared between the standard +//! and streaming Telegram parsers. + +use chrono::{DateTime, Utc}; +use serde::Deserialize; +use serde_json::Value; + +use crate::Message; + +/// Raw Telegram message structure for deserialization. +/// +/// Used by both standard and streaming parsers. +#[derive(Debug, Deserialize)] +pub struct TelegramRawMessage { + /// Message ID + pub id: Option, + /// Message type (we only care about "message") + #[serde(rename = "type")] + pub msg_type: String, + /// Unix timestamp as string + pub date_unixtime: Option, + /// Sender name + pub from: Option, + /// Message text (can be string or array) + pub text: Option, + /// Reply reference + pub reply_to_message_id: Option, + /// Edit timestamp as string (if message was edited) + pub edited_unixtime: Option, +} + +/// Telegram export wrapper. +#[derive(Debug, Deserialize)] +pub struct TelegramExport { + pub messages: Vec, +} + +/// Extracts text content from Telegram's complex `text` field. +/// +/// The `text` field in Telegram exports can be: +/// - A simple string: `"Hello"` +/// - An array with strings and objects: `["Text", {"type": "link", "text": "url"}]` +/// +/// This function handles both cases and returns a single string. +/// +/// # Example +/// +/// ```ignore +/// use serde_json::json; +/// use chatpack::parsing::telegram::extract_telegram_text; +/// +/// let simple = json!("Hello world"); +/// assert_eq!(extract_telegram_text(&simple), "Hello world"); +/// +/// let complex = json!([ +/// "Check this: ", +/// {"type": "link", "text": "https://example.com"} +/// ]); +/// assert_eq!(extract_telegram_text(&complex), "Check this: https://example.com"); +/// ``` +pub fn extract_telegram_text(text_value: &Value) -> String { + match text_value { + Value::String(s) => s.clone(), + Value::Array(arr) => arr + .iter() + .filter_map(|item| match item { + Value::String(s) => Some(s.clone()), + Value::Object(obj) => obj + .get("text") + .and_then(|v| v.as_str()) + .map(ToString::to_string), + _ => None, + }) + .collect::(), + _ => String::new(), + } +} + +/// Parses a Unix timestamp string to DateTime. +/// +/// Telegram stores timestamps as strings like "1234567890". +pub fn parse_unix_timestamp(ts_str: &str) -> Option> { + ts_str + .parse::() + .ok() + .and_then(|ts| DateTime::from_timestamp(ts, 0)) +} + +/// Parses a raw Telegram message into a `Message`. +/// +/// Returns `None` if: +/// - The message type is not "message" +/// - The sender is missing +/// - The content is empty +/// +/// This is the core parsing logic shared between standard and streaming parsers. +pub fn parse_telegram_message(msg: &TelegramRawMessage) -> Option { + // Skip non-message types + if msg.msg_type != "message" { + return None; + } + + let sender = msg.from.as_ref()?; + let text_value = msg.text.as_ref()?; + let content = extract_telegram_text(text_value); + + if content.trim().is_empty() { + return None; + } + + let timestamp = msg + .date_unixtime + .as_ref() + .and_then(|ts| parse_unix_timestamp(ts)); + let edited = msg + .edited_unixtime + .as_ref() + .and_then(|ts| parse_unix_timestamp(ts)); + + Some(Message::with_metadata( + sender, + content, + timestamp, + msg.id, + msg.reply_to_message_id, + edited, + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_extract_text_string() { + let value = json!("Hello world"); + assert_eq!(extract_telegram_text(&value), "Hello world"); + } + + #[test] + fn test_extract_text_array_with_link() { + let value = json!([ + "Check this: ", + {"type": "link", "text": "https://example.com"}, + " cool!" + ]); + assert_eq!( + extract_telegram_text(&value), + "Check this: https://example.com cool!" + ); + } + + #[test] + fn test_extract_text_empty() { + let value = json!(null); + assert_eq!(extract_telegram_text(&value), ""); + } + + #[test] + fn test_parse_unix_timestamp() { + let ts = parse_unix_timestamp("1705314600"); + assert!(ts.is_some()); + assert_eq!(ts.unwrap().timestamp(), 1705314600); + } + + #[test] + fn test_parse_unix_timestamp_invalid() { + assert!(parse_unix_timestamp("not-a-number").is_none()); + assert!(parse_unix_timestamp("").is_none()); + } + + #[test] + fn test_parse_telegram_message_basic() { + let msg = TelegramRawMessage { + id: Some(123), + msg_type: "message".to_string(), + date_unixtime: Some("1705314600".to_string()), + from: Some("Alice".to_string()), + text: Some(json!("Hello!")), + reply_to_message_id: None, + edited_unixtime: None, + }; + + let result = parse_telegram_message(&msg); + assert!(result.is_some()); + + let parsed = result.unwrap(); + assert_eq!(parsed.sender, "Alice"); + assert_eq!(parsed.content, "Hello!"); + assert!(parsed.timestamp.is_some()); + } + + #[test] + fn test_parse_telegram_message_skip_service() { + let msg = TelegramRawMessage { + id: Some(123), + msg_type: "service".to_string(), + date_unixtime: Some("1705314600".to_string()), + from: Some("Alice".to_string()), + text: Some(json!("pinned a message")), + reply_to_message_id: None, + edited_unixtime: None, + }; + + assert!(parse_telegram_message(&msg).is_none()); + } + + #[test] + fn test_parse_telegram_message_skip_empty() { + let msg = TelegramRawMessage { + id: Some(123), + msg_type: "message".to_string(), + date_unixtime: Some("1705314600".to_string()), + from: Some("Alice".to_string()), + text: Some(json!(" ")), + reply_to_message_id: None, + edited_unixtime: None, + }; + + assert!(parse_telegram_message(&msg).is_none()); + } +} diff --git a/src/parsing/whatsapp.rs b/src/parsing/whatsapp.rs new file mode 100644 index 00000000..3487ce0f --- /dev/null +++ b/src/parsing/whatsapp.rs @@ -0,0 +1,328 @@ +//! Shared WhatsApp parsing utilities. +//! +//! This module contains types and functions shared between the standard +//! and streaming WhatsApp parsers. + +use chrono::{DateTime, NaiveDateTime, Utc}; +use regex::Regex; + +/// Detected date format variants for WhatsApp exports. +/// +/// WhatsApp exports vary by locale and platform. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DateFormat { + /// US format: M/D/YY or M/D/YYYY with optional AM/PM + /// Example: [1/15/24, 10:30:45 AM] + US, + /// EU format with dots in brackets: DD.MM.YY or DD.MM.YYYY + /// Example: [15.01.24, 10:30:45] + EuDotBracketed, + /// EU format with dots, no brackets: DD.MM.YYYY + /// Example: 26.10.2025, 20:40 - Sender: Message + EuDotNoBracket, + /// EU format with slashes, no brackets: DD/MM/YYYY + /// Example: 15/01/2024, 10:30 - + EuSlash, + /// Bracketed EU with slashes + /// Example: [15/01/2024, 10:30:45] + EuSlashBracketed, +} + +impl DateFormat { + /// Returns regex pattern for this date format. + pub fn pattern(self) -> &'static str { + match self { + // [1/15/24, 10:30:45 AM] Sender: Message + DateFormat::US => { + r"^\[(\d{1,2}/\d{1,2}/\d{2,4}),\s(\d{1,2}:\d{2}(?::\d{2})?(?:\s?[APap][Mm])?)\]\s([^:]+):\s?(.*)" + } + // [15.01.24, 10:30:45] Sender: Message + DateFormat::EuDotBracketed => { + r"^\[(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" + } + // 26.10.2025, 20:40 - Sender: Message + DateFormat::EuDotNoBracket => { + r"^(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" + } + // 15/01/2024, 10:30 - Sender: Message + DateFormat::EuSlash => { + r"^(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" + } + // [15/01/2024, 10:30:45] Sender: Message + DateFormat::EuSlashBracketed => { + r"^\[(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" + } + } + } + + /// Returns date parsing format strings for chrono. + pub fn date_parse_formats(self) -> &'static [&'static str] { + match self { + DateFormat::US => &[ + "%m/%d/%y, %I:%M:%S %p", + "%m/%d/%y, %I:%M %p", + "%m/%d/%Y, %I:%M:%S %p", + "%m/%d/%Y, %I:%M %p", + "%m/%d/%y, %H:%M:%S", + "%m/%d/%y, %H:%M", + "%m/%d/%Y, %H:%M:%S", + "%m/%d/%Y, %H:%M", + ], + DateFormat::EuDotBracketed | DateFormat::EuDotNoBracket => &[ + "%d.%m.%y, %H:%M:%S", + "%d.%m.%y, %H:%M", + "%d.%m.%Y, %H:%M:%S", + "%d.%m.%Y, %H:%M", + ], + DateFormat::EuSlash | DateFormat::EuSlashBracketed => &[ + "%d/%m/%y, %H:%M:%S", + "%d/%m/%y, %H:%M", + "%d/%m/%Y, %H:%M:%S", + "%d/%m/%Y, %H:%M", + ], + } + } + + /// Returns all format variants. + pub fn all() -> &'static [DateFormat] { + &[ + DateFormat::US, + DateFormat::EuDotBracketed, + DateFormat::EuDotNoBracket, + DateFormat::EuSlash, + DateFormat::EuSlashBracketed, + ] + } +} + +/// Parse timestamp from date and time strings. +pub fn parse_whatsapp_timestamp( + date_str: &str, + time_str: &str, + format: DateFormat, +) -> Option> { + let datetime_str = format!("{date_str}, {time_str}"); + + for parse_format in format.date_parse_formats() { + if let Ok(naive) = NaiveDateTime::parse_from_str(&datetime_str, parse_format) { + return Some(naive.and_utc()); + } + } + + None +} + +/// Check if a line is a system message (no actual sender). +/// +/// System messages include: group created, user added/left, encryption notice, etc. +pub fn is_whatsapp_system_message(sender: &str, content: &str) -> bool { + // English system indicators + let system_indicators_en = [ + "Messages and calls are end-to-end encrypted", + "created group", + "added", + "removed", + "left", + "changed the subject", + "changed this group's icon", + "changed the group description", + "deleted this group's icon", + "changed their phone number", + "joined using this group's invite link", + "security code changed", + "You're now an admin", + "is now an admin", + "disappeared", + "turned on disappearing messages", + "turned off disappearing messages", + ]; + + // Russian system indicators + let system_indicators_ru = [ + "Сообщения и звонки защищены сквозным шифрованием", + "создал(а) группу", + "добавил", + "удалил", + "вышел", + "покинул", + "изменил тему", + "изменил иконку группы", + "изменил описание группы", + "удалил иконку группы", + "изменил номер телефона", + "присоединился по ссылке", + "код безопасности изменён", + "теперь администратор", + "включил исчезающие сообщения", + "выключил исчезающие сообщения", + "Подробнее", + ]; + + let content_lower = content.to_lowercase(); + let sender_lower = sender.to_lowercase(); + + // Check English indicators + for indicator in &system_indicators_en { + if content_lower.contains(&indicator.to_lowercase()) { + return true; + } + } + + // Check Russian indicators (case-sensitive for Cyrillic) + for indicator in &system_indicators_ru { + if content.contains(indicator) { + return true; + } + } + + // Check if sender is empty or system-like + sender.trim().is_empty() || sender_lower.contains("whatsapp") || sender_lower.contains("system") +} + +/// Detection result for format auto-detection. +struct FormatDetector { + format: DateFormat, + regex: Regex, +} + +impl FormatDetector { + fn new(format: DateFormat) -> Self { + Self { + format, + regex: Regex::new(format.pattern()).unwrap(), + } + } + + fn matches(&self, line: &str) -> bool { + self.regex.is_match(line) + } +} + +/// Auto-detect date format by analyzing sample lines. +/// +/// Analyzes the provided lines and returns the most likely format. +/// Returns `None` if no format matches any lines. +pub fn detect_whatsapp_format(lines: &[&str]) -> Option { + let detectors: Vec = DateFormat::all() + .iter() + .map(|&f| FormatDetector::new(f)) + .collect(); + + let mut scores = vec![0usize; detectors.len()]; + + for line in lines { + for (i, detector) in detectors.iter().enumerate() { + if detector.matches(line) { + scores[i] += 1; + } + } + } + + // Find the winner (highest score) + let max_score = *scores.iter().max()?; + if max_score == 0 { + return None; + } + + let winner_idx = scores.iter().position(|&s| s == max_score)?; + Some(detectors[winner_idx].format) +} + +/// Auto-detect date format from owned strings (for streaming). +pub fn detect_whatsapp_format_owned(lines: &[String]) -> Option { + let refs: Vec<&str> = lines.iter().map(String::as_str).collect(); + detect_whatsapp_format(&refs) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_format_us() { + let lines = vec![ + "[1/15/24, 10:30:45 AM] Alice: Hello", + "[1/15/24, 10:31:00 AM] Bob: Hi there", + ]; + assert_eq!(detect_whatsapp_format(&lines), Some(DateFormat::US)); + } + + #[test] + fn test_detect_format_eu_dot_bracketed() { + let lines = vec![ + "[15.01.24, 10:30:45] Alice: Hello", + "[15.01.24, 10:31:00] Bob: Hi there", + ]; + assert_eq!( + detect_whatsapp_format(&lines), + Some(DateFormat::EuDotBracketed) + ); + } + + #[test] + fn test_detect_format_eu_dot_no_bracket() { + let lines = vec![ + "26.10.2025, 20:40 - Alice: Hello", + "26.10.2025, 20:41 - Bob: Hi there", + ]; + assert_eq!( + detect_whatsapp_format(&lines), + Some(DateFormat::EuDotNoBracket) + ); + } + + #[test] + fn test_detect_format_eu_slash() { + let lines = vec![ + "15/01/2024, 10:30 - Alice: Hello", + "15/01/2024, 10:31 - Bob: Hi there", + ]; + assert_eq!(detect_whatsapp_format(&lines), Some(DateFormat::EuSlash)); + } + + #[test] + fn test_is_system_message_english() { + assert!(is_whatsapp_system_message( + "Alice", + "Messages and calls are end-to-end encrypted" + )); + assert!(is_whatsapp_system_message( + "Bob", + "added Charlie to the group" + )); + assert!(is_whatsapp_system_message("Alice", "left")); + assert!(!is_whatsapp_system_message("Alice", "Hello everyone!")); + assert!(!is_whatsapp_system_message("Bob", "")); + } + + #[test] + fn test_is_system_message_russian() { + assert!(is_whatsapp_system_message( + "Система", + "Сообщения и звонки защищены сквозным шифрованием" + )); + assert!(is_whatsapp_system_message("Bob", "Подробнее")); + assert!(!is_whatsapp_system_message("Муха", "Добрый вечер")); + } + + #[test] + fn test_parse_timestamp_us() { + let ts = parse_whatsapp_timestamp("1/15/24", "10:30:45 AM", DateFormat::US); + assert!(ts.is_some()); + } + + #[test] + fn test_parse_timestamp_eu_dot() { + let ts = parse_whatsapp_timestamp("15.01.24", "10:30:45", DateFormat::EuDotBracketed); + assert!(ts.is_some()); + + let ts2 = parse_whatsapp_timestamp("26.10.2025", "20:40", DateFormat::EuDotNoBracket); + assert!(ts2.is_some()); + } + + #[test] + fn test_empty_sender_is_system() { + assert!(is_whatsapp_system_message("", "Some message")); + assert!(is_whatsapp_system_message(" ", "Some message")); + } +} diff --git a/src/streaming/instagram.rs b/src/streaming/instagram.rs index 9c55e84a..41d1c786 100644 --- a/src/streaming/instagram.rs +++ b/src/streaming/instagram.rs @@ -18,11 +18,9 @@ use std::fs::File; use std::io::{BufRead, BufReader, Seek}; use std::path::Path; -use chrono::{TimeZone, Utc}; -use serde::Deserialize; - use crate::Message; use crate::error::ChatpackError; +use crate::parsing::instagram::{InstagramRawMessage, parse_instagram_message}; use super::{MessageIterator, StreamingConfig, StreamingError, StreamingParser, StreamingResult}; @@ -210,50 +208,14 @@ impl InstagramMessageIterator { } } - /// Parses a JSON string into a Message. - fn parse_message(json_str: &str) -> StreamingResult> { + /// Parses a JSON string into a Message using shared parsing logic. + fn parse_message_from_json(json_str: &str) -> StreamingResult> { let msg: InstagramRawMessage = serde_json::from_str(json_str)?; - - // Get content from various possible locations - let content = msg - .content - .or_else(|| msg.share.as_ref().and_then(|s| s.share_text.clone())) - .map(|s| fix_encoding(&s)); - - // Skip messages without content - let content = match content { - Some(c) if !c.trim().is_empty() => c, - _ => return Ok(None), - }; - - // Fix encoding on sender name - let sender = fix_encoding(&msg.sender_name); - - // Parse timestamp (milliseconds to DateTime) - let timestamp = Utc.timestamp_millis_opt(msg.timestamp_ms).single(); - - Ok(Some(Message::with_metadata( - sender, content, timestamp, None, // Instagram doesn't have message IDs in export - None, // No reply references - None, // No edit timestamps - ))) + // Streaming always fixes encoding + Ok(parse_instagram_message(&msg, true)) } } -/// Fix Meta's broken encoding (Mojibake). -/// -/// Meta exports UTF-8 text encoded as if it were ISO-8859-1. -/// Each UTF-8 byte is stored as a separate Unicode codepoint. -/// Example: "Привет" becomes "Привет" -/// -/// This function reverses that process by: -/// 1. Taking each char as its byte value -/// 2. Reconstructing the original UTF-8 string -fn fix_encoding(s: &str) -> String { - let bytes: Vec = s.chars().map(|c| c as u8).collect(); - String::from_utf8(bytes).unwrap_or_else(|_| s.to_string()) -} - impl MessageIterator for InstagramMessageIterator { fn progress(&self) -> Option { if self.file_size == 0 { @@ -282,7 +244,7 @@ impl Iterator for InstagramMessageIterator { loop { match self.read_next_object() { Ok(Some(json_str)) => { - match Self::parse_message(&json_str) { + match Self::parse_message_from_json(&json_str) { Ok(Some(msg)) => return Some(Ok(msg)), Ok(None) => {} // Skip messages without content, try next Err(_) if self.config.skip_invalid => {} // Skip invalid @@ -297,52 +259,10 @@ impl Iterator for InstagramMessageIterator { } } -/// Raw message structure for deserialization. -// Add #[allow(dead_code)] to the structs causing warnings - -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct InstagramRawMessage { - sender_name: String, - timestamp_ms: i64, - content: Option, - share: Option, - #[serde(default)] - photos: Option>, - #[serde(default)] - videos: Option>, - #[serde(default)] - audio_files: Option>, -} - -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct InstagramShare { - share_text: Option, - link: Option, -} - -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct InstagramPhoto { - uri: Option, -} - -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct InstagramVideo { - uri: Option, -} - -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct InstagramAudio { - uri: Option, -} - #[cfg(test)] mod tests { use super::*; + use crate::parsing::instagram::fix_mojibake_encoding; use std::io::Cursor; fn create_test_json() -> String { @@ -396,11 +316,7 @@ mod tests { #[test] fn test_fix_encoding() { // Test that normal ASCII passes through - assert_eq!(fix_encoding("Hello"), "Hello"); - - // Test that already correct UTF-8 might get corrupted (this is expected - // since we can't distinguish between mojibake and intentional Latin-1) - // The fix_encoding is only meant for Meta's specific encoding issue + assert_eq!(fix_mojibake_encoding("Hello"), "Hello"); } #[test] diff --git a/src/streaming/telegram.rs b/src/streaming/telegram.rs index c34e7136..94b87ad7 100644 --- a/src/streaming/telegram.rs +++ b/src/streaming/telegram.rs @@ -18,12 +18,9 @@ use std::fs::File; use std::io::{BufRead, BufReader, Seek}; use std::path::Path; -use chrono::DateTime; -use serde::Deserialize; -use serde_json::Value; - use crate::Message; use crate::error::ChatpackError; +use crate::parsing::telegram::{TelegramRawMessage, parse_telegram_message}; use super::{MessageIterator, StreamingConfig, StreamingError, StreamingParser, StreamingResult}; @@ -211,49 +208,10 @@ impl TelegramMessageIterator { } } - /// Parses a JSON string into a Message. - fn parse_message(json_str: &str) -> StreamingResult> { + /// Parses a JSON string into a Message using shared parsing logic. + fn parse_message_from_json(json_str: &str) -> StreamingResult> { let msg: TelegramRawMessage = serde_json::from_str(json_str)?; - - // Skip non-message types - if msg.msg_type != "message" { - return Ok(None); - } - - let Some(sender) = msg.from else { - return Ok(None); - }; - let content = match msg.text { - Some(text) => extract_text(&text), - None => return Ok(None), - }; - - if content.trim().is_empty() { - return Ok(None); - } - - let timestamp = msg.date_unixtime.as_ref().and_then(|ts_str| { - ts_str - .parse::() - .ok() - .and_then(|ts| DateTime::from_timestamp(ts, 0)) - }); - - let edited = msg.edited_unixtime.as_ref().and_then(|ts_str| { - ts_str - .parse::() - .ok() - .and_then(|ts| DateTime::from_timestamp(ts, 0)) - }); - - Ok(Some(Message::with_metadata( - sender, - content, - timestamp, - msg.id, - msg.reply_to_message_id, - edited, - ))) + Ok(parse_telegram_message(&msg)) } } @@ -285,7 +243,7 @@ impl Iterator for TelegramMessageIterator { loop { match self.read_next_object() { Ok(Some(json_str)) => { - match Self::parse_message(&json_str) { + match Self::parse_message_from_json(&json_str) { Ok(Some(msg)) => return Some(Ok(msg)), Ok(None) => {} // Skip non-messages, try next Err(_) if self.config.skip_invalid => {} // Skip invalid @@ -300,41 +258,10 @@ impl Iterator for TelegramMessageIterator { } } -/// Raw message structure for deserialization. -#[derive(Debug, Deserialize)] -struct TelegramRawMessage { - id: Option, - #[serde(rename = "type")] - msg_type: String, - date_unixtime: Option, - from: Option, - text: Option, - reply_to_message_id: Option, - edited_unixtime: Option, -} - -/// Extracts text content from Telegram's complex text field. -fn extract_text(text_value: &Value) -> String { - match text_value { - Value::String(s) => s.clone(), - Value::Array(arr) => arr - .iter() - .filter_map(|item| match item { - Value::String(s) => Some(s.clone()), - Value::Object(obj) => obj - .get("text") - .and_then(|v| v.as_str()) - .map(ToString::to_string), - _ => None, - }) - .collect::(), - _ => String::new(), - } -} - #[cfg(test)] mod tests { use super::*; + use crate::parsing::telegram::extract_telegram_text; use std::io::Cursor; fn create_test_json() -> String { @@ -394,7 +321,7 @@ mod tests { {"type": "bold", "text": "world"}, "!" ]); - assert_eq!(extract_text(&value), "Hello, world!"); + assert_eq!(extract_telegram_text(&value), "Hello, world!"); } #[test] diff --git a/src/streaming/whatsapp.rs b/src/streaming/whatsapp.rs index d0e9a4a0..22fcfce1 100644 --- a/src/streaming/whatsapp.rs +++ b/src/streaming/whatsapp.rs @@ -13,11 +13,14 @@ use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::Path; -use chrono::{DateTime, NaiveDateTime, Utc}; +use chrono::{DateTime, Utc}; use regex::Regex; use crate::Message; use crate::error::ChatpackError; +use crate::parsing::whatsapp::{ + DateFormat, detect_whatsapp_format_owned, is_whatsapp_system_message, parse_whatsapp_timestamp, +}; use super::{MessageIterator, StreamingConfig, StreamingParser, StreamingResult}; @@ -65,65 +68,6 @@ impl StreamingParser for WhatsAppStreamingParser { } } -/// Detected date format variants. -#[derive(Debug, Clone, Copy, PartialEq)] -enum DateFormat { - US, - EuDotBracketed, - EuDotNoBracket, - EuSlash, - EuSlashBracketed, -} - -impl DateFormat { - fn pattern(self) -> &'static str { - match self { - DateFormat::US => { - r"^\[(\d{1,2}/\d{1,2}/\d{2,4}),\s(\d{1,2}:\d{2}(?::\d{2})?(?:\s?[APap][Mm])?)\]\s([^:]+):\s?(.*)" - } - DateFormat::EuDotBracketed => { - r"^\[(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" - } - DateFormat::EuDotNoBracket => { - r"^(\d{2}\.\d{2}\.\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" - } - DateFormat::EuSlash => { - r"^(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\s-\s([^:]+):\s?(.*)" - } - DateFormat::EuSlashBracketed => { - r"^\[(\d{2}/\d{2}/\d{2,4}),\s(\d{2}:\d{2}(?::\d{2})?)\]\s([^:]+):\s?(.*)" - } - } - } - - fn date_parse_formats(self) -> &'static [&'static str] { - match self { - DateFormat::US => &[ - "%m/%d/%y, %I:%M:%S %p", - "%m/%d/%y, %I:%M %p", - "%m/%d/%Y, %I:%M:%S %p", - "%m/%d/%Y, %I:%M %p", - "%m/%d/%y, %H:%M:%S", - "%m/%d/%y, %H:%M", - "%m/%d/%Y, %H:%M:%S", - "%m/%d/%Y, %H:%M", - ], - DateFormat::EuDotBracketed | DateFormat::EuDotNoBracket => &[ - "%d.%m.%y, %H:%M:%S", - "%d.%m.%y, %H:%M", - "%d.%m.%Y, %H:%M:%S", - "%d.%m.%Y, %H:%M", - ], - DateFormat::EuSlash | DateFormat::EuSlashBracketed => &[ - "%d/%m/%y, %H:%M:%S", - "%d/%m/%y, %H:%M", - "%d/%m/%Y, %H:%M:%S", - "%d/%m/%Y, %H:%M", - ], - } - } -} - #[derive(Debug, Default)] struct PendingMessage { sender: String, @@ -145,7 +89,7 @@ impl PendingMessage { return None; } - if is_system_message(&self.sender, &self.content) { + if is_whatsapp_system_message(&self.sender, &self.content) { return None; } @@ -160,77 +104,6 @@ impl PendingMessage { } } -fn is_system_message(sender: &str, content: &str) -> bool { - let system_indicators_en = [ - "Messages and calls are end-to-end encrypted", - "created group", - "added", - "removed", - "left", - "changed the subject", - "changed this group's icon", - "changed the group description", - "deleted this group's icon", - "changed their phone number", - "joined using this group's invite link", - "security code changed", - "You're now an admin", - "is now an admin", - "disappeared", - "turned on disappearing messages", - "turned off disappearing messages", - ]; - - let system_indicators_ru = [ - "Сообщения и звонки защищены сквозным шифрованием", - "создал(а) группу", - "добавил", - "удалил", - "вышел", - "покинул", - "изменил тему", - "изменил иконку группы", - "изменил описание группы", - "удалил иконку группы", - "изменил номер телефона", - "присоединился по ссылке", - "код безопасности изменён", - "теперь администратор", - "включил исчезающие сообщения", - "выключил исчезающие сообщения", - "Подробнее", - ]; - - let content_lower = content.to_lowercase(); - let sender_lower = sender.to_lowercase(); - - for indicator in &system_indicators_en { - if content_lower.contains(&indicator.to_lowercase()) { - return true; - } - } - - for indicator in &system_indicators_ru { - if content.contains(indicator) { - return true; - } - } - - sender.trim().is_empty() || sender_lower.contains("whatsapp") || sender_lower.contains("system") -} - -fn parse_timestamp(date_str: &str, time_str: &str, format: DateFormat) -> Option> { - let datetime_str = format!("{}, {}", date_str, time_str); - - for parse_format in format.date_parse_formats() { - if let Ok(naive) = NaiveDateTime::parse_from_str(&datetime_str, parse_format) { - return Some(naive.and_utc()); - } - } - - None -} - /// Iterator over WhatsApp messages. pub struct WhatsAppMessageIterator { reader: R, @@ -261,7 +134,7 @@ impl WhatsAppMessageIterator { sample_lines.push(line); } - let detected_format = detect_format(&sample_lines); + let detected_format = detect_whatsapp_format_owned(&sample_lines); let format_regex = detected_format.map(|f| Regex::new(f.pattern()).unwrap()); let mut iter = Self { @@ -307,7 +180,7 @@ impl WhatsAppMessageIterator { self.pending.sender = sender.to_string(); self.pending.content = content.to_string(); - self.pending.timestamp = parse_timestamp(date_str, time_str, format); + self.pending.timestamp = parse_whatsapp_timestamp(date_str, time_str, format); return; } } @@ -330,36 +203,6 @@ impl WhatsAppMessageIterator { } } -fn detect_format(lines: &[String]) -> Option { - let formats = [ - DateFormat::US, - DateFormat::EuDotBracketed, - DateFormat::EuDotNoBracket, - DateFormat::EuSlash, - DateFormat::EuSlashBracketed, - ]; - - let mut scores = [0usize; 5]; - - for line in lines { - for (i, format) in formats.iter().enumerate() { - if let Ok(regex) = Regex::new(format.pattern()) { - if regex.is_match(line) { - scores[i] += 1; - } - } - } - } - - let max_score = *scores.iter().max()?; - if max_score == 0 { - return None; - } - - let winner_idx = scores.iter().position(|&s| s == max_score)?; - Some(formats[winner_idx]) -} - impl MessageIterator for WhatsAppMessageIterator { fn progress(&self) -> Option { if self.file_size == 0 { @@ -511,12 +354,15 @@ This is a continuation line #[test] fn test_system_message_detection() { - assert!(is_system_message( + assert!(is_whatsapp_system_message( "Alice", "Messages and calls are end-to-end encrypted" )); - assert!(!is_system_message("Alice", "Hello everyone!")); - assert!(is_system_message("Bob", "added Charlie to the group")); + assert!(!is_whatsapp_system_message("Alice", "Hello everyone!")); + assert!(is_whatsapp_system_message( + "Bob", + "added Charlie to the group" + )); } #[test] @@ -525,7 +371,7 @@ This is a continuation line "[1/15/24, 10:30:45 AM] Alice: Hello".to_string(), "[1/15/24, 10:31:00 AM] Bob: Hi there".to_string(), ]; - assert_eq!(detect_format(&lines), Some(DateFormat::US)); + assert_eq!(detect_whatsapp_format_owned(&lines), Some(DateFormat::US)); } #[test] @@ -534,7 +380,10 @@ This is a continuation line "[15.01.24, 10:30:45] Alice: Hello".to_string(), "[15.01.24, 10:31:00] Bob: Hi there".to_string(), ]; - assert_eq!(detect_format(&lines), Some(DateFormat::EuDotBracketed)); + assert_eq!( + detect_whatsapp_format_owned(&lines), + Some(DateFormat::EuDotBracketed) + ); } #[test] diff --git a/tests/integration.rs b/tests/integration.rs index fa123f41..5810bcbe 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -750,7 +750,7 @@ mod output_config_tests { } // ============================================================================ -// InternalMessage Tests +// Message Tests // ============================================================================ mod message_tests { @@ -762,7 +762,7 @@ mod message_tests { let ts = Utc.with_ymd_and_hms(2024, 1, 15, 10, 30, 0).unwrap(); let edit_ts = Utc.with_ymd_and_hms(2024, 1, 15, 11, 0, 0).unwrap(); - let msg = InternalMessage::new("Alice", "Hello") + let msg = Message::new("Alice", "Hello") .with_id(123) .with_timestamp(ts) .with_reply_to(122) @@ -778,25 +778,25 @@ mod message_tests { #[test] fn test_message_has_metadata() { - let simple = InternalMessage::new("Bob", "Hi"); + let simple = Message::new("Bob", "Hi"); assert!(!simple.has_metadata()); - let with_id = InternalMessage::new("Bob", "Hi").with_id(1); + let with_id = Message::new("Bob", "Hi").with_id(1); assert!(with_id.has_metadata()); } #[test] fn test_message_is_empty() { - let empty = InternalMessage::new("Alice", ""); + let empty = Message::new("Alice", ""); assert!(empty.is_empty()); - let not_empty = InternalMessage::new("Alice", "Hello"); + let not_empty = Message::new("Alice", "Hello"); assert!(!not_empty.is_empty()); } #[test] fn test_message_clone() { - let msg = InternalMessage::new("Alice", "Hello").with_id(1); + let msg = Message::new("Alice", "Hello").with_id(1); let cloned = msg.clone(); assert_eq!(msg.sender, cloned.sender); @@ -806,7 +806,7 @@ mod message_tests { #[test] fn test_message_debug() { - let msg = InternalMessage::new("Alice", "Hello"); + let msg = Message::new("Alice", "Hello"); let debug = format!("{msg:?}"); assert!(debug.contains("Alice")); @@ -815,9 +815,9 @@ mod message_tests { #[test] fn test_message_partial_eq() { - let msg1 = InternalMessage::new("Alice", "Hello"); - let msg2 = InternalMessage::new("Alice", "Hello"); - let msg3 = InternalMessage::new("Bob", "Hello"); + let msg1 = Message::new("Alice", "Hello"); + let msg2 = Message::new("Alice", "Hello"); + let msg3 = Message::new("Bob", "Hello"); assert_eq!(msg1, msg2); assert_ne!(msg1, msg3); @@ -979,12 +979,10 @@ mod serde_tests { #[test] fn test_message_serialize_deserialize() { let ts = Utc.with_ymd_and_hms(2024, 1, 15, 10, 30, 0).unwrap(); - let msg = InternalMessage::new("Alice", "Hello") - .with_id(1) - .with_timestamp(ts); + let msg = Message::new("Alice", "Hello").with_id(1).with_timestamp(ts); let json = serde_json::to_string(&msg).unwrap(); - let deserialized: InternalMessage = serde_json::from_str(&json).unwrap(); + let deserialized: Message = serde_json::from_str(&json).unwrap(); assert_eq!(msg, deserialized); } diff --git a/tests/output_tests.rs b/tests/output_tests.rs index a697dd9b..6c3759eb 100644 --- a/tests/output_tests.rs +++ b/tests/output_tests.rs @@ -1,26 +1,26 @@ //! Tests for output writers (JSON, JSONL, CSV) use chatpack::core::output::{write_csv, write_json, write_jsonl}; -use chatpack::core::{InternalMessage, OutputConfig}; +use chatpack::core::{Message, OutputConfig}; use chrono::{TimeZone, Utc}; use std::fs; use tempfile::tempdir; -fn sample_messages() -> Vec { +fn sample_messages() -> Vec { let ts1 = Utc.with_ymd_and_hms(2024, 1, 15, 10, 30, 0).unwrap(); let ts2 = Utc.with_ymd_and_hms(2024, 1, 15, 10, 31, 0).unwrap(); let ts3 = Utc.with_ymd_and_hms(2024, 1, 15, 10, 32, 0).unwrap(); let edit_ts = Utc.with_ymd_and_hms(2024, 1, 15, 11, 0, 0).unwrap(); vec![ - InternalMessage::new("Alice", "Hello!") + Message::new("Alice", "Hello!") .with_id(1) .with_timestamp(ts1), - InternalMessage::new("Bob", "Hi Alice!") + Message::new("Bob", "Hi Alice!") .with_id(2) .with_timestamp(ts2) .with_reply_to(1), - InternalMessage::new("Alice", "How are you?") + Message::new("Alice", "How are you?") .with_id(3) .with_timestamp(ts3) .with_edited(edit_ts), @@ -97,7 +97,7 @@ mod json_writer_tests { let path = dir.path().join("output.json"); let path_str = path.to_str().unwrap(); - let messages: Vec = vec![]; + let messages: Vec = vec![]; let config = OutputConfig::new(); write_json(&messages, path_str, &config).unwrap(); @@ -113,8 +113,8 @@ mod json_writer_tests { let path_str = path.to_str().unwrap(); let messages = vec![ - InternalMessage::new("Алиса", "Привет! 🎉"), - InternalMessage::new("田中", "こんにちは"), + Message::new("Алиса", "Привет! 🎉"), + Message::new("田中", "こんにちは"), ]; let config = OutputConfig::new(); @@ -195,7 +195,7 @@ mod jsonl_writer_tests { let path = dir.path().join("output.jsonl"); let path_str = path.to_str().unwrap(); - let messages: Vec = vec![]; + let messages: Vec = vec![]; let config = OutputConfig::new(); write_jsonl(&messages, path_str, &config).unwrap(); @@ -218,7 +218,7 @@ mod csv_writer_tests { let path = dir.path().join("output.csv"); let path_str = path.to_str().unwrap(); - let messages = vec![InternalMessage::new("Alice", "Hello, World!")]; + let messages = vec![Message::new("Alice", "Hello, World!")]; let config = OutputConfig::new(); write_csv(&messages, path_str, &config).unwrap(); @@ -233,7 +233,7 @@ mod csv_writer_tests { let path = dir.path().join("output.csv"); let path_str = path.to_str().unwrap(); - let messages = vec![InternalMessage::new("Alice", "She said \"hello\"")]; + let messages = vec![Message::new("Alice", "She said \"hello\"")]; let config = OutputConfig::new(); write_csv(&messages, path_str, &config).unwrap(); @@ -248,7 +248,7 @@ mod csv_writer_tests { let path = dir.path().join("output.csv"); let path_str = path.to_str().unwrap(); - let messages: Vec = vec![]; + let messages: Vec = vec![]; let config = OutputConfig::new(); write_csv(&messages, path_str, &config).unwrap(); @@ -265,7 +265,7 @@ mod csv_writer_tests { let path = dir.path().join("output.csv"); let path_str = path.to_str().unwrap(); - let messages = vec![InternalMessage::new("Алиса", "Привет! 🎉")]; + let messages = vec![Message::new("Алиса", "Привет! 🎉")]; let config = OutputConfig::new(); write_csv(&messages, path_str, &config).unwrap(); @@ -281,7 +281,7 @@ mod csv_writer_tests { let path = dir.path().join("output.csv"); let path_str = path.to_str().unwrap(); - let messages = vec![InternalMessage::new("Alice", "Line 1\nLine 2\nLine 3")]; + let messages = vec![Message::new("Alice", "Line 1\nLine 2\nLine 3")]; let config = OutputConfig::new(); write_csv(&messages, path_str, &config).unwrap(); @@ -302,9 +302,9 @@ mod edge_cases { let dir = tempdir().unwrap(); let messages = vec![ - InternalMessage::new("Alice", "Test <>&\"'"), - InternalMessage::new("Bob", "Tab:\tNewline:\n"), - InternalMessage::new("Charlie", "Backslash: \\"), + Message::new("Alice", "Test <>&\"'"), + Message::new("Bob", "Tab:\tNewline:\n"), + Message::new("Charlie", "Backslash: \\"), ]; let config = OutputConfig::new(); @@ -328,7 +328,7 @@ mod edge_cases { let dir = tempdir().unwrap(); let long_content = "A".repeat(10000); - let messages = vec![InternalMessage::new("Alice", &long_content)]; + let messages = vec![Message::new("Alice", &long_content)]; let config = OutputConfig::new(); @@ -343,7 +343,7 @@ mod edge_cases { fn test_empty_sender() { let dir = tempdir().unwrap(); - let messages = vec![InternalMessage::new("", "Message with empty sender")]; + let messages = vec![Message::new("", "Message with empty sender")]; let config = OutputConfig::new(); diff --git a/tests/proptest.rs b/tests/proptest.rs index d64d782e..077875d2 100644 --- a/tests/proptest.rs +++ b/tests/proptest.rs @@ -4,10 +4,10 @@ use proptest::prelude::*; -use chatpack::core::{FilterConfig, InternalMessage, apply_filters, merge_consecutive}; +use chatpack::core::{FilterConfig, Message, apply_filters, merge_consecutive}; -/// Generate a random InternalMessage using fast strategies (no regex!) -fn arb_message() -> impl Strategy { +/// Generate a random Message using fast strategies (no regex!) +fn arb_message() -> impl Strategy { ( // Fast: select from predefined senders prop::sample::select(vec![ @@ -32,7 +32,7 @@ fn arb_message() -> impl Strategy { "🎉🔥💀 emoji".to_string(), ]), ) - .prop_map(|(sender, content)| InternalMessage { + .prop_map(|(sender, content)| Message { sender, content, timestamp: None, @@ -43,7 +43,7 @@ fn arb_message() -> impl Strategy { } /// Generate a vector of random messages -fn arb_messages(max_len: usize) -> impl Strategy> { +fn arb_messages(max_len: usize) -> impl Strategy> { prop::collection::vec(arb_message(), 0..max_len) } @@ -136,7 +136,7 @@ proptest! { /// Messages with special characters don't break merge #[test] fn special_chars_dont_break_merge(sender in prop::sample::select(vec!["A", "B", "C"])) { - let msg = InternalMessage { + let msg = Message { sender: sender.to_string(), content: "test;with\"special\nchars\ttab".to_string(), timestamp: None, @@ -152,7 +152,7 @@ proptest! { fn unicode_content_preserved(idx in 0usize..5) { let contents = ["Привет", "こんにちは", "مرحبا", "🎉🔥💀", "Mixed Тест 日本"]; let content = contents[idx].to_string(); - let msg = InternalMessage { + let msg = Message { sender: "User".to_string(), content: content.clone(), timestamp: None, @@ -176,7 +176,7 @@ mod edge_cases { #[test] fn merge_consecutive_same_sender() { let messages = vec![ - InternalMessage { + Message { sender: "Alice".into(), content: "Hello".into(), timestamp: None, @@ -184,7 +184,7 @@ mod edge_cases { reply_to: None, edited: None, }, - InternalMessage { + Message { sender: "Alice".into(), content: "World".into(), timestamp: None, @@ -203,7 +203,7 @@ mod edge_cases { #[test] fn merge_alternating_senders() { let messages = vec![ - InternalMessage { + Message { sender: "Alice".into(), content: "Hi".into(), timestamp: None, @@ -211,7 +211,7 @@ mod edge_cases { reply_to: None, edited: None, }, - InternalMessage { + Message { sender: "Bob".into(), content: "Hey".into(), timestamp: None, @@ -219,7 +219,7 @@ mod edge_cases { reply_to: None, edited: None, }, - InternalMessage { + Message { sender: "Alice".into(), content: "Bye".into(), timestamp: None, @@ -235,7 +235,7 @@ mod edge_cases { #[test] fn filter_empty_messages() { - let messages: Vec = vec![]; + let messages: Vec = vec![]; let config = FilterConfig::new().with_user("Anyone".into()); let filtered = apply_filters(messages, &config); assert!(filtered.is_empty()); @@ -244,7 +244,7 @@ mod edge_cases { #[test] fn merge_with_empty_content() { let messages = vec![ - InternalMessage { + Message { sender: "Alice".into(), content: String::new(), timestamp: None, @@ -252,7 +252,7 @@ mod edge_cases { reply_to: None, edited: None, }, - InternalMessage { + Message { sender: "Alice".into(), content: "Real message".into(), timestamp: None,