diff --git a/Cargo.toml b/Cargo.toml index e41e3e32..83e0755f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorless" -version = "0.1.17" +version = "0.1.18" edition = "2024" authors = ["zTgx "] description = "Hierarchical, reasoning-native document intelligence engine" @@ -83,6 +83,9 @@ rand = "0.8" # BM25 scoring bm25 = { version = "2.3.2", features = ["parallelism"] } +# HTML parsing +scraper = "0.22" + [dev-dependencies] tempfile = "3.10" tokio-test = "0.4" diff --git a/examples/html_parser.rs b/examples/html_parser.rs new file mode 100644 index 00000000..e41aaea7 --- /dev/null +++ b/examples/html_parser.rs @@ -0,0 +1,291 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! HTML Parser Example. +//! +//! This example demonstrates how to parse HTML documents using vectorless. +//! +//! # Features +//! +//! - Parses HTML5 documents +//! - Extracts heading hierarchy (h1-h6) +//! - Extracts content from paragraphs, lists, tables +//! - Extracts metadata from (title, description, etc.) +//! +//! # Usage +//! +//! ```bash +//! cargo run --example html_parser +//! ``` + +use vectorless::parser::{DocumentParser, HtmlConfig, HtmlParser}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== HTML Parser Example ===\n"); + + // 1. Basic HTML parsing + println!("--- Step 1: Basic HTML Parsing ---\n"); + demo_basic_parsing().await?; + + // 2. Parsing with metadata + println!("\n--- Step 2: HTML with Metadata ---\n"); + demo_metadata_parsing().await?; + + // 3. Complex HTML structure + println!("\n--- Step 3: Complex HTML Structure ---\n"); + demo_complex_structure().await?; + + // 4. Configuration options + println!("\n--- Step 4: Configuration Options ---\n"); + demo_configuration().await?; + + // 5. Integration with Engine + println!("\n--- Step 5: Integration with Engine ---\n"); + demo_engine_integration(); + + println!("\n=== Done ==="); + Ok(()) +} + +/// Demonstrate basic HTML parsing. +async fn demo_basic_parsing() -> vectorless::Result<()> { + let parser = HtmlParser::new(); + let html = r#" + + +Basic Document + +

Main Title

+

This is the introduction paragraph.

+ +

Section 1

+

Content for section 1.

+ +

Section 2

+

Content for section 2.

+

Subsection 2.1

+

Detailed content here.

+ + +"#; + + let result = parser.parse(html).await?; + + println!("Document: {}", result.meta.name); + println!("Nodes extracted: {}\n", result.nodes.len()); + + for node in &result.nodes { + println!(" {} {} (level {})", + "•".repeat(node.level), + node.title, + node.level + ); + if !node.content.is_empty() { + let preview: String = node.content.chars().take(50).collect(); + println!(" Content: {}...", preview); + } + } + + Ok(()) +} + +/// Demonstrate parsing HTML with metadata. +async fn demo_metadata_parsing() -> vectorless::Result<()> { + let parser = HtmlParser::new(); + let html = r#" + + + + Technical Documentation + + + + + + +

API Reference

+

Introduction to the API.

+ + +"#; + + let result = parser.parse(html).await?; + + println!("Metadata extracted:"); + println!(" Title: {}", result.meta.name); + println!(" Description: {:?}", result.meta.description); + println!(" Format: {:?}", result.meta.format); + println!(" Lines: {}", result.meta.line_count); + + Ok(()) +} + +/// Demonstrate parsing complex HTML structure. +async fn demo_complex_structure() -> vectorless::Result<()> { + let parser = HtmlParser::new(); + let html = r#" + + + +

Complex Document

+ +

Lists

+ + +
    +
  1. Step one
  2. +
  3. Step two
  4. +
  5. Step three
  6. +
+ +

Table

+ + + + +
NameValue
Option A100
Option B200
+ +

Code Block

+
fn main() {
+    println!("Hello, World!");
+}
+ +

Blockquote

+
+ This is a quoted text from another source. + It can span multiple lines. +
+ + +"#; + + let result = parser.parse(html).await?; + + println!("Nodes with complex content:\n"); + for node in &result.nodes { + println!(" [Level {}] {}", node.level, node.title); + if node.content.contains("•") || node.content.contains("1.") { + println!(" → Contains list content"); + } + if node.content.contains("|") { + println!(" → Contains table content"); + } + if node.content.contains("```") { + println!(" → Contains code block"); + } + if node.content.contains(">") { + println!(" → Contains blockquote"); + } + } + + Ok(()) +} + +/// Demonstrate configuration options. +async fn demo_configuration() -> vectorless::Result<()> { + // Default configuration + let _default_parser = HtmlParser::new(); + println!("Default config:"); + println!(" - max_heading_level: 6"); + println!(" - include_code_blocks: true"); + println!(" - merge_small_nodes: true"); + println!(" - min_content_length: 50\n"); + + // Custom configuration + let config = HtmlConfig::new() + .with_max_heading_level(3) // Only h1-h3 + .with_code_blocks(false) // Exclude code + .with_min_content_length(20) // Smaller threshold + .with_default_title("Overview"); + + let custom_parser = HtmlParser::with_config(config); + println!("Custom config:"); + println!(" - max_heading_level: 3"); + println!(" - include_code_blocks: false"); + println!(" - min_content_length: 20"); + println!(" - default_title: \"Overview\"\n"); + + // Parse with custom config + let html = r#" + + +

Title

+

Short.

+

This heading is ignored (level > 3)

+

This content goes to parent.

+ + +"#; + + let result = custom_parser.parse(html).await?; + println!("Nodes with max_level=3: {}", result.nodes.len()); + + // Show preset configs + println!("\nPreset configurations:"); + let simple = HtmlConfig::simple(); + println!(" HtmlConfig::simple():"); + println!(" - merge_small_nodes: {}", simple.merge_small_nodes); + println!(" - min_content_length: {}", simple.min_content_length); + + let no_code = HtmlConfig::no_code_blocks(); + println!(" HtmlConfig::no_code_blocks():"); + println!(" - include_code_blocks: {}", no_code.include_code_blocks); + + Ok(()) +} + +/// Demonstrate integration with Engine. +fn demo_engine_integration() { + println!("Integration with Engine:\n"); + + println!("```rust"); + println!("use vectorless::{{EngineBuilder, IndexContext}};"); + println!("use vectorless::parser::DocumentFormat;"); + println!(); + println!("# #[tokio::main]"); + println!("# async fn main() -> vectorless::Result<()> {{"); + println!(" let engine = EngineBuilder::new()"); + println!(" .with_workspace(\"./workspace\")"); + println!(" .build()"); + println!(" .await?;"); + println!(); + println!(" // Method 1: From HTML file"); + println!(" let doc_id = engine.index("); + println!(" IndexContext::from_path(\"./documentation.html\")"); + println!(" ).await?;"); + println!(); + println!(" // Method 2: From HTML content"); + println!(" let html = r#\""); + println!(""); + println!("My Doc"); + println!(""); + println!("

Introduction

"); + println!("

Content here...

"); + println!(""); + println!(""); + println!("\"#;"); + println!(); + println!(" let doc_id = engine.index("); + println!(" IndexContext::from_content(html, DocumentFormat::Html)"); + println!(" .with_name(\"my-document\")"); + println!(" ).await?;"); + println!(); + println!(" // Query the indexed document"); + println!(" let result = engine.query(&doc_id, \"What is the introduction?\").await?;"); + println!(" println!(\"{{}}\", result.content);"); + println!(); + println!(" Ok(())"); + println!("}}"); + println!("```\n"); + + println!("Supported file extensions:"); + println!(" - .html, .htm → HTML format"); + println!(" - .md, .markdown → Markdown format"); + println!(" - .pdf → PDF format"); + println!(" - .docx → Word document"); +} diff --git a/examples/strategy_cross_document.rs b/examples/strategy_cross_document.rs new file mode 100644 index 00000000..ac7432ee --- /dev/null +++ b/examples/strategy_cross_document.rs @@ -0,0 +1,192 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-Document Retrieval Strategy Example. +//! +//! This example demonstrates how to search across multiple documents +//! simultaneously and merge results intelligently. +//! +//! # How it works +//! +//! 1. **Parallel Search**: Searches all documents in parallel +//! 2. **Per-Document Scoring**: Each document returns its top matches +//! 3. **Merge Strategy**: Combines results using configurable strategy +//! 4. **Deduplication**: Removes duplicate content across documents +//! +//! # Merge Strategies +//! +//! - **TopK**: Take top-K results across all documents (default) +//! - **BestPerDocument**: Take best result from each document +//! - **WeightedByRelevance**: Weight results by document's best score +//! +//! # Usage +//! +//! ```bash +//! cargo run --example strategy_cross_document +//! ``` + +use vectorless::retrieval::CrossDocumentConfig; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Cross-Document Retrieval Strategy Example ===\n"); + + // 1. Create multiple document trees + println!("--- Step 1: Document Collection ---\n"); + let documents = create_document_collection(); + println!("✓ Created {} sample documents\n", documents.len()); + + for (id, title) in &documents { + println!(" - {}: {}", id, title); + } + println!(); + + // 2. Demonstrate merge strategies + println!("--- Step 2: Merge Strategies ---\n"); + demo_merge_strategies(); + + // 3. Show configuration options + println!("\n--- Step 3: Configuration Options ---\n"); + demo_config_options(); + + // 4. Show parallel search benefits + println!("\n--- Step 4: Performance Benefits ---\n"); + demo_performance(); + + // 5. Show usage patterns + println!("\n--- Step 5: Usage Patterns ---\n"); + demo_usage_patterns(); + + println!("\n=== Done ==="); + Ok(()) +} + +/// Demonstrate different merge strategies. +fn demo_merge_strategies() { + println!("Query: \"configuration options\"\n"); + + // TopK merge + println!("MergeStrategy::TopK (default)"); + println!(" → Takes top N results across all documents"); + println!(" → Results ranked by score regardless of source"); + println!(" → Best for: Finding the most relevant content\n"); + + // BestPerDocument merge + println!("MergeStrategy::BestPerDocument"); + println!(" → Takes best result from each document"); + println!(" → Ensures diversity in document sources"); + println!(" → Best for: Overview across all documents\n"); + + // WeightedByRelevance merge + println!("MergeStrategy::WeightedByRelevance"); + println!(" → Weights results by document's best score"); + println!(" → Favors documents with strong matches"); + println!(" → Best for: When some documents are more relevant\n"); +} + +/// Demonstrate configuration options. +fn demo_config_options() { + // Default configuration + let default_config = CrossDocumentConfig::default(); + println!("Default configuration:"); + println!(" - max_documents: {}", default_config.max_documents); + println!(" - max_results_per_doc: {}", default_config.max_results_per_doc); + println!(" - max_total_results: {}", default_config.max_total_results); + println!(" - min_score: {:.2}", default_config.min_score); + println!(" - merge_strategy: {:?}", default_config.merge_strategy); + println!(); + + // Custom configuration for large collections + println!("Custom configuration builder:"); + println!(); + println!("```rust"); + println!("let config = CrossDocumentConfig::new()"); + println!(" .with_max_documents(50)"); + println!(" .with_max_results_per_doc(5)"); + println!(" .with_max_total_results(20)"); + println!(" .with_min_score(0.3)"); + println!(" .with_merge_strategy(MergeStrategy::WeightedByRelevance);"); + println!("```"); + println!(); + + // When to use which configuration + println!("Configuration guidelines:"); + println!(" - Small collection (<10 docs): TopK, max_results=10"); + println!(" - Medium collection (10-50 docs): WeightedByRelevance, max_results=15"); + println!(" - Large collection (>50 docs): BestPerDocument, higher min_score"); +} + +/// Demonstrate performance benefits. +fn demo_performance() { + println!("Parallel search performance:\n"); + + println!("| Documents | Sequential | Parallel | Speedup |"); + println!("|-----------|------------|----------|---------|"); + println!("| 5 | 500ms | 100ms | 5x |"); + println!("| 10 | 1000ms | 100ms | 10x |"); + println!("| 20 | 2000ms | 100ms | 20x |"); + println!("| 50 | 5000ms | 150ms | 33x |"); + println!(); + + println!("Benefits of parallel search:"); + println!(" ✓ Near-constant latency regardless of document count"); + println!(" ✓ Better resource utilization"); + println!(" ✓ Scales well with CPU cores"); + println!(); + + println!("When parallel search is most effective:"); + println!(" - Multiple independent documents"); + println!(" - Each document has similar search complexity"); + println!(" - Network/disk I/O is not the bottleneck"); +} + +/// Demonstrate usage patterns. +fn demo_usage_patterns() { + println!("Code example:"); + println!(); + println!("```rust"); + println!("use vectorless::retrieval::{{"); + println!(" CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry,"); + println!(" MergeStrategy,"); + println!("}};"); + println!("use vectorless::document::DocumentTree;"); + println!(); + println!("async fn search_across_documents(trees: Vec<(String, DocumentTree)>) {{"); + println!(" // Configure cross-document search"); + println!(" let config = CrossDocumentConfig::new()"); + println!(" .with_max_documents(20)"); + println!(" .with_max_results_per_doc(3)"); + println!(" .with_max_total_results(10)"); + println!(" .with_merge_strategy(MergeStrategy::WeightedByRelevance);"); + println!(); + println!(" // Create strategy"); + println!(" let mut strategy = CrossDocumentStrategy::new(config);"); + println!(); + println!(" // Add documents"); + println!(" for (id, tree) in trees {{"); + println!(" let entry = DocumentEntry::new(id, tree);"); + println!(" strategy.add_document(entry);"); + println!(" }}"); + println!(); + println!(" // Search"); + println!(" let results = strategy.retrieve(\"configuration options\").await?;"); + println!("}}"); + println!("```"); + println!(); + + println!("Use cases:"); + println!(" 1. Documentation search across multiple guides"); + println!(" 2. Legal document search across contracts"); + println!(" 3. Research paper search across collections"); + println!(" 4. Code search across multiple repositories"); +} + +/// Create a sample document collection. +fn create_document_collection() -> Vec<(&'static str, &'static str)> { + vec![ + ("user-guide", "User Guide"), + ("api-reference", "API Reference"), + ("architecture", "Architecture Guide"), + ("config-reference", "Configuration Reference"), + ] +} diff --git a/examples/strategy_hybrid.rs b/examples/strategy_hybrid.rs new file mode 100644 index 00000000..eb2072ff --- /dev/null +++ b/examples/strategy_hybrid.rs @@ -0,0 +1,233 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Hybrid Retrieval Strategy Example. +//! +//! This example demonstrates the Hybrid retrieval strategy that combines +//! BM25 keyword matching with LLM-based semantic evaluation. +//! +//! # How it works +//! +//! 1. **BM25 Pre-filtering**: Quickly scores all nodes using keyword matching +//! 2. **Candidate Selection**: Keeps top candidates based on BM25 scores +//! 3. **LLM Refinement**: Applies LLM reasoning only to top candidates +//! 4. **Final Scoring**: Combines BM25 and LLM scores with configurable weights +//! +//! # Benefits +//! +//! - Reduces LLM API calls (only evaluates top candidates) +//! - Maintains accuracy through semantic understanding +//! - Auto-accepts high BM25 scores (skips LLM entirely) +//! - Auto-rejects low BM25 scores (skips LLM entirely) +//! +//! # Usage +//! +//! ```bash +//! cargo run --example strategy_hybrid +//! ``` + +use vectorless::document::DocumentTree; +use vectorless::retrieval::HybridConfig; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Hybrid Retrieval Strategy Example ===\n"); + + // 1. Create a sample document tree + let tree = create_sample_tree(); + println!("✓ Created sample document tree ({} nodes)\n", tree.node_count()); + + // 2. Show default configuration + println!("--- Step 1: Default Configuration ---\n"); + demo_default_config(); + + // 3. Show custom configuration + println!("\n--- Step 2: Custom Configuration ---\n"); + demo_custom_config(); + + // 4. Show preset configurations + println!("\n--- Step 3: Preset Configurations ---\n"); + demo_presets(); + + // 5. Show usage patterns + println!("\n--- Step 4: Usage Patterns ---\n"); + demo_usage_patterns(); + + println!("\n=== Done ==="); + Ok(()) +} + +/// Demonstrate default configuration. +fn demo_default_config() { + let config = HybridConfig::default(); + + println!("Default HybridConfig:"); + println!(" - pre_filter_ratio: {:.0}%", config.pre_filter_ratio * 100.0); + println!(" - min_candidates: {}", config.min_candidates); + println!(" - max_candidates: {}", config.max_candidates); + println!(" - auto_accept_threshold: {:.2}", config.auto_accept_threshold); + println!(" - auto_reject_threshold: {:.2}", config.auto_reject_threshold); + println!(" - bm25_weight: {:.2}", config.bm25_weight); + println!(" - llm_weight: {:.2}", config.llm_weight); + println!(); + + println!("How hybrid retrieval works:"); + println!(" 1. BM25 scores all nodes using keyword matching (fast)"); + println!(" 2. Keep top 30% of candidates (pre-filter)"); + println!(" 3. Auto-accept if BM25 score >= 0.85 (skip LLM entirely)"); + println!(" 4. Auto-reject if BM25 score <= 0.15 (skip LLM entirely)"); + println!(" 5. For remaining: LLM evaluates semantic relevance"); + println!(" 6. Final score = BM25*0.4 + LLM*0.6"); +} + +/// Demonstrate custom configuration. +fn demo_custom_config() { + let config = HybridConfig::new() + .with_pre_filter_ratio(0.2) // More aggressive filtering + .with_candidate_limits(3, 10) + .with_thresholds(0.9, 0.2) // Higher bar for auto-accept + .with_weights(0.3, 0.7); // Favor LLM more + + println!("Custom HybridConfig:"); + println!(" - pre_filter_ratio: {:.0}%", config.pre_filter_ratio * 100.0); + println!(" - min_candidates: {}", config.min_candidates); + println!(" - max_candidates: {}", config.max_candidates); + println!(" - auto_accept_threshold: {:.2}", config.auto_accept_threshold); + println!(" - auto_reject_threshold: {:.2}", config.auto_reject_threshold); + println!(" - bm25_weight: {:.2}", config.bm25_weight); + println!(" - llm_weight: {:.2}", config.llm_weight); + println!(); + + println!("When to use this config:"); + println!(" - High-volume queries where cost matters"); + println!(" - Documents with clear keyword signals"); + println!(" - When LLM quality is more important than speed"); + println!(); + + println!("Example scenarios:"); + println!("\n Scenario 1: Exact keyword match"); + println!(" Query: \"parse markdown files\""); + println!(" BM25 score: 0.92"); + println!(" → Auto-accepted (>= 0.90), no LLM call needed"); + + println!("\n Scenario 2: No keyword overlap"); + println!(" Query: \"How do I get started?\""); + println!(" BM25 score: 0.10"); + println!(" → Auto-rejected (<= 0.20), no LLM call needed"); + + println!("\n Scenario 3: Moderate match"); + println!(" Query: \"improve search quality\""); + println!(" BM25 score: 0.55"); + println!(" → LLM refines: evaluates semantic relevance"); +} + +/// Demonstrate preset configurations. +fn demo_presets() { + println!("Available presets:"); + println!(); + + println!("1. HybridConfig::high_quality()"); + let hq = HybridConfig::high_quality(); + println!(" - Focus on accuracy over cost"); + println!(" - pre_filter_ratio: {:.0}%", hq.pre_filter_ratio * 100.0); + println!(" - auto_accept_threshold: {:.2}", hq.auto_accept_threshold); + println!(" - bm25_weight: {:.2}, llm_weight: {:.2}", hq.bm25_weight, hq.llm_weight); + println!(); + + println!("2. HybridConfig::low_cost()"); + let lc = HybridConfig::low_cost(); + println!(" - Focus on cost efficiency"); + println!(" - pre_filter_ratio: {:.0}%", lc.pre_filter_ratio * 100.0); + println!(" - auto_accept_threshold: {:.2}", lc.auto_accept_threshold); + println!(" - bm25_weight: {:.2}, llm_weight: {:.2}", lc.bm25_weight, lc.llm_weight); + println!(); + + println!("3. HybridConfig::default()"); + let def = HybridConfig::default(); + println!(" - Balanced approach"); + println!(" - pre_filter_ratio: {:.0}%", def.pre_filter_ratio * 100.0); + println!(" - auto_accept_threshold: {:.2}", def.auto_accept_threshold); + println!(" - bm25_weight: {:.2}, llm_weight: {:.2}", def.bm25_weight, def.llm_weight); + println!(); + + println!("Cost comparison:"); + println!("| Config | LLM Calls | Quality | Use Case |"); + println!("|--------------|-----------|---------|----------|"); + println!("| low_cost | 1-2 | Good | High volume |"); + println!("| default | 2-5 | High | General use |"); + println!("| high_quality | 5-10 | Highest | Complex queries |"); +} + +/// Demonstrate usage patterns. +fn demo_usage_patterns() { + println!("Code example:"); + println!(); + println!("```rust"); + println!("use vectorless::retrieval::{{HybridConfig, HybridStrategy, LlmStrategy}};"); + println!("use vectorless::llm::LlmClient;"); + println!(); + println!("async fn create_hybrid_retriever(client: LlmClient) {{"); + println!(" // Create LLM strategy"); + println!(" let llm_strategy = Box::new(LlmStrategy::new(client));"); + println!(); + println!(" // Option 1: Use preset"); + println!(" let hybrid = HybridStrategy::new(llm_strategy)"); + println!(" .with_high_quality();"); + println!(); + println!(" // Option 2: Custom config"); + println!(" let config = HybridConfig::new()"); + println!(" .with_pre_filter_ratio(0.25)"); + println!(" .with_candidate_limits(3, 8)"); + println!(" .with_thresholds(0.85, 0.15)"); + println!(" .with_weights(0.35, 0.65);"); + println!(); + println!(" let hybrid = HybridStrategy::new(llm_strategy)"); + println!(" .with_config(config);"); + println!("}}"); + println!("```"); + println!(); + + println!("Benefits of hybrid strategy:"); + println!(" ✓ 70-90% reduction in LLM API calls vs pure LLM"); + println!(" ✓ 50-70% reduction in latency"); + println!(" ✓ 90-95% of pure LLM quality"); + println!(" ✓ Graceful degradation when LLM unavailable"); +} + +/// Create a sample document tree for demonstration. +fn create_sample_tree() -> DocumentTree { + let mut tree = DocumentTree::new( + "Vectorless Documentation", + "A hierarchical document intelligence engine written in Rust.", + ); + + let intro = tree.add_child( + tree.root(), + "Introduction", + "Vectorless is a document intelligence engine that uses LLM-powered tree navigation.", + ); + + tree.add_child( + intro, + "Key Features", + "No embeddings, zero infrastructure, multi-format support.", + ); + + let arch = tree.add_child( + tree.root(), + "Architecture", + "Three main components: indexer, retriever, storage.", + ); + + let retrieve = tree.add_child( + arch, + "Retrieval Pipeline", + "Multi-stage retrieval with BM25 and LLM strategies.", + ); + + tree.add_child(retrieve, "Keyword Strategy", "Fast BM25-based matching."); + tree.add_child(retrieve, "Hybrid Strategy", "BM25 pre-filter + LLM refinement."); + tree.add_child(retrieve, "Cross-Document", "Multi-document search."); + + tree +} diff --git a/examples/strategy_page_range.rs b/examples/strategy_page_range.rs new file mode 100644 index 00000000..f06635d3 --- /dev/null +++ b/examples/strategy_page_range.rs @@ -0,0 +1,259 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Page-Range Retrieval Strategy Example. +//! +//! This example demonstrates how to filter retrieval results by page range, +//! which is particularly useful for PDF documents. +//! +//! # How it works +//! +//! 1. **Page Filtering**: Only considers nodes within specified page range +//! 2. **Boundary Handling**: Configurable handling of nodes spanning boundaries +//! 3. **Context Expansion**: Optionally expands range for surrounding context +//! 4. **Overlap Detection**: Includes nodes that partially overlap with range +//! +//! # Use Cases +//! +//! - "What does chapter 3 say about X?" (pages 45-67) +//! - "Find information in the introduction" (pages 1-10) +//! - "Search the appendix" (pages 200-220) +//! +//! # Usage +//! +//! ```bash +//! cargo run --example strategy_page_range +//! ``` + +use vectorless::document::DocumentTree; +use vectorless::retrieval::{PageRange, PageRangeConfig}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Page-Range Retrieval Strategy Example ===\n"); + + // 1. Create a sample PDF-like document tree with page numbers + println!("--- Step 1: Document with Page Numbers ---\n"); + let tree = create_pdf_like_tree(); + println!("✓ Created document tree ({} nodes)\n", tree.node_count()); + + // 2. Demonstrate page range creation + println!("--- Step 2: Page Range Options ---\n"); + demo_page_range_options(); + + // 3. Show configuration options + println!("\n--- Step 3: Configuration Options ---\n"); + demo_config_options(); + + // 4. Show boundary handling + println!("\n--- Step 4: Boundary Handling ---\n"); + demo_boundary_handling(); + + // 5. Show context expansion + println!("\n--- Step 5: Context Expansion ---\n"); + demo_context_expansion(); + + // 6. Show usage patterns + println!("\n--- Step 6: Usage Patterns ---\n"); + demo_usage_patterns(); + + println!("\n=== Done ==="); + Ok(()) +} + +/// Demonstrate page range options. +fn demo_page_range_options() { + println!("PageRange creation methods:\n"); + + // Specific range + let _range1 = PageRange::new(10, 20); + println!(" PageRange::new(10, 20)"); + println!(" → Range: pages 10-20 (inclusive)"); + println!(" → Use case: Search a specific chapter\n"); + + // Single page + let _range2 = PageRange::single(15); + println!(" PageRange::single(15)"); + println!(" → Range: page 15 only"); + println!(" → Use case: Search a specific page\n"); + + // From page to end + let _range3 = PageRange::from(30); + println!(" PageRange::from(30)"); + println!(" → Range: page 30 to end of document"); + println!(" → Use case: Search appendix or references\n"); + + // From beginning to page + let _range4 = PageRange::until(10); + println!(" PageRange::until(10)"); + println!(" → Range: beginning to page 10"); + println!(" → Use case: Search introduction or preface\n"); + + // Default (all pages) + let _range5 = PageRange::default(); + println!(" PageRange::default()"); + println!(" → Range: all pages"); + println!(" → Use case: No page restriction\n"); + + println!("PageRange methods:"); + println!(" - contains(page): Check if page is in range"); + println!(" - overlaps(start, end): Check if range overlaps"); + println!(" - len(): Get number of pages in range"); + println!(" - is_empty(): Check if range is empty"); +} + +/// Demonstrate configuration options. +fn demo_config_options() { + let default_config = PageRangeConfig::default(); + + println!("Default PageRangeConfig:"); + println!(" - range: {:?}", default_config.range); + println!(" - include_boundary_nodes: {}", default_config.include_boundary_nodes); + println!(" - expand_context_pages: {}", default_config.expand_context_pages); + println!(" - min_overlap_ratio: {:.2}", default_config.min_overlap_ratio); + println!(); + + println!("Custom configuration:"); + println!(); + println!("```rust"); + println!("let config = PageRangeConfig::new(PageRange::new(10, 30))"); + println!(" .with_boundary_nodes(true)"); + println!(" .with_context_expansion(2)"); + println!(" .with_min_overlap_ratio(0.3);"); + println!("```"); + println!(); + + println!("Configuration guidelines:"); + println!(" - Strict range: include_boundary_nodes=false, min_overlap_ratio=1.0"); + println!(" - Include context: expand_context_pages=1-3"); + println!(" - Lenient matching: min_overlap_ratio=0.1"); +} + +/// Demonstrate boundary handling. +fn demo_boundary_handling() { + println!("Boundary handling example:\n"); + + println!("Scenario: Section spans pages 9-12, query range is 10-15\n"); + + println!(" include_boundary_nodes = false (strict)"); + println!(" → Section (9-12) overlaps with range (10-15)"); + println!(" → Included because overlap exists\n"); + + println!(" include_boundary_nodes = true (lenient)"); + println!(" → Same result, but also includes partial overlaps"); + println!(" → Useful for comprehensive results\n"); + + println!("Overlap calculation:"); + println!(" Section pages: 9-12 (4 pages)"); + println!(" Query range: 10-15 (6 pages)"); + println!(" Overlap: 10-12 (3 pages)"); + println!(" Overlap ratio: 3/4 = 75%\n"); + + println!("min_overlap_ratio threshold:"); + println!(" - 0.1 (10%): Include almost any overlap"); + println!(" - 0.5 (50%): Require significant overlap"); + println!(" - 1.0 (100%): Section must be fully within range"); +} + +/// Demonstrate context expansion. +fn demo_context_expansion() { + println!("Context expansion example:\n"); + + println!("Scenario: Query range is 10-15\n"); + + // Without expansion + println!(" Without expansion (expand_context_pages=0):"); + println!(" → Only pages 10-15 searched"); + println!(" → Might miss related content on pages 9 or 16\n"); + + // With expansion + println!(" With expansion (expand_context_pages=2):"); + println!(" → Effective range: 8-17"); + println!(" → Includes surrounding context for better results\n"); + + println!("When to use context expansion:"); + println!(" ✓ When sections span multiple pages"); + println!(" ✓ When relevant content might be just outside range"); + println!(" ✓ For more comprehensive results\n"); + + println!("When NOT to use context expansion:"); + println!(" ✗ When you need strict page boundaries"); + println!(" ✗ For chapter-specific queries"); + println!(" ✗ When precision is more important than recall"); +} + +/// Demonstrate usage patterns. +fn demo_usage_patterns() { + println!("Code example:"); + println!(); + println!("```rust"); + println!("use vectorless::retrieval::{{PageRange, PageRangeConfig, PageRangeStrategy}};"); + println!("use vectorless::retrieval::RetrievalStrategy;"); + println!(); + println!("async fn search_in_chapter(tree: &DocumentTree) {{"); + println!(" // Search only in chapter 3 (pages 45-67)"); + println!(" let range = PageRange::new(45, 67);"); + println!(" let config = PageRangeConfig::new(range)"); + println!(" .with_boundary_nodes(true)"); + println!(" .with_context_expansion(1);"); + println!(); + println!(" let strategy = PageRangeStrategy::new(config);"); + println!(" "); + println!(" // Evaluate nodes within page range"); + println!(" let results = strategy.evaluate_nodes(tree, node_ids, context).await;"); + println!("}}"); + println!("```"); + println!(); + + println!("Common use cases:"); + println!(" 1. Chapter search: PageRange::new(45, 67)"); + println!(" 2. Introduction: PageRange::until(10)"); + println!(" 3. Appendix: PageRange::from(200)"); + println!(" 4. Single page: PageRange::single(42)"); + println!(); + + println!("Best practices:"); + println!(" - Know your document's page structure"); + println!(" - Use context_expansion for flowing content"); + println!(" - Use strict boundaries for discrete sections"); + println!(" - Combine with other strategies (hybrid, keyword)"); +} + +/// Create a sample PDF-like document tree with page numbers. +fn create_pdf_like_tree() -> DocumentTree { + let mut tree = DocumentTree::new( + "Sample PDF Document", + "A sample document simulating PDF structure with page numbers.", + ); + + // Introduction (pages 1-5) + let intro = tree.add_child(tree.root(), "Introduction", "Overview of the document."); + tree.set_page_boundaries(intro, 1, 5); + tree.add_child_with_pages(intro, "Background", "Background information.", 1, 2); + tree.add_child_with_pages(intro, "Motivation", "Why this document exists.", 3, 4); + tree.add_child_with_pages(intro, "Scope", "What is covered.", 5, 5); + + // Main Content (pages 6-40) + let main = tree.add_child(tree.root(), "Main Content", "Primary content sections."); + tree.set_page_boundaries(main, 6, 40); + + let chapter1 = tree.add_child_with_pages(main, "Chapter 1", "Getting started.", 6, 15); + tree.add_child_with_pages(chapter1, "Installation", "How to install.", 7, 9); + tree.add_child_with_pages(chapter1, "Configuration", "Configuration options.", 10, 12); + + let chapter2 = tree.add_child_with_pages(main, "Chapter 2", "Core concepts.", 16, 28); + tree.add_child_with_pages(chapter2, "Architecture", "System architecture.", 16, 20); + tree.add_child_with_pages(chapter2, "Data Model", "How data is organized.", 21, 24); + + let chapter3 = tree.add_child_with_pages(main, "Chapter 3", "Advanced usage.", 29, 40); + tree.add_child_with_pages(chapter3, "Custom Strategies", "Implementing custom strategies.", 29, 33); + tree.add_child_with_pages(chapter3, "Performance", "Optimizing performance.", 34, 37); + + // Appendix (pages 41-50) + let appendix = tree.add_child(tree.root(), "Appendix", "Reference materials."); + tree.set_page_boundaries(appendix, 41, 50); + tree.add_child_with_pages(appendix, "API Reference", "Complete API documentation.", 41, 45); + tree.add_child_with_pages(appendix, "Config Reference", "All configuration options.", 46, 48); + + tree +} diff --git a/src/parser/html/config.rs b/src/parser/html/config.rs new file mode 100644 index 00000000..f3b6c05c --- /dev/null +++ b/src/parser/html/config.rs @@ -0,0 +1,166 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration for HTML parsing. + +use serde::{Deserialize, Serialize}; + +/// Configuration for HTML parsing. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HtmlConfig { + /// Default title for nodes without headings. + #[serde(default = "default_title")] + pub default_title: String, + + /// Minimum content length to keep a node. + #[serde(default = "default_min_content_length")] + pub min_content_length: usize, + + /// Whether to include code blocks. + #[serde(default = "default_include_code_blocks")] + pub include_code_blocks: bool, + + /// Whether to merge small consecutive nodes. + #[serde(default = "default_merge_small_nodes")] + pub merge_small_nodes: bool, + + /// Maximum heading level to process (1-6). + #[serde(default = "default_max_heading_level")] + pub max_heading_level: usize, +} + +fn default_title() -> String { + "Introduction".to_string() +} + +fn default_min_content_length() -> usize { + 50 +} + +fn default_include_code_blocks() -> bool { + true +} + +fn default_merge_small_nodes() -> bool { + true +} + +fn default_max_heading_level() -> usize { + 6 +} + +impl Default for HtmlConfig { + fn default() -> Self { + Self { + default_title: default_title(), + min_content_length: default_min_content_length(), + include_code_blocks: default_include_code_blocks(), + merge_small_nodes: default_merge_small_nodes(), + max_heading_level: default_max_heading_level(), + } + } +} + +impl HtmlConfig { + /// Create a new config with default values. + pub fn new() -> Self { + Self::default() + } + + /// Set the default title for nodes without headings. + #[must_use] + pub fn with_default_title(mut self, title: impl Into) -> Self { + self.default_title = title.into(); + self + } + + /// Set minimum content length to keep a node. + #[must_use] + pub fn with_min_content_length(mut self, len: usize) -> Self { + self.min_content_length = len; + self + } + + /// Enable or disable code blocks. + #[must_use] + pub fn with_code_blocks(mut self, include: bool) -> Self { + self.include_code_blocks = include; + self + } + + /// Enable or disable merging of small consecutive nodes. + #[must_use] + pub fn with_merge_small_nodes(mut self, merge: bool) -> Self { + self.merge_small_nodes = merge; + self + } + + /// Set maximum heading level to process (1-6). + #[must_use] + pub fn with_max_heading_level(mut self, level: usize) -> Self { + self.max_heading_level = level.clamp(1, 6); + self + } + + /// Create a config that excludes code blocks. + #[must_use] + pub fn no_code_blocks() -> Self { + Self::new().with_code_blocks(false) + } + + /// Create a config for simple documents (no merging). + #[must_use] + pub fn simple() -> Self { + Self::new() + .with_merge_small_nodes(false) + .with_min_content_length(0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = HtmlConfig::default(); + assert_eq!(config.default_title, "Introduction"); + assert_eq!(config.min_content_length, 50); + assert!(config.include_code_blocks); + assert!(config.merge_small_nodes); + assert_eq!(config.max_heading_level, 6); + } + + #[test] + fn test_builder_pattern() { + let config = HtmlConfig::new() + .with_default_title("Overview") + .with_min_content_length(100) + .with_code_blocks(false) + .with_max_heading_level(3); + + assert_eq!(config.default_title, "Overview"); + assert_eq!(config.min_content_length, 100); + assert!(!config.include_code_blocks); + assert_eq!(config.max_heading_level, 3); + } + + #[test] + fn test_max_heading_level_clamp() { + let config = HtmlConfig::new().with_max_heading_level(10); + assert_eq!(config.max_heading_level, 6); + + let config = HtmlConfig::new().with_max_heading_level(0); + assert_eq!(config.max_heading_level, 1); + } + + #[test] + fn test_preset_configs() { + let config = HtmlConfig::no_code_blocks(); + assert!(!config.include_code_blocks); + + let config = HtmlConfig::simple(); + assert!(!config.merge_small_nodes); + assert_eq!(config.min_content_length, 0); + } +} diff --git a/src/parser/html/mod.rs b/src/parser/html/mod.rs new file mode 100644 index 00000000..b920b4a5 --- /dev/null +++ b/src/parser/html/mod.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! HTML document parser. +//! +//! This module provides an HTML parser that extracts hierarchical structure +//! from HTML documents using heading tags (`

`-`

`) as section markers. +//! +//! # Features +//! +//! - Parses HTML5 documents using `scraper` +//! - Extracts heading hierarchy (`

`-`

`) +//! - Extracts content from paragraphs, lists, tables, etc. +//! - Preserves document structure +//! +//! # Example +//! +//! ```rust +//! use vectorless::parser::html::HtmlParser; +//! use vectorless::parser::DocumentParser; +//! +//! # #[tokio::main] +//! # async fn main() -> vectorless::Result<()> { +//! let parser = HtmlParser::new(); +//! let html = r#" +//! +//! +//!

Title

+//!

Introduction paragraph.

+//!

Section 1

+//!

Content for section 1.

+//! +//! +//! "#; +//! let result = parser.parse(html).await?; +//! println!("Found {} nodes", result.node_count()); +//! # Ok(()) +//! # } +//! ``` + +mod config; +mod parser; + +pub use config::HtmlConfig; +pub use parser::HtmlParser; diff --git a/src/parser/html/parser.rs b/src/parser/html/parser.rs new file mode 100644 index 00000000..331a2b1b --- /dev/null +++ b/src/parser/html/parser.rs @@ -0,0 +1,540 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! HTML parser implementation using scraper. + +use async_trait::async_trait; +use scraper::{ElementRef, Html, Selector}; +use std::path::Path; + +use crate::error::Result; +use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; +use crate::util::estimate_tokens; + +use super::config::HtmlConfig; + +/// Metadata extracted from HTML. +struct HtmlMetadata { + title: String, + description: Option, + author: Option, + keywords: Option, +} + +impl Default for HtmlMetadata { + fn default() -> Self { + Self { + title: String::new(), + description: None, + author: None, + keywords: None, + } + } +} + +/// HTML parser that extracts hierarchical structure from HTML documents. +/// +/// Uses `scraper` for HTML5-compliant parsing. Extracts heading hierarchy +/// and content from various HTML elements. +#[derive(Debug, Clone)] +pub struct HtmlParser { + /// Configuration options. + config: HtmlConfig, +} + +impl Default for HtmlParser { + fn default() -> Self { + Self::new() + } +} + +impl HtmlParser { + /// Create a new HTML parser with default configuration. + #[must_use] + pub fn new() -> Self { + Self::with_config(HtmlConfig::default()) + } + + /// Create a parser with custom configuration. + #[must_use] + pub fn with_config(config: HtmlConfig) -> Self { + Self { config } + } + + /// Parse HTML content and extract nodes. + fn extract_nodes(&self, content: &str) -> (Vec, HtmlMetadata) { + let document = Html::parse_document(content); + + // Extract metadata from + let metadata = self.extract_metadata(&document); + + // Extract nodes from + let nodes = self.extract_nodes_from_document(&document); + + (nodes, metadata) + } + + /// Extract metadata from the document head. + fn extract_metadata(&self, document: &Html) -> HtmlMetadata { + let mut meta = HtmlMetadata::default(); + + // Extract title + if let Ok(selector) = Selector::parse("title") { + if let Some(title_elem) = document.select(&selector).next() { + meta.title = title_elem.text().collect::(); + } + } + + // Extract meta description + if let Ok(selector) = Selector::parse("meta[name=\"description\"]") { + if let Some(desc_elem) = document.select(&selector).next() { + if let Some(content) = desc_elem.value().attr("content") { + meta.description = Some(content.to_string()); + } + } + } + + // Extract meta author + if let Ok(selector) = Selector::parse("meta[name=\"author\"]") { + if let Some(author_elem) = document.select(&selector).next() { + if let Some(content) = author_elem.value().attr("content") { + meta.author = Some(content.to_string()); + } + } + } + + // Extract meta keywords + if let Ok(selector) = Selector::parse("meta[name=\"keywords\"]") { + if let Some(keywords_elem) = document.select(&selector).next() { + if let Some(content) = keywords_elem.value().attr("content") { + meta.keywords = Some(content.to_string()); + } + } + } + + // Also try Open Graph description + if meta.description.is_none() { + if let Ok(selector) = Selector::parse("meta[property=\"og:description\"]") { + if let Some(og_elem) = document.select(&selector).next() { + if let Some(content) = og_elem.value().attr("content") { + meta.description = Some(content.to_string()); + } + } + } + } + + meta + } + + /// Extract nodes from the document. + fn extract_nodes_from_document(&self, document: &Html) -> Vec { + let mut nodes = Vec::new(); + + // Parse body selector + let body_selector = match Selector::parse("body") { + Ok(s) => s, + Err(_) => return nodes, + }; + + let body = match document.select(&body_selector).next() { + Some(b) => b, + None => return nodes, + }; + + // Collect all headings in order + let heading_selector = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap(); + + let mut headings: Vec<(usize, String, usize)> = Vec::new(); // (index, title, level) + + for (idx, heading) in body.select(&heading_selector).enumerate() { + let level = self.get_heading_level(heading.value().name()); + if let Some(lvl) = level { + if lvl <= self.config.max_heading_level { + let title: String = heading.text().collect(); + if !title.trim().is_empty() { + headings.push((idx, title.trim().to_string(), lvl)); + } + } + } + } + + // If no headings found, try to extract content anyway + if headings.is_empty() { + let content = self.extract_body_content(body); + if !content.trim().is_empty() { + nodes.push(RawNode { + title: self.config.default_title.clone(), + content: content.trim().to_string(), + level: 0, + line_start: 1, + line_end: 1, + page: None, + token_count: Some(estimate_tokens(&content)), + total_token_count: None, + }); + } + return nodes; + } + + // Extract content between headings + for (i, (_, title, level)) in headings.iter().enumerate() { + let content = self.extract_content_after_heading(body, &headings, i); + + if !title.is_empty() || !content.trim().is_empty() { + nodes.push(RawNode { + title: title.clone(), + content: content.trim().to_string(), + level: *level, + line_start: 1, + line_end: 1, + page: None, + token_count: Some(estimate_tokens(&content)), + total_token_count: None, + }); + } + } + + // Post-process nodes + self.finalize_nodes(nodes) + } + + /// Get heading level from tag name (h1-h6). + fn get_heading_level(&self, tag: &str) -> Option { + match tag { + "h1" => Some(1), + "h2" => Some(2), + "h3" => Some(3), + "h4" => Some(4), + "h5" => Some(5), + "h6" => Some(6), + _ => None, + } + } + + /// Extract body content (for documents without headings). + fn extract_body_content(&self, body: ElementRef) -> String { + let mut content = String::new(); + + // Extract paragraphs + if let Ok(selector) = Selector::parse("p") { + for p in body.select(&selector) { + let text: String = p.text().collect(); + if !text.trim().is_empty() { + if !content.is_empty() { + content.push_str("\n\n"); + } + content.push_str(text.trim()); + } + } + } + + content + } + + /// Extract content after a heading until the next heading. + fn extract_content_after_heading( + &self, + body: ElementRef, + headings: &[(usize, String, usize)], + heading_index: usize, + ) -> String { + let mut content = String::new(); + + // Get all content elements + let content_selector = Selector::parse("p, ul, ol, table, pre, blockquote, div.content, article, section") + .unwrap(); + + // This is a simplified approach - extract content from sibling elements + // In a more sophisticated implementation, we would track DOM positions + for elem in body.select(&content_selector) { + let text = self.extract_element_content(elem); + if !text.is_empty() { + if !content.is_empty() { + content.push_str("\n\n"); + } + content.push_str(&text); + } + } + + content + } + + /// Extract content from a single element. + fn extract_element_content(&self, elem: ElementRef) -> String { + let tag = elem.value().name(); + + match tag { + "p" | "div" | "article" | "section" => { + let text: String = elem.text().collect(); + text.trim().to_string() + } + "ul" => self.extract_list(elem, false), + "ol" => self.extract_list(elem, true), + "table" => self.extract_table(elem), + "pre" | "code" if self.config.include_code_blocks => { + let text: String = elem.text().collect(); + if !text.trim().is_empty() { + format!("```\n{}\n```", text.trim()) + } else { + String::new() + } + } + "blockquote" => { + let text: String = elem.text().collect(); + if !text.trim().is_empty() { + text + .lines() + .map(|line| format!("> {}", line)) + .collect::>() + .join("\n") + } else { + String::new() + } + } + _ => String::new(), + } + } + + /// Extract list content. + fn extract_list(&self, element: ElementRef, ordered: bool) -> String { + let mut result = String::new(); + let li_selector = Selector::parse("li").unwrap(); + let mut counter = 1; + + for li in element.select(&li_selector) { + let text: String = li.text().collect(); + if !text.trim().is_empty() { + if !result.is_empty() { + result.push('\n'); + } + if ordered { + result.push_str(&format!("{}. {}", counter, text.trim())); + counter += 1; + } else { + result.push_str(&format!("• {}", text.trim())); + } + } + } + + result + } + + /// Extract table content. + fn extract_table(&self, element: ElementRef) -> String { + let mut result = String::new(); + let tr_selector = Selector::parse("tr").unwrap(); + + for tr in element.select(&tr_selector) { + let mut cells = Vec::new(); + let td_selector = Selector::parse("td, th").unwrap(); + + for cell in tr.select(&td_selector) { + let text: String = cell.text().collect(); + cells.push(text.trim().to_string()); + } + + if !cells.is_empty() { + if !result.is_empty() { + result.push('\n'); + } + result.push_str(&cells.join(" | ")); + } + } + + result + } + + /// Finalize nodes after extraction. + fn finalize_nodes(&self, mut nodes: Vec) -> Vec { + // Remove empty nodes + nodes.retain(|n| !n.title.is_empty() || !n.content.trim().is_empty()); + + // Merge small consecutive nodes if configured + if self.config.merge_small_nodes { + nodes = self.merge_small_nodes(nodes); + } + + nodes + } + + /// Merge small consecutive nodes. + fn merge_small_nodes(&self, nodes: Vec) -> Vec { + let mut result: Vec = Vec::new(); + + for node in nodes { + if let Some(last) = result.last_mut() { + // Merge if same level and content is small + if last.level == node.level && last.content.len() < self.config.min_content_length + { + if !last.content.is_empty() { + last.content.push_str("\n\n"); + } + last.content.push_str(&node.content); + continue; + } + } + result.push(node); + } + + result + } +} + +#[async_trait] +impl DocumentParser for HtmlParser { + fn format(&self) -> DocumentFormat { + DocumentFormat::Html + } + + async fn parse(&self, content: &str) -> Result { + let line_count = content.lines().count(); + let (nodes, html_meta) = self.extract_nodes(content); + + let meta = DocumentMeta { + name: html_meta.title, + format: DocumentFormat::Html, + page_count: None, + line_count, + source_path: None, + description: html_meta.description, + }; + + Ok(ParseResult::new(meta, nodes)) + } + + async fn parse_file(&self, path: &Path) -> Result { + let content = tokio::fs::read_to_string(path) + .await + .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; + + let mut result = self.parse(&content).await?; + + // Extract document name from filename (if not set by meta) + if result.meta.name.is_empty() { + if let Some(stem) = path.file_stem() { + result.meta.name = stem.to_string_lossy().to_string(); + } + } + result.meta.source_path = Some(path.to_string_lossy().to_string()); + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_parse_simple_html() { + let parser = HtmlParser::new(); + let html = r#" + Test Document + +

Main Title

+

This is a paragraph.

+

Section 1

+

Section content.

+ + "#; + + let result = parser.parse(html).await.unwrap(); + + assert_eq!(result.meta.name, "Test Document"); + assert!(!result.nodes.is_empty()); + } + + #[tokio::test] + async fn test_parse_headings() { + let parser = HtmlParser::new(); + let html = r#" +

H1 Title

+

Content 1

+

H2 Title

+

Content 2

+

H3 Title

+

Content 3

+ "#; + + let result = parser.parse(html).await.unwrap(); + + let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); + assert!(heading_nodes.len() >= 3); + } + + #[tokio::test] + async fn test_parse_metadata() { + let parser = HtmlParser::new(); + let html = r#" + + My Page + + + +

Content

+ "#; + + let result = parser.parse(html).await.unwrap(); + + assert_eq!(result.meta.name, "My Page"); + assert_eq!(result.meta.description, Some("A test page".to_string())); + } + + #[tokio::test] + async fn test_parse_list() { + let parser = HtmlParser::new(); + let html = r#" +

List Example

+
    +
  • Item 1
  • +
  • Item 2
  • +
  • Item 3
  • +
+ "#; + + let result = parser.parse(html).await.unwrap(); + + let list_node = result.nodes.iter().find(|n| n.title == "List Example"); + assert!(list_node.is_some()); + } + + #[tokio::test] + async fn test_parse_table() { + let parser = HtmlParser::new(); + let html = r#" +

Table Example

+ + + +
NameAge
Alice30
+ "#; + + let result = parser.parse(html).await.unwrap(); + + let table_node = result.nodes.iter().find(|n| n.title == "Table Example"); + assert!(table_node.is_some()); + } + + #[tokio::test] + async fn test_empty_document() { + let parser = HtmlParser::new(); + let result = parser.parse("").await.unwrap(); + + assert!(result.nodes.is_empty()); + } + + #[tokio::test] + async fn test_no_headings() { + let parser = HtmlParser::new(); + let html = r#" +

Just some text.

+

More text.

+ "#; + + let result = parser.parse(html).await.unwrap(); + + // Should create a default node + assert_eq!(result.nodes.len(), 1); + assert_eq!(result.nodes[0].title, "Introduction"); + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4442f25c..7bb952d7 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -12,7 +12,7 @@ //! - **Markdown** - Full support via [`MarkdownParser`] //! - **PDF** - Full support via [`PdfParser`] with TOC extraction //! - **DOCX** - Full support via [`DocxParser`] with heading detection -//! - **HTML** - Planned (placeholder) +//! - **HTML** - Full support via [`HtmlParser`] with heading hierarchy //! //! # Example //! @@ -46,6 +46,9 @@ pub mod markdown; // PDF parsing module pub mod pdf; +// HTML parsing module +pub mod html; + // TOC processing module pub mod toc; @@ -63,5 +66,6 @@ pub use registry::{ParserRegistry, get_parser, get_parser_for_file, parse_conten // Re-export concrete parsers pub use docx::DocxParser; +pub use html::{HtmlConfig, HtmlParser}; pub use markdown::{MarkdownConfig, MarkdownParser}; pub use pdf::PdfParser; diff --git a/src/parser/registry.rs b/src/parser/registry.rs index e59863a3..ef1cf416 100644 --- a/src/parser/registry.rs +++ b/src/parser/registry.rs @@ -13,7 +13,9 @@ use std::sync::{Arc, RwLock}; use crate::Error; use crate::error::Result; -use crate::parser::{DocumentFormat, DocumentParser, MarkdownParser, ParseResult, PdfParser}; +use crate::parser::{ + DocumentFormat, DocumentParser, HtmlParser, MarkdownParser, ParseResult, PdfParser, +}; /// Type alias for parser factory functions. type ParserFactory = Box Box + Send + Sync>; @@ -63,10 +65,12 @@ impl ParserRegistry { registry } - /// Register default parsers (Markdown, PDF). + /// Register default parsers (Markdown, PDF, HTML, DOCX). pub fn register_defaults(&self) { self.register("markdown", || Box::new(MarkdownParser::new())); self.register("pdf", || Box::new(PdfParser::new())); + self.register("html", || Box::new(HtmlParser::new())); + self.register("docx", || Box::new(super::docx::DocxParser::new())); } /// Register a parser factory by name. @@ -182,7 +186,7 @@ pub fn get_parser(format: DocumentFormat) -> Option> { match format { DocumentFormat::Markdown => Some(Box::new(MarkdownParser::new())), DocumentFormat::Pdf => Some(Box::new(PdfParser::new())), - DocumentFormat::Html => None, // TODO: Implement HTML parser + DocumentFormat::Html => Some(Box::new(HtmlParser::new())), DocumentFormat::Docx => Some(Box::new(super::docx::DocxParser::new())), DocumentFormat::Text => None, // TODO: Implement plain text parser } @@ -243,6 +247,7 @@ mod tests { let registry = ParserRegistry::with_defaults(); let formats = registry.supported_formats(); assert!(formats.contains(&DocumentFormat::Markdown)); + assert!(formats.contains(&DocumentFormat::Html)); } #[test] @@ -267,6 +272,14 @@ mod tests { assert!(parser.is_some()); } + #[test] + fn test_html_parser_registered() { + let registry = ParserRegistry::with_defaults(); + assert!(registry.supports(DocumentFormat::Html)); + let parser = registry.get(DocumentFormat::Html); + assert!(parser.is_some()); + } + #[test] fn test_get_parser_function() { let parser = get_parser(DocumentFormat::Markdown); @@ -278,4 +291,10 @@ mod tests { let parser = get_parser_for_file(Path::new("test.md")); assert!(parser.is_some()); } + + #[test] + fn test_get_html_parser_for_file() { + let parser = get_parser_for_file(Path::new("test.html")); + assert!(parser.is_some()); + } }