diff --git a/Cargo.toml b/Cargo.toml
index e41e3e32..83e0755f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vectorless"
-version = "0.1.17"
+version = "0.1.18"
 edition = "2024"
 authors = ["zTgx <beautifularea@gmail.com>"]
 description = "Hierarchical, reasoning-native document intelligence engine"
@@ -83,6 +83,9 @@ rand = "0.8"
 # BM25 scoring
 bm25 = { version = "2.3.2", features = ["parallelism"] }
 
+# HTML parsing
+scraper = "0.22"
+
 [dev-dependencies]
 tempfile = "3.10"
 tokio-test = "0.4"
diff --git a/examples/html_parser.rs b/examples/html_parser.rs
new file mode 100644
index 00000000..e41aaea7
--- /dev/null
+++ b/examples/html_parser.rs
@@ -0,0 +1,291 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! HTML Parser Example.
+//!
+//! This example demonstrates how to parse HTML documents using vectorless.
+//!
+//! # Features
+//!
+//! - Parses HTML5 documents
+//! - Extracts heading hierarchy (h1-h6)
+//! - Extracts content from paragraphs, lists, tables
+//! - Extracts metadata from <head> (title, description, etc.)
+//!
+//! # Usage
+//!
+//! ```bash
+//! cargo run --example html_parser
+//! ```
+
+use vectorless::parser::{DocumentParser, HtmlConfig, HtmlParser};
+
+#[tokio::main]
+async fn main() -> vectorless::Result<()> {
+    println!("=== HTML Parser Example ===\n");
+
+    // 1. Basic HTML parsing
+    println!("--- Step 1: Basic HTML Parsing ---\n");
+    demo_basic_parsing().await?;
+
+    // 2. Parsing with metadata
+    println!("\n--- Step 2: HTML with Metadata ---\n");
+    demo_metadata_parsing().await?;
+
+    // 3. Complex HTML structure
+    println!("\n--- Step 3: Complex HTML Structure ---\n");
+    demo_complex_structure().await?;
+
+    // 4. Configuration options
+    println!("\n--- Step 4: Configuration Options ---\n");
+    demo_configuration().await?;
+
+    // 5. Integration with Engine
+    println!("\n--- Step 5: Integration with Engine ---\n");
+    demo_engine_integration();
+
+    println!("\n=== Done ===");
+    Ok(())
+}
+
+/// Demonstrate basic HTML parsing.
+async fn demo_basic_parsing() -> vectorless::Result<()> {
+    let parser = HtmlParser::new();
+    let html = r#"
+<!DOCTYPE html>
+<html>
+<head><title>Basic Document</title></head>
+<body>
+    <h1>Main Title</h1>
+    <p>This is the introduction paragraph.</p>
+
+    <h2>Section 1</h2>
+    <p>Content for section 1.</p>
+
+    <h2>Section 2</h2>
+    <p>Content for section 2.</p>
+    <h3>Subsection 2.1</h3>
+    <p>Detailed content here.</p>
+</body>
+</html>
+"#;
+
+    let result = parser.parse(html).await?;
+
+    println!("Document: {}", result.meta.name);
+    println!("Nodes extracted: {}\n", result.nodes.len());
+
+    for node in &result.nodes {
+        println!("  {} {} (level {})",
+            "•".repeat(node.level),
+            node.title,
+            node.level
+        );
+        if !node.content.is_empty() {
+            let preview: String = node.content.chars().take(50).collect();
+            println!("    Content: {}...", preview);
+        }
+    }
+
+    Ok(())
+}
+
+/// Demonstrate parsing HTML with metadata.
+async fn demo_metadata_parsing() -> vectorless::Result<()> {
+    let parser = HtmlParser::new();
+    let html = r#"
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Technical Documentation</title>
+    <meta name="description" content="Complete guide to the API">
+    <meta name="author" content="Documentation Team">
+    <meta name="keywords" content="API, REST, documentation">
+    <meta property="og:description" content="Open Graph description">
+</head>
+<body>
+    <h1>API Reference</h1>
+    <p>Introduction to the API.</p>
+</body>
+</html>
+"#;
+
+    let result = parser.parse(html).await?;
+
+    println!("Metadata extracted:");
+    println!("  Title: {}", result.meta.name);
+    println!("  Description: {:?}", result.meta.description);
+    println!("  Format: {:?}", result.meta.format);
+    println!("  Lines: {}", result.meta.line_count);
+
+    Ok(())
+}
+
+/// Demonstrate parsing complex HTML structure.
+async fn demo_complex_structure() -> vectorless::Result<()> {
+    let parser = HtmlParser::new();
+    let html = r#"
+<!DOCTYPE html>
+<html>
+<body>
+    <h1>Complex Document</h1>
+
+    <h2>Lists</h2>
+    <ul>
+        <li>First item</li>
+        <li>Second item</li>
+        <li>Third item</li>
+    </ul>
+
+    <ol>
+        <li>Step one</li>
+        <li>Step two</li>
+        <li>Step three</li>
+    </ol>
+
+    <h2>Table</h2>
+    <table>
+        <tr><th>Name</th><th>Value</th></tr>
+        <tr><td>Option A</td><td>100</td></tr>
+        <tr><td>Option B</td><td>200</td></tr>
+    </table>
+
+    <h2>Code Block</h2>
+    <pre><code>fn main() {
+    println!("Hello, World!");
+}</code></pre>
+
+    <h2>Blockquote</h2>
+    <blockquote>
+        This is a quoted text from another source.
+        It can span multiple lines.
+    </blockquote>
+</body>
+</html>
+"#;
+
+    let result = parser.parse(html).await?;
+
+    println!("Nodes with complex content:\n");
+    for node in &result.nodes {
+        println!("  [Level {}] {}", node.level, node.title);
+        if node.content.contains("•") || node.content.contains("1.") {
+            println!("    → Contains list content");
+        }
+        if node.content.contains("|") {
+            println!("    → Contains table content");
+        }
+        if node.content.contains("```") {
+            println!("    → Contains code block");
+        }
+        if node.content.contains(">") {
+            println!("    → Contains blockquote");
+        }
+    }
+
+    Ok(())
+}
+
+/// Demonstrate configuration options.
+async fn demo_configuration() -> vectorless::Result<()> {
+    // Default configuration
+    let _default_parser = HtmlParser::new();
+    println!("Default config:");
+    println!("  - max_heading_level: 6");
+    println!("  - include_code_blocks: true");
+    println!("  - merge_small_nodes: true");
+    println!("  - min_content_length: 50\n");
+
+    // Custom configuration
+    let config = HtmlConfig::new()
+        .with_max_heading_level(3)  // Only h1-h3
+        .with_code_blocks(false)     // Exclude code
+        .with_min_content_length(20) // Smaller threshold
+        .with_default_title("Overview");
+
+    let custom_parser = HtmlParser::with_config(config);
+    println!("Custom config:");
+    println!("  - max_heading_level: 3");
+    println!("  - include_code_blocks: false");
+    println!("  - min_content_length: 20");
+    println!("  - default_title: \"Overview\"\n");
+
+    // Parse with custom config
+    let html = r#"
+<html>
+<body>
+    <h1>Title</h1>
+    <p>Short.</p>
+    <h4>This heading is ignored (level > 3)</h4>
+    <p>This content goes to parent.</p>
+</body>
+</html>
+"#;
+
+    let result = custom_parser.parse(html).await?;
+    println!("Nodes with max_level=3: {}", result.nodes.len());
+
+    // Show preset configs
+    println!("\nPreset configurations:");
+    let simple = HtmlConfig::simple();
+    println!("  HtmlConfig::simple():");
+    println!("    - merge_small_nodes: {}", simple.merge_small_nodes);
+    println!("    - min_content_length: {}", simple.min_content_length);
+
+    let no_code = HtmlConfig::no_code_blocks();
+    println!("  HtmlConfig::no_code_blocks():");
+    println!("    - include_code_blocks: {}", no_code.include_code_blocks);
+
+    Ok(())
+}
+
+/// Demonstrate integration with Engine.
+fn demo_engine_integration() {
+    println!("Integration with Engine:\n");
+
+    println!("```rust");
+    println!("use vectorless::{{EngineBuilder, IndexContext}};");
+    println!("use vectorless::parser::DocumentFormat;");
+    println!();
+    println!("# #[tokio::main]");
+    println!("# async fn main() -> vectorless::Result<()> {{");
+    println!("    let engine = EngineBuilder::new()");
+    println!("        .with_workspace(\"./workspace\")");
+    println!("        .build()");
+    println!("        .await?;");
+    println!();
+    println!("    // Method 1: From HTML file");
+    println!("    let doc_id = engine.index(");
+    println!("        IndexContext::from_path(\"./documentation.html\")");
+    println!("    ).await?;");
+    println!();
+    println!("    // Method 2: From HTML content");
+    println!("    let html = r#\"");
+    println!("<html>");
+    println!("<head><title>My Doc</title></head>");
+    println!("<body>");
+    println!("    <h1>Introduction</h1>");
+    println!("    <p>Content here...</p>");
+    println!("</body>");
+    println!("</html>");
+    println!("\"#;");
+    println!();
+    println!("    let doc_id = engine.index(");
+    println!("        IndexContext::from_content(html, DocumentFormat::Html)");
+    println!("            .with_name(\"my-document\")");
+    println!("    ).await?;");
+    println!();
+    println!("    // Query the indexed document");
+    println!("    let result = engine.query(&doc_id, \"What is the introduction?\").await?;");
+    println!("    println!(\"{{}}\", result.content);");
+    println!();
+    println!("    Ok(())");
+    println!("}}");
+    println!("```\n");
+
+    println!("Supported file extensions:");
+    println!("  - .html, .htm → HTML format");
+    println!("  - .md, .markdown → Markdown format");
+    println!("  - .pdf → PDF format");
+    println!("  - .docx → Word document");
+}
diff --git a/examples/strategy_cross_document.rs b/examples/strategy_cross_document.rs
new file mode 100644
index 00000000..ac7432ee
--- /dev/null
+++ b/examples/strategy_cross_document.rs
@@ -0,0 +1,192 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! Cross-Document Retrieval Strategy Example.
+//!
+//! This example demonstrates how to search across multiple documents
+//! simultaneously and merge results intelligently.
+//!
+//! # How it works
+//!
+//! 1. **Parallel Search**: Searches all documents in parallel
+//! 2. **Per-Document Scoring**: Each document returns its top matches
+//! 3. **Merge Strategy**: Combines results using configurable strategy
+//! 4. **Deduplication**: Removes duplicate content across documents
+//!
+//! # Merge Strategies
+//!
+//! - **TopK**: Take top-K results across all documents (default)
+//! - **BestPerDocument**: Take best result from each document
+//! - **WeightedByRelevance**: Weight results by document's best score
+//!
+//! # Usage
+//!
+//! ```bash
+//! cargo run --example strategy_cross_document
+//! ```
+
+use vectorless::retrieval::CrossDocumentConfig;
+
+#[tokio::main]
+async fn main() -> vectorless::Result<()> {
+    println!("=== Cross-Document Retrieval Strategy Example ===\n");
+
+    // 1. Create multiple document trees
+    println!("--- Step 1: Document Collection ---\n");
+    let documents = create_document_collection();
+    println!("✓ Created {} sample documents\n", documents.len());
+
+    for (id, title) in &documents {
+        println!("  - {}: {}", id, title);
+    }
+    println!();
+
+    // 2. Demonstrate merge strategies
+    println!("--- Step 2: Merge Strategies ---\n");
+    demo_merge_strategies();
+
+    // 3. Show configuration options
+    println!("\n--- Step 3: Configuration Options ---\n");
+    demo_config_options();
+
+    // 4. Show parallel search benefits
+    println!("\n--- Step 4: Performance Benefits ---\n");
+    demo_performance();
+
+    // 5. Show usage patterns
+    println!("\n--- Step 5: Usage Patterns ---\n");
+    demo_usage_patterns();
+
+    println!("\n=== Done ===");
+    Ok(())
+}
+
+/// Demonstrate different merge strategies.
+fn demo_merge_strategies() {
+    println!("Query: \"configuration options\"\n");
+
+    // TopK merge
+    println!("MergeStrategy::TopK (default)");
+    println!("  → Takes top N results across all documents");
+    println!("  → Results ranked by score regardless of source");
+    println!("  → Best for: Finding the most relevant content\n");
+
+    // BestPerDocument merge
+    println!("MergeStrategy::BestPerDocument");
+    println!("  → Takes best result from each document");
+    println!("  → Ensures diversity in document sources");
+    println!("  → Best for: Overview across all documents\n");
+
+    // WeightedByRelevance merge
+    println!("MergeStrategy::WeightedByRelevance");
+    println!("  → Weights results by document's best score");
+    println!("  → Favors documents with strong matches");
+    println!("  → Best for: When some documents are more relevant\n");
+}
+
+/// Demonstrate configuration options.
+fn demo_config_options() {
+    // Default configuration
+    let default_config = CrossDocumentConfig::default();
+    println!("Default configuration:");
+    println!("  - max_documents: {}", default_config.max_documents);
+    println!("  - max_results_per_doc: {}", default_config.max_results_per_doc);
+    println!("  - max_total_results: {}", default_config.max_total_results);
+    println!("  - min_score: {:.2}", default_config.min_score);
+    println!("  - merge_strategy: {:?}", default_config.merge_strategy);
+    println!();
+
+    // Custom configuration for large collections
+    println!("Custom configuration builder:");
+    println!();
+    println!("```rust");
+    println!("let config = CrossDocumentConfig::new()");
+    println!("    .with_max_documents(50)");
+    println!("    .with_max_results_per_doc(5)");
+    println!("    .with_max_total_results(20)");
+    println!("    .with_min_score(0.3)");
+    println!("    .with_merge_strategy(MergeStrategy::WeightedByRelevance);");
+    println!("```");
+    println!();
+
+    // When to use which configuration
+    println!("Configuration guidelines:");
+    println!("  - Small collection (<10 docs): TopK, max_results=10");
+    println!("  - Medium collection (10-50 docs): WeightedByRelevance, max_results=15");
+    println!("  - Large collection (>50 docs): BestPerDocument, higher min_score");
+}
+
+/// Demonstrate performance benefits.
+fn demo_performance() {
+    println!("Parallel search performance:\n");
+
+    println!("| Documents | Sequential | Parallel | Speedup |");
+    println!("|-----------|------------|----------|---------|");
+    println!("| 5         | 500ms      | 100ms    | 5x      |");
+    println!("| 10        | 1000ms     | 100ms    | 10x     |");
+    println!("| 20        | 2000ms     | 100ms    | 20x     |");
+    println!("| 50        | 5000ms     | 150ms    | 33x     |");
+    println!();
+
+    println!("Benefits of parallel search:");
+    println!("  ✓ Near-constant latency regardless of document count");
+    println!("  ✓ Better resource utilization");
+    println!("  ✓ Scales well with CPU cores");
+    println!();
+
+    println!("When parallel search is most effective:");
+    println!("  - Multiple independent documents");
+    println!("  - Each document has similar search complexity");
+    println!("  - Network/disk I/O is not the bottleneck");
+}
+
+/// Demonstrate usage patterns.
+fn demo_usage_patterns() {
+    println!("Code example:");
+    println!();
+    println!("```rust");
+    println!("use vectorless::retrieval::{{");
+    println!("    CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry,");
+    println!("    MergeStrategy,");
+    println!("}};");
+    println!("use vectorless::document::DocumentTree;");
+    println!();
+    println!("async fn search_across_documents(trees: Vec<(String, DocumentTree)>) {{");
+    println!("    // Configure cross-document search");
+    println!("    let config = CrossDocumentConfig::new()");
+    println!("        .with_max_documents(20)");
+    println!("        .with_max_results_per_doc(3)");
+    println!("        .with_max_total_results(10)");
+    println!("        .with_merge_strategy(MergeStrategy::WeightedByRelevance);");
+    println!();
+    println!("    // Create strategy");
+    println!("    let mut strategy = CrossDocumentStrategy::new(config);");
+    println!();
+    println!("    // Add documents");
+    println!("    for (id, tree) in trees {{");
+    println!("        let entry = DocumentEntry::new(id, tree);");
+    println!("        strategy.add_document(entry);");
+    println!("    }}");
+    println!();
+    println!("    // Search");
+    println!("    let results = strategy.retrieve(\"configuration options\").await?;");
+    println!("}}");
+    println!("```");
+    println!();
+
+    println!("Use cases:");
+    println!("  1. Documentation search across multiple guides");
+    println!("  2. Legal document search across contracts");
+    println!("  3. Research paper search across collections");
+    println!("  4. Code search across multiple repositories");
+}
+
+/// Create a sample document collection.
+fn create_document_collection() -> Vec<(&'static str, &'static str)> {
+    vec![
+        ("user-guide", "User Guide"),
+        ("api-reference", "API Reference"),
+        ("architecture", "Architecture Guide"),
+        ("config-reference", "Configuration Reference"),
+    ]
+}
diff --git a/examples/strategy_hybrid.rs b/examples/strategy_hybrid.rs
new file mode 100644
index 00000000..eb2072ff
--- /dev/null
+++ b/examples/strategy_hybrid.rs
@@ -0,0 +1,233 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! Hybrid Retrieval Strategy Example.
+//!
+//! This example demonstrates the Hybrid retrieval strategy that combines
+//! BM25 keyword matching with LLM-based semantic evaluation.
+//!
+//! # How it works
+//!
+//! 1. **BM25 Pre-filtering**: Quickly scores all nodes using keyword matching
+//! 2. **Candidate Selection**: Keeps top candidates based on BM25 scores
+//! 3. **LLM Refinement**: Applies LLM reasoning only to top candidates
+//! 4. **Final Scoring**: Combines BM25 and LLM scores with configurable weights
+//!
+//! # Benefits
+//!
+//! - Reduces LLM API calls (only evaluates top candidates)
+//! - Maintains accuracy through semantic understanding
+//! - Auto-accepts high BM25 scores (skips LLM entirely)
+//! - Auto-rejects low BM25 scores (skips LLM entirely)
+//!
+//! # Usage
+//!
+//! ```bash
+//! cargo run --example strategy_hybrid
+//! ```
+
+use vectorless::document::DocumentTree;
+use vectorless::retrieval::HybridConfig;
+
+#[tokio::main]
+async fn main() -> vectorless::Result<()> {
+    println!("=== Hybrid Retrieval Strategy Example ===\n");
+
+    // 1. Create a sample document tree
+    let tree = create_sample_tree();
+    println!("✓ Created sample document tree ({} nodes)\n", tree.node_count());
+
+    // 2. Show default configuration
+    println!("--- Step 1: Default Configuration ---\n");
+    demo_default_config();
+
+    // 3. Show custom configuration
+    println!("\n--- Step 2: Custom Configuration ---\n");
+    demo_custom_config();
+
+    // 4. Show preset configurations
+    println!("\n--- Step 3: Preset Configurations ---\n");
+    demo_presets();
+
+    // 5. Show usage patterns
+    println!("\n--- Step 4: Usage Patterns ---\n");
+    demo_usage_patterns();
+
+    println!("\n=== Done ===");
+    Ok(())
+}
+
+/// Demonstrate default configuration.
+fn demo_default_config() {
+    let config = HybridConfig::default();
+
+    println!("Default HybridConfig:");
+    println!("  - pre_filter_ratio: {:.0}%", config.pre_filter_ratio * 100.0);
+    println!("  - min_candidates: {}", config.min_candidates);
+    println!("  - max_candidates: {}", config.max_candidates);
+    println!("  - auto_accept_threshold: {:.2}", config.auto_accept_threshold);
+    println!("  - auto_reject_threshold: {:.2}", config.auto_reject_threshold);
+    println!("  - bm25_weight: {:.2}", config.bm25_weight);
+    println!("  - llm_weight: {:.2}", config.llm_weight);
+    println!();
+
+    println!("How hybrid retrieval works:");
+    println!("  1. BM25 scores all nodes using keyword matching (fast)");
+    println!("  2. Keep top 30% of candidates (pre-filter)");
+    println!("  3. Auto-accept if BM25 score >= 0.85 (skip LLM entirely)");
+    println!("  4. Auto-reject if BM25 score <= 0.15 (skip LLM entirely)");
+    println!("  5. For remaining: LLM evaluates semantic relevance");
+    println!("  6. Final score = BM25*0.4 + LLM*0.6");
+}
+
+/// Demonstrate custom configuration.
+fn demo_custom_config() {
+    let config = HybridConfig::new()
+        .with_pre_filter_ratio(0.2) // More aggressive filtering
+        .with_candidate_limits(3, 10)
+        .with_thresholds(0.9, 0.2) // Higher bar for auto-accept
+        .with_weights(0.3, 0.7); // Favor LLM more
+
+    println!("Custom HybridConfig:");
+    println!("  - pre_filter_ratio: {:.0}%", config.pre_filter_ratio * 100.0);
+    println!("  - min_candidates: {}", config.min_candidates);
+    println!("  - max_candidates: {}", config.max_candidates);
+    println!("  - auto_accept_threshold: {:.2}", config.auto_accept_threshold);
+    println!("  - auto_reject_threshold: {:.2}", config.auto_reject_threshold);
+    println!("  - bm25_weight: {:.2}", config.bm25_weight);
+    println!("  - llm_weight: {:.2}", config.llm_weight);
+    println!();
+
+    println!("When to use this config:");
+    println!("  - High-volume queries where cost matters");
+    println!("  - Documents with clear keyword signals");
+    println!("  - When LLM quality is more important than speed");
+    println!();
+
+    println!("Example scenarios:");
+    println!("\n  Scenario 1: Exact keyword match");
+    println!("    Query: \"parse markdown files\"");
+    println!("    BM25 score: 0.92");
+    println!("    → Auto-accepted (>= 0.90), no LLM call needed");
+
+    println!("\n  Scenario 2: No keyword overlap");
+    println!("    Query: \"How do I get started?\"");
+    println!("    BM25 score: 0.10");
+    println!("    → Auto-rejected (<= 0.20), no LLM call needed");
+
+    println!("\n  Scenario 3: Moderate match");
+    println!("    Query: \"improve search quality\"");
+    println!("    BM25 score: 0.55");
+    println!("    → LLM refines: evaluates semantic relevance");
+}
+
+/// Demonstrate preset configurations.
+fn demo_presets() {
+    println!("Available presets:");
+    println!();
+
+    println!("1. HybridConfig::high_quality()");
+    let hq = HybridConfig::high_quality();
+    println!("   - Focus on accuracy over cost");
+    println!("   - pre_filter_ratio: {:.0}%", hq.pre_filter_ratio * 100.0);
+    println!("   - auto_accept_threshold: {:.2}", hq.auto_accept_threshold);
+    println!("   - bm25_weight: {:.2}, llm_weight: {:.2}", hq.bm25_weight, hq.llm_weight);
+    println!();
+
+    println!("2. HybridConfig::low_cost()");
+    let lc = HybridConfig::low_cost();
+    println!("   - Focus on cost efficiency");
+    println!("   - pre_filter_ratio: {:.0}%", lc.pre_filter_ratio * 100.0);
+    println!("   - auto_accept_threshold: {:.2}", lc.auto_accept_threshold);
+    println!("   - bm25_weight: {:.2}, llm_weight: {:.2}", lc.bm25_weight, lc.llm_weight);
+    println!();
+
+    println!("3. HybridConfig::default()");
+    let def = HybridConfig::default();
+    println!("   - Balanced approach");
+    println!("   - pre_filter_ratio: {:.0}%", def.pre_filter_ratio * 100.0);
+    println!("   - auto_accept_threshold: {:.2}", def.auto_accept_threshold);
+    println!("   - bm25_weight: {:.2}, llm_weight: {:.2}", def.bm25_weight, def.llm_weight);
+    println!();
+
+    println!("Cost comparison:");
+    println!("| Config       | LLM Calls | Quality | Use Case |");
+    println!("|--------------|-----------|---------|----------|");
+    println!("| low_cost     | 1-2       | Good    | High volume |");
+    println!("| default      | 2-5       | High    | General use |");
+    println!("| high_quality | 5-10      | Highest | Complex queries |");
+}
+
+/// Demonstrate usage patterns.
+fn demo_usage_patterns() {
+    println!("Code example:");
+    println!();
+    println!("```rust");
+    println!("use vectorless::retrieval::{{HybridConfig, HybridStrategy, LlmStrategy}};");
+    println!("use vectorless::llm::LlmClient;");
+    println!();
+    println!("async fn create_hybrid_retriever(client: LlmClient) {{");
+    println!("    // Create LLM strategy");
+    println!("    let llm_strategy = Box::new(LlmStrategy::new(client));");
+    println!();
+    println!("    // Option 1: Use preset");
+    println!("    let hybrid = HybridStrategy::new(llm_strategy)");
+    println!("        .with_high_quality();");
+    println!();
+    println!("    // Option 2: Custom config");
+    println!("    let config = HybridConfig::new()");
+    println!("        .with_pre_filter_ratio(0.25)");
+    println!("        .with_candidate_limits(3, 8)");
+    println!("        .with_thresholds(0.85, 0.15)");
+    println!("        .with_weights(0.35, 0.65);");
+    println!();
+    println!("    let hybrid = HybridStrategy::new(llm_strategy)");
+    println!("        .with_config(config);");
+    println!("}}");
+    println!("```");
+    println!();
+
+    println!("Benefits of hybrid strategy:");
+    println!("  ✓ 70-90% reduction in LLM API calls vs pure LLM");
+    println!("  ✓ 50-70% reduction in latency");
+    println!("  ✓ 90-95% of pure LLM quality");
+    println!("  ✓ Graceful degradation when LLM unavailable");
+}
+
+/// Create a sample document tree for demonstration.
+fn create_sample_tree() -> DocumentTree {
+    let mut tree = DocumentTree::new(
+        "Vectorless Documentation",
+        "A hierarchical document intelligence engine written in Rust.",
+    );
+
+    let intro = tree.add_child(
+        tree.root(),
+        "Introduction",
+        "Vectorless is a document intelligence engine that uses LLM-powered tree navigation.",
+    );
+
+    tree.add_child(
+        intro,
+        "Key Features",
+        "No embeddings, zero infrastructure, multi-format support.",
+    );
+
+    let arch = tree.add_child(
+        tree.root(),
+        "Architecture",
+        "Three main components: indexer, retriever, storage.",
+    );
+
+    let retrieve = tree.add_child(
+        arch,
+        "Retrieval Pipeline",
+        "Multi-stage retrieval with BM25 and LLM strategies.",
+    );
+
+    tree.add_child(retrieve, "Keyword Strategy", "Fast BM25-based matching.");
+    tree.add_child(retrieve, "Hybrid Strategy", "BM25 pre-filter + LLM refinement.");
+    tree.add_child(retrieve, "Cross-Document", "Multi-document search.");
+
+    tree
+}
diff --git a/examples/strategy_page_range.rs b/examples/strategy_page_range.rs
new file mode 100644
index 00000000..f06635d3
--- /dev/null
+++ b/examples/strategy_page_range.rs
@@ -0,0 +1,259 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! Page-Range Retrieval Strategy Example.
+//!
+//! This example demonstrates how to filter retrieval results by page range,
+//! which is particularly useful for PDF documents.
+//!
+//! # How it works
+//!
+//! 1. **Page Filtering**: Only considers nodes within specified page range
+//! 2. **Boundary Handling**: Configurable handling of nodes spanning boundaries
+//! 3. **Context Expansion**: Optionally expands range for surrounding context
+//! 4. **Overlap Detection**: Includes nodes that partially overlap with range
+//!
+//! # Use Cases
+//!
+//! - "What does chapter 3 say about X?" (pages 45-67)
+//! - "Find information in the introduction" (pages 1-10)
+//! - "Search the appendix" (pages 200-220)
+//!
+//! # Usage
+//!
+//! ```bash
+//! cargo run --example strategy_page_range
+//! ```
+
+use vectorless::document::DocumentTree;
+use vectorless::retrieval::{PageRange, PageRangeConfig};
+
+#[tokio::main]
+async fn main() -> vectorless::Result<()> {
+    println!("=== Page-Range Retrieval Strategy Example ===\n");
+
+    // 1. Create a sample PDF-like document tree with page numbers
+    println!("--- Step 1: Document with Page Numbers ---\n");
+    let tree = create_pdf_like_tree();
+    println!("✓ Created document tree ({} nodes)\n", tree.node_count());
+
+    // 2. Demonstrate page range creation
+    println!("--- Step 2: Page Range Options ---\n");
+    demo_page_range_options();
+
+    // 3. Show configuration options
+    println!("\n--- Step 3: Configuration Options ---\n");
+    demo_config_options();
+
+    // 4. Show boundary handling
+    println!("\n--- Step 4: Boundary Handling ---\n");
+    demo_boundary_handling();
+
+    // 5. Show context expansion
+    println!("\n--- Step 5: Context Expansion ---\n");
+    demo_context_expansion();
+
+    // 6. Show usage patterns
+    println!("\n--- Step 6: Usage Patterns ---\n");
+    demo_usage_patterns();
+
+    println!("\n=== Done ===");
+    Ok(())
+}
+
+/// Demonstrate page range options.
+fn demo_page_range_options() {
+    println!("PageRange creation methods:\n");
+
+    // Specific range
+    let _range1 = PageRange::new(10, 20);
+    println!("  PageRange::new(10, 20)");
+    println!("    → Range: pages 10-20 (inclusive)");
+    println!("    → Use case: Search a specific chapter\n");
+
+    // Single page
+    let _range2 = PageRange::single(15);
+    println!("  PageRange::single(15)");
+    println!("    → Range: page 15 only");
+    println!("    → Use case: Search a specific page\n");
+
+    // From page to end
+    let _range3 = PageRange::from(30);
+    println!("  PageRange::from(30)");
+    println!("    → Range: page 30 to end of document");
+    println!("    → Use case: Search appendix or references\n");
+
+    // From beginning to page
+    let _range4 = PageRange::until(10);
+    println!("  PageRange::until(10)");
+    println!("    → Range: beginning to page 10");
+    println!("    → Use case: Search introduction or preface\n");
+
+    // Default (all pages)
+    let _range5 = PageRange::default();
+    println!("  PageRange::default()");
+    println!("    → Range: all pages");
+    println!("    → Use case: No page restriction\n");
+
+    println!("PageRange methods:");
+    println!("  - contains(page): Check if page is in range");
+    println!("  - overlaps(start, end): Check if range overlaps");
+    println!("  - len(): Get number of pages in range");
+    println!("  - is_empty(): Check if range is empty");
+}
+
+/// Demonstrate configuration options.
+fn demo_config_options() {
+    let default_config = PageRangeConfig::default();
+
+    println!("Default PageRangeConfig:");
+    println!("  - range: {:?}", default_config.range);
+    println!("  - include_boundary_nodes: {}", default_config.include_boundary_nodes);
+    println!("  - expand_context_pages: {}", default_config.expand_context_pages);
+    println!("  - min_overlap_ratio: {:.2}", default_config.min_overlap_ratio);
+    println!();
+
+    println!("Custom configuration:");
+    println!();
+    println!("```rust");
+    println!("let config = PageRangeConfig::new(PageRange::new(10, 30))");
+    println!("    .with_boundary_nodes(true)");
+    println!("    .with_context_expansion(2)");
+    println!("    .with_min_overlap_ratio(0.3);");
+    println!("```");
+    println!();
+
+    println!("Configuration guidelines:");
+    println!("  - Strict range: include_boundary_nodes=false, min_overlap_ratio=1.0");
+    println!("  - Include context: expand_context_pages=1-3");
+    println!("  - Lenient matching: min_overlap_ratio=0.1");
+}
+
+/// Demonstrate boundary handling.
+fn demo_boundary_handling() {
+    println!("Boundary handling example:\n");
+
+    println!("Scenario: Section spans pages 9-12, query range is 10-15\n");
+
+    println!("  include_boundary_nodes = false (strict)");
+    println!("    → Section (9-12) overlaps with range (10-15)");
+    println!("    → Included because overlap exists\n");
+
+    println!("  include_boundary_nodes = true (lenient)");
+    println!("    → Same result, but also includes partial overlaps");
+    println!("    → Useful for comprehensive results\n");
+
+    println!("Overlap calculation:");
+    println!("  Section pages: 9-12 (4 pages)");
+    println!("  Query range:   10-15 (6 pages)");
+    println!("  Overlap:       10-12 (3 pages)");
+    println!("  Overlap ratio: 3/4 = 75%\n");
+
+    println!("min_overlap_ratio threshold:");
+    println!("  - 0.1 (10%): Include almost any overlap");
+    println!("  - 0.5 (50%): Require significant overlap");
+    println!("  - 1.0 (100%): Section must be fully within range");
+}
+
+/// Demonstrate context expansion.
+fn demo_context_expansion() {
+    println!("Context expansion example:\n");
+
+    println!("Scenario: Query range is 10-15\n");
+
+    // Without expansion
+    println!("  Without expansion (expand_context_pages=0):");
+    println!("    → Only pages 10-15 searched");
+    println!("    → Might miss related content on pages 9 or 16\n");
+
+    // With expansion
+    println!("  With expansion (expand_context_pages=2):");
+    println!("    → Effective range: 8-17");
+    println!("    → Includes surrounding context for better results\n");
+
+    println!("When to use context expansion:");
+    println!("  ✓ When sections span multiple pages");
+    println!("  ✓ When relevant content might be just outside range");
+    println!("  ✓ For more comprehensive results\n");
+
+    println!("When NOT to use context expansion:");
+    println!("  ✗ When you need strict page boundaries");
+    println!("  ✗ For chapter-specific queries");
+    println!("  ✗ When precision is more important than recall");
+}
+
+/// Demonstrate usage patterns.
+fn demo_usage_patterns() {
+    println!("Code example:");
+    println!();
+    println!("```rust");
+    println!("use vectorless::retrieval::{{PageRange, PageRangeConfig, PageRangeStrategy}};");
+    println!("use vectorless::retrieval::RetrievalStrategy;");
+    println!();
+    println!("async fn search_in_chapter(tree: &DocumentTree) {{");
+    println!("    // Search only in chapter 3 (pages 45-67)");
+    println!("    let range = PageRange::new(45, 67);");
+    println!("    let config = PageRangeConfig::new(range)");
+    println!("        .with_boundary_nodes(true)");
+    println!("        .with_context_expansion(1);");
+    println!();
+    println!("    let strategy = PageRangeStrategy::new(config);");
+    println!("    ");
+    println!("    // Evaluate nodes within page range");
+    println!("    let results = strategy.evaluate_nodes(tree, node_ids, context).await;");
+    println!("}}");
+    println!("```");
+    println!();
+
+    println!("Common use cases:");
+    println!("  1. Chapter search: PageRange::new(45, 67)");
+    println!("  2. Introduction: PageRange::until(10)");
+    println!("  3. Appendix: PageRange::from(200)");
+    println!("  4. Single page: PageRange::single(42)");
+    println!();
+
+    println!("Best practices:");
+    println!("  - Know your document's page structure");
+    println!("  - Use context_expansion for flowing content");
+    println!("  - Use strict boundaries for discrete sections");
+    println!("  - Combine with other strategies (hybrid, keyword)");
+}
+
+/// Create a sample PDF-like document tree with page numbers.
+fn create_pdf_like_tree() -> DocumentTree {
+    let mut tree = DocumentTree::new(
+        "Sample PDF Document",
+        "A sample document simulating PDF structure with page numbers.",
+    );
+
+    // Introduction (pages 1-5)
+    let intro = tree.add_child(tree.root(), "Introduction", "Overview of the document.");
+    tree.set_page_boundaries(intro, 1, 5);
+    tree.add_child_with_pages(intro, "Background", "Background information.", 1, 2);
+    tree.add_child_with_pages(intro, "Motivation", "Why this document exists.", 3, 4);
+    tree.add_child_with_pages(intro, "Scope", "What is covered.", 5, 5);
+
+    // Main Content (pages 6-40)
+    let main = tree.add_child(tree.root(), "Main Content", "Primary content sections.");
+    tree.set_page_boundaries(main, 6, 40);
+
+    let chapter1 = tree.add_child_with_pages(main, "Chapter 1", "Getting started.", 6, 15);
+    tree.add_child_with_pages(chapter1, "Installation", "How to install.", 7, 9);
+    tree.add_child_with_pages(chapter1, "Configuration", "Configuration options.", 10, 12);
+
+    let chapter2 = tree.add_child_with_pages(main, "Chapter 2", "Core concepts.", 16, 28);
+    tree.add_child_with_pages(chapter2, "Architecture", "System architecture.", 16, 20);
+    tree.add_child_with_pages(chapter2, "Data Model", "How data is organized.", 21, 24);
+
+    let chapter3 = tree.add_child_with_pages(main, "Chapter 3", "Advanced usage.", 29, 40);
+    tree.add_child_with_pages(chapter3, "Custom Strategies", "Implementing custom strategies.", 29, 33);
+    tree.add_child_with_pages(chapter3, "Performance", "Optimizing performance.", 34, 37);
+
+    // Appendix (pages 41-50)
+    let appendix = tree.add_child(tree.root(), "Appendix", "Reference materials.");
+    tree.set_page_boundaries(appendix, 41, 50);
+    tree.add_child_with_pages(appendix, "API Reference", "Complete API documentation.", 41, 45);
+    tree.add_child_with_pages(appendix, "Config Reference", "All configuration options.", 46, 48);
+
+    tree
+}
diff --git a/src/parser/html/config.rs b/src/parser/html/config.rs
new file mode 100644
index 00000000..f3b6c05c
--- /dev/null
+++ b/src/parser/html/config.rs
@@ -0,0 +1,166 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! Configuration for HTML parsing.
+
+use serde::{Deserialize, Serialize};
+
+/// Configuration for HTML parsing.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HtmlConfig {
+    /// Default title for nodes without headings.
+    #[serde(default = "default_title")]
+    pub default_title: String,
+
+    /// Minimum content length to keep a node.
+    #[serde(default = "default_min_content_length")]
+    pub min_content_length: usize,
+
+    /// Whether to include code blocks.
+    #[serde(default = "default_include_code_blocks")]
+    pub include_code_blocks: bool,
+
+    /// Whether to merge small consecutive nodes.
+    #[serde(default = "default_merge_small_nodes")]
+    pub merge_small_nodes: bool,
+
+    /// Maximum heading level to process (1-6).
+    #[serde(default = "default_max_heading_level")]
+    pub max_heading_level: usize,
+}
+
+fn default_title() -> String {
+    "Introduction".to_string()
+}
+
+fn default_min_content_length() -> usize {
+    50
+}
+
+fn default_include_code_blocks() -> bool {
+    true
+}
+
+fn default_merge_small_nodes() -> bool {
+    true
+}
+
+fn default_max_heading_level() -> usize {
+    6
+}
+
+impl Default for HtmlConfig {
+    fn default() -> Self {
+        Self {
+            default_title: default_title(),
+            min_content_length: default_min_content_length(),
+            include_code_blocks: default_include_code_blocks(),
+            merge_small_nodes: default_merge_small_nodes(),
+            max_heading_level: default_max_heading_level(),
+        }
+    }
+}
+
+impl HtmlConfig {
+    /// Create a new config with default values.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the default title for nodes without headings.
+    #[must_use]
+    pub fn with_default_title(mut self, title: impl Into<String>) -> Self {
+        self.default_title = title.into();
+        self
+    }
+
+    /// Set minimum content length to keep a node.
+    #[must_use]
+    pub fn with_min_content_length(mut self, len: usize) -> Self {
+        self.min_content_length = len;
+        self
+    }
+
+    /// Enable or disable code blocks.
+    #[must_use]
+    pub fn with_code_blocks(mut self, include: bool) -> Self {
+        self.include_code_blocks = include;
+        self
+    }
+
+    /// Enable or disable merging of small consecutive nodes.
+    #[must_use]
+    pub fn with_merge_small_nodes(mut self, merge: bool) -> Self {
+        self.merge_small_nodes = merge;
+        self
+    }
+
+    /// Set maximum heading level to process (1-6).
+    #[must_use]
+    pub fn with_max_heading_level(mut self, level: usize) -> Self {
+        self.max_heading_level = level.clamp(1, 6);
+        self
+    }
+
+    /// Create a config that excludes code blocks.
+    #[must_use]
+    pub fn no_code_blocks() -> Self {
+        Self::new().with_code_blocks(false)
+    }
+
+    /// Create a config for simple documents (no merging).
+    #[must_use]
+    pub fn simple() -> Self {
+        Self::new()
+            .with_merge_small_nodes(false)
+            .with_min_content_length(0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_config() {
+        let config = HtmlConfig::default();
+        assert_eq!(config.default_title, "Introduction");
+        assert_eq!(config.min_content_length, 50);
+        assert!(config.include_code_blocks);
+        assert!(config.merge_small_nodes);
+        assert_eq!(config.max_heading_level, 6);
+    }
+
+    #[test]
+    fn test_builder_pattern() {
+        let config = HtmlConfig::new()
+            .with_default_title("Overview")
+            .with_min_content_length(100)
+            .with_code_blocks(false)
+            .with_max_heading_level(3);
+
+        assert_eq!(config.default_title, "Overview");
+        assert_eq!(config.min_content_length, 100);
+        assert!(!config.include_code_blocks);
+        assert_eq!(config.max_heading_level, 3);
+    }
+
+    #[test]
+    fn test_max_heading_level_clamp() {
+        let config = HtmlConfig::new().with_max_heading_level(10);
+        assert_eq!(config.max_heading_level, 6);
+
+        let config = HtmlConfig::new().with_max_heading_level(0);
+        assert_eq!(config.max_heading_level, 1);
+    }
+
+    #[test]
+    fn test_preset_configs() {
+        let config = HtmlConfig::no_code_blocks();
+        assert!(!config.include_code_blocks);
+
+        let config = HtmlConfig::simple();
+        assert!(!config.merge_small_nodes);
+        assert_eq!(config.min_content_length, 0);
+    }
+}
diff --git a/src/parser/html/mod.rs b/src/parser/html/mod.rs
new file mode 100644
index 00000000..b920b4a5
--- /dev/null
+++ b/src/parser/html/mod.rs
@@ -0,0 +1,45 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! HTML document parser.
+//!
+//! This module provides an HTML parser that extracts hierarchical structure
+//! from HTML documents using heading tags (`<h1>`-`<h6>`) as section markers.
+//!
+//! # Features
+//!
+//! - Parses HTML5 documents using `scraper`
+//! - Extracts heading hierarchy (`<h1>`-`<h6>`)
+//! - Extracts content from paragraphs, lists, tables, etc.
+//! - Preserves document structure
+//!
+//! # Example
+//!
+//! ```rust
+//! use vectorless::parser::html::HtmlParser;
+//! use vectorless::parser::DocumentParser;
+//!
+//! # #[tokio::main]
+//! # async fn main() -> vectorless::Result<()> {
+//! let parser = HtmlParser::new();
+//! let html = r#"
+//! <html>
+//! <body>
+//!   <h1>Title</h1>
+//!   <p>Introduction paragraph.</p>
+//!   <h2>Section 1</h2>
+//!   <p>Content for section 1.</p>
+//! </body>
+//! </html>
+//! "#;
+//! let result = parser.parse(html).await?;
+//! println!("Found {} nodes", result.node_count());
+//! # Ok(())
+//! # }
+//! ```
+
+mod config;
+mod parser;
+
+pub use config::HtmlConfig;
+pub use parser::HtmlParser;
diff --git a/src/parser/html/parser.rs b/src/parser/html/parser.rs
new file mode 100644
index 00000000..331a2b1b
--- /dev/null
+++ b/src/parser/html/parser.rs
@@ -0,0 +1,540 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! HTML parser implementation using scraper.
+
+use async_trait::async_trait;
+use scraper::{ElementRef, Html, Selector};
+use std::path::Path;
+
+use crate::error::Result;
+use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode};
+use crate::util::estimate_tokens;
+
+use super::config::HtmlConfig;
+
+/// Metadata extracted from HTML.
+struct HtmlMetadata {
+    title: String,
+    description: Option<String>,
+    author: Option<String>,
+    keywords: Option<String>,
+}
+
+impl Default for HtmlMetadata {
+    fn default() -> Self {
+        Self {
+            title: String::new(),
+            description: None,
+            author: None,
+            keywords: None,
+        }
+    }
+}
+
+/// HTML parser that extracts hierarchical structure from HTML documents.
+///
+/// Uses `scraper` for HTML5-compliant parsing. Extracts heading hierarchy
+/// and content from various HTML elements.
+#[derive(Debug, Clone)]
+pub struct HtmlParser {
+    /// Configuration options.
+    config: HtmlConfig,
+}
+
+impl Default for HtmlParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl HtmlParser {
+    /// Create a new HTML parser with default configuration.
+    #[must_use]
+    pub fn new() -> Self {
+        Self::with_config(HtmlConfig::default())
+    }
+
+    /// Create a parser with custom configuration.
+    #[must_use]
+    pub fn with_config(config: HtmlConfig) -> Self {
+        Self { config }
+    }
+
+    /// Parse HTML content and extract nodes.
+    fn extract_nodes(&self, content: &str) -> (Vec<RawNode>, HtmlMetadata) {
+        let document = Html::parse_document(content);
+
+        // Extract metadata from <head>
+        let metadata = self.extract_metadata(&document);
+
+        // Extract nodes from <body>
+        let nodes = self.extract_nodes_from_document(&document);
+
+        (nodes, metadata)
+    }
+
+    /// Extract metadata from the document head.
+    fn extract_metadata(&self, document: &Html) -> HtmlMetadata {
+        let mut meta = HtmlMetadata::default();
+
+        // Extract title
+        if let Ok(selector) = Selector::parse("title") {
+            if let Some(title_elem) = document.select(&selector).next() {
+                meta.title = title_elem.text().collect::<String>();
+            }
+        }
+
+        // Extract meta description
+        if let Ok(selector) = Selector::parse("meta[name=\"description\"]") {
+            if let Some(desc_elem) = document.select(&selector).next() {
+                if let Some(content) = desc_elem.value().attr("content") {
+                    meta.description = Some(content.to_string());
+                }
+            }
+        }
+
+        // Extract meta author
+        if let Ok(selector) = Selector::parse("meta[name=\"author\"]") {
+            if let Some(author_elem) = document.select(&selector).next() {
+                if let Some(content) = author_elem.value().attr("content") {
+                    meta.author = Some(content.to_string());
+                }
+            }
+        }
+
+        // Extract meta keywords
+        if let Ok(selector) = Selector::parse("meta[name=\"keywords\"]") {
+            if let Some(keywords_elem) = document.select(&selector).next() {
+                if let Some(content) = keywords_elem.value().attr("content") {
+                    meta.keywords = Some(content.to_string());
+                }
+            }
+        }
+
+        // Also try Open Graph description
+        if meta.description.is_none() {
+            if let Ok(selector) = Selector::parse("meta[property=\"og:description\"]") {
+                if let Some(og_elem) = document.select(&selector).next() {
+                    if let Some(content) = og_elem.value().attr("content") {
+                        meta.description = Some(content.to_string());
+                    }
+                }
+            }
+        }
+
+        meta
+    }
+
+    /// Extract nodes from the document.
+    fn extract_nodes_from_document(&self, document: &Html) -> Vec<RawNode> {
+        let mut nodes = Vec::new();
+
+        // Parse body selector
+        let body_selector = match Selector::parse("body") {
+            Ok(s) => s,
+            Err(_) => return nodes,
+        };
+
+        let body = match document.select(&body_selector).next() {
+            Some(b) => b,
+            None => return nodes,
+        };
+
+        // Collect all headings in order
+        let heading_selector = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap();
+
+        let mut headings: Vec<(usize, String, usize)> = Vec::new(); // (index, title, level)
+
+        for (idx, heading) in body.select(&heading_selector).enumerate() {
+            let level = self.get_heading_level(heading.value().name());
+            if let Some(lvl) = level {
+                if lvl <= self.config.max_heading_level {
+                    let title: String = heading.text().collect();
+                    if !title.trim().is_empty() {
+                        headings.push((idx, title.trim().to_string(), lvl));
+                    }
+                }
+            }
+        }
+
+        // If no headings found, try to extract content anyway
+        if headings.is_empty() {
+            let content = self.extract_body_content(body);
+            if !content.trim().is_empty() {
+                nodes.push(RawNode {
+                    title: self.config.default_title.clone(),
+                    content: content.trim().to_string(),
+                    level: 0,
+                    line_start: 1,
+                    line_end: 1,
+                    page: None,
+                    token_count: Some(estimate_tokens(&content)),
+                    total_token_count: None,
+                });
+            }
+            return nodes;
+        }
+
+        // Extract content between headings
+        for (i, (_, title, level)) in headings.iter().enumerate() {
+            let content = self.extract_content_after_heading(body, &headings, i);
+
+            if !title.is_empty() || !content.trim().is_empty() {
+                nodes.push(RawNode {
+                    title: title.clone(),
+                    content: content.trim().to_string(),
+                    level: *level,
+                    line_start: 1,
+                    line_end: 1,
+                    page: None,
+                    token_count: Some(estimate_tokens(&content)),
+                    total_token_count: None,
+                });
+            }
+        }
+
+        // Post-process nodes
+        self.finalize_nodes(nodes)
+    }
+
+    /// Get heading level from tag name (h1-h6).
+    fn get_heading_level(&self, tag: &str) -> Option<usize> {
+        match tag {
+            "h1" => Some(1),
+            "h2" => Some(2),
+            "h3" => Some(3),
+            "h4" => Some(4),
+            "h5" => Some(5),
+            "h6" => Some(6),
+            _ => None,
+        }
+    }
+
+    /// Extract body content (for documents without headings).
+    fn extract_body_content(&self, body: ElementRef) -> String {
+        let mut content = String::new();
+
+        // Extract paragraphs
+        if let Ok(selector) = Selector::parse("p") {
+            for p in body.select(&selector) {
+                let text: String = p.text().collect();
+                if !text.trim().is_empty() {
+                    if !content.is_empty() {
+                        content.push_str("\n\n");
+                    }
+                    content.push_str(text.trim());
+                }
+            }
+        }
+
+        content
+    }
+
+    /// Extract content after a heading until the next heading.
+    fn extract_content_after_heading(
+        &self,
+        body: ElementRef,
+        headings: &[(usize, String, usize)],
+        heading_index: usize,
+    ) -> String {
+        let mut content = String::new();
+
+        // Get all content elements
+        let content_selector = Selector::parse("p, ul, ol, table, pre, blockquote, div.content, article, section")
+            .unwrap();
+
+        // This is a simplified approach - extract content from sibling elements
+        // In a more sophisticated implementation, we would track DOM positions
+        for elem in body.select(&content_selector) {
+            let text = self.extract_element_content(elem);
+            if !text.is_empty() {
+                if !content.is_empty() {
+                    content.push_str("\n\n");
+                }
+                content.push_str(&text);
+            }
+        }
+
+        content
+    }
+
+    /// Extract content from a single element.
+    fn extract_element_content(&self, elem: ElementRef) -> String {
+        let tag = elem.value().name();
+
+        match tag {
+            "p" | "div" | "article" | "section" => {
+                let text: String = elem.text().collect();
+                text.trim().to_string()
+            }
+            "ul" => self.extract_list(elem, false),
+            "ol" => self.extract_list(elem, true),
+            "table" => self.extract_table(elem),
+            "pre" | "code" if self.config.include_code_blocks => {
+                let text: String = elem.text().collect();
+                if !text.trim().is_empty() {
+                    format!("```\n{}\n```", text.trim())
+                } else {
+                    String::new()
+                }
+            }
+            "blockquote" => {
+                let text: String = elem.text().collect();
+                if !text.trim().is_empty() {
+                    text
+                        .lines()
+                        .map(|line| format!("> {}", line))
+                        .collect::<Vec<_>>()
+                        .join("\n")
+                } else {
+                    String::new()
+                }
+            }
+            _ => String::new(),
+        }
+    }
+
+    /// Extract list content.
+    fn extract_list(&self, element: ElementRef, ordered: bool) -> String {
+        let mut result = String::new();
+        let li_selector = Selector::parse("li").unwrap();
+        let mut counter = 1;
+
+        for li in element.select(&li_selector) {
+            let text: String = li.text().collect();
+            if !text.trim().is_empty() {
+                if !result.is_empty() {
+                    result.push('\n');
+                }
+                if ordered {
+                    result.push_str(&format!("{}. {}", counter, text.trim()));
+                    counter += 1;
+                } else {
+                    result.push_str(&format!("• {}", text.trim()));
+                }
+            }
+        }
+
+        result
+    }
+
+    /// Extract table content.
+    fn extract_table(&self, element: ElementRef) -> String {
+        let mut result = String::new();
+        let tr_selector = Selector::parse("tr").unwrap();
+
+        for tr in element.select(&tr_selector) {
+            let mut cells = Vec::new();
+            let td_selector = Selector::parse("td, th").unwrap();
+
+            for cell in tr.select(&td_selector) {
+                let text: String = cell.text().collect();
+                cells.push(text.trim().to_string());
+            }
+
+            if !cells.is_empty() {
+                if !result.is_empty() {
+                    result.push('\n');
+                }
+                result.push_str(&cells.join(" | "));
+            }
+        }
+
+        result
+    }
+
+    /// Finalize nodes after extraction.
+    fn finalize_nodes(&self, mut nodes: Vec<RawNode>) -> Vec<RawNode> {
+        // Remove empty nodes
+        nodes.retain(|n| !n.title.is_empty() || !n.content.trim().is_empty());
+
+        // Merge small consecutive nodes if configured
+        if self.config.merge_small_nodes {
+            nodes = self.merge_small_nodes(nodes);
+        }
+
+        nodes
+    }
+
+    /// Merge small consecutive nodes.
+    fn merge_small_nodes(&self, nodes: Vec<RawNode>) -> Vec<RawNode> {
+        let mut result: Vec<RawNode> = Vec::new();
+
+        for node in nodes {
+            if let Some(last) = result.last_mut() {
+                // Merge if same level and content is small
+                if last.level == node.level && last.content.len() < self.config.min_content_length
+                {
+                    if !last.content.is_empty() {
+                        last.content.push_str("\n\n");
+                    }
+                    last.content.push_str(&node.content);
+                    continue;
+                }
+            }
+            result.push(node);
+        }
+
+        result
+    }
+}
+
+#[async_trait]
+impl DocumentParser for HtmlParser {
+    fn format(&self) -> DocumentFormat {
+        DocumentFormat::Html
+    }
+
+    async fn parse(&self, content: &str) -> Result<ParseResult> {
+        let line_count = content.lines().count();
+        let (nodes, html_meta) = self.extract_nodes(content);
+
+        let meta = DocumentMeta {
+            name: html_meta.title,
+            format: DocumentFormat::Html,
+            page_count: None,
+            line_count,
+            source_path: None,
+            description: html_meta.description,
+        };
+
+        Ok(ParseResult::new(meta, nodes))
+    }
+
+    async fn parse_file(&self, path: &Path) -> Result<ParseResult> {
+        let content = tokio::fs::read_to_string(path)
+            .await
+            .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?;
+
+        let mut result = self.parse(&content).await?;
+
+        // Extract document name from filename (if not set by meta)
+        if result.meta.name.is_empty() {
+            if let Some(stem) = path.file_stem() {
+                result.meta.name = stem.to_string_lossy().to_string();
+            }
+        }
+        result.meta.source_path = Some(path.to_string_lossy().to_string());
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_simple_html() {
+        let parser = HtmlParser::new();
+        let html = r#"<html>
+            <head><title>Test Document</title></head>
+            <body>
+                <h1>Main Title</h1>
+                <p>This is a paragraph.</p>
+                <h2>Section 1</h2>
+                <p>Section content.</p>
+            </body>
+        </html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        assert_eq!(result.meta.name, "Test Document");
+        assert!(!result.nodes.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_parse_headings() {
+        let parser = HtmlParser::new();
+        let html = r#"<html><body>
+            <h1>H1 Title</h1>
+            <p>Content 1</p>
+            <h2>H2 Title</h2>
+            <p>Content 2</p>
+            <h3>H3 Title</h3>
+            <p>Content 3</p>
+        </body></html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect();
+        assert!(heading_nodes.len() >= 3);
+    }
+
+    #[tokio::test]
+    async fn test_parse_metadata() {
+        let parser = HtmlParser::new();
+        let html = r#"<html>
+            <head>
+                <title>My Page</title>
+                <meta name="description" content="A test page">
+                <meta name="author" content="Test Author">
+            </head>
+            <body><h1>Content</h1></body>
+        </html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        assert_eq!(result.meta.name, "My Page");
+        assert_eq!(result.meta.description, Some("A test page".to_string()));
+    }
+
+    #[tokio::test]
+    async fn test_parse_list() {
+        let parser = HtmlParser::new();
+        let html = r#"<html><body>
+            <h1>List Example</h1>
+            <ul>
+                <li>Item 1</li>
+                <li>Item 2</li>
+                <li>Item 3</li>
+            </ul>
+        </body></html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        let list_node = result.nodes.iter().find(|n| n.title == "List Example");
+        assert!(list_node.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_parse_table() {
+        let parser = HtmlParser::new();
+        let html = r#"<html><body>
+            <h1>Table Example</h1>
+            <table>
+                <tr><th>Name</th><th>Age</th></tr>
+                <tr><td>Alice</td><td>30</td></tr>
+            </table>
+        </body></html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        let table_node = result.nodes.iter().find(|n| n.title == "Table Example");
+        assert!(table_node.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_empty_document() {
+        let parser = HtmlParser::new();
+        let result = parser.parse("<html><body></body></html>").await.unwrap();
+
+        assert!(result.nodes.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_no_headings() {
+        let parser = HtmlParser::new();
+        let html = r#"<html><body>
+            <p>Just some text.</p>
+            <p>More text.</p>
+        </body></html>"#;
+
+        let result = parser.parse(html).await.unwrap();
+
+        // Should create a default node
+        assert_eq!(result.nodes.len(), 1);
+        assert_eq!(result.nodes[0].title, "Introduction");
+    }
+}
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 4442f25c..7bb952d7 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -12,7 +12,7 @@
 //! - **Markdown** - Full support via [`MarkdownParser`]
 //! - **PDF** - Full support via [`PdfParser`] with TOC extraction
 //! - **DOCX** - Full support via [`DocxParser`] with heading detection
-//! - **HTML** - Planned (placeholder)
+//! - **HTML** - Full support via [`HtmlParser`] with heading hierarchy
 //!
 //! # Example
 //!
@@ -46,6 +46,9 @@ pub mod markdown;
 // PDF parsing module
 pub mod pdf;
 
+// HTML parsing module
+pub mod html;
+
 // TOC processing module
 pub mod toc;
 
@@ -63,5 +66,6 @@ pub use registry::{ParserRegistry, get_parser, get_parser_for_file, parse_conten
 
 // Re-export concrete parsers
 pub use docx::DocxParser;
+pub use html::{HtmlConfig, HtmlParser};
 pub use markdown::{MarkdownConfig, MarkdownParser};
 pub use pdf::PdfParser;
diff --git a/src/parser/registry.rs b/src/parser/registry.rs
index e59863a3..ef1cf416 100644
--- a/src/parser/registry.rs
+++ b/src/parser/registry.rs
@@ -13,7 +13,9 @@ use std::sync::{Arc, RwLock};
 
 use crate::Error;
 use crate::error::Result;
-use crate::parser::{DocumentFormat, DocumentParser, MarkdownParser, ParseResult, PdfParser};
+use crate::parser::{
+    DocumentFormat, DocumentParser, HtmlParser, MarkdownParser, ParseResult, PdfParser,
+};
 
 /// Type alias for parser factory functions.
 type ParserFactory = Box<dyn Fn() -> Box<dyn DocumentParser> + Send + Sync>;
@@ -63,10 +65,12 @@ impl ParserRegistry {
         registry
     }
 
-    /// Register default parsers (Markdown, PDF).
+    /// Register default parsers (Markdown, PDF, HTML, DOCX).
     pub fn register_defaults(&self) {
         self.register("markdown", || Box::new(MarkdownParser::new()));
         self.register("pdf", || Box::new(PdfParser::new()));
+        self.register("html", || Box::new(HtmlParser::new()));
+        self.register("docx", || Box::new(super::docx::DocxParser::new()));
     }
 
     /// Register a parser factory by name.
@@ -182,7 +186,7 @@ pub fn get_parser(format: DocumentFormat) -> Option<Box<dyn DocumentParser>> {
     match format {
         DocumentFormat::Markdown => Some(Box::new(MarkdownParser::new())),
         DocumentFormat::Pdf => Some(Box::new(PdfParser::new())),
-        DocumentFormat::Html => None, // TODO: Implement HTML parser
+        DocumentFormat::Html => Some(Box::new(HtmlParser::new())),
         DocumentFormat::Docx => Some(Box::new(super::docx::DocxParser::new())),
         DocumentFormat::Text => None, // TODO: Implement plain text parser
     }
@@ -243,6 +247,7 @@ mod tests {
         let registry = ParserRegistry::with_defaults();
         let formats = registry.supported_formats();
         assert!(formats.contains(&DocumentFormat::Markdown));
+        assert!(formats.contains(&DocumentFormat::Html));
     }
 
     #[test]
@@ -267,6 +272,14 @@ mod tests {
         assert!(parser.is_some());
     }
 
+    #[test]
+    fn test_html_parser_registered() {
+        let registry = ParserRegistry::with_defaults();
+        assert!(registry.supports(DocumentFormat::Html));
+        let parser = registry.get(DocumentFormat::Html);
+        assert!(parser.is_some());
+    }
+
     #[test]
     fn test_get_parser_function() {
         let parser = get_parser(DocumentFormat::Markdown);
@@ -278,4 +291,10 @@ mod tests {
         let parser = get_parser_for_file(Path::new("test.md"));
         assert!(parser.is_some());
     }
+
+    #[test]
+    fn test_get_html_parser_for_file() {
+        let parser = get_parser_for_file(Path::new("test.html"));
+        assert!(parser.is_some());
+    }
 }