diff --git a/rust/examples/advanced.rs b/rust/examples/advanced.rs index a5c367b4..a75608d1 100644 --- a/rust/examples/advanced.rs +++ b/rust/examples/advanced.rs @@ -9,10 +9,10 @@ //! # Usage //! //! ```bash -//! # First, copy the example config and edit it -//! cp config.toml ./my_vectorless.toml -//! # Edit my_vectorless.toml to customize settings +//! # Using environment variables for LLM config (overrides config file): +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o cargo run --example advanced //! +//! # Or with defaults (using config file): //! cargo run --example advanced //! ``` @@ -20,12 +20,28 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Vectorless Advanced Example (Config File) ===\n"); // Load all settings from the specified config file. // The config file must include api_key and model. - let client = EngineBuilder::new() - .with_config_path("./config.toml") + // If environment variables are set, they override the config file values. + let mut builder = EngineBuilder::new().with_config_path("./config.toml"); + + // Override config with env vars if present + if let Ok(api_key) = std::env::var("LLM_API_KEY") { + builder = builder.with_key(&api_key); + } + if let Ok(model) = std::env::var("LLM_MODEL") { + builder = builder.with_model(&model); + } + if let Ok(endpoint) = std::env::var("LLM_ENDPOINT") { + builder = builder.with_endpoint(&endpoint); + } + + let client = builder .build() .await .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; @@ -56,4 +72,4 @@ async fn main() -> vectorless::Result<()> { println!("\n=== Done ==="); Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/events.rs b/rust/examples/events.rs index 65176751..b0433dc7 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -11,6 +11,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example events +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example events //! ``` @@ -22,6 +27,9 @@ use vectorless::events::{EventEmitter, IndexEvent, QueryEvent}; #[tokio::main] async fn main() -> Result<(), Box> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Event Callbacks Example ===\n"); // 1. Create event emitter with handlers @@ -90,87 +98,61 @@ async fn main() -> Result<(), Box> { println!(" ✓ Event handlers configured\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "https://api.openai.com/v1".to_string()); + // 2. Create engine with events println!("Step 2: Creating engine with event emitter..."); let engine = EngineBuilder::new() .with_workspace("./workspace_events_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .with_events(events) .build() - .await - .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; + .await?; println!(" ✓ Engine created\n"); - // 3. Index a document (events will fire) - println!("Step 3: Indexing document (watch events)...\n"); - - let temp_dir = tempfile::tempdir()?; - let doc_content = r#"# Example Document - -## Introduction - -This is an example document for demonstrating event callbacks. - -## Features - -- Event monitoring for indexing -- Event monitoring for queries -- Progress tracking - -## Architecture - -The event system uses handlers that can be attached to the engine builder. -"#; - - let doc_path = temp_dir.path().join("example.md"); - tokio::fs::write(&doc_path, doc_content).await?; - - let index_result = engine.index(IndexContext::from_path(&doc_path)).await?; - let doc_id = index_result.doc_id().unwrap().to_string(); - println!(); - - // 4. Query the document (events will fire) - println!("Step 4: Querying document (watch events)...\n"); - + // 3. Index a document with events + println!("Step 3: Indexing document (with events)..."); let result = engine - .query(QueryContext::new("What features are available?").with_doc_id(&doc_id)) + .index(IndexContext::from_path("../README.md")) .await?; - println!(); + let doc_id = result.doc_id().unwrap().to_string(); + println!(" ✓ Indexed: {doc_id}\n"); - // 5. Show results - println!("Step 5: Query result:"); + // 4. Query with events + println!("Step 4: Querying (with events)..."); + let result = engine + .query( + QueryContext::new("What is vectorless?") + .with_doc_id(&doc_id) + ) + .await?; if let Some(item) = result.single() { - println!(" - Score: {:.2}", item.score); - println!(" - Nodes: {}", item.node_ids.len()); + println!(" ✓ Found result ({} chars)", item.content.len()); if !item.content.is_empty() { - let preview: String = item.content.chars().take(100).collect(); - println!(" - Content: {}...", preview); + let preview: String = item.content.chars().take(200).collect(); + println!(" Preview: {}...", preview); } } - println!(); - - // 6. Show statistics - println!("Step 6: Event statistics:"); - println!( - " - Index events fired: {}", - index_count.load(Ordering::SeqCst) - ); - println!( - " - Query events fired: {}", - query_count.load(Ordering::SeqCst) - ); - println!( - " - Nodes visited: {}", - nodes_visited.load(Ordering::SeqCst) - ); - println!(); - - // 7. Cleanup - println!("Step 7: Cleanup..."); + + // 5. Stats + println!("\n--- Stats ---"); + println!(" Documents indexed: {}", index_count.load(Ordering::SeqCst)); + println!(" Queries executed: {}", query_count.load(Ordering::SeqCst)); + println!(" Nodes visited: {}", nodes_visited.load(Ordering::SeqCst)); + + // Cleanup engine.remove(&doc_id).await?; - println!(" ✓ Document removed\n"); + println!("\n Cleaned up"); - println!("=== Example Complete ==="); + println!("\n=== Done ==="); Ok(()) } diff --git a/rust/examples/flow.rs b/rust/examples/flow.rs index ff1b6ca7..4778bd44 100644 --- a/rust/examples/flow.rs +++ b/rust/examples/flow.rs @@ -12,6 +12,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example flow +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example flow //! ``` @@ -54,14 +59,23 @@ async fn main() -> vectorless::Result<()> { println!("=== Vectorless Flow Example ===\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "https://api".to_string()); + // Step 1: Create a Vectorless client println!("Step 1: Creating Vectorless client..."); let engine = EngineBuilder::new() .with_workspace("./worksspace_flow_example") - .with_key("sk...") - .with_model("gpt-4o") - .with_endpoint("https://api") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -130,12 +144,10 @@ async fn main() -> vectorless::Result<()> { println!(); } - // Step 5: Cleanup - println!("Step 5: Cleanup..."); - - // engine.remove(&doc_id).await?; - // println!(" - Document removed"); + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } - println!("\n=== Example Complete ==="); Ok(()) } diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index cdefb451..ac87a673 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -10,6 +10,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! cargo run --example graph +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example graph //! ``` @@ -17,13 +22,23 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Document Graph Example ===\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + // 1. Create engine let engine = EngineBuilder::new() .with_workspace("./workspace_graph_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_key(&api_key) + .with_model(&model) .build() .await .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; @@ -91,4 +106,4 @@ async fn main() -> vectorless::Result<()> { println!("\n=== Done ==="); Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs index 6b710a93..32254d7d 100644 --- a/rust/examples/index_incremental.rs +++ b/rust/examples/index_incremental.rs @@ -4,6 +4,11 @@ //! Incremental indexing example — re-index with change detection. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_incremental +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example index_incremental //! ``` @@ -11,11 +16,23 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_incremental_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -93,4 +110,4 @@ Deletes a user by their unique identifier. } Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs new file mode 100644 index 00000000..c7840e14 --- /dev/null +++ b/rust/examples/index_pdf.rs @@ -0,0 +1,110 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! PDF indexing example — index a PDF document via the vectorless engine. +//! +//! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf +//! +//! # Or with defaults (edit the code to set your key/endpoint): +//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf +//! ``` + +use std::path::Path; + +use vectorless::{EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + // Initialize tracing so we can see pipeline logs. + // Set RUST_LOG=info or RUST_LOG=debug for more detail. + tracing_subscriber::fmt::init(); + + let args: Vec = std::env::args().collect(); + + let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or_else(|| { + eprintln!("Usage: cargo run --example index_pdf -- "); + std::process::exit(1); + }); + + if !Path::new(pdf_path).exists() { + eprintln!("Error: file not found: {}", pdf_path); + std::process::exit(1); + } + + println!("=== Indexing PDF: {} ===\n", pdf_path); + + // Build engine with LLM configuration from environment or defaults. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + + tracing::info!( + "LLM config — key: {}..., model: {}, endpoint: {}", + &api_key[..api_key.len().min(8)], + model, + endpoint + ); + + let engine = EngineBuilder::new() + .with_workspace("./workspace_pdf_example") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let result = engine + .index(IndexContext::from_path(pdf_path)) + .await?; + + println!( + "Indexed: {}, Failed: {}", + result.items.len(), + result.failed.len() + ); + + for item in &result.items { + println!("\n--- {} ---", item.name); + println!("doc_id: {}", item.doc_id); + println!("format: {:?}", item.format); + + if let Some(metrics) = &item.metrics { + println!("\nMetrics:"); + println!(" total time: {}ms", metrics.total_time_ms()); + println!(" parse: {}ms", metrics.parse_time_ms); + println!(" build: {}ms", metrics.build_time_ms); + println!(" enhance: {}ms", metrics.enhance_time_ms); + println!(" nodes: {}", metrics.nodes_processed); + println!(" summaries: {}", metrics.summaries_generated); + println!(" llm calls: {}", metrics.llm_calls); + println!(" tokens: {}", metrics.total_tokens_generated); + println!(" topics: {}", metrics.topics_indexed); + println!(" keywords: {}", metrics.keywords_indexed); + + if metrics.llm_calls == 0 { + println!("\n *** WARNING: No LLM calls were made. ***"); + println!(" Set RUST_LOG=info to see pipeline logs:"); + println!(" RUST_LOG=info cargo run --example index_pdf -- "); + println!(" Check LLM_API_KEY, LLM_MODEL, and LLM_ENDPOINT are valid."); + } + } + } + + for fail in &result.failed { + eprintln!("FAILED: {} — {}", fail.source, fail.error); + } + + // Cleanup workspace (uncomment to clean up after run) + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index 3a5632f0..55ec52d5 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -4,6 +4,11 @@ //! Single document indexing example — index one document from content. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_single +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example index_single //! ``` @@ -11,11 +16,23 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_single_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -69,21 +86,10 @@ Monitoring is implemented using a Prometheus and Grafana stack, with custom metr println!("name: {}", item.name); println!("format: {:?}", item.format); - if let Some(metrics) = &item.metrics { - println!(" metrics:"); - println!(" total time: {}ms", metrics.total_time_ms()); - println!(" parse: {}ms", metrics.parse_time_ms); - println!(" build: {}ms", metrics.build_time_ms); - println!(" enhance: {}ms", metrics.enhance_time_ms); - println!(" enrich: {}ms", metrics.enrich_time_ms); - println!(" optimize: {}ms", metrics.optimize_time_ms); - println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); - println!(" nodes: {}", metrics.nodes_processed); - println!(" summaries: {}", metrics.summaries_generated); - println!(" llm calls: {}", metrics.llm_calls); - println!(" tokens: {}", metrics.total_tokens_generated); - println!(" topics: {}", metrics.topics_indexed); - println!(" keywords: {}", metrics.keywords_indexed); + if let Some(ref metrics) = item.metrics { + println!("time: {}ms", metrics.total_time_ms()); + println!("nodes: {}", metrics.nodes_processed); + println!("tokens: {}", metrics.total_tokens_generated); } } @@ -93,4 +99,4 @@ Monitoring is implemented using a Prometheus and Grafana stack, with custom metr } Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index 53d8fe92..e4489d29 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -1,9 +1,14 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Batch indexing example — index multiple documents at once. +//! Batch indexing example — index multiple documents via the vectorless engine. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example indexing +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example indexing //! ``` @@ -11,30 +16,41 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_batch_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; - // Index multiple files from different paths + // Index multiple documents in a single call. + // Paths are resolved relative to the workspace directory. let result = engine - .index(IndexContext::from_paths(&[ - "../README.md", - "../CLAUDE.md", - "../LICENSE", - ])) + .index( + IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) .await?; - println!("indexed: {}, failed: {}", result.items.len(), result.failed.len()); + println!("Indexed {} document(s)", result.items.len()); for item in &result.items { - println!(" {} — doc_id: {}", item.name, item.doc_id); - } - for fail in &result.failed { - println!(" FAILED: {} — {}", fail.source, fail.error); + println!(" - {} ({})", item.name, item.doc_id); + if let Some(metrics) = &item.metrics { + println!(" Time: {}ms", metrics.total_time_ms()); + println!(" Nodes: {}", metrics.nodes_processed); + } } // Cleanup @@ -43,4 +59,4 @@ async fn main() -> vectorless::Result<()> { } Ok(()) -} +} \ No newline at end of file diff --git a/rust/src/index/parse/mod.rs b/rust/src/index/parse/mod.rs index 9fd5a042..0bcba9f4 100644 --- a/rust/src/index/parse/mod.rs +++ b/rust/src/index/parse/mod.rs @@ -27,9 +27,14 @@ use std::path::Path; use crate::error::Result; use crate::index::parse::markdown::MarkdownParser; +use crate::llm::LlmClient; /// Parse a string content document. -pub async fn parse_content(content: &str, format: DocumentFormat) -> Result { +pub async fn parse_content( + content: &str, + format: DocumentFormat, + _llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let parser = MarkdownParser::new(); @@ -42,21 +47,32 @@ pub async fn parse_content(content: &str, format: DocumentFormat) -> Result Result { +pub async fn parse_file( + path: &Path, + format: DocumentFormat, + llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let parser = MarkdownParser::new(); parser.parse_file(path).await } DocumentFormat::Pdf => { - let parser = pdf::PdfParser::new(); + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; parser.parse_file(path).await } } } /// Parse binary data. -pub async fn parse_bytes(bytes: &[u8], format: DocumentFormat) -> Result { +pub async fn parse_bytes( + bytes: &[u8], + format: DocumentFormat, + llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let content = std::str::from_utf8(bytes) @@ -65,7 +81,10 @@ pub async fn parse_bytes(bytes: &[u8], format: DocumentFormat) -> Result { - let parser = pdf::PdfParser::new(); + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; parser.parse_bytes_async(bytes, None).await } } diff --git a/rust/src/index/parse/pdf/parser.rs b/rust/src/index/parse/pdf/parser.rs index 4684ae1a..7702872b 100644 --- a/rust/src/index/parse/pdf/parser.rs +++ b/rust/src/index/parse/pdf/parser.rs @@ -1,7 +1,11 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! PDF document parser using lopdf. +//! PDF document parser. +//! +//! Uses [`pdf_extract`] for reliable text extraction (handles CJK, ToUnicode +//! CMap, font encoding, etc.) and [`lopdf`] only for metadata extraction from +//! the PDF Info dictionary. use std::path::Path; @@ -11,14 +15,16 @@ use tracing::{info, warn}; use crate::Error; use crate::error::Result; use crate::index::parse::toc::TocProcessor; +use crate::llm::LlmClient; use super::types::{PdfMetadata, PdfPage, PdfParseResult}; use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; /// PDF document parser. -#[derive(Debug, Clone)] pub struct PdfParser { config: PdfParserConfig, + /// Optional LLM client for TOC extraction and structure analysis. + llm_client: Option, } /// PDF parser configuration. @@ -35,7 +41,7 @@ impl Default for PdfParserConfig { fn default() -> Self { Self { max_pages: 0, - extract_toc: true, // Default enabled + extract_toc: true, } } } @@ -46,17 +52,31 @@ impl PdfParser { Self::default() } + /// Create a PDF parser with an externally provided LLM client. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + config: PdfParserConfig::default(), + llm_client: Some(client), + } + } + /// Create a parser with custom configuration. pub fn with_config(config: PdfParserConfig) -> Self { - Self { config } + Self { + config, + llm_client: None, + } } /// Create a parser without TOC extraction. pub fn without_toc() -> Self { - Self::with_config(PdfParserConfig { - extract_toc: false, - ..Default::default() - }) + Self { + config: PdfParserConfig { + extract_toc: false, + ..Default::default() + }, + llm_client: None, + } } /// Parse PDF from bytes and return raw pages. @@ -65,19 +85,42 @@ impl PdfParser { bytes: &[u8], filename: Option<&str>, ) -> Result { - let doc = LopdfDocument::load_mem(bytes) - .map_err(|e| Error::Parse(format!("Failed to parse PDF: {}", e)))?; + // Use pdf-extract for text (handles CJK, ToUnicode CMap, etc.) + let pages = self.extract_pages(bytes)?; + + // Use lopdf only for metadata; fall back gracefully if it fails + let metadata = match LopdfDocument::load_mem(bytes) { + Ok(doc) => self.extract_metadata(&doc, filename), + Err(_) => PdfMetadata { + title: filename.unwrap_or("Document").to_string(), + page_count: pages.len(), + ..Default::default() + }, + }; - // Extract metadata - let metadata = self.extract_metadata(&doc, filename); + Ok(PdfParseResult::new(metadata, pages)) + } - // Extract pages - let pages = self.extract_pages(&doc)?; + /// Extract text from all pages using pdf-extract. + fn extract_pages(&self, bytes: &[u8]) -> Result> { + let page_texts = pdf_extract::extract_text_from_mem_by_pages(bytes) + .map_err(|e| Error::Parse(format!("pdf-extract failed: {}", e)))?; - Ok(PdfParseResult::new(metadata, pages)) + let mut pages = Vec::new(); + for (i, text) in page_texts.iter().enumerate() { + if self.config.max_pages > 0 && i >= self.config.max_pages { + break; + } + let page_num = i + 1; // 1-based + if !text.trim().is_empty() { + pages.push(PdfPage::new(page_num, text.clone())); + } + } + + Ok(pages) } - /// Extract metadata from PDF document. + /// Extract metadata from PDF Info dictionary via lopdf. fn extract_metadata(&self, doc: &LopdfDocument, filename: Option<&str>) -> PdfMetadata { let mut metadata = PdfMetadata { title: filename.unwrap_or("Document").to_string(), @@ -85,26 +128,22 @@ impl PdfParser { ..Default::default() }; - // Try to extract metadata from Info dictionary if let Ok(info) = doc.trailer.get(b"Info") { if let Ok(info_ref) = info.as_reference() { if let Ok(info_obj) = doc.get_object(info_ref) { if let Ok(dict) = info_obj.as_dict() { - // Title if let Ok(title_obj) = dict.get(b"Title") { if let Ok(title) = title_obj.as_str() { metadata.title = self.decode_pdf_string(title); } } - // Author if let Ok(author_obj) = dict.get(b"Author") { if let Ok(author) = author_obj.as_str() { metadata.author = Some(self.decode_pdf_string(author)); } } - // Subject if let Ok(subject_obj) = dict.get(b"Subject") { if let Ok(subject) = subject_obj.as_str() { metadata.subject = Some(self.decode_pdf_string(subject)); @@ -118,158 +157,9 @@ impl PdfParser { metadata } - /// Extract text from all pages. - fn extract_pages(&self, doc: &LopdfDocument) -> Result> { - let page_map = doc.get_pages(); - let mut pages = Vec::new(); - - for (i, (page_num, object_id)) in page_map.iter().enumerate() { - // Check max pages limit - if self.config.max_pages > 0 && i >= self.config.max_pages { - break; - } - - let text = self.extract_page_text(doc, *object_id, *page_num as usize); - - // Skip empty pages - if !text.trim().is_empty() { - pages.push(PdfPage::new(*page_num as usize, text)); - } - } - - Ok(pages) - } - - /// Extract text from a single page. - fn extract_page_text( - &self, - doc: &LopdfDocument, - object_id: lopdf::ObjectId, - _page_num: usize, - ) -> String { - let mut text = String::new(); - - if let Ok(page_obj) = doc.get_object(object_id) { - if let Ok(page_dict) = page_obj.as_dict() { - if let Ok(contents) = page_dict.get(b"Contents") { - match contents { - lopdf::Object::Reference(ref_id) => { - if let Ok(content_obj) = doc.get_object(*ref_id) { - if let Ok(stream) = content_obj.as_stream() { - text = self.decode_stream_content(stream); - } - } - } - lopdf::Object::Array(arr) => { - for obj in arr { - if let Ok(ref_id) = obj.as_reference() { - if let Ok(content_obj) = doc.get_object(ref_id) { - if let Ok(stream) = content_obj.as_stream() { - let content = self.decode_stream_content(stream); - if !text.is_empty() { - text.push('\n'); - } - text.push_str(&content); - } - } - } - } - } - _ => {} - } - } - } - } - - // Post-process text - self.post_process_text(&text) - } - - /// Decode stream content to text. - fn decode_stream_content(&self, stream: &lopdf::Stream) -> String { - // Try to decode the stream - if let Ok(content) = stream.decompressed_content() { - self.extract_text_from_content(&content) - } else { - self.extract_text_from_content(&stream.content) - } - } - - /// Extract text from PDF content stream (simplified). - fn extract_text_from_content(&self, content: &[u8]) -> String { - let content_str = String::from_utf8_lossy(content); - let mut text = String::new(); - - for line in content_str.lines() { - let line = line.trim(); - - // Tj operator: (text) Tj - if line.ends_with("Tj") { - if let Some(text_part) = self.extract_parentheses_text(line) { - text.push_str(&text_part); - } - } - // TJ operator: [(text) ...] TJ - else if line.ends_with("TJ") { - if let Some(text_parts) = self.extract_array_text(line) { - text.push_str(&text_parts); - } - } - } - - text - } - - /// Extract text from parentheses in Tj operator. - fn extract_parentheses_text(&self, line: &str) -> Option { - let start = line.find('(')?; - let end = line.rfind(')')?; - if end > start { - let raw = &line[start + 1..end]; - Some(self.decode_pdf_string(raw.as_bytes())) - } else { - None - } - } - - /// Extract text from array in TJ operator. - fn extract_array_text(&self, line: &str) -> Option { - let start = line.find('[')?; - let end = line.rfind(']')?; - if end > start { - let content = &line[start + 1..end]; - let mut text = String::new(); - - let mut in_parens = false; - let mut current = String::new(); - - for ch in content.chars() { - match ch { - '(' => { - in_parens = true; - current.clear(); - } - ')' => { - if in_parens { - text.push_str(&self.decode_pdf_string(current.as_bytes())); - } - in_parens = false; - } - _ => { - if in_parens { - current.push(ch); - } - } - } - } - - Some(text) - } else { - None - } - } - - /// Decode PDF string (handle escape sequences). + /// Decode PDF string literal (handles escape sequences). + /// + /// Used only for metadata field values extracted via lopdf. fn decode_pdf_string(&self, bytes: &[u8]) -> String { let mut result = String::new(); let mut i = 0; @@ -299,26 +189,6 @@ impl PdfParser { result } - /// Post-process extracted text. - fn post_process_text(&self, text: &str) -> String { - let mut result = String::new(); - let mut prev_space = false; - - for ch in text.chars() { - if ch.is_whitespace() { - if !prev_space { - result.push(' '); - prev_space = true; - } - } else { - result.push(ch); - prev_space = false; - } - } - - result.trim().to_string() - } - /// Convert TOC entries to RawNodes. fn toc_entries_to_raw_nodes( &self, @@ -328,7 +198,6 @@ impl PdfParser { let mut nodes = Vec::new(); for entry in entries { - // Get content from the page range let content = self.get_content_for_entry(entry, pages); let mut node = RawNode::new(&entry.title) @@ -353,12 +222,10 @@ impl PdfParser { ) -> String { let start_page = entry.physical_page.unwrap_or(1); - // Find content on this page pages .iter() .find(|p| p.number == start_page) .map(|p| { - // Try to find the title position and extract content after it let text = &p.text; if let Some(pos) = text.find(&entry.title) { text[pos + entry.title.len()..].trim().to_string() @@ -423,7 +290,16 @@ impl PdfParser { let nodes = if self.config.extract_toc { info!("Extracting TOC from PDF with {} pages", page_count); - let processor = TocProcessor::new(); + let processor = match &self.llm_client { + Some(client) => { + info!("PdfParser: creating TocProcessor with LLM client"); + TocProcessor::with_llm_client(client.clone()) + } + None => { + info!("PdfParser: creating TocProcessor without LLM client (no key configured)"); + TocProcessor::new() + } + }; match processor.process(&result.pages).await { Ok(entries) if !entries.is_empty() => { info!("Extracted {} TOC entries", entries.len()); @@ -445,7 +321,6 @@ impl PdfParser { self.pages_to_raw_nodes(&result.pages) }; - // Build metadata let meta = DocumentMeta { name: result.metadata.title, format: DocumentFormat::Pdf, @@ -486,15 +361,4 @@ mod tests { let decoded = parser.decode_pdf_string(b"Hello\\nWorld"); assert_eq!(decoded, "Hello\nWorld"); } - - #[test] - fn test_post_process_text() { - let parser = PdfParser::new(); - - let processed = parser.post_process_text("Hello World"); - assert_eq!(processed, "Hello World"); - - let processed = parser.post_process_text(" Hello World "); - assert_eq!(processed, "Hello World"); - } } diff --git a/rust/src/index/parse/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs index fc97c420..beff3021 100644 --- a/rust/src/index/parse/toc/assigner.rs +++ b/rust/src/index/parse/toc/assigner.rs @@ -4,6 +4,7 @@ //! Page assigner - assigns physical page numbers to TOC entries. use std::collections::HashMap; +use futures::future::join_all; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -49,6 +50,14 @@ impl PageAssigner { Self { config, client } } + /// Create an assigner with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: PageAssignerConfig::default(), + client, + } + } + /// Create an assigner with default configuration. pub fn with_defaults() -> Self { Self::new(PageAssignerConfig::default()) @@ -121,7 +130,7 @@ impl PageAssigner { .collect() } - /// Calculate page offset by verifying anchors. + /// Calculate page offset by verifying anchors concurrently. async fn calculate_offset( &self, anchors: Vec<&TocEntry>, @@ -132,26 +141,41 @@ impl PageAssigner { } let anchor_count = anchors.len(); - let mut verified_offsets: Vec<(i32, bool)> = Vec::new(); - - for anchor in anchors { - let toc_page = anchor.toc_page.unwrap(); - - // Find the physical page where this title appears - if let Some(physical) = self - .locate_title_in_range(anchor.title.as_str(), pages, toc_page) - .await? - { - let offset = physical as i32 - toc_page as i32; - verified_offsets.push((offset, true)); - debug!( - "Anchor '{}' found: toc={}, physical={}, offset={}", - anchor.title, toc_page, physical, offset - ); - } else { - verified_offsets.push((0, false)); - } - } + + // Verify all anchors concurrently + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let futures: Vec<_> = anchors + .into_iter() + .map(|anchor| { + let title = anchor.title.clone(); + let toc_page = anchor.toc_page.unwrap(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let range_pages = Self::pages_around(&pages, toc_page, 3); + if range_pages.is_empty() { + return (0, false); + } + + let content = Self::format_range_pages(&range_pages); + match Self::locate_with_client(&client, &title, &content).await { + Ok(Some(physical)) => { + let offset = physical as i32 - toc_page as i32; + debug!( + "Anchor '{}' found: toc={}, physical={}, offset={}", + title, toc_page, physical, offset + ); + (offset, true) + } + _ => (0, false), + } + } + }) + .collect(); + + let verified_offsets = join_all(futures).await; // Calculate the mode (most common offset) let successful: Vec<_> = verified_offsets @@ -164,7 +188,7 @@ impl PageAssigner { return Ok(PageOffset::new(0, 0, 0.0)); } - let mode = self.calculate_mode(&successful); + let mode = Self::calculate_mode_static(&successful); let sample_count = successful.len(); let confidence = sample_count as f32 / anchor_count as f32; @@ -173,6 +197,11 @@ impl PageAssigner { /// Calculate mode of offset values. fn calculate_mode(&self, values: &[i32]) -> i32 { + Self::calculate_mode_static(values) + } + + /// Static version for use in concurrent contexts. + fn calculate_mode_static(values: &[i32]) -> i32 { let mut counts: HashMap = HashMap::new(); for &v in values { *counts.entry(v).or_insert(0) += 1; @@ -184,25 +213,18 @@ impl PageAssigner { .unwrap_or(0) } - /// Locate a title in a range of pages using LLM. - async fn locate_title_in_range( - &self, - title: &str, - pages: &[PdfPage], - near_page: usize, - ) -> Result> { - // Search in a range around the expected page - let start = (near_page.saturating_sub(3)).max(1); - let end = (near_page + 3).min(pages.len()); - - let range_pages: Vec<_> = (start..=end).filter_map(|i| pages.get(i - 1)).collect(); - - if range_pages.is_empty() { - return Ok(None); - } + /// Collect pages around a center page number. + fn pages_around(pages: &[PdfPage], center: usize, range: usize) -> Vec { + let start = center.saturating_sub(range).max(1); + let end = (center + range).min(pages.len()); + (start..=end) + .filter_map(|i| pages.get(i - 1).cloned()) + .collect() + } - // Use LLM to find the exact page - let content = range_pages + /// Format pages into tagged text for LLM. + fn format_range_pages(pages: &[PdfPage]) -> String { + pages .iter() .map(|p| { format!( @@ -213,8 +235,15 @@ impl PageAssigner { ) }) .collect::>() - .join("\n\n"); + .join("\n\n") + } + /// Locate a title in pre-formatted content using LLM (static, for concurrent use). + async fn locate_with_client( + client: &LlmClient, + title: &str, + content: &str, + ) -> Result> { let system = "You are a document analysis assistant. Find which page contains a specific section title."; let user = format!( r#"Find which page contains the section titled: "{}" @@ -232,21 +261,37 @@ Reply in JSON format: page: Option, } - let result: LocateResult = self.client.complete_json(system, &user).await?; + let result: LocateResult = client.complete_json(system, &user).await?; Ok(result.page) } - /// Assign pages using LLM for each entry. + /// Assign pages using LLM for each entry (concurrently). async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { info!("Assigning pages using LLM positioning"); - // Group pages for efficient processing - let page_groups = self.group_pages(pages, 5); + let client = self.client.clone(); + let pages_owned = pages.to_vec(); - for entry in entries.iter_mut() { - let physical = self - .locate_title_in_groups(entry.title.as_str(), &page_groups) - .await?; + // Launch all entry searches concurrently + let futures: Vec<_> = entries + .iter() + .map(|entry| { + let title = entry.title.clone(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let groups = Self::group_pages_owned(&pages, 5); + Self::locate_title_in_groups_static(&client, &title, &groups).await + } + }) + .collect(); + + let results = join_all(futures).await; + + // Write results back + for (entry, result) in entries.iter_mut().zip(results.into_iter()) { + let physical = result?; entry.physical_page = physical; entry.confidence = if physical.is_some() { 0.8 } else { 0.3 }; } @@ -254,19 +299,22 @@ Reply in JSON format: Ok(()) } - /// Group pages for batch processing. - fn group_pages<'a>(&self, pages: &'a [PdfPage], group_size: usize) -> Vec> { + /// Group owned pages for batch processing. + fn group_pages_owned(pages: &[PdfPage], group_size: usize) -> Vec> { pages .chunks(group_size) - .map(|chunk| chunk.iter().collect()) + .map(|chunk| chunk.to_vec()) .collect() } - /// Locate a title across page groups. - async fn locate_title_in_groups( - &self, + /// Locate a title across page groups (static, for concurrent use). + /// + /// Searches groups sequentially (early return on first match), + /// but multiple title searches can run concurrently. + async fn locate_title_in_groups_static( + client: &LlmClient, title: &str, - groups: &[Vec<&PdfPage>], + groups: &[Vec], ) -> Result> { let system = "You are a document analysis assistant. Find which page contains a specific section title."; @@ -301,7 +349,7 @@ Reply in JSON format: page: Option, } - let result: SearchResult = self.client.complete_json(system, &user).await?; + let result: SearchResult = client.complete_json(system, &user).await?; if result.found { return Ok(result.page); diff --git a/rust/src/index/parse/toc/detector.rs b/rust/src/index/parse/toc/detector.rs index f179c507..032a18af 100644 --- a/rust/src/index/parse/toc/detector.rs +++ b/rust/src/index/parse/toc/detector.rs @@ -74,6 +74,20 @@ impl TocDetector { } } + /// Create a detector with an externally provided LLM client. + pub fn with_client(config: TocDetectorConfig, client: LlmClient) -> Self { + let use_llm = config.use_llm_fallback; + Self { + config, + llm_client: if use_llm { + Some(client) + } else { + None + }, + patterns: Self::build_patterns(), + } + } + /// Create a detector with default configuration. pub fn with_defaults() -> Self { Self::new(TocDetectorConfig::default()) diff --git a/rust/src/index/parse/toc/mod.rs b/rust/src/index/parse/toc/mod.rs index a540cd1a..beac24d7 100644 --- a/rust/src/index/parse/toc/mod.rs +++ b/rust/src/index/parse/toc/mod.rs @@ -17,6 +17,7 @@ mod detector; mod parser; mod processor; mod repairer; +mod structure_extractor; mod types; mod verifier; diff --git a/rust/src/index/parse/toc/parser.rs b/rust/src/index/parse/toc/parser.rs index 20b61af2..06aaade3 100644 --- a/rust/src/index/parse/toc/parser.rs +++ b/rust/src/index/parse/toc/parser.rs @@ -47,6 +47,14 @@ impl TocParser { Self { config, client } } + /// Create a parser with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: TocParserConfig::default(), + client, + } + } + /// Create a parser with default configuration. pub fn with_defaults() -> Self { Self::new(TocParserConfig::default()) diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs index 79ef9a15..9ed2c95b 100644 --- a/rust/src/index/parse/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -2,17 +2,24 @@ // SPDX-License-Identifier: Apache-2.0 //! TOC processor - integrates all TOC processing components. +//! +//! The processor orchestrates a multi-mode extraction pipeline with automatic +//! degradation: if one mode fails verification, it falls back to a lower-quality +//! but more reliable mode. +use futures::future::join_all; use tracing::{debug, info, warn}; use crate::error::Result; use crate::index::parse::pdf::PdfPage; +use crate::llm::LlmClient; use super::assigner::{PageAssigner, PageAssignerConfig}; use super::detector::{TocDetector, TocDetectorConfig}; use super::parser::{TocParser, TocParserConfig}; use super::repairer::{IndexRepairer, RepairerConfig}; -use super::types::{TocEntry, VerificationReport}; +use super::structure_extractor::{StructureExtractor, StructureExtractorConfig}; +use super::types::{ProcessingMode, TocEntry, VerificationReport}; use super::verifier::{IndexVerifier, VerifierConfig}; /// TOC processor configuration. @@ -33,11 +40,17 @@ pub struct TocProcessorConfig { /// Repairer configuration. pub repairer: RepairerConfig, - /// Accuracy threshold for acceptance. + /// Accuracy threshold for acceptance (0.0 - 1.0). pub accuracy_threshold: f32, - /// Maximum repair attempts. + /// Maximum repair attempts per verification cycle. pub max_repair_attempts: usize, + + /// Maximum page span for a single entry before recursive refinement. + pub max_pages_per_entry: usize, + + /// Maximum estimated tokens for a single entry before recursive refinement. + pub max_tokens_per_entry: usize, } impl Default for TocProcessorConfig { @@ -50,6 +63,8 @@ impl Default for TocProcessorConfig { repairer: RepairerConfig::default(), accuracy_threshold: 0.6, max_repair_attempts: 3, + max_pages_per_entry: 30, + max_tokens_per_entry: 20000, } } } @@ -64,6 +79,18 @@ impl Default for TocProcessorConfig { /// 4. **Assign** - Map TOC pages to physical pages /// 5. **Verify** - Sample verification of page assignments /// 6. **Repair** - Fix incorrect assignments (if needed) +/// 7. **Refine** - Sub-divide oversized entries (if needed) +/// +/// # Degradation Strategy +/// +/// The pipeline tries three modes in order of quality: +/// +/// 1. `TocWithPageNumbers` - TOC found with page numbers (offset calculation) +/// 2. `TocWithoutPageNumbers` - TOC found without page numbers (LLM positioning) +/// 3. `NoToc` - No TOC available (LLM structure extraction from content) +/// +/// If a mode fails verification (accuracy < threshold), it automatically +/// degrades to the next mode. /// /// # Example /// @@ -73,11 +100,9 @@ impl Default for TocProcessorConfig { /// /// # #[tokio::main] /// # async fn main() -> vectorless::Result<()> { -/// // Parse PDF /// let pdf_parser = PdfParser::new(); -/// let result = pdf_parser.parse_file("document.pdf".as_ref())?; +/// let result = pdf_parser.parse_file("document.pdf".as_ref()).await?; /// -/// // Extract TOC /// let processor = TocProcessor::new(); /// let entries = processor.process(&result.pages).await?; /// @@ -94,6 +119,8 @@ pub struct TocProcessor { assigner: PageAssigner, verifier: IndexVerifier, repairer: IndexRepairer, + /// Optional LLM client for StructureExtractor (no-TOC mode and refinement). + llm_client: Option, } impl TocProcessor { @@ -102,21 +129,42 @@ impl TocProcessor { Self::with_config(TocProcessorConfig::default()) } + /// Create a TOC processor with an externally provided LLM client. + /// + /// All sub-components (detector, parser, assigner, verifier, repairer) + /// will use this client instead of creating their own from default config. + pub fn with_llm_client(client: LlmClient) -> Self { + info!("TocProcessor: created with external LLM client"); + let config = TocProcessorConfig::default(); + Self { + detector: TocDetector::with_client(config.detector.clone(), client.clone()), + parser: TocParser::with_client(client.clone()), + assigner: PageAssigner::with_client(client.clone()), + verifier: IndexVerifier::with_client(client.clone()), + repairer: IndexRepairer::with_client(client.clone()), + llm_client: Some(client), + config, + } + } + /// Create a TOC processor with custom configuration. pub fn with_config(config: TocProcessorConfig) -> Self { + info!("TocProcessor: created with config (no external LLM client)"); Self { detector: TocDetector::new(config.detector.clone()), parser: TocParser::new(config.parser.clone()), assigner: PageAssigner::new(config.assigner.clone()), verifier: IndexVerifier::new(config.verifier.clone()), repairer: IndexRepairer::new(config.repairer.clone()), + llm_client: None, config, } } - /// Process PDF pages and extract TOC. + /// Process PDF pages and extract hierarchical structure. /// - /// This is the main entry point for TOC extraction. + /// This is the main entry point. It detects TOC, selects the best + /// processing mode, and automatically degrades if needed. pub async fn process(&self, pages: &[PdfPage]) -> Result> { if pages.is_empty() { return Ok(Vec::new()); @@ -126,45 +174,190 @@ impl TocProcessor { // Step 1: Detect TOC let detection = self.detector.detect(pages).await?; - if !detection.found { + + // Step 2: Determine initial mode based on detection result + let initial_mode = if !detection.found { info!("No TOC found in document"); - return self.process_without_toc(pages).await; + ProcessingMode::NoToc + } else if detection.has_page_numbers { + info!( + "TOC found on pages {:?}, has page numbers", + detection.pages + ); + ProcessingMode::TocWithPageNumbers + } else { + info!( + "TOC found on pages {:?}, no page numbers", + detection.pages + ); + ProcessingMode::TocWithoutPageNumbers + }; + + // Step 3: Process with degradation + let entries = self + .process_with_degradation(initial_mode, &detection, pages) + .await?; + + // Step 4: Refine oversized entries + self.refine_large_entries(entries, pages).await + } + + /// Process with automatic mode degradation. + /// + /// Tries the given mode, verifies the result, and degrades to a + /// lower-quality mode if accuracy is below threshold. + async fn process_with_degradation( + &self, + initial_mode: ProcessingMode, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let mut mode = initial_mode; + + loop { + info!("Attempting extraction with mode {:?}", mode); + + let result = match mode { + ProcessingMode::TocWithPageNumbers => { + self.process_toc_with_page_numbers(detection, pages).await + } + ProcessingMode::TocWithoutPageNumbers => { + self.process_toc_without_page_numbers(detection, pages).await + } + ProcessingMode::NoToc => { + // NoToc always succeeds (produces some structure) + return self.process_without_toc(pages).await; + } + }; + + match result { + Ok(entries) if !entries.is_empty() => { + // Verify the entries + let mut mutable_entries = entries; + let report = self + .verify_and_repair(&mut mutable_entries, pages) + .await?; + + if report.accuracy >= self.config.accuracy_threshold { + info!( + "Mode {:?} succeeded: {} entries, accuracy {:.1}%", + mode, + mutable_entries.len(), + report.accuracy * 100.0 + ); + return Ok(mutable_entries); + } + + // Accuracy too low, try degrading + warn!( + "Mode {:?} accuracy {:.1}% below threshold {:.1}%", + mode, + report.accuracy * 100.0, + self.config.accuracy_threshold * 100.0 + ); + + match mode.degrade() { + Some(next) => { + info!("Degrading from {:?} to {:?}", mode, next); + mode = next; + // Continue loop with degraded mode + } + None => { + warn!("No further degradation possible, returning best effort"); + return Ok(mutable_entries); + } + } + } + Ok(_) => { + // Empty entries, degrade + warn!("Mode {:?} produced no entries", mode); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Ok(Vec::new()), + } + } + Err(e) => { + warn!("Mode {:?} failed: {}", mode, e); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Err(e), + } + } + } } + } - info!( - "TOC found on pages {:?}, has_page_numbers: {}", - detection.pages, detection.has_page_numbers - ); + /// Mode 1: TOC with page numbers. + /// + /// Parse the TOC, calculate physical-page offset from anchor entries, + /// and apply the offset to all entries. + async fn process_toc_with_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let toc_text = self.extract_toc_text(pages, &detection.pages); + if toc_text.trim().is_empty() { + return Ok(Vec::new()); + } - // Step 2: Extract TOC text + let mut entries = self.parser.parse(&toc_text).await?; + if entries.is_empty() { + return Ok(Vec::new()); + } + + // Assign physical pages using offset calculation + self.assigner.assign(&mut entries, pages).await?; + + Ok(entries) + } + + /// Mode 2: TOC without page numbers. + /// + /// Parse the TOC, then use LLM to locate each entry in the document. + async fn process_toc_without_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { let toc_text = self.extract_toc_text(pages, &detection.pages); if toc_text.trim().is_empty() { - warn!("TOC text is empty, falling back to structure extraction"); - return self.process_without_toc(pages).await; + return Ok(Vec::new()); } - // Step 3: Parse TOC let mut entries = self.parser.parse(&toc_text).await?; if entries.is_empty() { - warn!("No entries parsed from TOC"); return Ok(Vec::new()); } - info!("Parsed {} TOC entries", entries.len()); + // Clear any TOC page numbers (they're unreliable in this mode) + for entry in &mut entries { + entry.toc_page = None; + } - // Step 4: Assign physical pages + // Assign physical pages using LLM positioning self.assigner.assign(&mut entries, pages).await?; - // Step 5: Verify and repair - let report = self.verify_and_repair(&mut entries, pages).await?; + Ok(entries) + } - info!( - "TOC processing complete: {} entries, accuracy {:.1}%", - entries.len(), - report.accuracy * 100.0 - ); + /// Mode 3: No TOC available. + /// + /// Extract document structure directly from page content using LLM. + async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { + info!("Extracting structure from page content (no TOC available)"); - Ok(entries) + let extractor = match &self.llm_client { + Some(client) => { + StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone()) + } + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; + extractor.extract(pages).await } /// Extract TOC text from pages. @@ -177,37 +370,6 @@ impl TocProcessor { .join("\n\n") } - /// Process document without TOC (structure extraction). - async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { - warn!("Processing without TOC - this is a placeholder implementation"); - - // TODO: Implement structure extraction for documents without TOC - // For now, return a simple structure based on page count - - let mut entries = Vec::new(); - - // Group pages into chunks - let chunk_size = 10; - for chunk in pages.chunks(chunk_size) { - let start_page = chunk.first().map(|p| p.number).unwrap_or(1); - let end_page = chunk.last().map(|p| p.number).unwrap_or(1); - - let title = if chunk.len() == 1 { - format!("Page {}", start_page) - } else { - format!("Pages {}-{}", start_page, end_page) - }; - - entries.push( - TocEntry::new(title, 1) - .with_physical_page(start_page) - .with_confidence(0.5), - ); - } - - Ok(entries) - } - /// Verify entries and repair if needed. async fn verify_and_repair( &self, @@ -217,7 +379,6 @@ impl TocProcessor { let mut attempts = 0; while attempts < self.config.max_repair_attempts { - // Verify let report = self.verifier.verify(entries, pages).await?; if report.accuracy >= self.config.accuracy_threshold { @@ -232,7 +393,6 @@ impl TocProcessor { return Ok(report); } - // Repair let repaired = self.repairer.repair(entries, &report.errors, pages).await?; if repaired == 0 { @@ -244,9 +404,129 @@ impl TocProcessor { debug!("Repair attempt {} complete", attempts); } - // Final verification self.verifier.verify(entries, pages).await } + + /// Refine oversized entries by extracting sub-structure. + /// + /// Entries that span too many pages or tokens are broken down using + /// the same structure extraction approach used for no-TOC documents. + async fn refine_large_entries( + &self, + entries: Vec, + pages: &[PdfPage], + ) -> Result> { + if entries.is_empty() { + return Ok(entries); + } + + let page_count = pages.len(); + + // Pre-compute next-entry page numbers and classify entries + let next_pages: Vec> = entries + .iter() + .enumerate() + .map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page)) + .collect(); + + // Identify oversized entries and launch extractions concurrently + let llm_client = self.llm_client.clone(); + let oversized_futures: Vec<_> = entries + .iter() + .enumerate() + .filter(|(i, entry)| { + let span = entry_page_span(entry, next_pages[*i], page_count); + let tokens = entry_token_count(entry, pages); + span > self.config.max_pages_per_entry + && tokens > self.config.max_tokens_per_entry + }) + .map(|(i, entry)| { + let start = entry.physical_page.unwrap_or(1); + let end = next_pages[i].unwrap_or(page_count); + let sub_pages: Vec = pages + .iter() + .filter(|p| p.number >= start && p.number <= end) + .cloned() + .collect(); + + let entry_title = entry.title.clone(); + let entry_level = entry.level; + let llm_client = llm_client.clone(); + + async move { + if sub_pages.is_empty() { + return (i, Vec::new()); + } + debug!( + "Refining oversized entry '{}' (pages {}-{})", + entry_title, start, end + ); + let extractor = match &llm_client { + Some(client) => StructureExtractor::with_client( + StructureExtractorConfig::default(), + client.clone(), + ), + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; + match extractor.extract(&sub_pages).await { + Ok(sub_entries) => { + let skip = if sub_entries + .first() + .map(|e| e.title.trim() == entry_title.trim()) + .unwrap_or(false) + { + 1 + } else { + 0 + }; + + let refined: Vec = sub_entries[skip..] + .iter() + .map(|sub| { + TocEntry::new(&sub.title, sub.level + entry_level) + .with_physical_page(sub.physical_page.unwrap_or(start)) + .with_confidence(sub.confidence * 0.9) + }) + .collect(); + + info!( + "Refined '{}' into {} sub-entries", + entry_title, + refined.len() + ); + (i, refined) + } + Err(e) => { + warn!("Sub-extraction failed for '{}': {}", entry_title, e); + (i, Vec::new()) + } + } + } + }) + .collect(); + + let extraction_results = join_all(oversized_futures).await; + + // Build a lookup from index → refined sub-entries + let mut refined_map = std::collections::HashMap::new(); + for (idx, sub_entries) in extraction_results { + if !sub_entries.is_empty() { + refined_map.insert(idx, sub_entries); + } + } + + // Assemble final output + let mut result = Vec::with_capacity(entries.len() * 2); + for (i, entry) in entries.into_iter().enumerate() { + if let Some(sub_entries) = refined_map.remove(&i) { + result.extend(sub_entries); + } else { + result.push(entry); + } + } + + Ok(result) + } } impl Default for TocProcessor { @@ -255,6 +535,26 @@ impl Default for TocProcessor { } } +/// Calculate how many pages an entry spans. +/// +/// From its physical_page to the next entry's physical_page (or document end). +fn entry_page_span(entry: &TocEntry, next_physical_page: Option, total_pages: usize) -> usize { + let start = entry.physical_page.unwrap_or(1); + let end = next_physical_page.unwrap_or(total_pages); + end.saturating_sub(start) +} + +/// Estimate total tokens for the content covered by an entry. +fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize { + let start = entry.physical_page.unwrap_or(1); + pages + .iter() + .filter(|p| p.number >= start) + .take(30) // cap at max_pages_per_entry default + .map(|p| p.token_count) + .sum() +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/src/index/parse/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs index 4062f215..51931674 100644 --- a/rust/src/index/parse/toc/repairer.rs +++ b/rust/src/index/parse/toc/repairer.rs @@ -3,6 +3,7 @@ //! Index repairer - fixes incorrect TOC entry page assignments. +use futures::future::join_all; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -49,12 +50,20 @@ impl IndexRepairer { Self { config, client } } + /// Create a repairer with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: RepairerConfig::default(), + client, + } + } + /// Create a repairer with default configuration. pub fn with_defaults() -> Self { Self::new(RepairerConfig::default()) } - /// Repair incorrect entries. + /// Repair incorrect entries concurrently. pub async fn repair( &self, entries: &mut [TocEntry], @@ -66,38 +75,67 @@ impl IndexRepairer { } info!("Repairing {} incorrect entries", errors.len()); - let mut repaired_count = 0; - for error in errors { - if error.index >= entries.len() { - continue; - } - - let entry = &mut entries[error.index]; - let expected_page = error.expected_page; - - // Search around the expected page - let start = expected_page - .saturating_sub(self.config.search_range) - .max(1); - let end = (expected_page + self.config.search_range).min(pages.len()); - - if let Some(correct_page) = self - .find_correct_page(&entry.title, pages, start..=end) - .await? - { - debug!( - "Repaired '{}' : page {} → {}", - entry.title, expected_page, correct_page - ); - entry.physical_page = Some(correct_page); - entry.confidence = 0.9; - repaired_count += 1; - } else { - debug!( - "Could not repair '{}' (searched pages {}-{})", - entry.title, start, end - ); + // Collect repair tasks (don't borrow entries mutably yet) + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let search_range = self.config.search_range; + + let tasks: Vec<_> = errors + .iter() + .filter(|error| error.index < entries.len()) + .map(|error| { + let title = entries[error.index].title.clone(); + let expected_page = error.expected_page; + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let start = expected_page.saturating_sub(search_range).max(1); + let end = (expected_page + search_range).min(pages.len()); + + let result = Self::find_correct_page_static( + &client, + &title, + &pages, + start..=end, + ) + .await; + + (title, expected_page, result) + } + }) + .collect(); + + let results = join_all(tasks).await; + + // Apply repairs + let mut repaired_count = 0; + for (title, expected_page, result) in results { + match result { + Ok(Some(correct_page)) => { + // Find the corresponding error entry and fix it + if let Some(error) = errors.iter().find(|e| e.title == title) { + if error.index < entries.len() { + debug!( + "Repaired '{}' : page {} → {}", + title, expected_page, correct_page + ); + entries[error.index].physical_page = Some(correct_page); + entries[error.index].confidence = 0.9; + repaired_count += 1; + } + } + } + Ok(None) => { + debug!( + "Could not repair '{}' (searched around page {})", + title, expected_page + ); + } + Err(e) => { + debug!("Repair failed for '{}': {}", title, e); + } } } @@ -105,9 +143,9 @@ impl IndexRepairer { Ok(repaired_count) } - /// Find the correct page for a title within a range. - async fn find_correct_page( - &self, + /// Find the correct page for a title within a range (static, for concurrent use). + async fn find_correct_page_static( + client: &LlmClient, title: &str, pages: &[PdfPage], range: std::ops::RangeInclusive, @@ -152,7 +190,7 @@ Reply in JSON format: page: Option, } - let result: FindResult = self.client.complete_json(system, &user).await?; + let result: FindResult = client.complete_json(system, &user).await?; if result.found { Ok(result.page) diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/rust/src/index/parse/toc/structure_extractor.rs new file mode 100644 index 00000000..17511b36 --- /dev/null +++ b/rust/src/index/parse/toc/structure_extractor.rs @@ -0,0 +1,367 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Structure extraction from documents without a Table of Contents. +//! +//! When a PDF has no TOC (or all TOC-based extraction modes failed), this +//! module uses LLM to analyse page content and extract the document's +//! hierarchical structure directly. + +use tracing::{debug, info, warn}; + +use crate::config::LlmConfig; +use crate::error::Result; +use crate::index::parse::pdf::PdfPage; + +use super::types::TocEntry; +use crate::llm::LlmClient; + +/// Configuration for structure extraction. +#[derive(Debug, Clone)] +pub struct StructureExtractorConfig { + /// Maximum estimated tokens per page group sent to LLM. + pub max_tokens_per_group: usize, + + /// Number of overlap pages between consecutive groups. + pub overlap_pages: usize, + + /// LLM configuration. + pub llm_config: LlmConfig, +} + +impl Default for StructureExtractorConfig { + fn default() -> Self { + Self { + max_tokens_per_group: 20_000, + overlap_pages: 1, + llm_config: LlmConfig::default(), + } + } +} + +/// A group of consecutive pages with their combined text. +struct PageGroup { + /// Combined text with page markers: `\n...\n`. + text: String, + /// Start page number (1-based). + start_page: usize, + /// End page number (1-based, inclusive). + end_page: usize, +} + +/// Extracts document structure from page content using LLM. +/// +/// Used when a document has no Table of Contents, or when TOC-based extraction +/// failed. Pages are grouped by token count and analysed sequentially: the +/// first group generates an initial structure, subsequent groups append to it. +pub struct StructureExtractor { + config: StructureExtractorConfig, + client: LlmClient, +} + +impl StructureExtractor { + /// Create a new structure extractor. + pub fn new(config: StructureExtractorConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create a structure extractor with an externally provided LLM client. + pub fn with_client(config: StructureExtractorConfig, client: LlmClient) -> Self { + Self { config, client } + } + + /// Create an extractor with default configuration. + pub fn with_defaults() -> Self { + Self::new(StructureExtractorConfig::default()) + } + + /// Extract hierarchical structure from all pages. + pub async fn extract(&self, pages: &[PdfPage]) -> Result> { + if pages.is_empty() { + return Ok(Vec::new()); + } + + let groups = self.group_pages(pages); + info!( + "Extracting structure from {} pages in {} groups", + pages.len(), + groups.len() + ); + + let mut all_entries = Vec::new(); + let page_count = pages.len(); + + for (i, group) in groups.iter().enumerate() { + let group_entries = if i == 0 { + self.generate_initial(group).await? + } else { + self.generate_continuation(group, &all_entries).await? + }; + + debug!( + "Group {}/{} (pages {}-{}): extracted {} entries", + i + 1, + groups.len(), + group.start_page, + group.end_page, + group_entries.len() + ); + + all_entries.extend(group_entries); + } + + // Truncate physical_page values that exceed document length + for entry in &mut all_entries { + if let Some(p) = entry.physical_page { + if p > page_count { + warn!( + "Truncating out-of-range page {} for '{}'", + p, entry.title + ); + entry.physical_page = Some(page_count); + } + } + } + + info!("Structure extraction complete: {} entries", all_entries.len()); + Ok(all_entries) + } + + /// Group pages by estimated token count. + /// + /// Each group stays under `max_tokens_per_group`. Consecutive groups + /// overlap by `overlap_pages` pages to avoid splitting content at + /// section boundaries. + fn group_pages(&self, pages: &[PdfPage]) -> Vec { + let mut groups = Vec::new(); + let mut group_tokens = 0usize; + let mut group_pages_buf = Vec::new(); + + for (i, page) in pages.iter().enumerate() { + let new_tokens = group_tokens + page.token_count; + + if new_tokens > self.config.max_tokens_per_group && !group_pages_buf.is_empty() { + // Finalise current group + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + + // Start new group with overlap + let overlap_start = i.saturating_sub(self.config.overlap_pages); + group_pages_buf = pages[overlap_start..=i].to_vec(); + group_tokens = group_pages_buf.iter().map(|p| p.token_count).sum(); + } else { + group_tokens = new_tokens; + group_pages_buf.push(page.clone()); + } + } + + // Final group + if !group_pages_buf.is_empty() { + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + } + + groups + } + + /// Generate initial structure from the first page group. + async fn generate_initial(&self, group: &PageGroup) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + let user = format!( + r#"Analyze this document content and extract its hierarchical structure. + +Document content: +{} + +Return a JSON array: +[ + {{"title": "Section Title", "level": 1, "physical_page": 1}}, + {{"title": "Subsection", "level": 2, "physical_page": 3}}, + ... +] + +Rules: +- "level" reflects the hierarchy (1 = chapter/top, 2 = section, 3 = subsection) +- "physical_page" is the page number where the section begins +- Preserve original titles as closely as possible +- Only output the JSON array, no other text"#, + group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } + + /// Continue structure extraction for a subsequent group. + /// + /// Passes previously extracted entries as context so the LLM can + /// continue the structure rather than restart. + async fn generate_continuation( + &self, + group: &PageGroup, + previous: &[TocEntry], + ) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + + // Summarise previous entries as context + let prev_summary = previous + .iter() + .rev() + .take(10) + .rev() + .map(|e| { + format!( + " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", + e.title, + e.level, + e.physical_page.unwrap_or(0) + ) + }) + .collect::>() + .join(",\n"); + + let user = format!( + r#"Previously extracted structure: +[ +{} +] + +Continue extracting structure from these pages: +{} + +Return ONLY the NEW entries (do not repeat previous ones): +[ + {{"title": "...", "level": N, "physical_page": M}}, + ... +] + +If no new structural elements are found, return: []"#, + prev_summary, group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } +} + +/// Format pages into tagged text for LLM consumption. +fn format_group_text(pages: &[PdfPage]) -> String { + pages + .iter() + .map(|p| { + // Truncate individual page text if very long + let text = if p.text.len() > 3000 { + &p.text[..3000] + } else { + &p.text + }; + format!("\n{}\n", p.number, text, p.number) + }) + .collect::>() + .join("\n\n") +} + +const STRUCTURE_EXTRACTION_SYSTEM_PROMPT: &str = r#"You are a document structure extraction expert. Your task is to analyze document content and extract its hierarchical structure (chapters, sections, subsections). + +For each structural element you find, provide: +- title: The section title exactly as it appears +- level: The hierarchy level (1 = chapter/top level, 2 = section, 3 = subsection) +- physical_page: The page number where this section begins + +Important: +- Focus on genuine structural elements (chapters, sections), not paragraph topics +- Do NOT include the abstract, summary, or bibliography as structural elements unless they are major sections +- Be conservative: fewer high-quality entries are better than many low-quality ones"#; + +/// LLM response type for structure extraction. +#[derive(serde::Deserialize)] +struct ExtractedSection { + title: String, + level: usize, + physical_page: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = StructureExtractorConfig::default(); + assert_eq!(config.max_tokens_per_group, 20_000); + assert_eq!(config.overlap_pages, 1); + } + + #[test] + fn test_group_pages_single_group() { + let extractor = StructureExtractor::with_defaults(); + + let pages: Vec = (1..=5) + .map(|i| PdfPage::new(i, format!("Page {} content", i))) + .collect(); + + let groups = extractor.group_pages(&pages); + assert_eq!(groups.len(), 1); + assert_eq!(groups[0].start_page, 1); + assert_eq!(groups[0].end_page, 5); + } + + #[test] + fn test_group_pages_multiple_groups() { + let config = StructureExtractorConfig { + max_tokens_per_group: 50, + overlap_pages: 1, + ..Default::default() + }; + let extractor = StructureExtractor::new(config); + + // Create pages with enough text to span multiple groups + let pages: Vec = (1..=10) + .map(|i| { + let text = format!("Page {} content. This is a longer text to use more tokens. ", i).repeat(10); + PdfPage::new(i, text) + }) + .collect(); + + let groups = extractor.group_pages(&pages); + assert!(groups.len() > 1, "Expected multiple groups, got {}", groups.len()); + } + + #[test] + fn test_format_group_text() { + let pages = vec![ + PdfPage::new(1, "Hello"), + PdfPage::new(2, "World"), + ]; + let text = format_group_text(&pages); + assert!(text.contains("")); + assert!(text.contains("")); + assert!(text.contains("Hello")); + assert!(text.contains("World")); + } +} diff --git a/rust/src/index/parse/toc/types.rs b/rust/src/index/parse/toc/types.rs index 9465311b..0438c0d3 100644 --- a/rust/src/index/parse/toc/types.rs +++ b/rust/src/index/parse/toc/types.rs @@ -266,6 +266,33 @@ impl VerificationReport { } } +/// Processing mode for the TOC extraction pipeline. +/// +/// Modes are ordered by quality: higher modes produce more accurate results +/// when they succeed, but can degrade to lower modes on failure. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessingMode { + /// TOC found with page numbers. Highest quality path. + TocWithPageNumbers, + /// TOC found without page numbers, or page-number accuracy was too low. + TocWithoutPageNumbers, + /// No TOC, or all TOC-based modes failed. LLM-driven structure extraction. + NoToc, +} + +impl ProcessingMode { + /// Degrade to the next lower quality mode. + /// + /// Returns `None` if already at the lowest mode (`NoToc`). + pub fn degrade(self) -> Option { + match self { + Self::TocWithPageNumbers => Some(Self::TocWithoutPageNumbers), + Self::TocWithoutPageNumbers => Some(Self::NoToc), + Self::NoToc => None, + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -307,4 +334,17 @@ mod tests { "Title not found on page" ); } + + #[test] + fn test_processing_mode_degrade() { + assert_eq!( + ProcessingMode::TocWithPageNumbers.degrade(), + Some(ProcessingMode::TocWithoutPageNumbers) + ); + assert_eq!( + ProcessingMode::TocWithoutPageNumbers.degrade(), + Some(ProcessingMode::NoToc) + ); + assert_eq!(ProcessingMode::NoToc.degrade(), None); + } } diff --git a/rust/src/index/parse/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs index d0c3883e..09b28059 100644 --- a/rust/src/index/parse/toc/verifier.rs +++ b/rust/src/index/parse/toc/verifier.rs @@ -3,6 +3,7 @@ //! Index verifier - verifies TOC entry page assignments. +use futures::future::join_all; use rand::seq::SliceRandom; use tracing::{debug, info}; @@ -49,12 +50,22 @@ impl IndexVerifier { Self { config, client } } + /// Create a verifier with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: VerifierConfig::default(), + client, + } + } + /// Create a verifier with default configuration. pub fn with_defaults() -> Self { Self::new(VerifierConfig::default()) } /// Verify TOC entries against PDF pages. + /// + /// All sample entries are verified concurrently via LLM calls. pub async fn verify( &self, entries: &[TocEntry], @@ -64,38 +75,58 @@ impl IndexVerifier { return Ok(VerificationReport::all_correct(0)); } - // Select sample let sample = self.select_sample(entries); - // Verify each sample entry + // Launch all verification checks concurrently + let client = self.client.clone(); + let futures: Vec<_> = sample + .iter() + .map(|(index, entry)| { + let index = *index; + let title = entry.title.clone(); + let physical_page = entry.physical_page; + let client = client.clone(); + let pages = pages.to_vec(); + + async move { + match physical_page { + Some(page) => { + let result = + Self::verify_entry_with_client(&client, &title, page, &pages).await; + (index, title, page, result) + } + None => ( + index, + title, + 0, + Ok(Err(ErrorType::PageOutOfRange)), + ), + } + } + }) + .collect(); + + let results = join_all(futures).await; + + // Aggregate results + let total = results.len(); let mut errors = Vec::new(); let mut correct = 0; - for (index, entry) in &sample { - if let Some(physical_page) = entry.physical_page { - match self.verify_entry(entry, physical_page, pages).await? { - Ok(()) => correct += 1, - Err(error_type) => { - errors.push(VerificationError::new( - *index, - entry.title.clone(), - physical_page, - error_type, - )); - } + for (index, title, page, result) in results { + match result { + Ok(Ok(())) => correct += 1, + Ok(Err(error_type)) => { + errors.push(VerificationError::new(index, title, page, error_type)); + } + Err(e) => { + debug!("Verification LLM call failed: {}", e); + errors.push(VerificationError::new(index, title, page, ErrorType::TitleNotFound)); } - } else { - // No physical page assigned - errors.push(VerificationError::new( - *index, - entry.title.clone(), - 0, - ErrorType::PageOutOfRange, - )); } } - let report = VerificationReport::new(sample.len(), correct, errors); + let report = VerificationReport::new(total, correct, errors); info!( "Verification complete: {}/{} correct ({:.1}% accuracy)", report.correct, @@ -126,28 +157,23 @@ impl IndexVerifier { } } - /// Verify a single entry. - async fn verify_entry( - &self, - entry: &TocEntry, + /// Verify a single entry using a cloned client (for concurrent use). + async fn verify_entry_with_client( + client: &LlmClient, + title: &str, physical_page: usize, pages: &[PdfPage], ) -> Result> { - // Check page bounds if physical_page == 0 || physical_page > pages.len() { return Ok(Err(ErrorType::PageOutOfRange)); } let page = &pages[physical_page - 1]; - // Use LLM to check if title appears on this page - let found = self.check_title_on_page(&entry.title, &page.text).await?; + let found = Self::check_title_on_page_with_client(client, title, &page.text).await?; if !found { - debug!( - "Title '{}' not found on page {}", - entry.title, physical_page - ); + debug!("Title '{}' not found on page {}", title, physical_page); return Ok(Err(ErrorType::TitleNotFound)); } @@ -155,10 +181,13 @@ impl IndexVerifier { } /// Check if a title appears on a page using LLM. - async fn check_title_on_page(&self, title: &str, page_text: &str) -> Result { + async fn check_title_on_page_with_client( + client: &LlmClient, + title: &str, + page_text: &str, + ) -> Result { let system = "You are a document analysis assistant. Determine if a section title appears in the given text."; - // Truncate page text if too long let text = if page_text.len() > 1000 { &page_text[..1000] } else { @@ -181,7 +210,7 @@ Reply in JSON format: found: bool, } - let result: CheckResult = self.client.complete_json(system, &user).await?; + let result: CheckResult = client.complete_json(system, &user).await?; Ok(result.found) } diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index a80cf176..1538c7b3 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -81,8 +81,9 @@ impl PipelineExecutor { /// 7. `reasoning_index` - Build pre-computed reasoning index /// 8. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { + tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage"); let orchestrator = PipelineOrchestrator::new() - .stage_with_priority(ParseStage::new(), 10) + .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) .stage_with_priority(BuildStage::new(), 20) .stage_with_priority(ValidateStage::new(), 22) .stage_with_priority(SplitStage::new(), 25) diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs index 452089c0..5550de45 100644 --- a/rust/src/index/stages/enhance.rs +++ b/rust/src/index/stages/enhance.rs @@ -109,6 +109,12 @@ impl IndexStage for EnhanceStage { async fn execute(&mut self, ctx: &mut IndexContext) -> Result { let start = Instant::now(); + info!( + "EnhanceStage: llm_client={}, strategy={:?}", + self.llm_client.is_some(), + ctx.options.summary_strategy + ); + // Check if we need summaries if !self.needs_summaries(ctx) { info!( diff --git a/rust/src/index/stages/parse.rs b/rust/src/index/stages/parse.rs index 98ef911b..6c8166b6 100644 --- a/rust/src/index/stages/parse.rs +++ b/rust/src/index/stages/parse.rs @@ -15,12 +15,22 @@ use crate::index::IndexMode; use crate::index::pipeline::{IndexContext, IndexInput}; /// Parse stage - extracts raw nodes from documents. -pub struct ParseStage; +pub struct ParseStage { + /// Optional LLM client for PDF structure extraction. + llm_client: Option, +} impl ParseStage { /// Create a new parse stage. pub fn new() -> Self { - Self + Self { llm_client: None } + } + + /// Create a parse stage with an LLM client. + pub fn with_llm_client(client: crate::llm::LlmClient) -> Self { + Self { + llm_client: Some(client), + } } /// Detect document format from path and options. @@ -61,6 +71,10 @@ impl IndexStage for ParseStage { ctx.format = format; info!("Parsing document with format: {:?}", format); + info!( + "ParseStage llm_client present: {}", + self.llm_client.is_some() + ); // Parse based on input type let result = match &ctx.input { @@ -77,7 +91,7 @@ impl IndexStage for ParseStage { .to_string(); // Parse directly - crate::index::parse::parse_file(&path, format).await? + crate::index::parse::parse_file(&path, format, self.llm_client.clone()).await? } IndexInput::Content { content, @@ -88,14 +102,14 @@ impl IndexStage for ParseStage { ctx.name = name.clone(); // Parse content directly - crate::index::parse::parse_content(content, *format).await? + crate::index::parse::parse_content(content, *format, self.llm_client.clone()).await? } IndexInput::Bytes { data, name, format } => { // Set name ctx.name = name.clone(); // Parse bytes - crate::index::parse::parse_bytes(data, *format).await? + crate::index::parse::parse_bytes(data, *format, self.llm_client.clone()).await? } }; diff --git a/samples/Docker_Cheat_Sheet.pdf b/samples/Docker_Cheat_Sheet.pdf new file mode 100755 index 00000000..0768f1c3 Binary files /dev/null and b/samples/Docker_Cheat_Sheet.pdf differ