From 68231d4d127324c668701f406c53d3c82fe48c9e Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Mon, 13 Apr 2026 13:08:13 +0800 Subject: [PATCH 1/5] feat(pdf-parser): switch to pdf-extract for reliable text extraction Uses pdf-extract for text extraction which handles CJK, ToUnicode CMap, font encoding, and other complex PDF text scenarios more reliably than the previous lopdf-based approach. Falls back gracefully to basic metadata extraction when lopdf parsing fails. BREAKING CHANGE: Changes internal PDF parsing mechanism from lopdf to pdf-extract while maintaining the same public API. feat(toc-processor): add multi-mode extraction with automatic degradation Introduces a three-mode TOC extraction pipeline with automatic fallback: 1. TocWithPageNumbers - when TOC with page numbers is available 2. TocWithoutPageNumbers - when TOC exists but lacks page numbers 3. NoToc - direct structure extraction from content using LLM Each mode degrades to the next when accuracy thresholds aren't met. feat(structure-extractor): add LLM-powered structure extraction for no-TOC docs Implements document structure extraction from page content when no TOC is available. Groups pages by token count and uses LLM analysis to identify hierarchical sections. Adds support for continuation across page groups with overlap handling. feat(toc-processor): add refinement for oversized TOC entries Adds capability to recursively split large TOC entries that span too many pages or exceed token limits. Uses the same structure extraction approach to identify sub-sections within oversized entries, improving granularity of document structure. --- rust/src/index/parse/pdf/parser.rs | 241 ++--------- rust/src/index/parse/toc/mod.rs | 1 + rust/src/index/parse/toc/processor.rs | 377 +++++++++++++++--- .../index/parse/toc/structure_extractor.rs | 362 +++++++++++++++++ rust/src/index/parse/toc/types.rs | 40 ++ 5 files changed, 757 insertions(+), 264 deletions(-) create mode 100644 rust/src/index/parse/toc/structure_extractor.rs diff --git a/rust/src/index/parse/pdf/parser.rs b/rust/src/index/parse/pdf/parser.rs index 4684ae1a..b2ae6b5d 100644 --- a/rust/src/index/parse/pdf/parser.rs +++ b/rust/src/index/parse/pdf/parser.rs @@ -1,7 +1,11 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! PDF document parser using lopdf. +//! PDF document parser. +//! +//! Uses [`pdf_extract`] for reliable text extraction (handles CJK, ToUnicode +//! CMap, font encoding, etc.) and [`lopdf`] only for metadata extraction from +//! the PDF Info dictionary. use std::path::Path; @@ -35,7 +39,7 @@ impl Default for PdfParserConfig { fn default() -> Self { Self { max_pages: 0, - extract_toc: true, // Default enabled + extract_toc: true, } } } @@ -65,19 +69,42 @@ impl PdfParser { bytes: &[u8], filename: Option<&str>, ) -> Result { - let doc = LopdfDocument::load_mem(bytes) - .map_err(|e| Error::Parse(format!("Failed to parse PDF: {}", e)))?; + // Use pdf-extract for text (handles CJK, ToUnicode CMap, etc.) + let pages = self.extract_pages(bytes)?; + + // Use lopdf only for metadata; fall back gracefully if it fails + let metadata = match LopdfDocument::load_mem(bytes) { + Ok(doc) => self.extract_metadata(&doc, filename), + Err(_) => PdfMetadata { + title: filename.unwrap_or("Document").to_string(), + page_count: pages.len(), + ..Default::default() + }, + }; - // Extract metadata - let metadata = self.extract_metadata(&doc, filename); + Ok(PdfParseResult::new(metadata, pages)) + } - // Extract pages - let pages = self.extract_pages(&doc)?; + /// Extract text from all pages using pdf-extract. + fn extract_pages(&self, bytes: &[u8]) -> Result> { + let page_texts = pdf_extract::extract_text_from_mem_by_pages(bytes) + .map_err(|e| Error::Parse(format!("pdf-extract failed: {}", e)))?; - Ok(PdfParseResult::new(metadata, pages)) + let mut pages = Vec::new(); + for (i, text) in page_texts.iter().enumerate() { + if self.config.max_pages > 0 && i >= self.config.max_pages { + break; + } + let page_num = i + 1; // 1-based + if !text.trim().is_empty() { + pages.push(PdfPage::new(page_num, text.clone())); + } + } + + Ok(pages) } - /// Extract metadata from PDF document. + /// Extract metadata from PDF Info dictionary via lopdf. fn extract_metadata(&self, doc: &LopdfDocument, filename: Option<&str>) -> PdfMetadata { let mut metadata = PdfMetadata { title: filename.unwrap_or("Document").to_string(), @@ -85,26 +112,22 @@ impl PdfParser { ..Default::default() }; - // Try to extract metadata from Info dictionary if let Ok(info) = doc.trailer.get(b"Info") { if let Ok(info_ref) = info.as_reference() { if let Ok(info_obj) = doc.get_object(info_ref) { if let Ok(dict) = info_obj.as_dict() { - // Title if let Ok(title_obj) = dict.get(b"Title") { if let Ok(title) = title_obj.as_str() { metadata.title = self.decode_pdf_string(title); } } - // Author if let Ok(author_obj) = dict.get(b"Author") { if let Ok(author) = author_obj.as_str() { metadata.author = Some(self.decode_pdf_string(author)); } } - // Subject if let Ok(subject_obj) = dict.get(b"Subject") { if let Ok(subject) = subject_obj.as_str() { metadata.subject = Some(self.decode_pdf_string(subject)); @@ -118,158 +141,9 @@ impl PdfParser { metadata } - /// Extract text from all pages. - fn extract_pages(&self, doc: &LopdfDocument) -> Result> { - let page_map = doc.get_pages(); - let mut pages = Vec::new(); - - for (i, (page_num, object_id)) in page_map.iter().enumerate() { - // Check max pages limit - if self.config.max_pages > 0 && i >= self.config.max_pages { - break; - } - - let text = self.extract_page_text(doc, *object_id, *page_num as usize); - - // Skip empty pages - if !text.trim().is_empty() { - pages.push(PdfPage::new(*page_num as usize, text)); - } - } - - Ok(pages) - } - - /// Extract text from a single page. - fn extract_page_text( - &self, - doc: &LopdfDocument, - object_id: lopdf::ObjectId, - _page_num: usize, - ) -> String { - let mut text = String::new(); - - if let Ok(page_obj) = doc.get_object(object_id) { - if let Ok(page_dict) = page_obj.as_dict() { - if let Ok(contents) = page_dict.get(b"Contents") { - match contents { - lopdf::Object::Reference(ref_id) => { - if let Ok(content_obj) = doc.get_object(*ref_id) { - if let Ok(stream) = content_obj.as_stream() { - text = self.decode_stream_content(stream); - } - } - } - lopdf::Object::Array(arr) => { - for obj in arr { - if let Ok(ref_id) = obj.as_reference() { - if let Ok(content_obj) = doc.get_object(ref_id) { - if let Ok(stream) = content_obj.as_stream() { - let content = self.decode_stream_content(stream); - if !text.is_empty() { - text.push('\n'); - } - text.push_str(&content); - } - } - } - } - } - _ => {} - } - } - } - } - - // Post-process text - self.post_process_text(&text) - } - - /// Decode stream content to text. - fn decode_stream_content(&self, stream: &lopdf::Stream) -> String { - // Try to decode the stream - if let Ok(content) = stream.decompressed_content() { - self.extract_text_from_content(&content) - } else { - self.extract_text_from_content(&stream.content) - } - } - - /// Extract text from PDF content stream (simplified). - fn extract_text_from_content(&self, content: &[u8]) -> String { - let content_str = String::from_utf8_lossy(content); - let mut text = String::new(); - - for line in content_str.lines() { - let line = line.trim(); - - // Tj operator: (text) Tj - if line.ends_with("Tj") { - if let Some(text_part) = self.extract_parentheses_text(line) { - text.push_str(&text_part); - } - } - // TJ operator: [(text) ...] TJ - else if line.ends_with("TJ") { - if let Some(text_parts) = self.extract_array_text(line) { - text.push_str(&text_parts); - } - } - } - - text - } - - /// Extract text from parentheses in Tj operator. - fn extract_parentheses_text(&self, line: &str) -> Option { - let start = line.find('(')?; - let end = line.rfind(')')?; - if end > start { - let raw = &line[start + 1..end]; - Some(self.decode_pdf_string(raw.as_bytes())) - } else { - None - } - } - - /// Extract text from array in TJ operator. - fn extract_array_text(&self, line: &str) -> Option { - let start = line.find('[')?; - let end = line.rfind(']')?; - if end > start { - let content = &line[start + 1..end]; - let mut text = String::new(); - - let mut in_parens = false; - let mut current = String::new(); - - for ch in content.chars() { - match ch { - '(' => { - in_parens = true; - current.clear(); - } - ')' => { - if in_parens { - text.push_str(&self.decode_pdf_string(current.as_bytes())); - } - in_parens = false; - } - _ => { - if in_parens { - current.push(ch); - } - } - } - } - - Some(text) - } else { - None - } - } - - /// Decode PDF string (handle escape sequences). + /// Decode PDF string literal (handles escape sequences). + /// + /// Used only for metadata field values extracted via lopdf. fn decode_pdf_string(&self, bytes: &[u8]) -> String { let mut result = String::new(); let mut i = 0; @@ -299,26 +173,6 @@ impl PdfParser { result } - /// Post-process extracted text. - fn post_process_text(&self, text: &str) -> String { - let mut result = String::new(); - let mut prev_space = false; - - for ch in text.chars() { - if ch.is_whitespace() { - if !prev_space { - result.push(' '); - prev_space = true; - } - } else { - result.push(ch); - prev_space = false; - } - } - - result.trim().to_string() - } - /// Convert TOC entries to RawNodes. fn toc_entries_to_raw_nodes( &self, @@ -328,7 +182,6 @@ impl PdfParser { let mut nodes = Vec::new(); for entry in entries { - // Get content from the page range let content = self.get_content_for_entry(entry, pages); let mut node = RawNode::new(&entry.title) @@ -353,12 +206,10 @@ impl PdfParser { ) -> String { let start_page = entry.physical_page.unwrap_or(1); - // Find content on this page pages .iter() .find(|p| p.number == start_page) .map(|p| { - // Try to find the title position and extract content after it let text = &p.text; if let Some(pos) = text.find(&entry.title) { text[pos + entry.title.len()..].trim().to_string() @@ -445,7 +296,6 @@ impl PdfParser { self.pages_to_raw_nodes(&result.pages) }; - // Build metadata let meta = DocumentMeta { name: result.metadata.title, format: DocumentFormat::Pdf, @@ -486,15 +336,4 @@ mod tests { let decoded = parser.decode_pdf_string(b"Hello\\nWorld"); assert_eq!(decoded, "Hello\nWorld"); } - - #[test] - fn test_post_process_text() { - let parser = PdfParser::new(); - - let processed = parser.post_process_text("Hello World"); - assert_eq!(processed, "Hello World"); - - let processed = parser.post_process_text(" Hello World "); - assert_eq!(processed, "Hello World"); - } } diff --git a/rust/src/index/parse/toc/mod.rs b/rust/src/index/parse/toc/mod.rs index a540cd1a..beac24d7 100644 --- a/rust/src/index/parse/toc/mod.rs +++ b/rust/src/index/parse/toc/mod.rs @@ -17,6 +17,7 @@ mod detector; mod parser; mod processor; mod repairer; +mod structure_extractor; mod types; mod verifier; diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs index 79ef9a15..978ba4e3 100644 --- a/rust/src/index/parse/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -2,6 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 //! TOC processor - integrates all TOC processing components. +//! +//! The processor orchestrates a multi-mode extraction pipeline with automatic +//! degradation: if one mode fails verification, it falls back to a lower-quality +//! but more reliable mode. use tracing::{debug, info, warn}; @@ -12,7 +16,8 @@ use super::assigner::{PageAssigner, PageAssignerConfig}; use super::detector::{TocDetector, TocDetectorConfig}; use super::parser::{TocParser, TocParserConfig}; use super::repairer::{IndexRepairer, RepairerConfig}; -use super::types::{TocEntry, VerificationReport}; +use super::structure_extractor::{StructureExtractor, StructureExtractorConfig}; +use super::types::{ProcessingMode, TocEntry, VerificationReport}; use super::verifier::{IndexVerifier, VerifierConfig}; /// TOC processor configuration. @@ -33,11 +38,17 @@ pub struct TocProcessorConfig { /// Repairer configuration. pub repairer: RepairerConfig, - /// Accuracy threshold for acceptance. + /// Accuracy threshold for acceptance (0.0 - 1.0). pub accuracy_threshold: f32, - /// Maximum repair attempts. + /// Maximum repair attempts per verification cycle. pub max_repair_attempts: usize, + + /// Maximum page span for a single entry before recursive refinement. + pub max_pages_per_entry: usize, + + /// Maximum estimated tokens for a single entry before recursive refinement. + pub max_tokens_per_entry: usize, } impl Default for TocProcessorConfig { @@ -50,6 +61,8 @@ impl Default for TocProcessorConfig { repairer: RepairerConfig::default(), accuracy_threshold: 0.6, max_repair_attempts: 3, + max_pages_per_entry: 30, + max_tokens_per_entry: 20000, } } } @@ -64,6 +77,18 @@ impl Default for TocProcessorConfig { /// 4. **Assign** - Map TOC pages to physical pages /// 5. **Verify** - Sample verification of page assignments /// 6. **Repair** - Fix incorrect assignments (if needed) +/// 7. **Refine** - Sub-divide oversized entries (if needed) +/// +/// # Degradation Strategy +/// +/// The pipeline tries three modes in order of quality: +/// +/// 1. `TocWithPageNumbers` - TOC found with page numbers (offset calculation) +/// 2. `TocWithoutPageNumbers` - TOC found without page numbers (LLM positioning) +/// 3. `NoToc` - No TOC available (LLM structure extraction from content) +/// +/// If a mode fails verification (accuracy < threshold), it automatically +/// degrades to the next mode. /// /// # Example /// @@ -73,11 +98,9 @@ impl Default for TocProcessorConfig { /// /// # #[tokio::main] /// # async fn main() -> vectorless::Result<()> { -/// // Parse PDF /// let pdf_parser = PdfParser::new(); -/// let result = pdf_parser.parse_file("document.pdf".as_ref())?; +/// let result = pdf_parser.parse_file("document.pdf".as_ref()).await?; /// -/// // Extract TOC /// let processor = TocProcessor::new(); /// let entries = processor.process(&result.pages).await?; /// @@ -114,9 +137,10 @@ impl TocProcessor { } } - /// Process PDF pages and extract TOC. + /// Process PDF pages and extract hierarchical structure. /// - /// This is the main entry point for TOC extraction. + /// This is the main entry point. It detects TOC, selects the best + /// processing mode, and automatically degrades if needed. pub async fn process(&self, pages: &[PdfPage]) -> Result> { if pages.is_empty() { return Ok(Vec::new()); @@ -126,45 +150,185 @@ impl TocProcessor { // Step 1: Detect TOC let detection = self.detector.detect(pages).await?; - if !detection.found { + + // Step 2: Determine initial mode based on detection result + let initial_mode = if !detection.found { info!("No TOC found in document"); - return self.process_without_toc(pages).await; + ProcessingMode::NoToc + } else if detection.has_page_numbers { + info!( + "TOC found on pages {:?}, has page numbers", + detection.pages + ); + ProcessingMode::TocWithPageNumbers + } else { + info!( + "TOC found on pages {:?}, no page numbers", + detection.pages + ); + ProcessingMode::TocWithoutPageNumbers + }; + + // Step 3: Process with degradation + let entries = self + .process_with_degradation(initial_mode, &detection, pages) + .await?; + + // Step 4: Refine oversized entries + self.refine_large_entries(entries, pages).await + } + + /// Process with automatic mode degradation. + /// + /// Tries the given mode, verifies the result, and degrades to a + /// lower-quality mode if accuracy is below threshold. + async fn process_with_degradation( + &self, + initial_mode: ProcessingMode, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let mut mode = initial_mode; + + loop { + info!("Attempting extraction with mode {:?}", mode); + + let result = match mode { + ProcessingMode::TocWithPageNumbers => { + self.process_toc_with_page_numbers(detection, pages).await + } + ProcessingMode::TocWithoutPageNumbers => { + self.process_toc_without_page_numbers(detection, pages).await + } + ProcessingMode::NoToc => { + // NoToc always succeeds (produces some structure) + return self.process_without_toc(pages).await; + } + }; + + match result { + Ok(entries) if !entries.is_empty() => { + // Verify the entries + let mut mutable_entries = entries; + let report = self + .verify_and_repair(&mut mutable_entries, pages) + .await?; + + if report.accuracy >= self.config.accuracy_threshold { + info!( + "Mode {:?} succeeded: {} entries, accuracy {:.1}%", + mode, + mutable_entries.len(), + report.accuracy * 100.0 + ); + return Ok(mutable_entries); + } + + // Accuracy too low, try degrading + warn!( + "Mode {:?} accuracy {:.1}% below threshold {:.1}%", + mode, + report.accuracy * 100.0, + self.config.accuracy_threshold * 100.0 + ); + + match mode.degrade() { + Some(next) => { + info!("Degrading from {:?} to {:?}", mode, next); + mode = next; + // Continue loop with degraded mode + } + None => { + warn!("No further degradation possible, returning best effort"); + return Ok(mutable_entries); + } + } + } + Ok(_) => { + // Empty entries, degrade + warn!("Mode {:?} produced no entries", mode); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Ok(Vec::new()), + } + } + Err(e) => { + warn!("Mode {:?} failed: {}", mode, e); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Err(e), + } + } + } } + } - info!( - "TOC found on pages {:?}, has_page_numbers: {}", - detection.pages, detection.has_page_numbers - ); + /// Mode 1: TOC with page numbers. + /// + /// Parse the TOC, calculate physical-page offset from anchor entries, + /// and apply the offset to all entries. + async fn process_toc_with_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let toc_text = self.extract_toc_text(pages, &detection.pages); + if toc_text.trim().is_empty() { + return Ok(Vec::new()); + } - // Step 2: Extract TOC text + let mut entries = self.parser.parse(&toc_text).await?; + if entries.is_empty() { + return Ok(Vec::new()); + } + + // Assign physical pages using offset calculation + self.assigner.assign(&mut entries, pages).await?; + + Ok(entries) + } + + /// Mode 2: TOC without page numbers. + /// + /// Parse the TOC, then use LLM to locate each entry in the document. + async fn process_toc_without_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { let toc_text = self.extract_toc_text(pages, &detection.pages); if toc_text.trim().is_empty() { - warn!("TOC text is empty, falling back to structure extraction"); - return self.process_without_toc(pages).await; + return Ok(Vec::new()); } - // Step 3: Parse TOC let mut entries = self.parser.parse(&toc_text).await?; if entries.is_empty() { - warn!("No entries parsed from TOC"); return Ok(Vec::new()); } - info!("Parsed {} TOC entries", entries.len()); + // Clear any TOC page numbers (they're unreliable in this mode) + for entry in &mut entries { + entry.toc_page = None; + } - // Step 4: Assign physical pages + // Assign physical pages using LLM positioning self.assigner.assign(&mut entries, pages).await?; - // Step 5: Verify and repair - let report = self.verify_and_repair(&mut entries, pages).await?; + Ok(entries) + } - info!( - "TOC processing complete: {} entries, accuracy {:.1}%", - entries.len(), - report.accuracy * 100.0 - ); + /// Mode 3: No TOC available. + /// + /// Extract document structure directly from page content using LLM. + async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { + info!("Extracting structure from page content (no TOC available)"); - Ok(entries) + let extractor = StructureExtractor::new(StructureExtractorConfig::default()); + extractor.extract(pages).await } /// Extract TOC text from pages. @@ -177,37 +341,6 @@ impl TocProcessor { .join("\n\n") } - /// Process document without TOC (structure extraction). - async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { - warn!("Processing without TOC - this is a placeholder implementation"); - - // TODO: Implement structure extraction for documents without TOC - // For now, return a simple structure based on page count - - let mut entries = Vec::new(); - - // Group pages into chunks - let chunk_size = 10; - for chunk in pages.chunks(chunk_size) { - let start_page = chunk.first().map(|p| p.number).unwrap_or(1); - let end_page = chunk.last().map(|p| p.number).unwrap_or(1); - - let title = if chunk.len() == 1 { - format!("Page {}", start_page) - } else { - format!("Pages {}-{}", start_page, end_page) - }; - - entries.push( - TocEntry::new(title, 1) - .with_physical_page(start_page) - .with_confidence(0.5), - ); - } - - Ok(entries) - } - /// Verify entries and repair if needed. async fn verify_and_repair( &self, @@ -217,7 +350,6 @@ impl TocProcessor { let mut attempts = 0; while attempts < self.config.max_repair_attempts { - // Verify let report = self.verifier.verify(entries, pages).await?; if report.accuracy >= self.config.accuracy_threshold { @@ -232,7 +364,6 @@ impl TocProcessor { return Ok(report); } - // Repair let repaired = self.repairer.repair(entries, &report.errors, pages).await?; if repaired == 0 { @@ -244,9 +375,109 @@ impl TocProcessor { debug!("Repair attempt {} complete", attempts); } - // Final verification self.verifier.verify(entries, pages).await } + + /// Refine oversized entries by extracting sub-structure. + /// + /// Entries that span too many pages or tokens are broken down using + /// the same structure extraction approach used for no-TOC documents. + async fn refine_large_entries( + &self, + entries: Vec, + pages: &[PdfPage], + ) -> Result> { + if entries.is_empty() { + return Ok(entries); + } + + let page_count = pages.len(); + + // Pre-compute next-entry page numbers before consuming entries + let next_pages: Vec> = entries + .iter() + .enumerate() + .map(|(i, _)| { + entries.get(i + 1).and_then(|e| e.physical_page) + }) + .collect(); + + let mut refined = Vec::with_capacity(entries.len()); + + for (i, entry) in entries.into_iter().enumerate() { + let span = entry_page_span(&entry, next_pages[i], page_count); + let tokens = entry_token_count(&entry, pages); + + if span > self.config.max_pages_per_entry + && tokens > self.config.max_tokens_per_entry + { + debug!( + "Refining oversized entry '{}' ({} pages, ~{} tokens)", + entry.title, span, tokens + ); + + // Extract sub-pages covered by this entry + let start = entry.physical_page.unwrap_or(1); + let end = next_pages[i].unwrap_or(page_count); + let sub_pages: Vec = pages + .iter() + .filter(|p| p.number >= start && p.number <= end) + .cloned() + .collect(); + + if sub_pages.is_empty() { + refined.push(entry); + } else { + // Run structure extraction on the sub-pages + let extractor = + StructureExtractor::new(StructureExtractorConfig::default()); + match extractor.extract(&sub_pages).await { + Ok(sub_entries) if !sub_entries.is_empty() => { + // If the first sub-entry has the same title as the + // parent, skip it — the parent already represents + // that content's starting point. + let skip = if sub_entries + .first() + .map(|e| e.title.trim() == entry.title.trim()) + .unwrap_or(false) + { + 1 + } else { + 0 + }; + + for sub in &sub_entries[skip..] { + let level_offset = entry.level; + refined.push( + TocEntry::new(&sub.title, sub.level + level_offset) + .with_physical_page(sub.physical_page.unwrap_or(start)) + .with_confidence(sub.confidence * 0.9), + ); + } + + info!( + "Refined '{}' into {} sub-entries", + entry.title, + sub_entries.len() - skip + ); + } + Ok(_) => { + debug!("Sub-extraction produced no entries, keeping original"); + refined.push(entry); + } + Err(e) => { + warn!("Sub-extraction failed for '{}': {}", entry.title, e); + refined.push(entry); + } + } + } + } else { + refined.push(entry); + } + } + + Ok(refined) + } } impl Default for TocProcessor { @@ -255,6 +486,26 @@ impl Default for TocProcessor { } } +/// Calculate how many pages an entry spans. +/// +/// From its physical_page to the next entry's physical_page (or document end). +fn entry_page_span(entry: &TocEntry, next_physical_page: Option, total_pages: usize) -> usize { + let start = entry.physical_page.unwrap_or(1); + let end = next_physical_page.unwrap_or(total_pages); + end.saturating_sub(start) +} + +/// Estimate total tokens for the content covered by an entry. +fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize { + let start = entry.physical_page.unwrap_or(1); + pages + .iter() + .filter(|p| p.number >= start) + .take(30) // cap at max_pages_per_entry default + .map(|p| p.token_count) + .sum() +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/rust/src/index/parse/toc/structure_extractor.rs new file mode 100644 index 00000000..a6dd807d --- /dev/null +++ b/rust/src/index/parse/toc/structure_extractor.rs @@ -0,0 +1,362 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Structure extraction from documents without a Table of Contents. +//! +//! When a PDF has no TOC (or all TOC-based extraction modes failed), this +//! module uses LLM to analyse page content and extract the document's +//! hierarchical structure directly. + +use tracing::{debug, info, warn}; + +use crate::config::LlmConfig; +use crate::error::Result; +use crate::index::parse::pdf::PdfPage; + +use super::types::TocEntry; +use crate::llm::LlmClient; + +/// Configuration for structure extraction. +#[derive(Debug, Clone)] +pub struct StructureExtractorConfig { + /// Maximum estimated tokens per page group sent to LLM. + pub max_tokens_per_group: usize, + + /// Number of overlap pages between consecutive groups. + pub overlap_pages: usize, + + /// LLM configuration. + pub llm_config: LlmConfig, +} + +impl Default for StructureExtractorConfig { + fn default() -> Self { + Self { + max_tokens_per_group: 20_000, + overlap_pages: 1, + llm_config: LlmConfig::default(), + } + } +} + +/// A group of consecutive pages with their combined text. +struct PageGroup { + /// Combined text with page markers: `\n...\n`. + text: String, + /// Start page number (1-based). + start_page: usize, + /// End page number (1-based, inclusive). + end_page: usize, +} + +/// Extracts document structure from page content using LLM. +/// +/// Used when a document has no Table of Contents, or when TOC-based extraction +/// failed. Pages are grouped by token count and analysed sequentially: the +/// first group generates an initial structure, subsequent groups append to it. +pub struct StructureExtractor { + config: StructureExtractorConfig, + client: LlmClient, +} + +impl StructureExtractor { + /// Create a new structure extractor. + pub fn new(config: StructureExtractorConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create an extractor with default configuration. + pub fn with_defaults() -> Self { + Self::new(StructureExtractorConfig::default()) + } + + /// Extract hierarchical structure from all pages. + pub async fn extract(&self, pages: &[PdfPage]) -> Result> { + if pages.is_empty() { + return Ok(Vec::new()); + } + + let groups = self.group_pages(pages); + info!( + "Extracting structure from {} pages in {} groups", + pages.len(), + groups.len() + ); + + let mut all_entries = Vec::new(); + let page_count = pages.len(); + + for (i, group) in groups.iter().enumerate() { + let group_entries = if i == 0 { + self.generate_initial(group).await? + } else { + self.generate_continuation(group, &all_entries).await? + }; + + debug!( + "Group {}/{} (pages {}-{}): extracted {} entries", + i + 1, + groups.len(), + group.start_page, + group.end_page, + group_entries.len() + ); + + all_entries.extend(group_entries); + } + + // Truncate physical_page values that exceed document length + for entry in &mut all_entries { + if let Some(p) = entry.physical_page { + if p > page_count { + warn!( + "Truncating out-of-range page {} for '{}'", + p, entry.title + ); + entry.physical_page = Some(page_count); + } + } + } + + info!("Structure extraction complete: {} entries", all_entries.len()); + Ok(all_entries) + } + + /// Group pages by estimated token count. + /// + /// Each group stays under `max_tokens_per_group`. Consecutive groups + /// overlap by `overlap_pages` pages to avoid splitting content at + /// section boundaries. + fn group_pages(&self, pages: &[PdfPage]) -> Vec { + let mut groups = Vec::new(); + let mut group_tokens = 0usize; + let mut group_pages_buf = Vec::new(); + + for (i, page) in pages.iter().enumerate() { + let new_tokens = group_tokens + page.token_count; + + if new_tokens > self.config.max_tokens_per_group && !group_pages_buf.is_empty() { + // Finalise current group + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + + // Start new group with overlap + let overlap_start = i.saturating_sub(self.config.overlap_pages); + group_pages_buf = pages[overlap_start..=i].to_vec(); + group_tokens = group_pages_buf.iter().map(|p| p.token_count).sum(); + } else { + group_tokens = new_tokens; + group_pages_buf.push(page.clone()); + } + } + + // Final group + if !group_pages_buf.is_empty() { + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + } + + groups + } + + /// Generate initial structure from the first page group. + async fn generate_initial(&self, group: &PageGroup) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + let user = format!( + r#"Analyze this document content and extract its hierarchical structure. + +Document content: +{} + +Return a JSON array: +[ + {{"title": "Section Title", "level": 1, "physical_page": 1}}, + {{"title": "Subsection", "level": 2, "physical_page": 3}}, + ... +] + +Rules: +- "level" reflects the hierarchy (1 = chapter/top, 2 = section, 3 = subsection) +- "physical_page" is the page number where the section begins +- Preserve original titles as closely as possible +- Only output the JSON array, no other text"#, + group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } + + /// Continue structure extraction for a subsequent group. + /// + /// Passes previously extracted entries as context so the LLM can + /// continue the structure rather than restart. + async fn generate_continuation( + &self, + group: &PageGroup, + previous: &[TocEntry], + ) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + + // Summarise previous entries as context + let prev_summary = previous + .iter() + .rev() + .take(10) + .rev() + .map(|e| { + format!( + " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", + e.title, + e.level, + e.physical_page.unwrap_or(0) + ) + }) + .collect::>() + .join(",\n"); + + let user = format!( + r#"Previously extracted structure: +[ +{} +] + +Continue extracting structure from these pages: +{} + +Return ONLY the NEW entries (do not repeat previous ones): +[ + {{"title": "...", "level": N, "physical_page": M}}, + ... +] + +If no new structural elements are found, return: []"#, + prev_summary, group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } +} + +/// Format pages into tagged text for LLM consumption. +fn format_group_text(pages: &[PdfPage]) -> String { + pages + .iter() + .map(|p| { + // Truncate individual page text if very long + let text = if p.text.len() > 3000 { + &p.text[..3000] + } else { + &p.text + }; + format!("\n{}\n", p.number, text, p.number) + }) + .collect::>() + .join("\n\n") +} + +const STRUCTURE_EXTRACTION_SYSTEM_PROMPT: &str = r#"You are a document structure extraction expert. Your task is to analyze document content and extract its hierarchical structure (chapters, sections, subsections). + +For each structural element you find, provide: +- title: The section title exactly as it appears +- level: The hierarchy level (1 = chapter/top level, 2 = section, 3 = subsection) +- physical_page: The page number where this section begins + +Important: +- Focus on genuine structural elements (chapters, sections), not paragraph topics +- Do NOT include the abstract, summary, or bibliography as structural elements unless they are major sections +- Be conservative: fewer high-quality entries are better than many low-quality ones"#; + +/// LLM response type for structure extraction. +#[derive(serde::Deserialize)] +struct ExtractedSection { + title: String, + level: usize, + physical_page: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = StructureExtractorConfig::default(); + assert_eq!(config.max_tokens_per_group, 20_000); + assert_eq!(config.overlap_pages, 1); + } + + #[test] + fn test_group_pages_single_group() { + let extractor = StructureExtractor::with_defaults(); + + let pages: Vec = (1..=5) + .map(|i| PdfPage::new(i, format!("Page {} content", i))) + .collect(); + + let groups = extractor.group_pages(&pages); + assert_eq!(groups.len(), 1); + assert_eq!(groups[0].start_page, 1); + assert_eq!(groups[0].end_page, 5); + } + + #[test] + fn test_group_pages_multiple_groups() { + let config = StructureExtractorConfig { + max_tokens_per_group: 50, + overlap_pages: 1, + ..Default::default() + }; + let extractor = StructureExtractor::new(config); + + // Create pages with enough text to span multiple groups + let pages: Vec = (1..=10) + .map(|i| { + let text = format!("Page {} content. This is a longer text to use more tokens. ", i).repeat(10); + PdfPage::new(i, text) + }) + .collect(); + + let groups = extractor.group_pages(&pages); + assert!(groups.len() > 1, "Expected multiple groups, got {}", groups.len()); + } + + #[test] + fn test_format_group_text() { + let pages = vec![ + PdfPage::new(1, "Hello"), + PdfPage::new(2, "World"), + ]; + let text = format_group_text(&pages); + assert!(text.contains("")); + assert!(text.contains("")); + assert!(text.contains("Hello")); + assert!(text.contains("World")); + } +} diff --git a/rust/src/index/parse/toc/types.rs b/rust/src/index/parse/toc/types.rs index 9465311b..0438c0d3 100644 --- a/rust/src/index/parse/toc/types.rs +++ b/rust/src/index/parse/toc/types.rs @@ -266,6 +266,33 @@ impl VerificationReport { } } +/// Processing mode for the TOC extraction pipeline. +/// +/// Modes are ordered by quality: higher modes produce more accurate results +/// when they succeed, but can degrade to lower modes on failure. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessingMode { + /// TOC found with page numbers. Highest quality path. + TocWithPageNumbers, + /// TOC found without page numbers, or page-number accuracy was too low. + TocWithoutPageNumbers, + /// No TOC, or all TOC-based modes failed. LLM-driven structure extraction. + NoToc, +} + +impl ProcessingMode { + /// Degrade to the next lower quality mode. + /// + /// Returns `None` if already at the lowest mode (`NoToc`). + pub fn degrade(self) -> Option { + match self { + Self::TocWithPageNumbers => Some(Self::TocWithoutPageNumbers), + Self::TocWithoutPageNumbers => Some(Self::NoToc), + Self::NoToc => None, + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -307,4 +334,17 @@ mod tests { "Title not found on page" ); } + + #[test] + fn test_processing_mode_degrade() { + assert_eq!( + ProcessingMode::TocWithPageNumbers.degrade(), + Some(ProcessingMode::TocWithoutPageNumbers) + ); + assert_eq!( + ProcessingMode::TocWithoutPageNumbers.degrade(), + Some(ProcessingMode::NoToc) + ); + assert_eq!(ProcessingMode::NoToc.degrade(), None); + } } From 6ff5abd7986993493db93f9dc3427a7c8f74a2e3 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Mon, 13 Apr 2026 13:44:38 +0800 Subject: [PATCH 2/5] feat(toc): implement concurrent LLM verification for TOC entries - Replace sequential LLM calls with concurrent processing using futures::join_all for better performance - Add concurrent page assignment verification in PageAssigner - Implement concurrent TOC entry verification in IndexVerifier - Add concurrent index repair functionality in IndexRepairer - Refactor methods to static versions for concurrent use - Improve performance of oversized entry refinement in TocProcessor --- rust/src/index/parse/toc/assigner.rs | 154 ++++++++++++++++---------- rust/src/index/parse/toc/processor.rs | 104 +++++++++-------- rust/src/index/parse/toc/repairer.rs | 102 +++++++++++------ rust/src/index/parse/toc/verifier.rs | 95 +++++++++------- 4 files changed, 280 insertions(+), 175 deletions(-) diff --git a/rust/src/index/parse/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs index fc97c420..eefa3769 100644 --- a/rust/src/index/parse/toc/assigner.rs +++ b/rust/src/index/parse/toc/assigner.rs @@ -4,6 +4,7 @@ //! Page assigner - assigns physical page numbers to TOC entries. use std::collections::HashMap; +use futures::future::join_all; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -121,7 +122,7 @@ impl PageAssigner { .collect() } - /// Calculate page offset by verifying anchors. + /// Calculate page offset by verifying anchors concurrently. async fn calculate_offset( &self, anchors: Vec<&TocEntry>, @@ -132,26 +133,41 @@ impl PageAssigner { } let anchor_count = anchors.len(); - let mut verified_offsets: Vec<(i32, bool)> = Vec::new(); - - for anchor in anchors { - let toc_page = anchor.toc_page.unwrap(); - - // Find the physical page where this title appears - if let Some(physical) = self - .locate_title_in_range(anchor.title.as_str(), pages, toc_page) - .await? - { - let offset = physical as i32 - toc_page as i32; - verified_offsets.push((offset, true)); - debug!( - "Anchor '{}' found: toc={}, physical={}, offset={}", - anchor.title, toc_page, physical, offset - ); - } else { - verified_offsets.push((0, false)); - } - } + + // Verify all anchors concurrently + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let futures: Vec<_> = anchors + .into_iter() + .map(|anchor| { + let title = anchor.title.clone(); + let toc_page = anchor.toc_page.unwrap(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let range_pages = Self::pages_around(&pages, toc_page, 3); + if range_pages.is_empty() { + return (0, false); + } + + let content = Self::format_range_pages(&range_pages); + match Self::locate_with_client(&client, &title, &content).await { + Ok(Some(physical)) => { + let offset = physical as i32 - toc_page as i32; + debug!( + "Anchor '{}' found: toc={}, physical={}, offset={}", + title, toc_page, physical, offset + ); + (offset, true) + } + _ => (0, false), + } + } + }) + .collect(); + + let verified_offsets = join_all(futures).await; // Calculate the mode (most common offset) let successful: Vec<_> = verified_offsets @@ -164,7 +180,7 @@ impl PageAssigner { return Ok(PageOffset::new(0, 0, 0.0)); } - let mode = self.calculate_mode(&successful); + let mode = Self::calculate_mode_static(&successful); let sample_count = successful.len(); let confidence = sample_count as f32 / anchor_count as f32; @@ -173,6 +189,11 @@ impl PageAssigner { /// Calculate mode of offset values. fn calculate_mode(&self, values: &[i32]) -> i32 { + Self::calculate_mode_static(values) + } + + /// Static version for use in concurrent contexts. + fn calculate_mode_static(values: &[i32]) -> i32 { let mut counts: HashMap = HashMap::new(); for &v in values { *counts.entry(v).or_insert(0) += 1; @@ -184,25 +205,18 @@ impl PageAssigner { .unwrap_or(0) } - /// Locate a title in a range of pages using LLM. - async fn locate_title_in_range( - &self, - title: &str, - pages: &[PdfPage], - near_page: usize, - ) -> Result> { - // Search in a range around the expected page - let start = (near_page.saturating_sub(3)).max(1); - let end = (near_page + 3).min(pages.len()); - - let range_pages: Vec<_> = (start..=end).filter_map(|i| pages.get(i - 1)).collect(); - - if range_pages.is_empty() { - return Ok(None); - } + /// Collect pages around a center page number. + fn pages_around(pages: &[PdfPage], center: usize, range: usize) -> Vec { + let start = center.saturating_sub(range).max(1); + let end = (center + range).min(pages.len()); + (start..=end) + .filter_map(|i| pages.get(i - 1).cloned()) + .collect() + } - // Use LLM to find the exact page - let content = range_pages + /// Format pages into tagged text for LLM. + fn format_range_pages(pages: &[PdfPage]) -> String { + pages .iter() .map(|p| { format!( @@ -213,8 +227,15 @@ impl PageAssigner { ) }) .collect::>() - .join("\n\n"); + .join("\n\n") + } + /// Locate a title in pre-formatted content using LLM (static, for concurrent use). + async fn locate_with_client( + client: &LlmClient, + title: &str, + content: &str, + ) -> Result> { let system = "You are a document analysis assistant. Find which page contains a specific section title."; let user = format!( r#"Find which page contains the section titled: "{}" @@ -232,21 +253,37 @@ Reply in JSON format: page: Option, } - let result: LocateResult = self.client.complete_json(system, &user).await?; + let result: LocateResult = client.complete_json(system, &user).await?; Ok(result.page) } - /// Assign pages using LLM for each entry. + /// Assign pages using LLM for each entry (concurrently). async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { info!("Assigning pages using LLM positioning"); - // Group pages for efficient processing - let page_groups = self.group_pages(pages, 5); + let client = self.client.clone(); + let pages_owned = pages.to_vec(); - for entry in entries.iter_mut() { - let physical = self - .locate_title_in_groups(entry.title.as_str(), &page_groups) - .await?; + // Launch all entry searches concurrently + let futures: Vec<_> = entries + .iter() + .map(|entry| { + let title = entry.title.clone(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let groups = Self::group_pages_owned(&pages, 5); + Self::locate_title_in_groups_static(&client, &title, &groups).await + } + }) + .collect(); + + let results = join_all(futures).await; + + // Write results back + for (entry, result) in entries.iter_mut().zip(results.into_iter()) { + let physical = result?; entry.physical_page = physical; entry.confidence = if physical.is_some() { 0.8 } else { 0.3 }; } @@ -254,19 +291,22 @@ Reply in JSON format: Ok(()) } - /// Group pages for batch processing. - fn group_pages<'a>(&self, pages: &'a [PdfPage], group_size: usize) -> Vec> { + /// Group owned pages for batch processing. + fn group_pages_owned(pages: &[PdfPage], group_size: usize) -> Vec> { pages .chunks(group_size) - .map(|chunk| chunk.iter().collect()) + .map(|chunk| chunk.to_vec()) .collect() } - /// Locate a title across page groups. - async fn locate_title_in_groups( - &self, + /// Locate a title across page groups (static, for concurrent use). + /// + /// Searches groups sequentially (early return on first match), + /// but multiple title searches can run concurrently. + async fn locate_title_in_groups_static( + client: &LlmClient, title: &str, - groups: &[Vec<&PdfPage>], + groups: &[Vec], ) -> Result> { let system = "You are a document analysis assistant. Find which page contains a specific section title."; @@ -301,7 +341,7 @@ Reply in JSON format: page: Option, } - let result: SearchResult = self.client.complete_json(system, &user).await?; + let result: SearchResult = client.complete_json(system, &user).await?; if result.found { return Ok(result.page); diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs index 978ba4e3..b2dbc1cd 100644 --- a/rust/src/index/parse/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -7,6 +7,7 @@ //! degradation: if one mode fails verification, it falls back to a lower-quality //! but more reliable mode. +use futures::future::join_all; use tracing::{debug, info, warn}; use crate::error::Result; @@ -393,30 +394,24 @@ impl TocProcessor { let page_count = pages.len(); - // Pre-compute next-entry page numbers before consuming entries + // Pre-compute next-entry page numbers and classify entries let next_pages: Vec> = entries .iter() .enumerate() - .map(|(i, _)| { - entries.get(i + 1).and_then(|e| e.physical_page) - }) + .map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page)) .collect(); - let mut refined = Vec::with_capacity(entries.len()); - - for (i, entry) in entries.into_iter().enumerate() { - let span = entry_page_span(&entry, next_pages[i], page_count); - let tokens = entry_token_count(&entry, pages); - - if span > self.config.max_pages_per_entry - && tokens > self.config.max_tokens_per_entry - { - debug!( - "Refining oversized entry '{}' ({} pages, ~{} tokens)", - entry.title, span, tokens - ); - - // Extract sub-pages covered by this entry + // Identify oversized entries and launch extractions concurrently + let oversized_futures: Vec<_> = entries + .iter() + .enumerate() + .filter(|(i, entry)| { + let span = entry_page_span(entry, next_pages[*i], page_count); + let tokens = entry_token_count(entry, pages); + span > self.config.max_pages_per_entry + && tokens > self.config.max_tokens_per_entry + }) + .map(|(i, entry)| { let start = entry.physical_page.unwrap_or(1); let end = next_pages[i].unwrap_or(page_count); let sub_pages: Vec = pages @@ -425,20 +420,24 @@ impl TocProcessor { .cloned() .collect(); - if sub_pages.is_empty() { - refined.push(entry); - } else { - // Run structure extraction on the sub-pages + let entry_title = entry.title.clone(); + let entry_level = entry.level; + + async move { + if sub_pages.is_empty() { + return (i, Vec::new()); + } + debug!( + "Refining oversized entry '{}' (pages {}-{})", + entry_title, start, end + ); let extractor = StructureExtractor::new(StructureExtractorConfig::default()); match extractor.extract(&sub_pages).await { - Ok(sub_entries) if !sub_entries.is_empty() => { - // If the first sub-entry has the same title as the - // parent, skip it — the parent already represents - // that content's starting point. + Ok(sub_entries) => { let skip = if sub_entries .first() - .map(|e| e.title.trim() == entry.title.trim()) + .map(|e| e.title.trim() == entry_title.trim()) .unwrap_or(false) { 1 @@ -446,37 +445,52 @@ impl TocProcessor { 0 }; - for sub in &sub_entries[skip..] { - let level_offset = entry.level; - refined.push( - TocEntry::new(&sub.title, sub.level + level_offset) + let refined: Vec = sub_entries[skip..] + .iter() + .map(|sub| { + TocEntry::new(&sub.title, sub.level + entry_level) .with_physical_page(sub.physical_page.unwrap_or(start)) - .with_confidence(sub.confidence * 0.9), - ); - } + .with_confidence(sub.confidence * 0.9) + }) + .collect(); info!( "Refined '{}' into {} sub-entries", - entry.title, - sub_entries.len() - skip + entry_title, + refined.len() ); - } - Ok(_) => { - debug!("Sub-extraction produced no entries, keeping original"); - refined.push(entry); + (i, refined) } Err(e) => { - warn!("Sub-extraction failed for '{}': {}", entry.title, e); - refined.push(entry); + warn!("Sub-extraction failed for '{}': {}", entry_title, e); + (i, Vec::new()) } } } + }) + .collect(); + + let extraction_results = join_all(oversized_futures).await; + + // Build a lookup from index → refined sub-entries + let mut refined_map = std::collections::HashMap::new(); + for (idx, sub_entries) in extraction_results { + if !sub_entries.is_empty() { + refined_map.insert(idx, sub_entries); + } + } + + // Assemble final output + let mut result = Vec::with_capacity(entries.len() * 2); + for (i, entry) in entries.into_iter().enumerate() { + if let Some(sub_entries) = refined_map.remove(&i) { + result.extend(sub_entries); } else { - refined.push(entry); + result.push(entry); } } - Ok(refined) + Ok(result) } } diff --git a/rust/src/index/parse/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs index 4062f215..70498782 100644 --- a/rust/src/index/parse/toc/repairer.rs +++ b/rust/src/index/parse/toc/repairer.rs @@ -3,6 +3,7 @@ //! Index repairer - fixes incorrect TOC entry page assignments. +use futures::future::join_all; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -54,7 +55,7 @@ impl IndexRepairer { Self::new(RepairerConfig::default()) } - /// Repair incorrect entries. + /// Repair incorrect entries concurrently. pub async fn repair( &self, entries: &mut [TocEntry], @@ -66,38 +67,67 @@ impl IndexRepairer { } info!("Repairing {} incorrect entries", errors.len()); - let mut repaired_count = 0; - - for error in errors { - if error.index >= entries.len() { - continue; - } - let entry = &mut entries[error.index]; - let expected_page = error.expected_page; - - // Search around the expected page - let start = expected_page - .saturating_sub(self.config.search_range) - .max(1); - let end = (expected_page + self.config.search_range).min(pages.len()); - - if let Some(correct_page) = self - .find_correct_page(&entry.title, pages, start..=end) - .await? - { - debug!( - "Repaired '{}' : page {} → {}", - entry.title, expected_page, correct_page - ); - entry.physical_page = Some(correct_page); - entry.confidence = 0.9; - repaired_count += 1; - } else { - debug!( - "Could not repair '{}' (searched pages {}-{})", - entry.title, start, end - ); + // Collect repair tasks (don't borrow entries mutably yet) + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let search_range = self.config.search_range; + + let tasks: Vec<_> = errors + .iter() + .filter(|error| error.index < entries.len()) + .map(|error| { + let title = entries[error.index].title.clone(); + let expected_page = error.expected_page; + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let start = expected_page.saturating_sub(search_range).max(1); + let end = (expected_page + search_range).min(pages.len()); + + let result = Self::find_correct_page_static( + &client, + &title, + &pages, + start..=end, + ) + .await; + + (title, expected_page, result) + } + }) + .collect(); + + let results = join_all(tasks).await; + + // Apply repairs + let mut repaired_count = 0; + for (title, expected_page, result) in results { + match result { + Ok(Some(correct_page)) => { + // Find the corresponding error entry and fix it + if let Some(error) = errors.iter().find(|e| e.title == title) { + if error.index < entries.len() { + debug!( + "Repaired '{}' : page {} → {}", + title, expected_page, correct_page + ); + entries[error.index].physical_page = Some(correct_page); + entries[error.index].confidence = 0.9; + repaired_count += 1; + } + } + } + Ok(None) => { + debug!( + "Could not repair '{}' (searched around page {})", + title, expected_page + ); + } + Err(e) => { + debug!("Repair failed for '{}': {}", title, e); + } } } @@ -105,9 +135,9 @@ impl IndexRepairer { Ok(repaired_count) } - /// Find the correct page for a title within a range. - async fn find_correct_page( - &self, + /// Find the correct page for a title within a range (static, for concurrent use). + async fn find_correct_page_static( + client: &LlmClient, title: &str, pages: &[PdfPage], range: std::ops::RangeInclusive, @@ -152,7 +182,7 @@ Reply in JSON format: page: Option, } - let result: FindResult = self.client.complete_json(system, &user).await?; + let result: FindResult = client.complete_json(system, &user).await?; if result.found { Ok(result.page) diff --git a/rust/src/index/parse/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs index d0c3883e..42186a09 100644 --- a/rust/src/index/parse/toc/verifier.rs +++ b/rust/src/index/parse/toc/verifier.rs @@ -3,6 +3,7 @@ //! Index verifier - verifies TOC entry page assignments. +use futures::future::join_all; use rand::seq::SliceRandom; use tracing::{debug, info}; @@ -55,6 +56,8 @@ impl IndexVerifier { } /// Verify TOC entries against PDF pages. + /// + /// All sample entries are verified concurrently via LLM calls. pub async fn verify( &self, entries: &[TocEntry], @@ -64,38 +67,58 @@ impl IndexVerifier { return Ok(VerificationReport::all_correct(0)); } - // Select sample let sample = self.select_sample(entries); - // Verify each sample entry + // Launch all verification checks concurrently + let client = self.client.clone(); + let futures: Vec<_> = sample + .iter() + .map(|(index, entry)| { + let index = *index; + let title = entry.title.clone(); + let physical_page = entry.physical_page; + let client = client.clone(); + let pages = pages.to_vec(); + + async move { + match physical_page { + Some(page) => { + let result = + Self::verify_entry_with_client(&client, &title, page, &pages).await; + (index, title, page, result) + } + None => ( + index, + title, + 0, + Ok(Err(ErrorType::PageOutOfRange)), + ), + } + } + }) + .collect(); + + let results = join_all(futures).await; + + // Aggregate results + let total = results.len(); let mut errors = Vec::new(); let mut correct = 0; - for (index, entry) in &sample { - if let Some(physical_page) = entry.physical_page { - match self.verify_entry(entry, physical_page, pages).await? { - Ok(()) => correct += 1, - Err(error_type) => { - errors.push(VerificationError::new( - *index, - entry.title.clone(), - physical_page, - error_type, - )); - } + for (index, title, page, result) in results { + match result { + Ok(Ok(())) => correct += 1, + Ok(Err(error_type)) => { + errors.push(VerificationError::new(index, title, page, error_type)); + } + Err(e) => { + debug!("Verification LLM call failed: {}", e); + errors.push(VerificationError::new(index, title, page, ErrorType::TitleNotFound)); } - } else { - // No physical page assigned - errors.push(VerificationError::new( - *index, - entry.title.clone(), - 0, - ErrorType::PageOutOfRange, - )); } } - let report = VerificationReport::new(sample.len(), correct, errors); + let report = VerificationReport::new(total, correct, errors); info!( "Verification complete: {}/{} correct ({:.1}% accuracy)", report.correct, @@ -126,28 +149,23 @@ impl IndexVerifier { } } - /// Verify a single entry. - async fn verify_entry( - &self, - entry: &TocEntry, + /// Verify a single entry using a cloned client (for concurrent use). + async fn verify_entry_with_client( + client: &LlmClient, + title: &str, physical_page: usize, pages: &[PdfPage], ) -> Result> { - // Check page bounds if physical_page == 0 || physical_page > pages.len() { return Ok(Err(ErrorType::PageOutOfRange)); } let page = &pages[physical_page - 1]; - // Use LLM to check if title appears on this page - let found = self.check_title_on_page(&entry.title, &page.text).await?; + let found = Self::check_title_on_page_with_client(client, title, &page.text).await?; if !found { - debug!( - "Title '{}' not found on page {}", - entry.title, physical_page - ); + debug!("Title '{}' not found on page {}", title, physical_page); return Ok(Err(ErrorType::TitleNotFound)); } @@ -155,10 +173,13 @@ impl IndexVerifier { } /// Check if a title appears on a page using LLM. - async fn check_title_on_page(&self, title: &str, page_text: &str) -> Result { + async fn check_title_on_page_with_client( + client: &LlmClient, + title: &str, + page_text: &str, + ) -> Result { let system = "You are a document analysis assistant. Determine if a section title appears in the given text."; - // Truncate page text if too long let text = if page_text.len() > 1000 { &page_text[..1000] } else { @@ -181,7 +202,7 @@ Reply in JSON format: found: bool, } - let result: CheckResult = self.client.complete_json(system, &user).await?; + let result: CheckResult = client.complete_json(system, &user).await?; Ok(result.found) } From e8c10b0b9d41d41bd666a867359de6fb7f97db26 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Mon, 13 Apr 2026 14:09:00 +0800 Subject: [PATCH 3/5] feat: add PDF indexing example with comprehensive documentation - Create index_pdf.rs example demonstrating PDF indexing capabilities - Implement automatic PDF format detection and hierarchical document parsing - Add support for environment variable configuration for LLM settings - Include detailed usage instructions with command-line examples - Integrate error handling and process exit codes for invalid inputs - Provide comprehensive metrics output including timing and processing stats - Add automatic workspace cleanup after indexing operations --- rust/examples/index_pdf.rs | 100 +++++++++++++++++++++++++++++++++ samples/Docker_Cheat_Sheet.pdf | Bin 0 -> 25326 bytes 2 files changed, 100 insertions(+) create mode 100644 rust/examples/index_pdf.rs create mode 100755 samples/Docker_Cheat_Sheet.pdf diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs new file mode 100644 index 00000000..244ca6a2 --- /dev/null +++ b/rust/examples/index_pdf.rs @@ -0,0 +1,100 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! PDF indexing example — index a PDF document via the vectorless engine. +//! +//! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf +//! +//! # Or with defaults (edit the code to set your key/endpoint): +//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf +//! ``` + +use std::path::Path; + +use vectorless::{EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let args: Vec = std::env::args().collect(); + + let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or_else(|| { + eprintln!("Usage: cargo run --example index_pdf -- "); + std::process::exit(1); + }); + + if !Path::new(pdf_path).exists() { + eprintln!("Error: file not found: {}", pdf_path); + std::process::exit(1); + } + + println!("=== Indexing PDF: {} ===\n", pdf_path); + + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + + let engine = EngineBuilder::new() + .with_workspace("./workspace_pdf_example") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + // Index the PDF — format is auto-detected from the .pdf extension. + // The engine will: + // 1. Extract text from every page + // 2. Detect and parse the Table of Contents + // 3. Build a hierarchical document tree + // 4. Generate summaries for each node (LLM) + // 5. Build a reasoning index for retrieval + let result = engine + .index(IndexContext::from_path(pdf_path)) + .await?; + + println!( + "Indexed: {}, Failed: {}", + result.items.len(), + result.failed.len() + ); + + for item in &result.items { + println!("\n--- {} ---", item.name); + println!("doc_id: {}", item.doc_id); + println!("format: {:?}", item.format); + + if let Some(metrics) = &item.metrics { + println!("\nMetrics:"); + println!(" total time: {}ms", metrics.total_time_ms()); + println!(" parse: {}ms", metrics.parse_time_ms); + println!(" build: {}ms", metrics.build_time_ms); + println!(" enhance: {}ms", metrics.enhance_time_ms); + println!(" nodes: {}", metrics.nodes_processed); + println!(" summaries: {}", metrics.summaries_generated); + println!(" llm calls: {}", metrics.llm_calls); + println!(" tokens: {}", metrics.total_tokens_generated); + println!(" topics: {}", metrics.topics_indexed); + println!(" keywords: {}", metrics.keywords_indexed); + } + } + + for fail in &result.failed { + eprintln!("FAILED: {} — {}", fail.source, fail.error); + } + + // Cleanup workspace + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/samples/Docker_Cheat_Sheet.pdf b/samples/Docker_Cheat_Sheet.pdf new file mode 100755 index 0000000000000000000000000000000000000000..0768f1c3eb59a04f87de07c384f32795f187f210 GIT binary patch literal 25326 zcmaHx19)Uzv#2N5#GKf+ZQHiZj-5=LOst7Bv2EM7HL>m7%=i7@fBtjMy|;Jw-d(k7 z)vC2>_kOybcag{oi_$RAvOts6pH;j-Gvm|a+ZkFyb93XyP z5th)eEG3Fuvm*QvVO};PTyW{@U`Wmc#cf{WOW630M9x`QB3Gp>42Pz3{EdIy5j@Vj@fh!sSaM#DTkVd7TGPnN)*hRHGgSQNd!2rFQcUPhy50bzPPsCJx%-aZp8UlP&@DG2mjTNN1Zn3haUF30=-? zzQa>?CwX!Kg3v4JhjM5usZR_T6FOM6Y?FI0? zb^2@tYecz^ zIu=Lf1Gk$rv-}>J#D84*VWbN!ceB#_4pi+-aL=7`H7LnqZ43rb_Qk3|DG_@$%)+?}cb#~>a)lZj(+~F%wx@>1@q8tVBZmXXVzSkCM|sN; z0B{Alf-ltNc4%$pqw5yfJbGLr$+A{Y$l1cXl6BV5%5x;bf|;sI`3kE>_4t*M^XN+K zpfzkIcGTmpkp;z{D=~FmX{4*Sx2)Tt|D-QfFLzZ(_=7W!sum;(fv1 z2Wi3Cn7&TBd{8W80Ao#!Bd1ef8FIh!jHP<&mv;Y@jpiYp>MJ;(k6xX4p7lwR2GK#q zVxPh#VxbVdRTZxhHQ4S)_1N6hA{2ojmH`IKeY3&h2U>yaC8{OY1 zTDKGX9AsIH+?ZKe_fn>4^9HS%CD@P`3(ACD<0QY!Ix}qRZuD@!BFNMeElWtJkk;p6 zd`C60(om1iSAb5W5UE?-}q#>T7LLX&}i!0bth5?St2kl9oaG~_51N!a_dX6pXJVs64W%ok2 z&KXU(A@tv}xxaFAtbAM2r@`Xgo5%-_mM+r*SH4`0%Z~sV$O6I?f+LkW&xUe)U zH?aBoQ$rU*q6o@x5)WRa8T?X!lV7+4WxRH>r~NJ#>y^<-uj5xj%}dBf=Bc`@YCHs$ zax}=i90Ypl;+|=!GQ9%#dVDL-`^Wum3%?pT2AIGO?yPH_8?oz~svg~pb;BiOD5-$* z>uB7O$Q`JDVR6^T{XNU3vW#pxIqNB`&_*lEDFVF8u6>m-+Bt%rGxLFxKM1EXC}Itl z&mEJ;W~^F(q;6?%CSV$f#Ylz8*r~*ln>yDE`)do6CZFq%kgKEPXM>?oAyHQ{=^=vG$dyI zfh*;-KnsHOO8E8+bXW)0d6&$2KgOfFSy{uGUM-Np-k@-=F>_}nxgVwihR4~28 z=GuaKp6U+g#fsbZR+oIMKA@3=+>3P$d5av|%{C^Ap}25?8@TXrb;pgfVRDeUQfILS z9gMn*mdVF=orm)mhqKJHQnD4+OOJsTxXgn!XOMPfvT%&LOOJK!^KA+Qxx6j?@PJBE z=rI?#xQ}*aONGP`Sj+-aUxH4R7t>%;v;$?o=#dmDQWbW0jqk69VV$#35TGbg`68ja zJlJBO8CB^rtTyPWd);NX@luQ$M!V-ygNL-Q*}EjzExOvR^e_r)ZVYN&;|&7;hT16(^;SJ!QhYqRVK?# z_t$|e@0f69twmcW9Stv+8Bt^wTlzD5jtwLWmJKVU`pww~u%%cL)A^=tH+pw-4{!s* zS{rvz!lD|S>~datI4v`+Mfu5w(qdUlVpF<_0i}`QrX5qWw5EHGg3I31S$N&7BrSWr zM@Bg1B*gQrq!5srR6^Pw(_VFR!GrBz85h~ZG?sTid7MLj@ypNk9R!u*paxn=1H{SO z%Q8}NLo$;Xh`o?NI~--RzxWdm>&tGTKXdr&9eTWj*PzHfa|rZEQ!BDypwNS-6*A*U zLt|_$r4;9_CN5uhtJ2e~7ik?0f~_mE(I2X|-21jVbZ&);z0GACRcB$=Db_hblI?~m z1}g>1$|Vn=50(yfp4ZJI&3r60X74n5w?dp#NKewR`TKaq z{gZPF19h?zYU~JTIRjip2JS%ba=nLmhHG$ZYEh{3`BBncd1W)b!i1ZsBBX~tEG{Oi z-toCN*X9&j~(_>m%9_E1%xrlWhq(JbBh?M<*5VZ=q*;){U3*`h@1^Lt zMY1ipt(nKqET^^Ya3gPtKwCN-|2A+lT{sqf};PV#R zr!NMV-}okkyD%LEi&^}kpn~SDgV8vMf_^OXRdOIGV;26DFTZ8 zG;g&?t7hc2c$=h zpXisuG}HE9Sf}QPK&1y~bWQ6~h!eb4rp}pl9#I1=YkQe=C#P6U!B4*)VC?7;a=^CQ z$3MM)aS3jn2yeBG8|sJc>&B@9sd_lUa!!%Pr)dRM>Yca;ipg$AYrVg@Ogx+3-OAOJ zGiPu=stcpQ!ULj1o{reo2Wr+Z5}3!^h<*pb(_{Mj@*5jnELLPdqpYX4_CC`(iXXHmgK&%yS`;E%}k@8aXHazn%oAf^QP zEH&`Cx&J6WUvviap8`IekN`ddKAn-l=lQb$`E2<6Oebn*3-}LfQyO|2dVJP@3Yb3* zOlcTCRc!xL#emQLkLuIa=lfqFG5$Lwd^%-!dlP&*c>^;Od=h+yze1EZa5VWGk>Q`d zbTTH!76yWLZunZCDtdel7J6C^7Ir2!W_$)FHU?UHc4h{8R(zemItzUU``OzG|4-5N zcZDV9XyHgf+*ErAbkbT;`%68f|fHgT~qGEo#0 z{G**QBJnK1rm?+1m{67jmhCeg-e^-8gR`c%?kpZ8Bk&X4gm4d(B z-QAQ$R_s*T3uRriHn`gJW{;w#ptRyhW4>33j}pe<3yXY}!smxf6GuV72>tG$4erbz z+waF8BOu&|fg!>V&dgaB5O+wP-AhKh%N)5I6>6jCb-@8OWQ6p zheJS-*kpN7=ZUU@?QOauafh?WUMQELY#a8~{RaZx1eyD6b?@%X0DZAE59EVFkOwsRf;~cdw1ogv+o|SLb~T7tkD47Q zR)D0^O+I|r0hY&dkfs_xHbkDQn6i`zY({Po_~yH@a)eBaQ9@e~;#JKZ8V64x-gOC= zHV)i!x_d=U$ul0uYe&z5!fAW)>n0ppve{upvyo?-=nuf1;t4ze7DvQRkN#J#;een* zh(&?&rsGFrmhU~ROd7LYNz?O=)Xre>4QVkTLHi}ZOgthC@n|TLWVjwq(Ta0lJ=q#V z9|sOZ+k2&`%Sgxi$11Iu-lW{|1x_5OxG2Q(Z$M)#Fo9uXY?W$FFHA`Z+L&IAF)!?F zw_)d_wioF}o{;|4Xnj3hX)2O~a32FsPJ0ou5?vyha=yrjh-MoxUy`8gl<2Ty&sx#k zO1|bK!%GXw`WGgLdx`3L<~?mPab~@SH+zm2vs}C4=;E^BT#Ln{kwz_ZO)LWrTME~u zl2|KUFNg7>`f7#BH6mUFu0!*mI@dLGa_OUC9nY^fmHPm6vp|8Laylv3fi}!JwpSuG z$o03wDN2*yx@@ll1ooBV?&hSeMUCZTjRJL06*eD5fm(|sV#Yf4RfSb#Saa?9~yrka;CGO?!A z>!Mr^^a^`txM%kj{P?Ohx_F|KU&l;gU1$KpxP-eQL3Ie;(tO8_MIahVs6fP}rTgnd zOXLL2L)5roR9&EhGc5XRWKh`2K)4)s6zow|cAU(cvMunC#+xzt$~i@8luQXf||J6j(kwuwH}NXDr2V>{n^PHN>`J zt~5p&dTlcZ)513PtLjnaNuns-;}0uwj#&x_G6Tyxm%OUMg8Z1pEYhH}--2`+@l$Q# z`)Z|XlPl3!>I^T>DaDdyW6oUK?_Up8FJ~SwS2Em-n47 zfLCdI!PS%F5-bQE7kwAn!B<Z??G*m$hs+Hd<-4$f0mSkDLiZ#OY(>NOgK z${Z)`XPl;<-GZs9U#ru`RnshUImR7iK+@bhEV3bZVm70Xg zm{zrgYAH)qI@Ye9z3uQ!xHQEkb0*7M=tyXG6Av``Vl&ED$&z(R&wKJ1q>>&<>$oy6#oekHx6D~t%oo+!FwdGCFI0SHn_1rK+D>Q zf6~$7_1jSZ>%_3Pmgza@Iq1b~4Y!3y8dbzBdUM?S;=Flt?AP*~bR`@eHsEEp zmWywp#ac_`=!)jzT&CUASQV|l@cE-+cp#*uCDD#p!q|nnzvoW1sD5_+Su^)A3fwd% zba}rj(y`>(5=TYZ42z|rXrcn`hr+JBHx2QEmGowCQU@$7TofDZ)qaOVqJjdZ)VbU< z8G|vmx{azQ168o0+BYf?AqoYP@ZqELKsq;8&!6of8F$$a5vYxkn^dx_9jbbhY~hWS zh=|`}eWThI#3vO6?WWCjgFG_BJTe7zBb^Blhf}T9BK@=XaxcUyT;p7q^L1ij@0~M% z!x;`|5wjL|=G?geH5m~^O~t$d$VO@Jz7{Vt=M?5_{!2w86LnpB2;`7x-=RT*x_jiq z6p!)nQV`M+FtIXIIS2X3=*if5m?8SPCfO&MC)k2AUS?c;%7T5K5rzm~5GB zm@Y0aAWtFBAAFuG-n*o=Xt71`2 zHg5RXZ&mL%7leyfsUK)p8UjZy(akcgfB~I73ZXa+vAYK8;UZULd&`awa6z`xCOtWA zW#>r(iG1d9+z$T*h$Y42!u5WNs~`laqnd&dy|Zt1~xwRU!IT)*@3^>PkqTB-30p0rYUqycno3pCBYlc(w zryEK%xXhY;t4W|XN}?ED<514NFgpBo%W%(n$BO<9I#JLl$D+vH-@@NL%O}eqt7fuk zvThQ)dI&8;#EwB7rIulHt@EaD!p8D7LZ76Obm{hMYcMq%Xj({x{#WVv1yr;GlTWa2|$+3om zPlBl)su)@rMgXj0Y+@SYs>f=Wx`xgzoFpQ-NC**W9^Ix#f<#kk`#s{AfdtRf&kVta zqX16Wf@3qx5$fo}WUPhb_}F*}OL9M2|AHJbcK*)(OFQPbQa4p=JG9k)%8fZkpn$N7 zne3!mgr}!2SZx8Em&&G^7rot){tsL=?GM$avStvnIpL;#^k z^{p}$!XlLGFW2~9Ju6NdcGHav)sx4B^k4uogO*{%)R$(tgOilpo#Fl_$+VK0l5nwy z$Wz6Ie4H?@n6*q{*e{N~l*AvoI`&S7E#*}{?-J9}GGqsf=rcz!*V*Tr)Dt%Z}VWw~8fGlMjch{(qSIiKZ2nld{zYQ)sHC|0m8))%=uYxRD#{}!h7@*^ z0F_Rm3Rr!IGcE}``s;)~$e>g3aW0mu&>39jC=Fh#w?X%f+u73-SCZWIM8#pFYHU4d zsk`TOT-Tb-T?er?TavI-#-UbeId>FSMaCt?cE;E-PJ2&l@QhW+_XZ3c)Fq`U+m4b- zTKT)*!9X`I7s=@*E4EX~r#A(8k6skT?iJh*`iuGx=8NX84*DflC9Tx@H2FWT0r|SY zg-RLiWiaw`iWpP)O9v6AW9oGx)F!amdAPQF=?{CQ2A4?XfB!;O?Gal(| z?GMwn5KK5y$ze#Gjn}f{xGv$-UckpOu#+of`ww$*vuK_)eC2$m8e)8b>+cCb11ZQt zFs8FV@|O$Gc0|S@X?JAIjkBqm6G z?dAEEXkn!fb16AX@}@=1%2>o_CAlPRg+;C5n8l|D<26d@vw)F^`-_;7(8T?IaLu>A z5t3iSU(xsukRC~B8Td@J%l$MQlClu2`f};9$VN!t3i(}D%g}xaOin8q^_kSNpv_bK zszWF(ns|y(WOZY{6mnUf@4C_Ruw@}aIGzrgT~g(yuQhmr`p%V*DW?)cAswE6eT$wj^ zm(c?)h*1vC(mnQ2+|1wNNZ{d*g`NqFA#tFEt}cVFdFnt_56{@(5LFL*ZzY~Vy_@cg zVn@?_Dw0KuaDU-ct~k?Og1h?{tJ&T1*i?YeMW;6@%P6rdKPk1@oS8pMwbodqW> zk!@V3Wt}}5GjxR)7*k^F*oCj%%1GSp%P$*5dz_hZiJR_M*B_IKdj93VrXPul8DjGF1OVS262!2|4jrbgS z<1PFmfXHGJR^XE_gy;j6#uUw>7rF6=g3k!v!R><+K{G!B7-o*wYCh#sJW&TQi{Z&|8bAIY2Vzo)GVEj{B_ ztV&gXGPGW{9pX>_dMyq4v{1{739oD_Abbb;%L9BJ@snB6J$7IMR#^3WQ}0BbN=^7sP`<4k}w z{^g2>&?X;Zi1(VLLRU0vN9al2xuGHUxt!`1=NX4iA1qJ!C(T{_alVNn&qNBHiukBF zP7>HDh(%zXI%iO|MwsL`oC;GK%95NQ=BT+UOisvcR|GBzIIG(SF`Bd9=$qZVFy?8) z^yYYKedssjw@lMnB+cm3iWn;9&~oi0DZ1Ip*d6CR5b|y4Vgjln5G{xyiM$u7Yo64I z#X9Jk0JiFiLyHXp^r68exgqJ1s#_sayPjK|Wm!`~Gg|y~wqaQv^%8wGiFDqvhU!u` zA>$Nmly=tCFJ$?QM{+ZE4k*n+-){ZzzjiscaR#zYW9IDyXbYfkbFRYVR}&_whrdQ_EONaF?T0!N_WNLGW&iKjuz{C zhHK(h=}M37DIT{tmXd4vF5)Yesw(#ejSTp=oBizv#V#aG=g|Q|Ji4*Og-(g{n!JBn#)EO#NL53-090iL0$t?9b2; zfHi&22ed$6*I*$jH4$z>0F48`5BfancJYCFk!@W7HBoOVYHufl z_MwIj=5>0PUUvTOYJ`4e0ovp~+uwwIUsdRJF` z_rZa7`ImtlmkU5#zEogvr;XPjE=!%0eGgg&Sx_5&;~OYdYF${gZtFn42vuTTwm^5d zZsRb{wr;4MHATdjwsJy9)!#-2}Tod2{sym2wc+4CR&$eU2{AQHYf_3wtirbqczc=BEK{3Y|ir|BQm1 zl?gHns=U`V=NKf1Dqq3xF8z~HV_Qj@Uc+C)gqGI=HbH@l&Z%1%j1-a_H8j+PSGTLF znwM9$OWiu^t{Vf01gc4_%iMafA_~pE(N6)WX%xzJWBa!&=@uMzQ!h+EQF|LZ5Rur@ zlQ>;uRqq)PhVI^8ua3O3`GJTmIkxF`kPC*Xe-0FeV_H1CL${4AM!KH>2#r?^D!8V8 zS2-dtK2YNhn_U!tS!d!P=xt|z&x_VJ8qDp9)&ASs)Z$m~k(@?9V6t1Jm@H8DW(saP z;AWl-ZZ#-*Fiz20_<Q$-PTmx2pI?FcDgmNz6ND)kVOSN;V51c zDCCE;>H^f@Sfy#b_7ikLD6Sto$#OW7Xdg7iu`5)Jsij+;hQd_dFiyZv*H6}O43!pU z3;HsJnB>a{145M(>6B+GM?DAw$hy=ho_n~Z+~H( zf6+Hq7AB5=P#41=iuFGc&>#Nw|3*NcRO#QF|8(MpZ3F2Mf-iia@n(hkkRp+x2vm-O zgwPZ3fx2*!@(ihk;M%;+xD_bxzT7Oon_Wf5JKfAQIV28?rO3dPVvCxAP~XsC#PDc~ zlGSLhOqV^Vyrvc35DFZ)XsM=PvW?}I7+;5ljAF7jdp0=1%Lz)Rwn4xc0MMIXcpKa1 zH`+e@q1Gss6c%TC{I+$uJ{Dx~;}6^Hs57Ha8pIm;`WN>F_8l@DBupEs9w+z@*T94m z3+l^)bI`UgE z*dcmHm+>K)jeUD^)Rl2ZJwZJ}9aKOOyXh!+>v5Epa`4;zQf<1>S4JVBl&9&WjiE~QivcSmx>QcJ2zaw0W@v4AmSkH>sRU?wAr+5ezeos|v)3%j%7fC<%} zF<3TE1B&hBrSH(eygc2#3;VnrulDK#l$u_*|9{EpKb-Ot$}zFB{5z%pMO6R6mwyPU zu!)nAqlG=d&hZZ#{0o}O8rXcYO<4g|VHG(lVG~OORcED7$Vek-XKnm%BT)kz3u||L z^1s{h{}@pG-DquK_K6f3|7|JwHv*;kgq|4aN60xQT_C`6q7ubn}-=31DKQiqHOc z|38BGKf<4Ber{JaVZ{FfHvbcJ{-MGD+~t4c)jzg>yK^=K{Ka^cKT+zRw!gH322Lh_ zO#c7NW8vro5HdG#{AU%U4gMuFF#e5-|NFv!;^2S(vP5h@chXwen&H!_S=b8LI$8Xq z_-lAI3uAz}(tZ1Eki-SFM?P=8VO<6z;KxO zpvKApLzwi4!QUYvfuRDZ5rG)+MTMp5bAQB&=wpl78}}U)u^&!`rao;m5)`*NwIU0f zy|(jh9(e0+RVuDPP_zZ5MM~=#-6!` z9xo)mLoHZDP5tw!Yi$F@6ge(((fDKR1sxsNtNN6Rn~z`I>%Huo63pqw>I6dEPJat; zZBa!bxb_Gw2pG0E4kBE3u@(V z1ny4-X8t#bOqVE9;W5HUlo3i@a+h6m6{MraD=Xnx0^*LW zJsDXOGBk%p282Sn-Vo*d)h-{N;!G7Rb~F;{4+AU*;!R(r1c?z9++PupTvYHd4$o^2 z)zb>G206TTp@V0ROExV9lmQ^Wh-Vbzrr zRhUVFB>E9z#b(T)W~oVC#;l}!XUEV9mWlPp3;7k#**c2u(SD+_(lzoqi;fX26RjkP zPA(xf&pf1Eaz#Z%luWry@>(YLYqlhvPW#+m%k>-TqJyit>_Aa9vP$wU7WWyu^s(bm zs_0Pyk%PsNMW;j{#oS^9sf&Ww#uPiIl(%l?6bT6nSCXm8{qT84oUG^c>4tjH7dqZ8*QO*I@NF$o>Jsj-Ic}HjXy4}N>Qj7j`WK-xl`VZ|# zEz#$!U^uP2p+KqK9?y>IkH>rzsqaO{{z0aj?Hn}HU z8sC!6mYR2$&Kn&2R~w8(yBiO2jcJcPdFe<=p@@jy2qfD>=2nlnt$LiJZ)7hYgM*E- zRFjMN=k$X49PEV|C{!iqkjjPW7$nCGwZhu_6n3Z`5}P7;<`KscZL#I}odk69O_3TS zh+V4htv3olUZ~h`*{iBkJ@yj!eTVFdv3Q%kj^}>U$z?uTrg2lG?Xg{hj^_}}NxdLS zu^~?_)VlLlPglMX^|^^022TuPaYnL92Syc+F~jzkTb9^6Ik>N*;}}1!H*tl2!04)9b+csV&3C5EHI3 ze7K+y1C>F9NEo@9)>6D-{B##)AKo&Qn_3@aL12%JfqB6Xh_q-Ce)4D(1@f!3xYZs}JC7HG4}~HL3O2W0 zrSXvuL6ZhAP>d27EHhbE@ysIG3gdw}1IL5Ec{QYzkE!C#75-$hal*bC_G(F?2^8^4 zRnJWzmzqOzj%`q~sjV2vLKAg@BMJ}T`<{FZQz*q5uUIRl)Mcplj{ zVBO!m?Qs0&aPDkay${$mxhHHJ&_i{8KKUVfkCwiD5!$abln_%!Zs~22D}hWl_x2V6 zjT)v`)KRB(JIw%P^nN6QaQ1%+cm7)%`_hbmmr0Gp9^JYl6H+b=t=Mh;p(71 zHwSndW_Ze^M12=%`Yq* zM5$CMP05H$@642 zaqF#es)8uX=O+OnP6>7PBtcIO^vuyVu4y6u*VL7WeRK`rdFux5bAQ$-;eC=E(c_ozF ziy)k)MhTzYlJ%;skiB{@Ee0vnCq7=5VSLpEdmFM5Z2l!{Cl4z^v_c?0 z&g}}m&9-Z=J2Zh190vw84@|DdA4-q$jV*Vu!R+MZ1yASO3>7Av=UQQOJ?4&A(ntpo$Mg}V9qt5>V}@Q2bb3!_k8j$ck%2^Yvjmka@N8sg^d$j3@ZCyH3N z9tT#j3>1jMwd)0L+*d-gWy5XZ?}+w4NTq0#xqbV?i@%9owbaGFPp_=aQ@P6gQRW3{PMJYS|K8v-zfXP%ww7p53`RE`zw2TbSu6=5O z{rJ!na#z@>v6*LoDusU!`YSK{?En?~@zTuFvAWgV;@KAnaeKcyCfywRs?v&zj|eHb z#`ybLDLrehtJ7fEmwTI~OKNO`hu2aIy>4aB@0Lxlsg&M@bb7o|U}vC%V95%dh{jcX z;R(|!YYL`%CUdRs-WUOLdB@f9+Z#Et6$?_S8Hg&n?K{tKy(LGHjv|So<q_DXLS&66Ib-g+TS#S0Axj#KvZ(SrN znHDJWfIrI;=R9-8Iv(XWmY!V`L@d!#0M)PC8>0c8oTnET1X(Vl3UCyuMd+?OebWBL zg>gdb9L%|axKq}{(17E(9OLtN@q27aP=itmdA%O0EM|OezdFc}qZA?93QHq#sf?$W zyJZU1G$6+%NO?mApw?3Avr0YGsL~~=ORzwBGFM)uQSNWnk9cQu2XY5t9*pP46rIkv z2n}ibGk;e2w&2$t_0vg1;Hm4l4cRfO-HG?@O+%>-gu3DCAkMu)z|9tkLf%l8u$MC3 ztSZ!|lk(i^Vd?f=CgPOlBIra)WMzUU9vr~V?TngPB<7c;O;Y7&65YigR zK@Y%2OyvsL`f^-r2(8IS2_T?Jq7obTJgdKH9U1M1dkS&3nMeKAc>c;XswV~!>-RP$ z(8}xoEKQQk#s0JeC&_zHCi&xDx4Ye>Biw#;BmTr^XU`~?Ii$qB_3N2s_i%HTgCoH2 zf*RuTH9g){(J{{Q+ak!cXx%rPlc14vgZnDjDF_UZF(6i^%)VLTh518E7Z{0$f|r&c zm#8z$WVEcLm;_DJgjACT#@4|&KZbsJ=_-F5>@84qVSZv^0cKL^;vf2x2`U+_ zEY*yKbhICHM(1?HopFyBd!6ux;AOb&c8((bbuxii53iTodyVIG+`P_DKfk@ay=1+v z;=#b*bVIbd;IK{b5q3}8NcA@mg@p~))RL`pP|_`@DmQx#`l};_I*1`AyZ-W_;l|pF zKJKgK9K?LaWbVUHT1VrrH$clZF$G_K%$F6vIT4V4(V{L?imQacsN**6Z^G(Io#+*R zF}(UVv*A@&&)yuYE$ATQt#cZdfH&j5yWxM?tpv7{Il(Fa-a-3CJh8)Zy(zcRo1J^JF%s4 z=Dlb*2-r45IE3>%B4YP?V7q8t6hxk|lmN0l#3`}&mI^bL+>iP<;U^DkGHhhQ$u- zSM19M1YuJVE{%dNg4_j@(BR?v>VEZ&DDa$~+jp_0=sI2@nR$86B3eOf@*2(lEZ-}9 zg(q@JRAyHw`<~!=wLKHw3eF^-!cnPc-gNLELZcsbb7Qz(*Oyq8!PiB}8{r}eG2?B* zheXH=gu8fuYa}wp^&zWB!6qeZ+Z@}&`3U_qFsPjHVonQ>u6P@&M+-yPpw5cHik^D7 zpLyW4*NYNpJr*Q=Q9_BvJtvloJg!X~>=V*a7WxjeoTfCJz}>h%76G?N7(xQByt%T1 z!ryc2FKz;iVc1;AM@&RPS4?ig;)fr`hCU=l!;|(l2!{)IS5Z>1~RbRoJM}7jm|AH+M1#3OO}qAr;v05P$Bodr@Ic z6mBB9D(`BHWum-HiZWnM3(h18x+#{Wf|}G9mfz+t!>6PhvJlWeG9p-0oZkTxP%mD= ze}kCXp;myG0}tv|^Nq_lf(*P#joGmhl|2}BsNFCA@?;IhNK{mhSQBm8(g%-E@ml!X ztwp3jE5rUzfgiwktcALG=UTgE5i8gN+?lIAHqex>WYOna*P- z?CNdVY2m4*hrf(VzcpxeO@@HqsID>(9n|(oJLizsln2l6I2eCDvDPwZtDA&hyJ%QPspFbMZoDNHuh|@+9R8IqD&*UPIv2#El^o-I^35_+6FNT~f}M?!fb}(b z!L$D<>4s%#V0=jwYEo>qeP8Zuir%D$^NUibA&oqF^Vgx19!7mic7?!Fqzo#AUzY8= zqVvx_nx5ug%Xsi0*hW67&h%Jma+It(1#sZ1m(qSKs;Qf8Yab6j+4R98j_37Y3G&^ zQ(1s1<6BWFm58IRk2Unfze-eVER$ubSk|-E$@N4B3kT2{}cGj`Ca=KvOhHu26r4XWAo%_d6OX9NWE5LMdI7xuvNmyz11~N?8g0h z@O2HWC*Q~6?&Qin!Lj-q8`!gJyqhDHsa4+06ctbOjO6YUdt-5ROenhm_H+&*VGH_z zJiHtv+rNg*zg_l;Z-IRnXA#k6YxzYRri%#|QdLYZ^dQT6^#x9OwjS7Bi zD1$EkfZSkhe#s44APS#gVt>O5Zv^CI0nGcvbdM`ZkNwE>RF0=wYRkVD1KOy>HtVj_{7W{b?Ua2K#5W@M!A zqn(Uk#2)7g`8yJu23uZz2HzEo~7rM)-#u9mAB_HieZIUBg=Y>b>uAnKFTFp zz#gHXDiv<_o2d6#zYgKpf3Vrs)`fgmy=6)XwMXjC^VH2hMJg_^GwZ}(=aA%T{ zlJCw#t>`@K47fS9*~A&eF-_`-!z@$p@(y}Wc)3UUIC{MSmV z8OHG+2@jU~-*|Vcm?&w(o%33*x=-(e^Ppsu^3Zg3H)M^OoE2(b9k);IeCV>0rOdzK zXx&GpxV&$8?m<1}wBgIGIbFQ;hRkLFNHauSH$*wz&22(HZm(ac4@B@5Qbw!V*k&Dd z>>1ET1>XZjbKBKtNyfy@o0-NDq z@rroPnACm0_tHvAPa=uh@hS!9YXG1-lz)#;k^`yP&6I<)|Ju2k3^Sju5+SCdSZ?JY z$sEIOsfl}1hn=7~u`-9D?KSolpIkDG)@=z&cB0@`#y;Y7kKZ~Gkhdmi)+Qv@a9*)} z6TAr9m|HfFXPF)?j1@`B^;(P=tAK!eHlU8F&s0&ko_0FFLB?Ke+Gj#d>a`6(8l%DoTatGJWC8%3oT&y9Ub@j|fxhXvgEEw4dzzc$S%q z@#M_^?oIwdJ=Z8$@X{NU1_TomrL7=?3uaKBjtB;Il{&D+_yEfYCJ~q)6jn(veP?8Z z;iv$A-tk8I#Dg4bAe1+R*xN5%L{cA)Sp|XjQ?Q5E?bliDL$UQJ(g9f=H)RXN)fUnJ zDegOtE{+Yu23Xv#0Gdb259@^K7P=*#};oGXa>lYzjMDRhHow z{VifFHq}=>n%qQgj;yQQKcL8-kS?%S*KfWodl_|pf_5u>Hsw}_Xa)g2rq@rypwa%j z`<}zB-@^}|0ld)?xel$@=OVGYZ~0(Rd=p7ly05*;RpOO(1dLP?`l?cx1Sq$1V&Tp5 z`Aer0w;MR~&b|wji{8RdKP~UnX`PXI{hNa+GsgzCiUY5?mwi8-L(|k?N!+-4IJWLS2D!7d#YP=kL@*7QclPN zW|-~cJuhhuad0(1aB}T-5Ph4w$1ih^=1s^%wSs#Ff9U?0Xy+q8ebvd8t0qhVYXayo zhXKR=m%)srNvN+S1_UjdjB{w;H?E7Z_Y8&e+#sYf^3xB;!aO^9Mx)6qA_cQbs(Ub> zKJGAI!o)HwJ)5F<_xR~LqVA<%Ox7N(^YAjtZ(wdJY)OKxz4pnhMZ}6nKGj1h)M=wu z+B|Yb+i%V&;@K|YLdJ+KwDB&Jf8e@i8nocyE!Xf5l7{wOYYc4t zM-T2dPNzmT~C`z~n=Jdl;XTK^=CRM7D-DJC-m#lo5 zjz&K|tSq#m-!p>aQqk>b(12RUK)x&GonQykj`8-LxSfcdgAD>uU1g2op)PpXmE=vw z_t&DILg)6ASGWE5rOTqiU3sVFJm&PK%a}XX*v09?I$Cmr(u{>0-m^_Bs1KxMJ$g2} zI)L+CfzM3;f~XCZ0!R_%$Axg)nC6$E2$_ zI>}o5sm_!gyiaEv+&1lgAp7za`2JyL!(p1=Ekr(B?ropgp=f(o0poM#TQ`dGtwKAM zV63(?8M)B7t+u+xtS1do>hZCI8ZU{M70XNGqltrJ3SF6y5%xpvx;rqU zeoxXk?tByTi(mQT1t)gDPh>6-&*YipMbW&Vz`P&a%_cw3+Zlu}5q!Cm%ZbLOlXi@_ z zryK`e29g4YpIg{bH#NMuv0rS#vas%DLvdffZjwR#s}O%_RAB0Yuc@t^059iar$e^d zM#25D@a(e2P5y~2JNDtFJ3IW{AyMy?QaiUd<}HjgaHG{n%q9(sJ>sMu5HgHCC>|M? zN>#V*ye4Lna?ifmUNQI%ixo;ELuw$vGJQen34Y&WjX<`lPw4214+^P~>nP$6u(-r% zWVEW);&muefh;CfLw3YW`Lx&LrUf5g7sf+sOJn)b^7XFkbU~^5oh477@e^E)-%?sJ z3G;5Nx4Hf~EXd$6g=$esYEQ z`&CJ4Uq3=GI-2|1PxhT>E+%`!8svuJTF7?`8P>nzsxX&2Y+bB?UJu)w*m+;E!0I9% zKXwQmjW`g$d+X(={BR>^&SYax;B1Ttzv;_$6S{rrm5oqx14j>)0fB{VH^DIV+H0)Q z&|r}za@NYJ_kXxD*N9#bP7k5E7}`Zv%}_xiz(dR89Or*^`C?($8NVmg6-=KwBiQB3 z4xYO8)}dFH4MJ$m$3uFBW1dHc#m}U4x9nzqbkPEj#AuIw!rOu)ZJ7IW6N`P!A;|U> zlr>nmVp~DfKP?&b8k1NleBCYMLAUk*X8N^0tDHo;R){5QS|ei8ZSc; zoHden(Uyc~nV9)VxSkYWt?w-z!9e|a+6MwAB-?atHD$y4CN7*&x)M)L1bsjV{Q~XY zL(=b0iK#yIP?j&{CJ0T7y`Fk(x}!FSqBSW`ne&5*Qbb}@p4U-9%Y zc5ZTo==b+Z5}h+v?Bt@(MF1>f$Z6dycI2vcA0YO6PpF@l^RS zZp3>D!6myd6P`O=Z=DsK4_8LD)9N}Au{OYIBo6tEd}nadvR~$c4mjsyRd!5_cWTdG zQrhXn^*6$&H4Y)?@C}IaFD2yM4&SF_VB<;r3K-O=EB1osR=Lovztx2s!}Is}eaXcp9}riAof2MPmwrGad2-lxem= zAB??phZH^Ec2aM=D#R%9T>0!=E&^+xC6nLVdut)WH=lh`gneF?s0w5p?NoXy)#F9r ze%FHJ>5nBFVf$Sgi-9XneZ5sLB#3&I#(2%52;7xDtrDcgVu|XRFs`R{7TJb%OSM}g z*AQjWt?rC=J{P6zcmr~)>)Ny_ONa3cNj$Y$moL_j@uj~ad%UA%dCqR)9Ys=Is~myd ztw64k=8LJ(rGC@YWe2+YCa*@tQ~5?z1Ks=%JYhwTORd}l%H&!<7-Q%jd>iA}D0@YH z&RE}43aJ-ID@kutz(=oFH@^+i zmY`EMGPe_n*l0~*lWt1^eOB3d7~Jns+jfh{qjt`nC==7eO~fW&+tx`zM~R|ky8)RZ zt5KxW!;>yF4GT8Bu72B=c&X`ZO$1JJa0>{3`XlAt2B$BQxwYfCxCCSElK%Bn_+#nK z!AwN^yWX%mp*Y$Y5I$FsZc+|Ui5^I)siHUr5GdTHO(U0sNP;;>-MYEsRl%nNgf0|6h5+)E( zOA{tbX-*7AllAAd+#=HCr+7k4=}CG1e8Z_$4Q(@!)JAq6Ax06SL>M#ILLdfm2~BNH zA%12#$9F39Og+EO7{?gK+K~GX5DD!&5ZqTK_Nn((o>P&+NP__KQW|zSfdo_Wz`6j1 zr*4&|Zw)5)4@1bmHH!xvZcu$`KCMI6$@3489xRUG2TV&u5c(IU=T`vBj}UFpKjRtv zYm7D{?G}ej2zA4ka}w2|XU!Q0mAJKrjh%0KRE_VSDrM!-LPblHefz%US@5iq0At$A zXqB3}n(%V3`^rP3#s>Vayxk$as#jPJ}N=iO|T`?>EaKpF32!Ui> zsMNVr^1-Ep(8kY8%w>@5@j@`}d>VLt1n>aF> z?S*=q`imh~*+Gz@>F(|JUIj56?ZK-RY|XgM*o1mU_T1GJ9T2fO(wdZh6Sp^#i6Amv z9XHJi%<{slSh_fymH0}$XtjmMUI}j6*y)3wxT6Y^ zBlTGDxwkKrE!OmQA?|s5TaRYb`bb?--o;*%a*lbkf0!cJ$NpoAz~S&eQD0d6&EFwT z{}J^iC8w^csqzcz>j-yrgrNFA0KNdU5I-CNKz)P(U=I`qz=-}I0KR}~juBe_3&7VA zDCk#|&`&F21Bd*Vh_C-(n*SQ?^=EJqfuGo}e*=3REole<(!%y0`v(I+R)0aS1Q8Ih zAfO{ChyWvgNk28PGVC?>{?A(estX|kltKb{D*)bw5QKt-0Bqi$C4~SWmyj@61bBv6 z83KX@eIbx=K=aog{M5(hLXaa(DC|f_2#!3;5daiRQ0Q2O003t&VeD9hgpo&ANMJlz zDFQ%s9oGy8u7rT;BM89Auy=%zKvWiBjMyBTj^@#CcrfhPetpu=e~4lNRQSKegTaJF z;J?AahIMRpbVkq3cr^J~1(@BtychruqF|a0Bg@c)G84(Q%j+uOSgUHHWAux;(egYo z=!}>)QyZ8BDz6M~ZqA7&49ih%&*B*v0aqic(K4}P1}I|&`TF-PmAtl9h5M7kF3ZV2 z&5r98+cn!$K8kD^3I!LUHOqakpB0aTCw9Vv6j1)-d$P)Md?u1v#5s*$XrFyvIXrM_ zsg(X`sI>BCWBdkHRQSI5yj=WQGoKzUxvT}eZGNLlJIij|p^GAhPMfXm94_qcbm|~G zzQl@h6#+p3${XAw*>>+6%FE)_-|7{|=_+x4c9GmJoX+TxYHaA=C56rQT^O>7RiiKF z5+LPTT*{73xVPn$B|4QKsne_`{5r<-3l0I_*AfXs8y0RTOvWU2T(&GuvZ5if%_r}s z%#8@;s4&SJab7k8i&pN1xw?qE6&x9(^nv0QMHL;*EZYz6xUe(p8OVJNO?~Y<9{rwv zxyqemgu_9lNC^&lj4*V`Qv3>+H_)>h35!8hja6RmR!oeiPs&*ls>nvt`Mw`sHtmA1 zdAn2?`(~1HIatlftZkwbYI?Z&`@fpR>D?15!0#VR;Pmp@WK+(15T^F9hC^N?Lz{F| zXM2>P!qV3M+>=O!BvjPkkS%k&BVzkDOEo%J*IVR!xx9||y=j_93`x7|IQ(XJ15}17 z^x|8%9C)TZEKEwX{I9I-HnCY<(>J!)U*&J)_o*SH(=fPjjXyjwPj#7bNb+)}_f@}d zgYksgHz(AGH%lS%RS5e=9-~_CIw|Ib=aCbAPh;cdpR|Hf^f-5=c%R~eP?gIAa)#Za z?+3SVcljC>4;379HNBtDJfmHGYryHA|6QlCs9(g&d0=uVG_kTmM=jKe@`*^%$_IDR zaCdkgURQ?l9-<(7TIamMjR|uD`vS-t&@%t9+N8xL8lU=V?Nti5&`O~pMZ>2_oc(IW z+8=B(3oG&Qa)xC(;ooEr4MBxQJ&u{qDLT0{vUUu7Q-PRzNtgO<_N>C5PhKH<%F8$t z_71RmKI=yuQ)S+3dG1{e_dn#}&Z0pLg}STOyTrTf1&za}skMvSW@sc(Uu_GszG8|9 z<%Ns;!mIG>@@Ng0l|G{8AF!{S9^}jKR1dW*hA?Uk>K8Mt%36;l%B;nUiA(n=pHa@% zQLUuAUMczM;w{#-T-P&sdKcVAZCSFPz3O7#Mdm-(n>4LcTIYMn-~Xw`6pwbsFzF$)9oa zn0S6@A(kLv@hYMbtvcb0LV2_~Y*6jVRgGakE8lJAnseWr|qqeyYYE`tdsg!gGn`yFgLz>&5 zn3R%~vu`SCrqqPd1jGPtFSiJF_I*{+Rr%sZlLgf%4Z=Er5c<+bP=adH(Oq2{{Q z*roG!t*(XexzqH+60usE7rK|5EPPvbrXKcw+<^!1I{0b{8sBNHWqL7-Q%l`yBE3Vr zxIpKdd^d`Y)!S*_x2cZeh1YGdv?%hKUJce@j30gvbOx2YHP?G-k0(Gd!Xsq|#i3R+ zM;@WQ;8y@j;o>UjQA$n@i}V2L5-g_3B%AIDEp&IV$UQ(A#(IR1r7|&J?-l z3HvPSGa1m!(>Gf*wCvz5u0)K|%hQF@^G8UN!Aslm;(;Wp-QmgHgv!#;f=qS~8c zvYxN*^G~n&zj=roaSeB@*E59DqM>_i<)yNFL+xm>@ur#MUF|iu%$ED&7rfO81t5Ot zyqkMN_ospLv+?0CEFa4kxg>_}D;u5ag}O^H7Q4ezAEG1Ih-^a1Cu`_!d~mAi#~4ER zBi@`&n&>7R!$dH7a|YxJj}H&xmUcz5hGK@$G+Wdt)S6vY86IQd<^E;zoL249pgd+DGYC&4hFm>2tM-vW_U_j?`(cy@;Z-WpT?BijZgCIiLBU zEOCC#JF8qvyfv?=jodenJ52YSgKt{dgR+6hbN%`(A))lIrMSt>tO=E+xKVsozENe6 zbF-FwPD^=s-#o)K4eo!#*^BTZVa*JNCO~hJJdpN;FjUabnv1>Qz8Ik;d!AZu{-E}Z zTQk8n^wZ<+?d7Lnns15epBJI8L!Nkv52>I@c$>QZka(jpNIyuTVyQy;np$qgLi+92 zizW7?L_5<*q*V0VXCUVTZlI@gT-`hfk^FR5@Ql&IbRG{~3hHmrB`{~Akul;UIpvK; zP&(U}bm&{?l*}6+dMM-`!iTtN<*cp`E8t$&i%^iuAx)jvduv8gW3GQsm$c&i%LnQ2 zV{GyQHx)=WOfldZhL10w2C4~lJ`7YNdAF{Ii_UCj+@I1L7QlDl-GvR%d9;CbKd87} zXb+V*&#p|=CBZ)4XDB+B6hkLf!o*IfVA}S*Pg`_2mXa%U0YvfI3F-|=UYN5G$Ro?fJP)OSj z_zp8xwj7e*Js8C~SXJ01$Vm8p@9(?iqm96i-7*v@0{yvNhW!KY1%}=GXdUl^|0U?P zk;d!Dmtc}(hYEPnaSs$UE|roCO_GC6wX*P!yeYg~o9wDHBM`w6Q?E|~jUGfxVkvPi}zhopvB*ZJDP|eN)Bn z`bI=n^QH6(sci&#az9s|%xK`%q>eiUJK0uex|X#aVe13#uc~VnJi-Y{7rmph?zV7T ze3%8m7mJ;(EQT#9b4jDx$Wi^fegWqvX{=hmYfALKI z|GOP0i6*dL$qE0-?~qoM!Hxra``E+zi*54H<^c+5d$|B^kfMW?BX)vfy&5gco7Mmk z3!vs)1p+Br9+7geLW>;gSEJZv!jI@3J8YJjWV-#b_7E&L4_^>%sTE{uC7j^0zha=Kzh^J z%;}aj%H@)y^G%*(rj#@gDj(oofyLxx<>Vj`I1&Ou$O1_U0udGja%_SE*AO5Fyeb%w z3j*I^fDUl$ILCg+UW))aKoW-kNPrAmhu~G<9#A(nA^xp?Y}p)eRgm;t)Q4-w{qKzM$&&98@2$8%lU3}ARU-aNAJk)?k` z$^2`^csg5Joy9t~XMvvo_<#{YLV`kIEAX)l28SU5rxE)BJN%GA1O-D=HQsi%Ag8_W2ll=o^fQ5fA3xfjuypuA)qa%mk z+5paBfMbIbWr4GZljDF1B4B^7Pvq}q;eg`F`rruoFZOeFHUoHk&PV@|LfhKg5<6e8 t?=~$*M^`Y`&_8~svF}6h(HcEk^)9Yv&aOu@3 Date: Mon, 13 Apr 2026 14:25:23 +0800 Subject: [PATCH 4/5] feat(examples): add environment variable support for LLM configuration Add support for configuring LLM settings through environment variables (LLM_API_KEY, LLM_MODEL, LLM_ENDPOINT) that override config file values. Update all examples to demonstrate both environment variable usage and default config file approaches with updated documentation. The changes affect all example files to provide consistent configuration methods and improve usability by allowing runtime configuration without modifying source code or configuration files. Fixes related to workspace cleanup and metric display formatting are also included as part of the refactoring. --- rust/examples/advanced.rs | 25 +++++++++++++----- rust/examples/events.rs | 19 ++++++++++++-- rust/examples/flow.rs | 30 +++++++++++++++------- rust/examples/graph.rs | 18 ++++++++++--- rust/examples/index_incremental.rs | 22 +++++++++++++--- rust/examples/index_single.rs | 41 ++++++++++++++++-------------- rust/examples/indexing.rs | 22 +++++++++++++--- 7 files changed, 130 insertions(+), 47 deletions(-) diff --git a/rust/examples/advanced.rs b/rust/examples/advanced.rs index a5c367b4..602fa435 100644 --- a/rust/examples/advanced.rs +++ b/rust/examples/advanced.rs @@ -9,10 +9,10 @@ //! # Usage //! //! ```bash -//! # First, copy the example config and edit it -//! cp config.toml ./my_vectorless.toml -//! # Edit my_vectorless.toml to customize settings +//! # Using environment variables for LLM config (overrides config file): +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o cargo run --example advanced //! +//! # Or with defaults (using config file): //! cargo run --example advanced //! ``` @@ -24,8 +24,21 @@ async fn main() -> vectorless::Result<()> { // Load all settings from the specified config file. // The config file must include api_key and model. - let client = EngineBuilder::new() - .with_config_path("./config.toml") + // If environment variables are set, they override the config file values. + let mut builder = EngineBuilder::new().with_config_path("./config.toml"); + + // Override config with env vars if present + if let Ok(api_key) = std::env::var("LLM_API_KEY") { + builder = builder.with_key(&api_key); + } + if let Ok(model) = std::env::var("LLM_MODEL") { + builder = builder.with_model(&model); + } + if let Ok(endpoint) = std::env::var("LLM_ENDPOINT") { + builder = builder.with_endpoint(&endpoint); + } + + let client = builder .build() .await .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; @@ -56,4 +69,4 @@ async fn main() -> vectorless::Result<()> { println!("\n=== Done ==="); Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/events.rs b/rust/examples/events.rs index 65176751..59d8e3d2 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -11,6 +11,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example events +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example events //! ``` @@ -90,12 +95,22 @@ async fn main() -> Result<(), Box> { println!(" ✓ Event handlers configured\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "https://api.openai.com/v1".to_string()); + // 2. Create engine with events println!("Step 2: Creating engine with event emitter..."); let engine = EngineBuilder::new() .with_workspace("./workspace_events_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .with_events(events) .build() .await diff --git a/rust/examples/flow.rs b/rust/examples/flow.rs index ff1b6ca7..4778bd44 100644 --- a/rust/examples/flow.rs +++ b/rust/examples/flow.rs @@ -12,6 +12,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example flow +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example flow //! ``` @@ -54,14 +59,23 @@ async fn main() -> vectorless::Result<()> { println!("=== Vectorless Flow Example ===\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "https://api".to_string()); + // Step 1: Create a Vectorless client println!("Step 1: Creating Vectorless client..."); let engine = EngineBuilder::new() .with_workspace("./worksspace_flow_example") - .with_key("sk...") - .with_model("gpt-4o") - .with_endpoint("https://api") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -130,12 +144,10 @@ async fn main() -> vectorless::Result<()> { println!(); } - // Step 5: Cleanup - println!("Step 5: Cleanup..."); - - // engine.remove(&doc_id).await?; - // println!(" - Document removed"); + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } - println!("\n=== Example Complete ==="); Ok(()) } diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index cdefb451..61033da1 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -10,6 +10,11 @@ //! # Usage //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ +//! cargo run --example graph +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example graph //! ``` @@ -19,11 +24,18 @@ use vectorless::{EngineBuilder, IndexContext}; async fn main() -> vectorless::Result<()> { println!("=== Document Graph Example ===\n"); + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "gpt-4o".to_string()); + // 1. Create engine let engine = EngineBuilder::new() .with_workspace("./workspace_graph_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_key(&api_key) + .with_model(&model) .build() .await .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; @@ -91,4 +103,4 @@ async fn main() -> vectorless::Result<()> { println!("\n=== Done ==="); Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs index 6b710a93..078a5ed0 100644 --- a/rust/examples/index_incremental.rs +++ b/rust/examples/index_incremental.rs @@ -4,6 +4,11 @@ //! Incremental indexing example — re-index with change detection. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_incremental +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example index_incremental //! ``` @@ -11,11 +16,20 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_incremental_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -93,4 +107,4 @@ Deletes a user by their unique identifier. } Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index 3a5632f0..4fe0a522 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -4,6 +4,11 @@ //! Single document indexing example — index one document from content. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_single +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example index_single //! ``` @@ -11,11 +16,20 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_single_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -69,21 +83,10 @@ Monitoring is implemented using a Prometheus and Grafana stack, with custom metr println!("name: {}", item.name); println!("format: {:?}", item.format); - if let Some(metrics) = &item.metrics { - println!(" metrics:"); - println!(" total time: {}ms", metrics.total_time_ms()); - println!(" parse: {}ms", metrics.parse_time_ms); - println!(" build: {}ms", metrics.build_time_ms); - println!(" enhance: {}ms", metrics.enhance_time_ms); - println!(" enrich: {}ms", metrics.enrich_time_ms); - println!(" optimize: {}ms", metrics.optimize_time_ms); - println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); - println!(" nodes: {}", metrics.nodes_processed); - println!(" summaries: {}", metrics.summaries_generated); - println!(" llm calls: {}", metrics.llm_calls); - println!(" tokens: {}", metrics.total_tokens_generated); - println!(" topics: {}", metrics.topics_indexed); - println!(" keywords: {}", metrics.keywords_indexed); + if let Some(ref metrics) = item.metrics { + println!("time: {}ms", metrics.total_time_ms()); + println!("nodes: {}", metrics.nodes_processed); + println!("tokens: {}", metrics.total_tokens_generated); } } @@ -93,4 +96,4 @@ Monitoring is implemented using a Prometheus and Grafana stack, with custom metr } Ok(()) -} +} \ No newline at end of file diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index 53d8fe92..ecc0eb83 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -4,6 +4,11 @@ //! Batch indexing example — index multiple documents at once. //! //! ```bash +//! # Using environment variables for LLM config: +//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ +//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example indexing +//! +//! # Or with defaults (edit the code to set your key/endpoint): //! cargo run --example indexing //! ``` @@ -11,11 +16,20 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Build engine with LLM configuration from environment or defaults. + // Adjust the defaults below to match your setup. + let api_key = std::env::var("LLM_API_KEY") + .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + let model = std::env::var("LLM_MODEL") + .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); + let endpoint = std::env::var("LLM_ENDPOINT") + .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + let engine = EngineBuilder::new() .with_workspace("./workspace_batch_example") - .with_key("sk-or-v1-...") - .with_model("google/gemini-3-flash-preview") - .with_endpoint("http://localhost:4000/api/v1") + .with_key(&api_key) + .with_model(&model) + .with_endpoint(&endpoint) .build() .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; @@ -43,4 +57,4 @@ async fn main() -> vectorless::Result<()> { } Ok(()) -} +} \ No newline at end of file From 524c3423d8aa0beea8e4ef4ab94f62fae044bb10 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Mon, 13 Apr 2026 15:56:20 +0800 Subject: [PATCH 5/5] feat: add tracing initialization to examples and enhance PDF parsing - Add tracing_subscriber::fmt::init() to all examples for debug output - Modify parse functions to accept optional LLM client for enhanced PDF processing - Update PDF parser to use external LLM client for TOC extraction and structure analysis - Add with_llm_client constructors to TOC processing components - Improve error handling in event example by removing redundant error mapping - Update examples to use cleaner output formatting and better documentation --- rust/examples/advanced.rs | 3 + rust/examples/events.rs | 93 ++++++------------- rust/examples/graph.rs | 3 + rust/examples/index_incremental.rs | 3 + rust/examples/index_pdf.rs | 28 ++++-- rust/examples/index_single.rs | 3 + rust/examples/indexing.rs | 26 +++--- rust/src/index/parse/mod.rs | 29 +++++- rust/src/index/parse/pdf/parser.rs | 39 ++++++-- rust/src/index/parse/toc/assigner.rs | 8 ++ rust/src/index/parse/toc/detector.rs | 14 +++ rust/src/index/parse/toc/parser.rs | 8 ++ rust/src/index/parse/toc/processor.rs | 41 +++++++- rust/src/index/parse/toc/repairer.rs | 8 ++ .../index/parse/toc/structure_extractor.rs | 5 + rust/src/index/parse/toc/verifier.rs | 8 ++ rust/src/index/pipeline/executor.rs | 3 +- rust/src/index/stages/enhance.rs | 6 ++ rust/src/index/stages/parse.rs | 24 ++++- 19 files changed, 247 insertions(+), 105 deletions(-) diff --git a/rust/examples/advanced.rs b/rust/examples/advanced.rs index 602fa435..a75608d1 100644 --- a/rust/examples/advanced.rs +++ b/rust/examples/advanced.rs @@ -20,6 +20,9 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Vectorless Advanced Example (Config File) ===\n"); // Load all settings from the specified config file. diff --git a/rust/examples/events.rs b/rust/examples/events.rs index 59d8e3d2..b0433dc7 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -27,6 +27,9 @@ use vectorless::events::{EventEmitter, IndexEvent, QueryEvent}; #[tokio::main] async fn main() -> Result<(), Box> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Event Callbacks Example ===\n"); // 1. Create event emitter with handlers @@ -113,79 +116,43 @@ async fn main() -> Result<(), Box> { .with_endpoint(&endpoint) .with_events(events) .build() - .await - .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; + .await?; println!(" ✓ Engine created\n"); - // 3. Index a document (events will fire) - println!("Step 3: Indexing document (watch events)...\n"); - - let temp_dir = tempfile::tempdir()?; - let doc_content = r#"# Example Document - -## Introduction - -This is an example document for demonstrating event callbacks. - -## Features - -- Event monitoring for indexing -- Event monitoring for queries -- Progress tracking - -## Architecture - -The event system uses handlers that can be attached to the engine builder. -"#; - - let doc_path = temp_dir.path().join("example.md"); - tokio::fs::write(&doc_path, doc_content).await?; - - let index_result = engine.index(IndexContext::from_path(&doc_path)).await?; - let doc_id = index_result.doc_id().unwrap().to_string(); - println!(); - - // 4. Query the document (events will fire) - println!("Step 4: Querying document (watch events)...\n"); - + // 3. Index a document with events + println!("Step 3: Indexing document (with events)..."); let result = engine - .query(QueryContext::new("What features are available?").with_doc_id(&doc_id)) + .index(IndexContext::from_path("../README.md")) .await?; - println!(); + let doc_id = result.doc_id().unwrap().to_string(); + println!(" ✓ Indexed: {doc_id}\n"); - // 5. Show results - println!("Step 5: Query result:"); + // 4. Query with events + println!("Step 4: Querying (with events)..."); + let result = engine + .query( + QueryContext::new("What is vectorless?") + .with_doc_id(&doc_id) + ) + .await?; if let Some(item) = result.single() { - println!(" - Score: {:.2}", item.score); - println!(" - Nodes: {}", item.node_ids.len()); + println!(" ✓ Found result ({} chars)", item.content.len()); if !item.content.is_empty() { - let preview: String = item.content.chars().take(100).collect(); - println!(" - Content: {}...", preview); + let preview: String = item.content.chars().take(200).collect(); + println!(" Preview: {}...", preview); } } - println!(); - - // 6. Show statistics - println!("Step 6: Event statistics:"); - println!( - " - Index events fired: {}", - index_count.load(Ordering::SeqCst) - ); - println!( - " - Query events fired: {}", - query_count.load(Ordering::SeqCst) - ); - println!( - " - Nodes visited: {}", - nodes_visited.load(Ordering::SeqCst) - ); - println!(); - - // 7. Cleanup - println!("Step 7: Cleanup..."); + + // 5. Stats + println!("\n--- Stats ---"); + println!(" Documents indexed: {}", index_count.load(Ordering::SeqCst)); + println!(" Queries executed: {}", query_count.load(Ordering::SeqCst)); + println!(" Nodes visited: {}", nodes_visited.load(Ordering::SeqCst)); + + // Cleanup engine.remove(&doc_id).await?; - println!(" ✓ Document removed\n"); + println!("\n Cleaned up"); - println!("=== Example Complete ==="); + println!("\n=== Done ==="); Ok(()) } diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index 61033da1..ac87a673 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -22,6 +22,9 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + println!("=== Document Graph Example ===\n"); // Build engine with LLM configuration from environment or defaults. diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs index 078a5ed0..32254d7d 100644 --- a/rust/examples/index_incremental.rs +++ b/rust/examples/index_incremental.rs @@ -16,6 +16,9 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + // Build engine with LLM configuration from environment or defaults. // Adjust the defaults below to match your setup. let api_key = std::env::var("LLM_API_KEY") diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs index 244ca6a2..c7840e14 100644 --- a/rust/examples/index_pdf.rs +++ b/rust/examples/index_pdf.rs @@ -18,6 +18,10 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing so we can see pipeline logs. + // Set RUST_LOG=info or RUST_LOG=debug for more detail. + tracing_subscriber::fmt::init(); + let args: Vec = std::env::args().collect(); let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or_else(|| { @@ -33,7 +37,6 @@ async fn main() -> vectorless::Result<()> { println!("=== Indexing PDF: {} ===\n", pdf_path); // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. let api_key = std::env::var("LLM_API_KEY") .unwrap_or_else(|_| "sk-or-v1-...".to_string()); let model = std::env::var("LLM_MODEL") @@ -41,6 +44,13 @@ async fn main() -> vectorless::Result<()> { let endpoint = std::env::var("LLM_ENDPOINT") .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); + tracing::info!( + "LLM config — key: {}..., model: {}, endpoint: {}", + &api_key[..api_key.len().min(8)], + model, + endpoint + ); + let engine = EngineBuilder::new() .with_workspace("./workspace_pdf_example") .with_key(&api_key) @@ -50,13 +60,6 @@ async fn main() -> vectorless::Result<()> { .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; - // Index the PDF — format is auto-detected from the .pdf extension. - // The engine will: - // 1. Extract text from every page - // 2. Detect and parse the Table of Contents - // 3. Build a hierarchical document tree - // 4. Generate summaries for each node (LLM) - // 5. Build a reasoning index for retrieval let result = engine .index(IndexContext::from_path(pdf_path)) .await?; @@ -84,6 +87,13 @@ async fn main() -> vectorless::Result<()> { println!(" tokens: {}", metrics.total_tokens_generated); println!(" topics: {}", metrics.topics_indexed); println!(" keywords: {}", metrics.keywords_indexed); + + if metrics.llm_calls == 0 { + println!("\n *** WARNING: No LLM calls were made. ***"); + println!(" Set RUST_LOG=info to see pipeline logs:"); + println!(" RUST_LOG=info cargo run --example index_pdf -- "); + println!(" Check LLM_API_KEY, LLM_MODEL, and LLM_ENDPOINT are valid."); + } } } @@ -91,7 +101,7 @@ async fn main() -> vectorless::Result<()> { eprintln!("FAILED: {} — {}", fail.source, fail.error); } - // Cleanup workspace + // Cleanup workspace (uncomment to clean up after run) for doc in engine.list().await? { engine.remove(&doc.id).await?; } diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index 4fe0a522..55ec52d5 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -16,6 +16,9 @@ use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + // Build engine with LLM configuration from environment or defaults. // Adjust the defaults below to match your setup. let api_key = std::env::var("LLM_API_KEY") diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index ecc0eb83..e4489d29 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -1,7 +1,7 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Batch indexing example — index multiple documents at once. +//! Batch indexing example — index multiple documents via the vectorless engine. //! //! ```bash //! # Using environment variables for LLM config: @@ -16,6 +16,9 @@ use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { + // Initialize tracing for debug output (set RUST_LOG=debug to see more) + tracing_subscriber::fmt::init(); + // Build engine with LLM configuration from environment or defaults. // Adjust the defaults below to match your setup. let api_key = std::env::var("LLM_API_KEY") @@ -34,21 +37,20 @@ async fn main() -> vectorless::Result<()> { .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; - // Index multiple files from different paths + // Index multiple documents in a single call. + // Paths are resolved relative to the workspace directory. let result = engine - .index(IndexContext::from_paths(&[ - "../README.md", - "../CLAUDE.md", - "../LICENSE", - ])) + .index( + IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) .await?; - println!("indexed: {}, failed: {}", result.items.len(), result.failed.len()); + println!("Indexed {} document(s)", result.items.len()); for item in &result.items { - println!(" {} — doc_id: {}", item.name, item.doc_id); - } - for fail in &result.failed { - println!(" FAILED: {} — {}", fail.source, fail.error); + println!(" - {} ({})", item.name, item.doc_id); + if let Some(metrics) = &item.metrics { + println!(" Time: {}ms", metrics.total_time_ms()); + println!(" Nodes: {}", metrics.nodes_processed); + } } // Cleanup diff --git a/rust/src/index/parse/mod.rs b/rust/src/index/parse/mod.rs index 9fd5a042..0bcba9f4 100644 --- a/rust/src/index/parse/mod.rs +++ b/rust/src/index/parse/mod.rs @@ -27,9 +27,14 @@ use std::path::Path; use crate::error::Result; use crate::index::parse::markdown::MarkdownParser; +use crate::llm::LlmClient; /// Parse a string content document. -pub async fn parse_content(content: &str, format: DocumentFormat) -> Result { +pub async fn parse_content( + content: &str, + format: DocumentFormat, + _llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let parser = MarkdownParser::new(); @@ -42,21 +47,32 @@ pub async fn parse_content(content: &str, format: DocumentFormat) -> Result Result { +pub async fn parse_file( + path: &Path, + format: DocumentFormat, + llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let parser = MarkdownParser::new(); parser.parse_file(path).await } DocumentFormat::Pdf => { - let parser = pdf::PdfParser::new(); + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; parser.parse_file(path).await } } } /// Parse binary data. -pub async fn parse_bytes(bytes: &[u8], format: DocumentFormat) -> Result { +pub async fn parse_bytes( + bytes: &[u8], + format: DocumentFormat, + llm_client: Option, +) -> Result { match format { DocumentFormat::Markdown => { let content = std::str::from_utf8(bytes) @@ -65,7 +81,10 @@ pub async fn parse_bytes(bytes: &[u8], format: DocumentFormat) -> Result { - let parser = pdf::PdfParser::new(); + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; parser.parse_bytes_async(bytes, None).await } } diff --git a/rust/src/index/parse/pdf/parser.rs b/rust/src/index/parse/pdf/parser.rs index b2ae6b5d..7702872b 100644 --- a/rust/src/index/parse/pdf/parser.rs +++ b/rust/src/index/parse/pdf/parser.rs @@ -15,14 +15,16 @@ use tracing::{info, warn}; use crate::Error; use crate::error::Result; use crate::index::parse::toc::TocProcessor; +use crate::llm::LlmClient; use super::types::{PdfMetadata, PdfPage, PdfParseResult}; use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; /// PDF document parser. -#[derive(Debug, Clone)] pub struct PdfParser { config: PdfParserConfig, + /// Optional LLM client for TOC extraction and structure analysis. + llm_client: Option, } /// PDF parser configuration. @@ -50,17 +52,31 @@ impl PdfParser { Self::default() } + /// Create a PDF parser with an externally provided LLM client. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + config: PdfParserConfig::default(), + llm_client: Some(client), + } + } + /// Create a parser with custom configuration. pub fn with_config(config: PdfParserConfig) -> Self { - Self { config } + Self { + config, + llm_client: None, + } } /// Create a parser without TOC extraction. pub fn without_toc() -> Self { - Self::with_config(PdfParserConfig { - extract_toc: false, - ..Default::default() - }) + Self { + config: PdfParserConfig { + extract_toc: false, + ..Default::default() + }, + llm_client: None, + } } /// Parse PDF from bytes and return raw pages. @@ -274,7 +290,16 @@ impl PdfParser { let nodes = if self.config.extract_toc { info!("Extracting TOC from PDF with {} pages", page_count); - let processor = TocProcessor::new(); + let processor = match &self.llm_client { + Some(client) => { + info!("PdfParser: creating TocProcessor with LLM client"); + TocProcessor::with_llm_client(client.clone()) + } + None => { + info!("PdfParser: creating TocProcessor without LLM client (no key configured)"); + TocProcessor::new() + } + }; match processor.process(&result.pages).await { Ok(entries) if !entries.is_empty() => { info!("Extracted {} TOC entries", entries.len()); diff --git a/rust/src/index/parse/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs index eefa3769..beff3021 100644 --- a/rust/src/index/parse/toc/assigner.rs +++ b/rust/src/index/parse/toc/assigner.rs @@ -50,6 +50,14 @@ impl PageAssigner { Self { config, client } } + /// Create an assigner with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: PageAssignerConfig::default(), + client, + } + } + /// Create an assigner with default configuration. pub fn with_defaults() -> Self { Self::new(PageAssignerConfig::default()) diff --git a/rust/src/index/parse/toc/detector.rs b/rust/src/index/parse/toc/detector.rs index f179c507..032a18af 100644 --- a/rust/src/index/parse/toc/detector.rs +++ b/rust/src/index/parse/toc/detector.rs @@ -74,6 +74,20 @@ impl TocDetector { } } + /// Create a detector with an externally provided LLM client. + pub fn with_client(config: TocDetectorConfig, client: LlmClient) -> Self { + let use_llm = config.use_llm_fallback; + Self { + config, + llm_client: if use_llm { + Some(client) + } else { + None + }, + patterns: Self::build_patterns(), + } + } + /// Create a detector with default configuration. pub fn with_defaults() -> Self { Self::new(TocDetectorConfig::default()) diff --git a/rust/src/index/parse/toc/parser.rs b/rust/src/index/parse/toc/parser.rs index 20b61af2..06aaade3 100644 --- a/rust/src/index/parse/toc/parser.rs +++ b/rust/src/index/parse/toc/parser.rs @@ -47,6 +47,14 @@ impl TocParser { Self { config, client } } + /// Create a parser with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: TocParserConfig::default(), + client, + } + } + /// Create a parser with default configuration. pub fn with_defaults() -> Self { Self::new(TocParserConfig::default()) diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs index b2dbc1cd..9ed2c95b 100644 --- a/rust/src/index/parse/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -12,6 +12,7 @@ use tracing::{debug, info, warn}; use crate::error::Result; use crate::index::parse::pdf::PdfPage; +use crate::llm::LlmClient; use super::assigner::{PageAssigner, PageAssignerConfig}; use super::detector::{TocDetector, TocDetectorConfig}; @@ -118,6 +119,8 @@ pub struct TocProcessor { assigner: PageAssigner, verifier: IndexVerifier, repairer: IndexRepairer, + /// Optional LLM client for StructureExtractor (no-TOC mode and refinement). + llm_client: Option, } impl TocProcessor { @@ -126,14 +129,34 @@ impl TocProcessor { Self::with_config(TocProcessorConfig::default()) } + /// Create a TOC processor with an externally provided LLM client. + /// + /// All sub-components (detector, parser, assigner, verifier, repairer) + /// will use this client instead of creating their own from default config. + pub fn with_llm_client(client: LlmClient) -> Self { + info!("TocProcessor: created with external LLM client"); + let config = TocProcessorConfig::default(); + Self { + detector: TocDetector::with_client(config.detector.clone(), client.clone()), + parser: TocParser::with_client(client.clone()), + assigner: PageAssigner::with_client(client.clone()), + verifier: IndexVerifier::with_client(client.clone()), + repairer: IndexRepairer::with_client(client.clone()), + llm_client: Some(client), + config, + } + } + /// Create a TOC processor with custom configuration. pub fn with_config(config: TocProcessorConfig) -> Self { + info!("TocProcessor: created with config (no external LLM client)"); Self { detector: TocDetector::new(config.detector.clone()), parser: TocParser::new(config.parser.clone()), assigner: PageAssigner::new(config.assigner.clone()), verifier: IndexVerifier::new(config.verifier.clone()), repairer: IndexRepairer::new(config.repairer.clone()), + llm_client: None, config, } } @@ -328,7 +351,12 @@ impl TocProcessor { async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { info!("Extracting structure from page content (no TOC available)"); - let extractor = StructureExtractor::new(StructureExtractorConfig::default()); + let extractor = match &self.llm_client { + Some(client) => { + StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone()) + } + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; extractor.extract(pages).await } @@ -402,6 +430,7 @@ impl TocProcessor { .collect(); // Identify oversized entries and launch extractions concurrently + let llm_client = self.llm_client.clone(); let oversized_futures: Vec<_> = entries .iter() .enumerate() @@ -422,6 +451,7 @@ impl TocProcessor { let entry_title = entry.title.clone(); let entry_level = entry.level; + let llm_client = llm_client.clone(); async move { if sub_pages.is_empty() { @@ -431,8 +461,13 @@ impl TocProcessor { "Refining oversized entry '{}' (pages {}-{})", entry_title, start, end ); - let extractor = - StructureExtractor::new(StructureExtractorConfig::default()); + let extractor = match &llm_client { + Some(client) => StructureExtractor::with_client( + StructureExtractorConfig::default(), + client.clone(), + ), + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; match extractor.extract(&sub_pages).await { Ok(sub_entries) => { let skip = if sub_entries diff --git a/rust/src/index/parse/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs index 70498782..51931674 100644 --- a/rust/src/index/parse/toc/repairer.rs +++ b/rust/src/index/parse/toc/repairer.rs @@ -50,6 +50,14 @@ impl IndexRepairer { Self { config, client } } + /// Create a repairer with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: RepairerConfig::default(), + client, + } + } + /// Create a repairer with default configuration. pub fn with_defaults() -> Self { Self::new(RepairerConfig::default()) diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/rust/src/index/parse/toc/structure_extractor.rs index a6dd807d..17511b36 100644 --- a/rust/src/index/parse/toc/structure_extractor.rs +++ b/rust/src/index/parse/toc/structure_extractor.rs @@ -66,6 +66,11 @@ impl StructureExtractor { Self { config, client } } + /// Create a structure extractor with an externally provided LLM client. + pub fn with_client(config: StructureExtractorConfig, client: LlmClient) -> Self { + Self { config, client } + } + /// Create an extractor with default configuration. pub fn with_defaults() -> Self { Self::new(StructureExtractorConfig::default()) diff --git a/rust/src/index/parse/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs index 42186a09..09b28059 100644 --- a/rust/src/index/parse/toc/verifier.rs +++ b/rust/src/index/parse/toc/verifier.rs @@ -50,6 +50,14 @@ impl IndexVerifier { Self { config, client } } + /// Create a verifier with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: VerifierConfig::default(), + client, + } + } + /// Create a verifier with default configuration. pub fn with_defaults() -> Self { Self::new(VerifierConfig::default()) diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index a80cf176..1538c7b3 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -81,8 +81,9 @@ impl PipelineExecutor { /// 7. `reasoning_index` - Build pre-computed reasoning index /// 8. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { + tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage"); let orchestrator = PipelineOrchestrator::new() - .stage_with_priority(ParseStage::new(), 10) + .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) .stage_with_priority(BuildStage::new(), 20) .stage_with_priority(ValidateStage::new(), 22) .stage_with_priority(SplitStage::new(), 25) diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs index 452089c0..5550de45 100644 --- a/rust/src/index/stages/enhance.rs +++ b/rust/src/index/stages/enhance.rs @@ -109,6 +109,12 @@ impl IndexStage for EnhanceStage { async fn execute(&mut self, ctx: &mut IndexContext) -> Result { let start = Instant::now(); + info!( + "EnhanceStage: llm_client={}, strategy={:?}", + self.llm_client.is_some(), + ctx.options.summary_strategy + ); + // Check if we need summaries if !self.needs_summaries(ctx) { info!( diff --git a/rust/src/index/stages/parse.rs b/rust/src/index/stages/parse.rs index 98ef911b..6c8166b6 100644 --- a/rust/src/index/stages/parse.rs +++ b/rust/src/index/stages/parse.rs @@ -15,12 +15,22 @@ use crate::index::IndexMode; use crate::index::pipeline::{IndexContext, IndexInput}; /// Parse stage - extracts raw nodes from documents. -pub struct ParseStage; +pub struct ParseStage { + /// Optional LLM client for PDF structure extraction. + llm_client: Option, +} impl ParseStage { /// Create a new parse stage. pub fn new() -> Self { - Self + Self { llm_client: None } + } + + /// Create a parse stage with an LLM client. + pub fn with_llm_client(client: crate::llm::LlmClient) -> Self { + Self { + llm_client: Some(client), + } } /// Detect document format from path and options. @@ -61,6 +71,10 @@ impl IndexStage for ParseStage { ctx.format = format; info!("Parsing document with format: {:?}", format); + info!( + "ParseStage llm_client present: {}", + self.llm_client.is_some() + ); // Parse based on input type let result = match &ctx.input { @@ -77,7 +91,7 @@ impl IndexStage for ParseStage { .to_string(); // Parse directly - crate::index::parse::parse_file(&path, format).await? + crate::index::parse::parse_file(&path, format, self.llm_client.clone()).await? } IndexInput::Content { content, @@ -88,14 +102,14 @@ impl IndexStage for ParseStage { ctx.name = name.clone(); // Parse content directly - crate::index::parse::parse_content(content, *format).await? + crate::index::parse::parse_content(content, *format, self.llm_client.clone()).await? } IndexInput::Bytes { data, name, format } => { // Set name ctx.name = name.clone(); // Parse bytes - crate::index::parse::parse_bytes(data, *format).await? + crate::index::parse::parse_bytes(data, *format, self.llm_client.clone()).await? } };