diff --git a/Cargo.toml b/Cargo.toml index 0357d8d4..09307720 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,19 +77,12 @@ libc = "0.2" pdf-extract = "0.10.0" lopdf = "0.34" -# DOCX processing -zip = "2.2" -roxmltree = "0.20" - # Random number generation rand = "0.8" # BM25 scoring bm25 = { version = "2.3.2", features = ["parallelism"] } -# HTML parsing -scraper = "0.22" - # Python bindings pyo3 = { version = "0.22", features = ["extension-module"] } diff --git a/examples/rust/graph.rs b/examples/rust/graph.rs index d4e3e3a3..9159326a 100644 --- a/examples/rust/graph.rs +++ b/examples/rust/graph.rs @@ -53,7 +53,11 @@ async fn main() -> vectorless::Result<()> { " Node: {} — {} keyword(s), top: {:?}", node.title, node.top_keywords.len(), - node.top_keywords.iter().take(3).map(|kw| &kw.keyword).collect::>() + node.top_keywords + .iter() + .take(3) + .map(|kw| &kw.keyword) + .collect::>() ); // Show edges (connected documents) diff --git a/examples/rust/indexing.rs b/examples/rust/indexing.rs index d0a56595..c6e03abc 100644 --- a/examples/rust/indexing.rs +++ b/examples/rust/indexing.rs @@ -28,9 +28,7 @@ async fn main() -> vectorless::Result<()> { // 2. Index a single document with default options println!("--- Single document (default mode) ---"); - let result = engine - .index(IndexContext::from_path("./README.md")) - .await?; + let result = engine.index(IndexContext::from_path("./README.md")).await?; for item in &result.items { println!(" doc_id: {}", item.doc_id); @@ -60,10 +58,7 @@ async fn main() -> vectorless::Result<()> { // 3. Re-index with incremental mode — should detect no change println!("\n--- Re-index (incremental, unchanged) ---"); let result2 = engine - .index( - IndexContext::from_path("./README.md") - .with_mode(IndexMode::Incremental), - ) + .index(IndexContext::from_path("./README.md").with_mode(IndexMode::Incremental)) .await?; for item in &result2.items { @@ -86,8 +81,16 @@ async fn main() -> vectorless::Result<()> { batch.failed.len() ); for item in &batch.items { - let time = item.metrics.as_ref().map(|m| m.total_time_ms()).unwrap_or(0); - let nodes = item.metrics.as_ref().map(|m| m.nodes_processed).unwrap_or(0); + let time = item + .metrics + .as_ref() + .map(|m| m.total_time_ms()) + .unwrap_or(0); + let nodes = item + .metrics + .as_ref() + .map(|m| m.nodes_processed) + .unwrap_or(0); println!(" {} — {}ms, {} nodes", item.name, time, nodes); } diff --git a/examples/rust/markdownflow.rs b/examples/rust/markdownflow.rs index 4efbb1cc..28cb9bbe 100644 --- a/examples/rust/markdownflow.rs +++ b/examples/rust/markdownflow.rs @@ -86,7 +86,10 @@ async fn main() -> Result<(), Box> { for query in queries { println!(" Query: \"{}\"", query); - match client.query(QueryContext::new(query).with_doc_id(&doc_id)).await { + match client + .query(QueryContext::new(query).with_doc_id(&doc_id)) + .await + { Ok(result) => { if let Some(item) = result.single() { if item.content.is_empty() { diff --git a/python/src/lib.rs b/python/src/lib.rs index 70a43f85..f240c2c1 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -171,10 +171,8 @@ fn parse_format(format: &str) -> PyResult { match format.to_lowercase().as_str() { "markdown" | "md" => Ok(DocumentFormat::Markdown), "pdf" => Ok(DocumentFormat::Pdf), - "docx" | "doc" => Ok(DocumentFormat::Docx), - "html" | "htm" => Ok(DocumentFormat::Html), _ => Err(PyErr::from(VectorlessError::new( - format!("Unknown format: {}. Supported: markdown, pdf, docx, html", format), + format!("Unknown format: {}. Supported: markdown, pdf", format), "config", ))), } diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 40d3fc9c..3fac2aec 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -22,10 +22,6 @@ path = "../examples/rust/basic.rs" name = "advanced" path = "../examples/rust/advanced.rs" -[[example]] -name = "cli_tool" -path = "../examples/rust/cli_tool.rs" - [[example]] name = "custom_config" path = "../examples/rust/custom_config.rs" @@ -113,19 +109,12 @@ libc = { workspace = true } pdf-extract = { workspace = true } lopdf = { workspace = true } -# DOCX processing -zip = { workspace = true } -roxmltree = { workspace = true } - # Random number generation rand = { workspace = true } # BM25 scoring bm25 = { workspace = true } -# HTML parsing -scraper = { workspace = true } - [dev-dependencies] tempfile = { workspace = true } tokio-test = { workspace = true } diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index bcf551b2..079a186e 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -47,7 +47,6 @@ use crate::index::PipelineOptions; use crate::index::incremental::{self, IndexAction}; use crate::retrieval::{PipelineRetriever, RetrieveEventReceiver}; use crate::storage::{PersistedDocument, Workspace}; -use crate::utils::fingerprint::Fingerprint; use crate::{DocumentTree, Error}; use super::events::EventEmitter; @@ -55,7 +54,7 @@ use super::index_context::{IndexContext, IndexSource}; use super::indexer::IndexerClient; use super::query_context::{QueryContext, QueryScope}; use super::retriever::RetrieverClient; -use super::types::{DocumentInfo, FailedItem, IndexItem, IndexMode, IndexResult, QueryResult, QueryResultItem}; +use super::types::{DocumentInfo, FailedItem, IndexItem, IndexMode, IndexResult, QueryResult}; use super::workspace::WorkspaceClient; /// The main Engine client. @@ -160,7 +159,9 @@ impl Engine { // Single source: no need for concurrency overhead if ctx.sources.len() == 1 { let source = &ctx.sources[0]; - let (items, failed) = self.process_source(source, &ctx.options, ctx.name.as_deref()).await; + let (items, failed) = self + .process_source(source, &ctx.options, ctx.name.as_deref()) + .await; if items.is_empty() && !failed.is_empty() { return Err(Error::Config(format!( "All {} source(s) failed to index", @@ -176,21 +177,26 @@ impl Engine { } // Multiple sources: parallel indexing - let concurrency = self.config.concurrency.max_concurrent_requests.min(ctx.sources.len()); - - let results: Vec<(Vec, Vec)> = - futures::stream::iter(&ctx.sources) - .map(|source| { - let options = ctx.options.clone(); - let name = ctx.name.clone(); - let engine = self.clone(); - async move { - engine.process_source(source, &options, name.as_deref()).await - } - }) - .buffer_unordered(concurrency) - .collect() - .await; + let concurrency = self + .config + .concurrency + .max_concurrent_requests + .min(ctx.sources.len()); + + let results: Vec<(Vec, Vec)> = futures::stream::iter(&ctx.sources) + .map(|source| { + let options = ctx.options.clone(); + let name = ctx.name.clone(); + let engine = self.clone(); + async move { + engine + .process_source(source, &options, name.as_deref()) + .await + } + }) + .buffer_unordered(concurrency) + .collect() + .await; let mut items = Vec::new(); let mut failed = Vec::new(); @@ -252,12 +258,18 @@ impl Engine { doc.format.clone(), doc.description.clone(), doc.page_count, - ).with_metrics_opt(metrics); - let persisted = self.indexer.to_persisted_with_options(doc, &pipeline_options); + ) + .with_metrics_opt(metrics); + let persisted = self + .indexer + .to_persisted_with_options(doc, &pipeline_options); if let Some(ref workspace) = self.workspace { if let Err(e) = workspace.save(&persisted).await { - return (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]); + return ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ); } // Clean up old document after successful save (atomic: save-first, then remove old) if let Some(old_id) = &existing_id { @@ -270,11 +282,17 @@ impl Engine { } Err(e) => { tracing::warn!("Failed to index {}: {}", source_label, e); - (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) } } } - Ok(IndexAction::IncrementalUpdate { old_tree, existing_id }) => { + Ok(IndexAction::IncrementalUpdate { + old_tree, + existing_id, + }) => { info!("Incremental update for: {}", source_label); match self .indexer @@ -291,13 +309,19 @@ impl Engine { doc.format.clone(), doc.description.clone(), doc.page_count, - ).with_metrics_opt(metrics); - let persisted = self.indexer.to_persisted_with_options(doc, &pipeline_options); + ) + .with_metrics_opt(metrics); + let persisted = self + .indexer + .to_persisted_with_options(doc, &pipeline_options); if let Some(ref workspace) = self.workspace { // save() is atomic (write-lock + put), no need to remove first if let Err(e) = workspace.save(&persisted).await { - return (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]); + return ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ); } } @@ -306,13 +330,19 @@ impl Engine { } Err(e) => { tracing::warn!("Incremental update failed for {}: {}", source_label, e); - (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) } } } Err(e) => { tracing::warn!("Failed to resolve action for {}: {}", source_label, e); - (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) } } } @@ -416,13 +446,20 @@ impl Engine { pub async fn query_stream(&self, ctx: QueryContext) -> Result { let doc_id = match &ctx.scope { QueryScope::Single(id) => id.clone(), - _ => return Err(Error::Config("query_stream requires a single doc_id".to_string())), + _ => { + return Err(Error::Config( + "query_stream requires a single doc_id".to_string(), + )); + } }; let tree = self.get_structure(&doc_id).await?; let options = ctx.to_retrieve_options(&self.config); - let rx = self.retriever.query_stream(&tree, &ctx.query, &options).await?; + let rx = self + .retriever + .query_stream(&tree, &ctx.query, &options) + .await?; Ok(rx) } @@ -524,15 +561,13 @@ impl Engine { fn build_pipeline_options( &self, options: &super::types::IndexOptions, - format: crate::parser::DocumentFormat, + format: crate::index::parse::DocumentFormat, ) -> PipelineOptions { use crate::index::SummaryStrategy; PipelineOptions { mode: match format { - crate::parser::DocumentFormat::Markdown => crate::index::IndexMode::Markdown, - crate::parser::DocumentFormat::Pdf => crate::index::IndexMode::Pdf, - crate::parser::DocumentFormat::Html => crate::index::IndexMode::Html, - crate::parser::DocumentFormat::Docx => crate::index::IndexMode::Docx, + crate::index::parse::DocumentFormat::Markdown => crate::index::IndexMode::Markdown, + crate::index::parse::DocumentFormat::Pdf => crate::index::IndexMode::Pdf, }, generate_ids: options.generate_ids, summary_strategy: if options.generate_summaries { @@ -628,8 +663,8 @@ impl Engine { return Ok(IndexAction::Skip(incremental::SkipInfo { doc_id: existing_id, name, - format: crate::parser::DocumentFormat::from_extension(&format_str) - .unwrap_or(crate::parser::DocumentFormat::Markdown), + format: crate::index::parse::DocumentFormat::from_extension(&format_str) + .unwrap_or(crate::index::parse::DocumentFormat::Markdown), description: desc, page_count: pages, })); @@ -646,17 +681,13 @@ impl Engine { None => return Ok(IndexAction::FullIndex { existing_id: None }), }; - let format = crate::parser::DocumentFormat::from_extension(&stored_doc.meta.format) - .unwrap_or(crate::parser::DocumentFormat::Markdown); + let format = crate::index::parse::DocumentFormat::from_extension(&stored_doc.meta.format) + .unwrap_or(crate::index::parse::DocumentFormat::Markdown); let pipeline_options = self.build_pipeline_options(options, format); // If logic fingerprint changed, remove old doc before full reprocess - let action = incremental::resolve_action( - ¤t_bytes, - &stored_doc, - &pipeline_options, - format, - ); + let action = + incremental::resolve_action(¤t_bytes, &stored_doc, &pipeline_options, format); // Note: if FullIndex, old doc cleanup happens in process_source() // after successful save (save-first, then remove old). diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 0f551bc8..aca0270e 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -34,7 +34,7 @@ use std::path::PathBuf; -use crate::parser::DocumentFormat; +use crate::index::parse::DocumentFormat; use super::types::{IndexMode, IndexOptions}; @@ -149,10 +149,10 @@ impl IndexContext { /// Create from a directory path. /// /// Indexes all supported files in the directory (non-recursive). - /// Supported extensions: `.md`, `.pdf`, `.docx`, `.html`, `.txt`. + /// Supported extensions: `.md`, `.pdf`, `.txt`. pub fn from_dir(dir: impl Into) -> Self { let dir = dir.into(); - let supported_extensions = ["md", "markdown", "pdf", "docx", "html", "htm", "txt"]; + let supported_extensions = ["md", "markdown", "pdf", "txt"]; let mut sources = Vec::new(); if let Ok(entries) = std::fs::read_dir(&dir) { diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index bc10c210..f0e43890 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -20,20 +20,20 @@ //! println!("Indexed: {} ({} nodes)", result.id, result.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)); //! ``` -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::Arc; use tracing::info; use uuid::Uuid; use crate::error::{Error, Result}; +use crate::index::parse::DocumentFormat; use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; use crate::llm::LlmClient; -use crate::parser::DocumentFormat; use crate::storage::{DocumentMeta, PersistedDocument}; use super::events::{EventEmitter, IndexEvent}; -use super::index_context::{IndexContext, IndexSource}; +use super::index_context::IndexSource; use super::types::{IndexOptions, IndexedDocument}; /// Document indexing client. @@ -121,7 +121,12 @@ impl IndexerClient { } /// Index a document from an index context. - pub async fn index(&self, source: &IndexSource, name: Option<&str>, options: &IndexOptions) -> Result { + pub async fn index( + &self, + source: &IndexSource, + name: Option<&str>, + options: &IndexOptions, + ) -> Result { self.index_with_existing(source, name, options, None).await } @@ -134,18 +139,29 @@ impl IndexerClient { existing_tree: Option<&crate::DocumentTree>, ) -> Result { match source { - IndexSource::Path(path) => self.index_from_path(path, name, options, existing_tree).await, + IndexSource::Path(path) => { + self.index_from_path(path, name, options, existing_tree) + .await + } IndexSource::Content { data, format } => { - self.index_from_content(data, *format, name, options, existing_tree).await + self.index_from_content(data, *format, name, options, existing_tree) + .await } IndexSource::Bytes { data, format } => { - self.index_from_bytes(data, *format, name, options, existing_tree).await + self.index_from_bytes(data, *format, name, options, existing_tree) + .await } } } /// Index from a file path. - async fn index_from_path(&self, path: &Path, name: Option<&str>, options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>) -> Result { + async fn index_from_path( + &self, + path: &Path, + name: Option<&str>, + options: &IndexOptions, + existing_tree: Option<&crate::DocumentTree>, + ) -> Result { let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); if !path.exists() { @@ -168,11 +184,8 @@ impl IndexerClient { info!("Indexing {:?} document: {}", format, path.display()); // Build pipeline options - let pipeline_options = self.build_pipeline_options_with_existing( - options, - format, - existing_tree.cloned(), - ); + let pipeline_options = + self.build_pipeline_options_with_existing(options, format, existing_tree.cloned()); // Create pipeline input and execute let input = IndexInput::file(&path); @@ -201,11 +214,8 @@ impl IndexerClient { info!("Indexing {:?} document from content", format); - let pipeline_options = self.build_pipeline_options_with_existing( - options, - format, - existing_tree.cloned(), - ); + let pipeline_options = + self.build_pipeline_options_with_existing(options, format, existing_tree.cloned()); let input = IndexInput::content(content); let mut executor = (self.executor_factory)(); @@ -237,11 +247,8 @@ impl IndexerClient { bytes.len() ); - let pipeline_options = self.build_pipeline_options_with_existing( - options, - format, - existing_tree.cloned(), - ); + let pipeline_options = + self.build_pipeline_options_with_existing(options, format, existing_tree.cloned()); let input = IndexInput::bytes(bytes); let mut executor = (self.executor_factory)(); @@ -270,8 +277,6 @@ impl IndexerClient { mode: match format { DocumentFormat::Markdown => IndexMode::Markdown, DocumentFormat::Pdf => IndexMode::Pdf, - DocumentFormat::Html => IndexMode::Html, - DocumentFormat::Docx => IndexMode::Docx, }, generate_ids: options.generate_ids, summary_strategy: if options.generate_summaries { @@ -397,7 +402,11 @@ impl IndexerClient { } /// Convert IndexedDocument to PersistedDocument, storing fingerprints from pipeline options. - pub fn to_persisted_with_options(&self, doc: IndexedDocument, pipeline_options: &PipelineOptions) -> PersistedDocument { + pub fn to_persisted_with_options( + &self, + doc: IndexedDocument, + pipeline_options: &PipelineOptions, + ) -> PersistedDocument { let mut meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) .with_source_path( doc.source_path diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index d2852efe..286e0511 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -99,19 +99,12 @@ pub use events::EventEmitter; // ============================================================ pub use types::{ - ClientError, - DocumentInfo, - FailedItem, - IndexItem, - IndexMode, - IndexOptions, - IndexResult, - QueryResult, - QueryResultItem, + ClientError, DocumentInfo, FailedItem, IndexItem, IndexMode, IndexOptions, IndexResult, + QueryResult, QueryResultItem, }; // ============================================================ // Parser Types (needed for IndexContext::from_content) // ============================================================ -pub use crate::parser::DocumentFormat; +pub use crate::index::parse::DocumentFormat; diff --git a/rust/src/client/retriever.rs b/rust/src/client/retriever.rs index 23e9a051..a5b8676e 100644 --- a/rust/src/client/retriever.rs +++ b/rust/src/client/retriever.rs @@ -21,17 +21,14 @@ use std::sync::Arc; use tracing::info; +use super::events::{EventEmitter, QueryEvent}; +use super::types::QueryResultItem; use crate::config::Config; use crate::document::{DocumentTree, NodeId}; use crate::error::{Error, Result}; use crate::retrieval::content::ContentAggregatorConfig; -use crate::retrieval::stream::{RetrieveEvent, RetrieveEventReceiver}; -use crate::retrieval::{ - QueryComplexity, RetrievalResult, RetrieveOptions, RetrieveResponse, Retriever, - SufficiencyLevel, -}; -use super::events::{EventEmitter, QueryEvent}; -use super::types::QueryResultItem; +use crate::retrieval::stream::RetrieveEventReceiver; +use crate::retrieval::{RetrievalResult, RetrieveOptions, RetrieveResponse, Retriever}; /// Document retrieval client. /// diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index 099a9987..61201a25 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -9,8 +9,8 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; use crate::document::DocumentTree; +use crate::index::parse::DocumentFormat; use crate::metrics::IndexMetrics; -use crate::parser::DocumentFormat; // ============================================================ // Document Types @@ -592,7 +592,13 @@ mod tests { #[test] fn test_partial_success() { - let items = vec![IndexItem::new("doc-1", "A", DocumentFormat::Markdown, None, None)]; + let items = vec![IndexItem::new( + "doc-1", + "A", + DocumentFormat::Markdown, + None, + None, + )]; let failed = vec![FailedItem::new("missing.pdf", "File not found")]; let result = IndexResult::with_partial(items, failed); diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index a4f7b59e..d23c37bb 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -25,7 +25,7 @@ use std::sync::Arc; -use tracing::{debug, info, warn}; +use tracing::{debug, info}; use crate::error::Result; use crate::storage::{PersistedDocument, Workspace}; @@ -317,28 +317,3 @@ pub(crate) struct WorkspaceStats { /// Number of documents in the workspace. pub document_count: usize, } - -#[cfg(test)] -mod tests { - use super::*; - use crate::storage::backend::MemoryBackend; - use std::sync::Arc as StdArc; - - #[tokio::test] - async fn test_workspace_client_creation() { - let backend = StdArc::new(MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - let client = WorkspaceClient::new(workspace).await; - assert!(client.is_empty().await); - } - - #[tokio::test] - async fn test_workspace_stats() { - let backend = StdArc::new(MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - let client = WorkspaceClient::new(workspace).await; - - let stats = client.stats().await.unwrap(); - assert_eq!(stats.document_count, 0); - } -} diff --git a/rust/src/config/mod.rs b/rust/src/config/mod.rs index b73d21c2..5ab66b55 100644 --- a/rust/src/config/mod.rs +++ b/rust/src/config/mod.rs @@ -11,12 +11,10 @@ mod merge; mod types; mod validator; -pub(crate) use loader::{ConfigError, ConfigLoader}; +pub(crate) use loader::ConfigLoader; pub(crate) use types::{ - CacheConfig, CompressionAlgorithm, CompressionConfig, ConcurrencyConfig, Config, - ConfigValidationError, ContentAggregatorConfig, FallbackBehavior, FallbackConfig, - IndexerConfig, LlmClientConfig, LlmConfig, LlmFallbackBehavior, LlmFallbackConfig, - LlmMetricsConfig, LlmPoolConfig, MetricsConfig, OnAllFailedBehavior, PilotMetricsConfig, - RetrievalConfig, RetrievalMetricsConfig, RetryConfig, SearchConfig, Severity, StorageConfig, - StrategyConfig, SufficiencyConfig, SummaryConfig, ThrottleConfig, ValidationError, + CacheConfig, CompressionAlgorithm, ConcurrencyConfig, Config, FallbackBehavior, FallbackConfig, + IndexerConfig, LlmConfig, LlmMetricsConfig, MetricsConfig, OnAllFailedBehavior, + PilotMetricsConfig, RetrievalConfig, RetrievalMetricsConfig, StrategyConfig, SufficiencyConfig, + SummaryConfig, }; diff --git a/rust/src/config/types/llm_pool.rs b/rust/src/config/types/llm_pool.rs index 18793400..c7b0a18c 100644 --- a/rust/src/config/types/llm_pool.rs +++ b/rust/src/config/types/llm_pool.rs @@ -263,8 +263,7 @@ impl RetryConfig { /// Calculate delay for a given attempt (0-indexed). pub fn delay_for_attempt(&self, attempt: usize) -> std::time::Duration { - let delay_ms = - (self.initial_delay_ms as f64) * self.multiplier.powi(attempt as i32); + let delay_ms = (self.initial_delay_ms as f64) * self.multiplier.powi(attempt as i32); let delay_ms = delay_ms.min(self.max_delay_ms as f64); std::time::Duration::from_millis(delay_ms as u64) } diff --git a/rust/src/config/types/mod.rs b/rust/src/config/types/mod.rs index ea6cedbd..e6735072 100644 --- a/rust/src/config/types/mod.rs +++ b/rust/src/config/types/mod.rs @@ -23,18 +23,13 @@ pub(crate) use content::ContentAggregatorConfig; pub(crate) use fallback::{FallbackBehavior, FallbackConfig, OnAllFailedBehavior}; pub(crate) use indexer::IndexerConfig; pub(crate) use llm::{LlmConfig, SummaryConfig}; -pub(crate) use llm_pool::{ - FallbackBehavior as LlmFallbackBehavior, FallbackConfig as LlmFallbackConfig, - LlmClientConfig, LlmPoolConfig, OnAllFailedBehavior as LlmOnAllFailedBehavior, RetryConfig, - ThrottleConfig, -}; +pub(crate) use llm_pool::LlmPoolConfig; pub(crate) use metrics::{ LlmMetricsConfig, MetricsConfig, PilotMetricsConfig, RetrievalMetricsConfig, }; pub(crate) use retrieval::{RetrievalConfig, SearchConfig}; pub(crate) use storage::{ - CacheConfig, CompressionAlgorithm, CompressionConfig, StorageConfig, StrategyConfig, - SufficiencyConfig, + CacheConfig, CompressionAlgorithm, StorageConfig, StrategyConfig, SufficiencyConfig, }; /// Main configuration for vectorless. diff --git a/rust/src/config/types/storage.rs b/rust/src/config/types/storage.rs index ac8bd2cd..b50e86e6 100644 --- a/rust/src/config/types/storage.rs +++ b/rust/src/config/types/storage.rs @@ -387,14 +387,30 @@ pub struct HybridStrategyConfig { pub llm_weight: f32, } -fn default_true() -> bool { true } -fn default_pre_filter_ratio() -> f32 { 0.3 } -fn default_min_candidates() -> usize { 2 } -fn default_max_candidates() -> usize { 5 } -fn default_auto_accept_threshold() -> f32 { 0.85 } -fn default_auto_reject_threshold() -> f32 { 0.15 } -fn default_bm25_weight() -> f32 { 0.4 } -fn default_llm_weight() -> f32 { 0.6 } +fn default_true() -> bool { + true +} +fn default_pre_filter_ratio() -> f32 { + 0.3 +} +fn default_min_candidates() -> usize { + 2 +} +fn default_max_candidates() -> usize { + 5 +} +fn default_auto_accept_threshold() -> f32 { + 0.85 +} +fn default_auto_reject_threshold() -> f32 { + 0.15 +} +fn default_bm25_weight() -> f32 { + 0.4 +} +fn default_llm_weight() -> f32 { + 0.6 +} impl Default for HybridStrategyConfig { fn default() -> Self { @@ -443,11 +459,21 @@ pub struct CrossDocumentStrategyConfig { pub parallel_search: bool, } -fn default_max_documents() -> usize { 10 } -fn default_max_results_per_doc() -> usize { 3 } -fn default_max_total_results() -> usize { 10 } -fn default_min_score() -> f32 { 0.3 } -fn default_merge_strategy() -> String { "TopK".to_string() } +fn default_max_documents() -> usize { + 10 +} +fn default_max_results_per_doc() -> usize { + 3 +} +fn default_max_total_results() -> usize { + 10 +} +fn default_min_score() -> f32 { + 0.3 +} +fn default_merge_strategy() -> String { + "TopK".to_string() +} impl Default for CrossDocumentStrategyConfig { fn default() -> Self { @@ -483,7 +509,9 @@ pub struct PageRangeStrategyConfig { pub min_overlap_ratio: f32, } -fn default_min_overlap_ratio() -> f32 { 0.1 } +fn default_min_overlap_ratio() -> f32 { + 0.1 +} impl Default for PageRangeStrategyConfig { fn default() -> Self { diff --git a/rust/src/document/mod.rs b/rust/src/document/mod.rs index d2abf53f..cc7c22e8 100644 --- a/rust/src/document/mod.rs +++ b/rust/src/document/mod.rs @@ -33,9 +33,7 @@ pub use reasoning::{ HotNodeEntry, ReasoningIndex, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, TopicEntry, }; -pub use reference::{ - NodeReference, RefType, ReferenceExtractor, ReferenceResolver, -}; +pub use reference::{NodeReference, RefType, ReferenceExtractor, ReferenceResolver}; pub use structure::{DocumentStructure, StructureNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; pub use tree::{DocumentTree, RetrievalIndex}; diff --git a/rust/src/document/reasoning.rs b/rust/src/document/reasoning.rs index 5aaae8bd..6146763e 100644 --- a/rust/src/document/reasoning.rs +++ b/rust/src/document/reasoning.rs @@ -67,7 +67,10 @@ impl ReasoningIndex { /// Check if a node is marked as hot. pub fn is_hot(&self, node_id: NodeId) -> bool { - self.hot_nodes.get(&node_id).map(|e| e.is_hot).unwrap_or(false) + self.hot_nodes + .get(&node_id) + .map(|e| e.is_hot) + .unwrap_or(false) } /// Get the hot node entry for a node. @@ -160,7 +163,8 @@ impl ReasoningIndexBuilder { /// Add a section mapping. pub fn add_section(&mut self, title: impl Into, node_id: NodeId) { - self.section_map.insert(title.into().to_lowercase(), node_id); + self.section_map + .insert(title.into().to_lowercase(), node_id); } /// Set the config hash for cache invalidation. diff --git a/rust/src/document/reference.rs b/rust/src/document/reference.rs index 4a3a1a53..10d08e42 100644 --- a/rust/src/document/reference.rs +++ b/rust/src/document/reference.rs @@ -339,7 +339,8 @@ impl ReferenceExtractor { for node_id in tree.traverse() { if let Some(node) = tree.get(node_id) { let title_lower = node.title.to_lowercase(); - if title_lower.starts_with(&format!("appendix {}", r#ref.target_id.to_lowercase())) + if title_lower + .starts_with(&format!("appendix {}", r#ref.target_id.to_lowercase())) || title_lower == format!("appendix {}", r#ref.target_id.to_lowercase()) { return Some(node_id); @@ -438,10 +439,16 @@ mod tests { // Debug: print what was extracted for r in &refs { - eprintln!("Extracted: {:?} '{}' -> '{}'", r.ref_type, r.ref_text, r.target_id); + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); } - assert!(refs.iter().any(|r| r.ref_type == RefType::Section && r.target_id == "2.1")); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Section && r.target_id == "2.1") + ); // Note: The regex may not capture all multi-level section numbers correctly // in a single pass, so we check for the presence of section references assert!(refs.iter().any(|r| r.ref_type == RefType::Section)); @@ -452,7 +459,10 @@ mod tests { let text = "See Appendix G for more information."; let refs = ReferenceExtractor::extract(text); - assert!(refs.iter().any(|r| r.ref_type == RefType::Appendix && r.target_id == "G")); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Appendix && r.target_id == "G") + ); } #[test] @@ -462,12 +472,22 @@ mod tests { // Debug output for r in &refs { - eprintln!("Extracted: {:?} '{}' -> '{}'", r.ref_type, r.ref_text, r.target_id); + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); } - assert!(refs.iter().any(|r| r.ref_type == RefType::Table && r.target_id == "5.3")); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Table && r.target_id == "5.3") + ); // The trailing period may be included, so check for either "1" or "1." - assert!(refs.iter().any(|r| r.ref_type == RefType::Table && (r.target_id == "1" || r.target_id == "1."))); + assert!( + refs.iter().any( + |r| r.ref_type == RefType::Table && (r.target_id == "1" || r.target_id == "1.") + ) + ); } #[test] @@ -477,12 +497,22 @@ mod tests { // Debug output for r in &refs { - eprintln!("Extracted: {:?} '{}' -> '{}'", r.ref_type, r.ref_text, r.target_id); + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); } - assert!(refs.iter().any(|r| r.ref_type == RefType::Figure && r.target_id == "2.1")); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Figure && r.target_id == "2.1") + ); // The trailing period may be included, so check for either "3" or "3." - assert!(refs.iter().any(|r| r.ref_type == RefType::Figure && (r.target_id == "3" || r.target_id == "3."))); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Figure + && (r.target_id == "3" || r.target_id == "3.")) + ); } #[test] @@ -490,7 +520,10 @@ mod tests { let text = "See page 42 for details."; let refs = ReferenceExtractor::extract(text); - assert!(refs.iter().any(|r| r.ref_type == RefType::Page && r.target_id == "42")); + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Page && r.target_id == "42") + ); } #[test] diff --git a/rust/src/document/structure.rs b/rust/src/document/structure.rs index 6fa93b35..455b25cb 100644 --- a/rust/src/document/structure.rs +++ b/rust/src/document/structure.rs @@ -3,8 +3,6 @@ //! Document structure types for JSON export. //! -//! These types define the JSON format for exporting document trees, -//! compatible with PageIndex format. use serde::{Deserialize, Serialize}; diff --git a/rust/src/document/tree.rs b/rust/src/document/tree.rs index 5521a5c2..24dacb26 100644 --- a/rust/src/document/tree.rs +++ b/rust/src/document/tree.rs @@ -602,7 +602,7 @@ impl DocumentTree { } } - /// Export the tree structure to JSON format (PageIndex compatible). + /// Export the tree structure to JSON format. pub fn to_structure_json(&self, doc_name: &str) -> DocumentStructure { let structure = self.build_structure_nodes(self.root_id); DocumentStructure { diff --git a/rust/src/events/emitter.rs b/rust/src/events/emitter.rs index a281d18b..42753b5b 100644 --- a/rust/src/events/emitter.rs +++ b/rust/src/events/emitter.rs @@ -171,12 +171,18 @@ impl EventEmitter { pub fn merge(self, other: EventEmitter) -> Self { let mut other_inner = other.inner.write(); let mut inner = self.inner.write(); - inner.index_handlers.extend(other_inner.index_handlers.drain(..)); - inner.query_handlers.extend(other_inner.query_handlers.drain(..)); + inner + .index_handlers + .extend(other_inner.index_handlers.drain(..)); + inner + .query_handlers + .extend(other_inner.query_handlers.drain(..)); inner .workspace_handlers .extend(other_inner.workspace_handlers.drain(..)); - inner.async_handlers.extend(other_inner.async_handlers.drain(..)); + inner + .async_handlers + .extend(other_inner.async_handlers.drain(..)); drop(inner); drop(other_inner); self diff --git a/rust/src/events/mod.rs b/rust/src/events/mod.rs index 8e2b1526..7e390219 100644 --- a/rust/src/events/mod.rs +++ b/rust/src/events/mod.rs @@ -29,6 +29,3 @@ mod types; pub use emitter::EventEmitter; pub use types::{Event, IndexEvent, QueryEvent, WorkspaceEvent}; - -// Re-export handler traits for internal use -pub(crate) use emitter::{AsyncEventHandler, EventHandler}; diff --git a/rust/src/events/types.rs b/rust/src/events/types.rs index 7c8e58ce..2d5c22f7 100644 --- a/rust/src/events/types.rs +++ b/rust/src/events/types.rs @@ -6,7 +6,7 @@ //! Provides enums for indexing, query, and workspace events //! that can be observed via [`EventEmitter`](super::EventEmitter). -use crate::parser::DocumentFormat; +use crate::index::parse::DocumentFormat; use crate::retrieval::SufficiencyLevel; /// Top-level event types for client operations. diff --git a/rust/src/graph/builder.rs b/rust/src/graph/builder.rs index 62ee87a5..6cdf388b 100644 --- a/rust/src/graph/builder.rs +++ b/rust/src/graph/builder.rs @@ -250,10 +250,7 @@ mod tests { use super::*; fn make_keywords(pairs: &[(&str, f32)]) -> HashMap { - pairs - .iter() - .map(|&(k, w)| (k.to_string(), w)) - .collect() + pairs.iter().map(|&(k, w)| (k.to_string(), w)).collect() } #[test] @@ -359,13 +356,7 @@ mod tests { "A", "md", 5, - make_keywords(&[ - ("a", 0.9), - ("b", 0.8), - ("c", 0.7), - ("d", 0.6), - ("e", 0.5), - ]), + make_keywords(&[("a", 0.9), ("b", 0.8), ("c", 0.7), ("d", 0.6), ("e", 0.5)]), ); builder.add_document( "doc2", diff --git a/rust/src/graph/types.rs b/rust/src/graph/types.rs index 3a8577a6..08f8d00a 100644 --- a/rust/src/graph/types.rs +++ b/rust/src/graph/types.rs @@ -81,10 +81,7 @@ impl DocumentGraph { /// Add a directed edge from `source` to `target`. pub fn add_edge(&mut self, source: &str, edge: GraphEdge) { - self.edges - .entry(source.to_string()) - .or_default() - .push(edge); + self.edges.entry(source.to_string()).or_default().push(edge); self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum(); } @@ -100,9 +97,7 @@ impl DocumentGraph { /// Find documents containing a keyword. pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] { - self.keyword_index - .get(keyword) - .map_or(&[], Vec::as_slice) + self.keyword_index.get(keyword).map_or(&[], Vec::as_slice) } /// Get the number of documents in the graph. @@ -144,7 +139,7 @@ pub struct DocumentGraphNode { pub doc_id: String, /// Document title/name. pub title: String, - /// Document format (md, pdf, docx). + /// Document format (md, pdf). pub format: String, /// Top-N representative keywords extracted from the document's /// ReasoningIndex topic_paths, sorted by aggregate weight. @@ -233,8 +228,14 @@ mod tests { title: "Test Doc".to_string(), format: "md".to_string(), top_keywords: vec![ - WeightedKeyword { keyword: "rust".to_string(), weight: 0.9 }, - WeightedKeyword { keyword: "async".to_string(), weight: 0.7 }, + WeightedKeyword { + keyword: "rust".to_string(), + weight: 0.9, + }, + WeightedKeyword { + keyword: "async".to_string(), + weight: 0.7, + }, ], node_count: 10, }); @@ -264,19 +265,22 @@ mod tests { node_count: 8, }); - graph.add_edge("doc1", GraphEdge { - target_doc_id: "doc2".to_string(), - weight: 0.5, - evidence: EdgeEvidence { - shared_keywords: vec![SharedKeyword { - keyword: "rust".to_string(), - source_weight: 0.9, - target_weight: 0.8, - }], - shared_keyword_count: 1, - keyword_jaccard: 0.3, + graph.add_edge( + "doc1", + GraphEdge { + target_doc_id: "doc2".to_string(), + weight: 0.5, + evidence: EdgeEvidence { + shared_keywords: vec![SharedKeyword { + keyword: "rust".to_string(), + source_weight: 0.9, + target_weight: 0.8, + }], + shared_keyword_count: 1, + keyword_jaccard: 0.3, + }, }, - }); + ); assert_eq!(graph.edge_count(), 1); assert_eq!(graph.get_neighbors("doc1").len(), 1); @@ -291,7 +295,10 @@ mod tests { doc_id: "doc1".to_string(), title: "Test".to_string(), format: "md".to_string(), - top_keywords: vec![WeightedKeyword { keyword: "test".to_string(), weight: 1.0 }], + top_keywords: vec![WeightedKeyword { + keyword: "test".to_string(), + weight: 1.0, + }], node_count: 3, }); diff --git a/rust/src/index/config.rs b/rust/src/index/config.rs index c6fa74ea..f06fa22c 100644 --- a/rust/src/index/config.rs +++ b/rust/src/index/config.rs @@ -14,6 +14,8 @@ use crate::config::{ConcurrencyConfig, IndexerConfig}; use crate::document::{DocumentTree, ReasoningIndexConfig}; use crate::utils::fingerprint::{Fingerprint, Fingerprinter}; +use std::path::PathBuf; + /// Index mode for document processing. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum IndexMode { @@ -23,10 +25,6 @@ pub enum IndexMode { Markdown, /// Force PDF format. Pdf, - /// Force DOCX format. - Docx, - /// Force HTML format. - Html, } impl Default for IndexMode { @@ -97,6 +95,11 @@ pub struct ThinningConfig { /// Token threshold for merging. pub threshold: usize, + + /// Whether to merge child content into the parent when removing children. + /// When true, nodes below threshold absorb their children's text before removal. + /// When false, small nodes are simply discarded. + pub merge_content: bool, } impl Default for ThinningConfig { @@ -104,6 +107,7 @@ impl Default for ThinningConfig { Self { enabled: false, threshold: 500, + merge_content: true, } } } @@ -119,6 +123,7 @@ impl ThinningConfig { Self { enabled: true, threshold, + merge_content: true, } } @@ -127,6 +132,58 @@ impl ThinningConfig { self.threshold = threshold; self } + + /// Set whether to merge content. + pub fn with_merge_content(mut self, merge: bool) -> Self { + self.merge_content = merge; + self + } +} + +/// Configuration for large node splitting. +#[derive(Debug, Clone)] +pub struct SplitConfig { + /// Whether splitting is enabled. + pub enabled: bool, + + /// Maximum tokens per leaf node. Nodes exceeding this are split. + pub max_tokens_per_node: usize, + + /// Whether to use pattern-based splitting (headings, paragraphs). + /// When false, splits at approximate byte boundaries. + pub pattern_split: bool, +} + +impl Default for SplitConfig { + fn default() -> Self { + Self { + enabled: true, + max_tokens_per_node: 8000, + pattern_split: true, + } + } +} + +impl SplitConfig { + /// Create disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Create enabled config with custom token limit. + pub fn with_max_tokens(mut self, max: usize) -> Self { + self.max_tokens_per_node = max; + self + } + + /// Set whether to use pattern-based splitting. + pub fn with_pattern_split(mut self, pattern: bool) -> Self { + self.pattern_split = pattern; + self + } } /// Pipeline options for index execution. @@ -147,6 +204,9 @@ pub struct PipelineOptions { /// Optimization configuration. pub optimization: OptimizationConfig, + /// Split configuration. + pub split: SplitConfig, + /// Whether to generate document description. pub generate_description: bool, @@ -166,6 +226,12 @@ pub struct PipelineOptions { /// Current processing version. Bumped when indexing algorithm changes /// to force reprocessing of existing documents. pub processing_version: u32, + + /// Directory for pipeline checkpoints. + /// When set, the pipeline saves state after each stage group + /// and can resume from the last completed stage on restart. + /// When `None`, checkpointing is disabled. + pub checkpoint_dir: Option, } impl Default for PipelineOptions { @@ -176,12 +242,14 @@ impl Default for PipelineOptions { summary_strategy: SummaryStrategy::full(), thinning: ThinningConfig::default(), optimization: OptimizationConfig::default(), + split: SplitConfig::default(), generate_description: true, concurrency: ConcurrencyConfig::default(), indexer: IndexerConfig::default(), reasoning_index: ReasoningIndexConfig::default(), existing_tree: None, processing_version: 1, + checkpoint_dir: None, } } } @@ -222,6 +290,12 @@ impl PipelineOptions { self } + /// Set the split configuration. + pub fn with_split(mut self, split: SplitConfig) -> Self { + self.split = split; + self + } + /// Set whether to generate document description. pub fn with_generate_description(mut self, generate: bool) -> Self { self.generate_description = generate; @@ -246,6 +320,15 @@ impl PipelineOptions { self } + /// Set the checkpoint directory. + /// + /// When set, the pipeline saves state after each stage group + /// and can resume from the last completed stage on restart. + pub fn with_checkpoint_dir(mut self, dir: impl Into) -> Self { + self.checkpoint_dir = Some(dir.into()); + self + } + /// Compute a fingerprint of the pipeline configuration. /// /// If this fingerprint changes between runs, all documents need full reprocessing diff --git a/rust/src/index/incremental/detector.rs b/rust/src/index/incremental/detector.rs index 73c018b2..c69e653e 100644 --- a/rust/src/index/incremental/detector.rs +++ b/rust/src/index/incremental/detector.rs @@ -291,10 +291,8 @@ impl ChangeDetector { } // Record processing version - self.processing_versions.insert( - doc_id.to_string(), - self.current_processing_version, - ); + self.processing_versions + .insert(doc_id.to_string(), self.current_processing_version); } /// Record document from ChangeInfo. @@ -316,11 +314,7 @@ impl ChangeDetector { } /// Detect changes between two trees using fingerprints. - pub fn detect_changes( - &self, - old_tree: &DocumentTree, - new_tree: &DocumentTree, - ) -> ChangeSet { + pub fn detect_changes(&self, old_tree: &DocumentTree, new_tree: &DocumentTree) -> ChangeSet { let mut changes = ChangeSet::new(); // Collect fingerprints from both trees @@ -333,7 +327,10 @@ impl ChangeDetector { let mut map = HashMap::new(); for node_id in old_tree.traverse() { if let Some(node) = old_tree.get(node_id) { - let key = node.node_id.clone().unwrap_or_else(|| format!("node_{:?}", node_id.0)); + let key = node + .node_id + .clone() + .unwrap_or_else(|| format!("node_{:?}", node_id.0)); if let Some(fp) = old_fps.get(&key) { map.insert(node.title.clone(), (key, fp.clone())); } @@ -346,7 +343,10 @@ impl ChangeDetector { let mut map = HashMap::new(); for node_id in new_tree.traverse() { if let Some(node) = new_tree.get(node_id) { - let key = node.node_id.clone().unwrap_or_else(|| format!("node_{:?}", node_id.0)); + let key = node + .node_id + .clone() + .unwrap_or_else(|| format!("node_{:?}", node_id.0)); if let Some(fp) = new_fps.get(&key) { map.insert(node.title.clone(), (key, fp.clone())); } @@ -369,12 +369,8 @@ impl ChangeDetector { for (title, (node_key, fp)) in &old_by_title { if !new_by_title.contains_key(title) { changes.removed.push( - NodeChange::new( - Some(node_key.clone()), - title.clone(), - ChangeType::Removed, - ) - .with_fingerprint(fp.clone()), + NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Removed) + .with_fingerprint(fp.clone()), ); } } @@ -384,12 +380,8 @@ impl ChangeDetector { if let Some((_old_key, old_fp)) = old_by_title.get(title) { if new_fp.content_changed(old_fp) { changes.modified.push( - NodeChange::new( - Some(new_key.clone()), - title.clone(), - ChangeType::Modified, - ) - .with_fingerprint(new_fp.clone()), + NodeChange::new(Some(new_key.clone()), title.clone(), ChangeType::Modified) + .with_fingerprint(new_fp.clone()), ); } else if new_fp.subtree_changed(old_fp) { changes.restructured.push( @@ -562,7 +554,10 @@ pub fn compute_all_node_fingerprints(tree: &DocumentTree) -> HashMap = changes.modified + let changed_titles: std::collections::HashSet = changes + .modified .iter() .chain(changes.restructured.iter()) .chain(changes.added.iter()) diff --git a/rust/src/index/incremental/resolver.rs b/rust/src/index/incremental/resolver.rs index 4d3d7031..a8087fd4 100644 --- a/rust/src/index/incremental/resolver.rs +++ b/rust/src/index/incremental/resolver.rs @@ -11,10 +11,10 @@ use tracing::info; use crate::document::DocumentTree; +use crate::index::config::PipelineOptions; +use crate::index::parse::DocumentFormat; use crate::storage::PersistedDocument; use crate::utils::fingerprint::Fingerprint; -use crate::index::config::PipelineOptions; -use crate::parser::DocumentFormat; /// Action to take for a source during indexing. pub enum IndexAction { @@ -71,7 +71,10 @@ pub fn resolve_action( let current_fp = Fingerprint::from_bytes(file_bytes); // Layer 1: File-level content fingerprint - if !stored_doc.meta.needs_reprocessing(¤t_fp, pipeline_options.processing_version) { + if !stored_doc + .meta + .needs_reprocessing(¤t_fp, pipeline_options.processing_version) + { info!("File fingerprint unchanged, skipping"); return IndexAction::Skip(SkipInfo { doc_id: stored_doc.meta.id.clone(), diff --git a/rust/src/index/incremental/updater.rs b/rust/src/index/incremental/updater.rs index fd1575df..a9220acf 100644 --- a/rust/src/index/incremental/updater.rs +++ b/rust/src/index/incremental/updater.rs @@ -7,7 +7,7 @@ use tracing::info; use crate::document::{DocumentTree, NodeId}; use crate::error::Result; -use crate::parser::RawNode; +use crate::index::parse::RawNode; use super::detector::ChangeDetector; diff --git a/rust/src/index/mod.rs b/rust/src/index/mod.rs index 6a4c5dd1..269ad362 100644 --- a/rust/src/index/mod.rs +++ b/rust/src/index/mod.rs @@ -8,18 +8,35 @@ //! # Architecture //! //! ```text -//! ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -//! │ Parse │───►│ Build │───►│ Enhance │───►│ Enrich │ -//! │ (Document) │ │ (Tree) │ │ (LLM Boost)│ │ (Metadata) │ -//! └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ -//! │ -//! ▼ -//! ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -//! │ Output │◄───│ Persist │◄───│ Optimize │◄───│ Enrich │ -//! │ (Indexed) │ │ (Storage) │ │ (Tree) │ │ │ -//! └─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘ +//! Priority 10: ┌──────────┐ +//! │ Parse │ Parse document into raw nodes +//! └────┬─────┘ +//! Priority 20: ┌────▼─────┐ +//! │ Build │ Construct tree + thinning (with content merge) +//! └────┬─────┘ +//! Priority 22: ┌────▼─────┐ +//! │ Validate │ Tree integrity checks (optional) +//! └────┬─────┘ +//! Priority 25: ┌────▼─────┐ +//! │ Split │ Split oversized leaf nodes (optional) +//! └────┬─────┘ +//! Priority 30: ┌────▼─────┐ +//! │ Enhance │ LLM summaries (when client available) +//! └────┬─────┘ +//! Priority 40: ┌────▼─────┐ +//! │ Enrich │ Metadata + cross-references +//! └────┬─────┘ +//! Priority 45: ┌────▼──────────┐ +//! │ Reasoning Idx │ Pre-computed reasoning index +//! └────┬──────────┘ +//! Priority 60: ┌────▼──────┐ +//! │ Optimize │ Final tree optimization +//! └───────────┘ //! ``` //! +//! Checkpointing is available when `PipelineOptions::checkpoint_dir` is set. +//! State is saved after each stage group and resumed on restart. +//! //! # Usage //! //! ```rust,ignore @@ -37,29 +54,16 @@ pub mod config; pub mod incremental; +pub mod parse; pub mod pipeline; pub mod stages; pub mod summary; // Re-export main types from pipeline -pub use pipeline::{ - ExecutionGroup, FailurePolicy, IndexContext, IndexInput, IndexMetrics, PipelineResult, - PipelineExecutor, PipelineOrchestrator, StageResult, StageRetryConfig, -}; +pub use pipeline::{IndexInput, IndexMetrics, PipelineExecutor, PipelineResult}; // Re-export config types -pub use config::{IndexMode, OptimizationConfig, PipelineOptions, ThinningConfig}; - -// Re-export stages -pub use stages::IndexStage; +pub use config::{IndexMode, PipelineOptions, ThinningConfig}; // Re-export summary -pub use summary::{ - FullStrategy, LazyStrategy, LlmSummaryGenerator, SelectiveStrategy, SummaryGenerator, - SummaryStrategy, SummaryStrategyConfig, -}; - -// Re-export incremental -pub use incremental::{ChangeDetector, ChangeSet, PartialUpdater}; - -pub(crate) use crate::config::{ConcurrencyConfig, IndexerConfig}; +pub use summary::SummaryStrategy; diff --git a/rust/src/parser/markdown/config.rs b/rust/src/index/parse/markdown/config.rs similarity index 100% rename from rust/src/parser/markdown/config.rs rename to rust/src/index/parse/markdown/config.rs diff --git a/rust/src/parser/markdown/frontmatter.rs b/rust/src/index/parse/markdown/frontmatter.rs similarity index 100% rename from rust/src/parser/markdown/frontmatter.rs rename to rust/src/index/parse/markdown/frontmatter.rs diff --git a/rust/src/parser/markdown/mod.rs b/rust/src/index/parse/markdown/mod.rs similarity index 96% rename from rust/src/parser/markdown/mod.rs rename to rust/src/index/parse/markdown/mod.rs index 69ab2ed5..168f3645 100644 --- a/rust/src/parser/markdown/mod.rs +++ b/rust/src/index/parse/markdown/mod.rs @@ -27,5 +27,4 @@ mod config; mod frontmatter; mod parser; -pub use config::MarkdownConfig; pub use parser::MarkdownParser; diff --git a/rust/src/parser/markdown/parser.rs b/rust/src/index/parse/markdown/parser.rs similarity index 97% rename from rust/src/parser/markdown/parser.rs rename to rust/src/index/parse/markdown/parser.rs index bb0fe3cb..5bdf6a71 100644 --- a/rust/src/parser/markdown/parser.rs +++ b/rust/src/index/parse/markdown/parser.rs @@ -3,12 +3,11 @@ //! Main Markdown parser implementation. -use async_trait::async_trait; use pulldown_cmark::Options; use std::path::Path; use crate::error::Result; -use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; +use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; use crate::utils::estimate_tokens; use super::config::MarkdownConfig; @@ -363,13 +362,9 @@ fn finish_current_node( None } -#[async_trait] -impl DocumentParser for MarkdownParser { - fn format(&self) -> DocumentFormat { - DocumentFormat::Markdown - } - - async fn parse(&self, content: &str) -> Result { +impl MarkdownParser { + /// Parse Markdown content and return result. + pub async fn parse(&self, content: &str) -> Result { let line_count = content.lines().count(); let (nodes, fm_fields) = self.extract_nodes(content); @@ -396,7 +391,8 @@ impl DocumentParser for MarkdownParser { Ok(ParseResult::new(meta, nodes)) } - async fn parse_file(&self, path: &Path) -> Result { + /// Parse a Markdown file. + pub async fn parse_file(&self, path: &Path) -> Result { let content = tokio::fs::read_to_string(path) .await .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; diff --git a/rust/src/index/parse/mod.rs b/rust/src/index/parse/mod.rs new file mode 100644 index 00000000..9fd5a042 --- /dev/null +++ b/rust/src/index/parse/mod.rs @@ -0,0 +1,77 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document parsing for the index pipeline. +//! +//! Supports Markdown and PDF formats. Parsing is dispatched directly +//! via `match` — no trait objects or registry needed. +//! +//! # Quick parse +//! +//! ```rust,ignore +//! use vectorless::index::parse::{parse_content, parse_bytes, DocumentFormat}; +//! +//! let result = parse_content("# Title\nContent", DocumentFormat::Markdown).await?; +//! let result = parse_bytes(&pdf_bytes, DocumentFormat::Pdf).await?; +//! ``` + +pub mod markdown; +pub mod pdf; +pub mod toc; +pub mod types; + +// Re-export core types at module level +pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; + +use std::path::Path; + +use crate::error::Result; +use crate::index::parse::markdown::MarkdownParser; + +/// Parse a string content document. +pub async fn parse_content(content: &str, format: DocumentFormat) -> Result { + match format { + DocumentFormat::Markdown => { + let parser = MarkdownParser::new(); + parser.parse(content).await + } + DocumentFormat::Pdf => Err(crate::Error::Parse( + "PDF requires bytes, not string content".to_string(), + )), + } +} + +/// Parse a file. +pub async fn parse_file(path: &Path, format: DocumentFormat) -> Result { + match format { + DocumentFormat::Markdown => { + let parser = MarkdownParser::new(); + parser.parse_file(path).await + } + DocumentFormat::Pdf => { + let parser = pdf::PdfParser::new(); + parser.parse_file(path).await + } + } +} + +/// Parse binary data. +pub async fn parse_bytes(bytes: &[u8], format: DocumentFormat) -> Result { + match format { + DocumentFormat::Markdown => { + let content = std::str::from_utf8(bytes) + .map_err(|e| crate::Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; + let parser = MarkdownParser::new(); + parser.parse(content).await + } + DocumentFormat::Pdf => { + let parser = pdf::PdfParser::new(); + parser.parse_bytes_async(bytes, None).await + } + } +} + +/// Detect document format from a file extension. +pub fn format_from_extension(ext: &str) -> Option { + DocumentFormat::from_extension(ext) +} diff --git a/rust/src/parser/pdf/mod.rs b/rust/src/index/parse/pdf/mod.rs similarity index 88% rename from rust/src/parser/pdf/mod.rs rename to rust/src/index/parse/pdf/mod.rs index 880e8025..dc92da86 100644 --- a/rust/src/parser/pdf/mod.rs +++ b/rust/src/index/parse/pdf/mod.rs @@ -28,5 +28,5 @@ mod parser; mod types; -pub use parser::{PdfParser, PdfParserConfig}; -pub use types::{PdfMetadata, PdfPage, PdfParseResult}; +pub use parser::PdfParser; +pub use types::PdfPage; diff --git a/rust/src/parser/pdf/parser.rs b/rust/src/index/parse/pdf/parser.rs similarity index 89% rename from rust/src/parser/pdf/parser.rs rename to rust/src/index/parse/pdf/parser.rs index 10fe053e..4684ae1a 100644 --- a/rust/src/parser/pdf/parser.rs +++ b/rust/src/index/parse/pdf/parser.rs @@ -10,11 +10,10 @@ use tracing::{info, warn}; use crate::Error; use crate::error::Result; -use crate::parser::DocumentParser; -use crate::parser::toc::TocProcessor; +use crate::index::parse::toc::TocProcessor; use super::types::{PdfMetadata, PdfPage, PdfParseResult}; -use crate::parser::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; /// PDF document parser. #[derive(Debug, Clone)] @@ -60,16 +59,12 @@ impl PdfParser { }) } - /// Parse a PDF file and return detailed result. - pub fn parse_file(&self, path: &Path) -> Result { - let bytes = std::fs::read(path) - .map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?; - - self.parse_bytes(&bytes, path.file_stem().and_then(|s| s.to_str())) - } - - /// Parse PDF from bytes. - pub fn parse_bytes(&self, bytes: &[u8], filename: Option<&str>) -> Result { + /// Parse PDF from bytes and return raw pages. + pub async fn parse_bytes_raw( + &self, + bytes: &[u8], + filename: Option<&str>, + ) -> Result { let doc = LopdfDocument::load_mem(bytes) .map_err(|e| Error::Parse(format!("Failed to parse PDF: {}", e)))?; @@ -327,7 +322,7 @@ impl PdfParser { /// Convert TOC entries to RawNodes. fn toc_entries_to_raw_nodes( &self, - entries: &[crate::parser::toc::TocEntry], + entries: &[crate::index::parse::toc::TocEntry], pages: &[PdfPage], ) -> Vec { let mut nodes = Vec::new(); @@ -353,7 +348,7 @@ impl PdfParser { /// Get content for a TOC entry from pages. fn get_content_for_entry( &self, - entry: &crate::parser::toc::TocEntry, + entry: &crate::index::parse::toc::TocEntry, pages: &[PdfPage], ) -> String { let start_page = entry.physical_page.unwrap_or(1); @@ -394,27 +389,34 @@ impl Default for PdfParser { } } -#[async_trait::async_trait] -impl DocumentParser for PdfParser { - fn format(&self) -> DocumentFormat { - DocumentFormat::Pdf - } - - async fn parse(&self, content: &str) -> Result { - // For PDF, content is the file path - let path = Path::new(content); - self.parse_path(path).await +impl PdfParser { + /// Parse a PDF file into raw nodes for the index pipeline. + pub async fn parse_file(&self, path: &Path) -> Result { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?; + let filename = path.file_stem().and_then(|s| s.to_str()); + self.parse_bytes_to_result(&bytes, filename, Some(path)) + .await } - async fn parse_file(&self, path: &Path) -> Result { - self.parse_path(path).await + /// Parse PDF bytes into raw nodes for the index pipeline. + pub async fn parse_bytes_async( + &self, + bytes: &[u8], + filename: Option<&str>, + ) -> Result { + self.parse_bytes_to_result(bytes, filename, None).await } -} -impl PdfParser { - /// Internal async method to parse PDF from path. - async fn parse_path(&self, path: &Path) -> Result { - let result = self.parse_pdf_file(path)?; + /// Core async parsing logic shared by parse_file and parse_bytes_async. + async fn parse_bytes_to_result( + &self, + bytes: &[u8], + filename: Option<&str>, + source_path: Option<&Path>, + ) -> Result { + let result = self.parse_bytes_raw(bytes, filename).await?; let page_count = result.pages.len(); // Try TOC extraction if enabled @@ -449,20 +451,12 @@ impl PdfParser { format: DocumentFormat::Pdf, page_count: Some(page_count), line_count: 0, - source_path: Some(path.to_string_lossy().to_string()), + source_path: source_path.map(|p| p.to_string_lossy().to_string()), description: result.metadata.subject, }; Ok(ParseResult::new(meta, nodes)) } - - /// Parse a PDF file and return detailed result. - fn parse_pdf_file(&self, path: &Path) -> Result { - let bytes = std::fs::read(path) - .map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?; - - self.parse_bytes(&bytes, path.file_stem().and_then(|s| s.to_str())) - } } #[cfg(test)] diff --git a/rust/src/parser/pdf/types.rs b/rust/src/index/parse/pdf/types.rs similarity index 100% rename from rust/src/parser/pdf/types.rs rename to rust/src/index/parse/pdf/types.rs diff --git a/rust/src/parser/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs similarity index 99% rename from rust/src/parser/toc/assigner.rs rename to rust/src/index/parse/toc/assigner.rs index a62e6486..fc97c420 100644 --- a/rust/src/parser/toc/assigner.rs +++ b/rust/src/index/parse/toc/assigner.rs @@ -8,7 +8,7 @@ use tracing::{debug, info}; use crate::config::LlmConfig; use crate::error::Result; -use crate::parser::pdf::PdfPage; +use crate::index::parse::pdf::PdfPage; use super::types::{PageOffset, TocEntry}; use crate::llm::LlmClient; diff --git a/rust/src/parser/toc/detector.rs b/rust/src/index/parse/toc/detector.rs similarity index 99% rename from rust/src/parser/toc/detector.rs rename to rust/src/index/parse/toc/detector.rs index 6688adfc..f179c507 100644 --- a/rust/src/parser/toc/detector.rs +++ b/rust/src/index/parse/toc/detector.rs @@ -10,8 +10,8 @@ use crate::config::LlmConfig; use crate::error::Result; use super::types::TocDetection; +use crate::index::parse::pdf::PdfPage; use crate::llm::LlmClient; -use crate::parser::pdf::PdfPage; /// TOC detector configuration. #[derive(Debug, Clone)] diff --git a/rust/src/index/parse/toc/mod.rs b/rust/src/index/parse/toc/mod.rs new file mode 100644 index 00000000..a540cd1a --- /dev/null +++ b/rust/src/index/parse/toc/mod.rs @@ -0,0 +1,27 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Table of Contents (TOC) processing module. +//! +//! This module provides functionality to extract and verify document structure +//! from PDF Table of Contents: +//! +//! - **Detection** — Find TOC in document (regex + LLM fallback) +//! - **Parsing** — Convert TOC text to structured entries (LLM) +//! - **Assignment** — Map TOC pages to physical pages +//! - **Verification** — Sample verification of page assignments +//! - **Repair** — Fix incorrect assignments + +mod assigner; +mod detector; +mod parser; +mod processor; +mod repairer; +mod types; +mod verifier; + +// Re-export main types +pub use types::TocEntry; + +// Re-export components +pub use processor::TocProcessor; diff --git a/rust/src/parser/toc/parser.rs b/rust/src/index/parse/toc/parser.rs similarity index 100% rename from rust/src/parser/toc/parser.rs rename to rust/src/index/parse/toc/parser.rs diff --git a/rust/src/parser/toc/processor.rs b/rust/src/index/parse/toc/processor.rs similarity index 99% rename from rust/src/parser/toc/processor.rs rename to rust/src/index/parse/toc/processor.rs index 1d26f9a6..79ef9a15 100644 --- a/rust/src/parser/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -6,7 +6,7 @@ use tracing::{debug, info, warn}; use crate::error::Result; -use crate::parser::pdf::PdfPage; +use crate::index::parse::pdf::PdfPage; use super::assigner::{PageAssigner, PageAssignerConfig}; use super::detector::{TocDetector, TocDetectorConfig}; diff --git a/rust/src/parser/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs similarity index 99% rename from rust/src/parser/toc/repairer.rs rename to rust/src/index/parse/toc/repairer.rs index 8a26b8cd..4062f215 100644 --- a/rust/src/parser/toc/repairer.rs +++ b/rust/src/index/parse/toc/repairer.rs @@ -7,7 +7,7 @@ use tracing::{debug, info}; use crate::config::LlmConfig; use crate::error::Result; -use crate::parser::pdf::PdfPage; +use crate::index::parse::pdf::PdfPage; use super::types::{TocEntry, VerificationError, VerificationReport}; use super::verifier::IndexVerifier; diff --git a/rust/src/parser/toc/types.rs b/rust/src/index/parse/toc/types.rs similarity index 100% rename from rust/src/parser/toc/types.rs rename to rust/src/index/parse/toc/types.rs diff --git a/rust/src/parser/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs similarity index 99% rename from rust/src/parser/toc/verifier.rs rename to rust/src/index/parse/toc/verifier.rs index a0243bc1..d0c3883e 100644 --- a/rust/src/parser/toc/verifier.rs +++ b/rust/src/index/parse/toc/verifier.rs @@ -8,7 +8,7 @@ use tracing::{debug, info}; use crate::config::LlmConfig; use crate::error::Result; -use crate::parser::pdf::PdfPage; +use crate::index::parse::pdf::PdfPage; use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; use crate::llm::LlmClient; diff --git a/rust/src/parser/types.rs b/rust/src/index/parse/types.rs similarity index 95% rename from rust/src/parser/types.rs rename to rust/src/index/parse/types.rs index 2ccac2f7..6a7fa07b 100644 --- a/rust/src/parser/types.rs +++ b/rust/src/index/parse/types.rs @@ -17,10 +17,6 @@ pub enum DocumentFormat { Markdown, /// PDF files (.pdf) Pdf, - /// HTML files (.html, .htm) - Html, - /// Word documents (.docx) - Docx, } impl DocumentFormat { @@ -29,8 +25,6 @@ impl DocumentFormat { match ext.to_lowercase().as_str() { "md" | "markdown" => Some(Self::Markdown), "pdf" => Some(Self::Pdf), - "html" | "htm" => Some(Self::Html), - "docx" => Some(Self::Docx), _ => None, } } @@ -40,8 +34,6 @@ impl DocumentFormat { match self { Self::Markdown => "md", Self::Pdf => "pdf", - Self::Html => "html", - Self::Docx => "docx", } } } diff --git a/rust/src/index/pipeline/checkpoint.rs b/rust/src/index/pipeline/checkpoint.rs new file mode 100644 index 00000000..4ba1f01a --- /dev/null +++ b/rust/src/index/pipeline/checkpoint.rs @@ -0,0 +1,329 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pipeline checkpoint support for resume-after-interruption. +//! +//! Saves pipeline state after each stage group completes. +//! On restart, completed stages are skipped and the pipeline resumes +//! from the first incomplete stage. + +use std::path::PathBuf; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tracing::{info, warn}; + +use crate::document::DocumentTree; +use crate::index::parse::RawNode; + +use super::metrics::IndexMetrics; + +/// Serializable checkpoint capturing pipeline state at a point in time. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineCheckpoint { + /// Document ID being indexed. + pub doc_id: String, + + /// SHA-256 hash of the source content. + pub source_hash: String, + + /// Processing version at the time of checkpoint. + pub processing_version: u32, + + /// Fingerprint of pipeline configuration. + pub config_fingerprint: String, + + /// Names of stages that completed successfully. + pub completed_stages: Vec, + + /// Serialized context data that stages need for resume. + pub context_data: CheckpointContextData, + + /// When this checkpoint was created. + pub timestamp: DateTime, +} + +/// Context data that can be serialized for checkpoint persistence. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckpointContextData { + /// Raw nodes from parsing (if parse stage completed). + pub raw_nodes: Vec, + + /// Built document tree (if build stage completed). + pub tree: Option, + + /// Metrics collected so far. + pub metrics: IndexMetrics, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count. + pub line_count: Option, + + /// Document description. + pub description: Option, +} + +/// Manages checkpoint persistence on disk. +pub struct CheckpointManager { + /// Directory where checkpoints are stored. + checkpoint_dir: PathBuf, +} + +impl CheckpointManager { + /// Create a new checkpoint manager. + /// + /// The directory will be created on first save if it doesn't exist. + pub fn new(checkpoint_dir: impl Into) -> Self { + Self { + checkpoint_dir: checkpoint_dir.into(), + } + } + + /// Save a checkpoint for the given document. + pub fn save(&self, doc_id: &str, checkpoint: &PipelineCheckpoint) -> std::io::Result<()> { + // Ensure directory exists + std::fs::create_dir_all(&self.checkpoint_dir)?; + + let path = self.checkpoint_path(doc_id); + let json = serde_json::to_string(checkpoint) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + // Write atomically: write to temp file, then rename + let temp_path = path.with_extension("tmp"); + std::fs::write(&temp_path, json)?; + std::fs::rename(&temp_path, &path)?; + + Ok(()) + } + + /// Load a checkpoint for the given document. + /// + /// Returns `None` if no checkpoint exists. + pub fn load(&self, doc_id: &str) -> Option { + let path = self.checkpoint_path(doc_id); + if !path.exists() { + return None; + } + + let data = std::fs::read(&path).ok()?; + match serde_json::from_slice(&data) { + Ok(checkpoint) => Some(checkpoint), + Err(e) => { + warn!("Failed to deserialize checkpoint for {}: {}", doc_id, e); + None + } + } + } + + /// Remove a checkpoint after successful completion. + pub fn clear(&self, doc_id: &str) -> std::io::Result<()> { + let path = self.checkpoint_path(doc_id); + if path.exists() { + std::fs::remove_file(path)?; + info!("Cleared checkpoint for document {}", doc_id); + } + Ok(()) + } + + /// Check if a checkpoint exists for the given document. + pub fn exists(&self, doc_id: &str) -> bool { + self.checkpoint_path(doc_id).exists() + } + + /// Get the checkpoint file path for a document. + fn checkpoint_path(&self, doc_id: &str) -> PathBuf { + // Use a sanitized version of doc_id for the filename + let safe_name = doc_id.replace(['/', '\\', ':', '*', '?', '"', '<', '>', '|'], "_"); + self.checkpoint_dir + .join(format!("{}.checkpoint.json", safe_name)) + } + + /// Check if a checkpoint is valid for resuming. + /// + /// A checkpoint is valid if: + /// - Source hash matches (content hasn't changed) + /// - Processing version matches (algorithm hasn't changed) + /// - Config fingerprint matches (options haven't changed) + pub fn is_valid_for_resume( + checkpoint: &PipelineCheckpoint, + source_hash: &str, + processing_version: u32, + config_fingerprint: &str, + ) -> bool { + checkpoint.source_hash == source_hash + && checkpoint.processing_version == processing_version + && checkpoint.config_fingerprint == config_fingerprint + } + + /// List all checkpoint files in the directory. + pub fn list_checkpoints(&self) -> Vec { + let mut result = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&self.checkpoint_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().map_or(false, |e| e == "json") { + if let Some(name) = path.file_stem().and_then(|n| n.to_str()) { + // Strip .checkpoint suffix + if let Some(doc_id) = name.strip_suffix(".checkpoint") { + result.push(doc_id.to_string()); + } + } + } + } + } + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn make_checkpoint() -> PipelineCheckpoint { + PipelineCheckpoint { + doc_id: "test-doc-123".to_string(), + source_hash: "abc123".to_string(), + processing_version: 1, + config_fingerprint: "cfg-fp".to_string(), + completed_stages: vec!["parse".to_string(), "build".to_string()], + context_data: CheckpointContextData { + raw_nodes: Vec::new(), + tree: Some(DocumentTree::new("Test", "content")), + metrics: IndexMetrics::default(), + page_count: None, + line_count: Some(10), + description: None, + }, + timestamp: Utc::now(), + } + } + + #[test] + fn test_save_and_load() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let checkpoint = make_checkpoint(); + manager.save("test-doc-123", &checkpoint).unwrap(); + + let loaded = manager.load("test-doc-123").unwrap(); + assert_eq!(loaded.doc_id, "test-doc-123"); + assert_eq!(loaded.completed_stages, vec!["parse", "build"]); + assert_eq!(loaded.context_data.line_count, Some(10)); + } + + #[test] + fn test_load_nonexistent() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + assert!(manager.load("nonexistent").is_none()); + } + + #[test] + fn test_clear() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let checkpoint = make_checkpoint(); + manager.save("test-doc-123", &checkpoint).unwrap(); + assert!(manager.exists("test-doc-123")); + + manager.clear("test-doc-123").unwrap(); + assert!(!manager.exists("test-doc-123")); + } + + #[test] + fn test_is_valid_for_resume() { + let checkpoint = make_checkpoint(); + + // Matching — valid + assert!(CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 1, + "cfg-fp" + )); + + // Different source hash — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "different", + 1, + "cfg-fp" + )); + + // Different processing version — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 2, + "cfg-fp" + )); + + // Different config fingerprint — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 1, + "different" + )); + } + + #[test] + fn test_list_checkpoints() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let mut cp = make_checkpoint(); + cp.doc_id = "doc-a".to_string(); + manager.save("doc-a", &cp).unwrap(); + + cp.doc_id = "doc-b".to_string(); + manager.save("doc-b", &cp).unwrap(); + + let list = manager.list_checkpoints(); + assert_eq!(list.len(), 2); + assert!(list.contains(&"doc-a".to_string())); + assert!(list.contains(&"doc-b".to_string())); + } + + #[test] + fn test_roundtrip_preserves_tree() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "Section 1", "Content"); + tree.set_token_count(child, 42); + + let checkpoint = PipelineCheckpoint { + doc_id: "tree-test".to_string(), + source_hash: "hash".to_string(), + processing_version: 1, + config_fingerprint: "fp".to_string(), + completed_stages: vec!["build".to_string()], + context_data: CheckpointContextData { + raw_nodes: Vec::new(), + tree: Some(tree), + metrics: IndexMetrics::default(), + page_count: None, + line_count: None, + description: None, + }, + timestamp: Utc::now(), + }; + + manager.save("tree-test", &checkpoint).unwrap(); + let loaded = manager.load("tree-test").unwrap(); + + let tree = loaded.context_data.tree.unwrap(); + assert_eq!(tree.node_count(), 2); // root + 1 child + let child_id = tree.children(tree.root())[0]; + assert_eq!(tree.get(child_id).unwrap().title, "Section 1"); + assert_eq!(tree.get(child_id).unwrap().token_count, Some(42)); + } +} diff --git a/rust/src/index/pipeline/context.rs b/rust/src/index/pipeline/context.rs index 264b966e..21e61ddb 100644 --- a/rust/src/index/pipeline/context.rs +++ b/rust/src/index/pipeline/context.rs @@ -7,8 +7,8 @@ use std::collections::HashMap; use std::path::PathBuf; use crate::document::{DocumentTree, NodeId, ReasoningIndex}; +use crate::index::parse::{DocumentFormat, RawNode}; use crate::llm::LlmClient; -use crate::parser::{DocumentFormat, RawNode}; use super::super::{PipelineOptions, SummaryStrategy}; use super::metrics::IndexMetrics; @@ -417,6 +417,8 @@ impl PipelineResult { pub fn total_time_ms(&self) -> u64 { self.metrics.parse_time_ms + self.metrics.build_time_ms + + self.metrics.validate_time_ms + + self.metrics.split_time_ms + self.metrics.enhance_time_ms + self.metrics.enrich_time_ms + self.metrics.reasoning_index_time_ms diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index 91ddcd99..a80cf176 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -14,7 +14,7 @@ use crate::llm::LlmClient; use super::super::PipelineOptions; use super::super::stages::{ BuildStage, EnhanceStage, EnrichStage, IndexStage, OptimizeStage, ParseStage, - ReasoningIndexStage, + ReasoningIndexStage, SplitStage, ValidateStage, }; use super::context::{IndexInput, PipelineResult}; use super::orchestrator::PipelineOrchestrator; @@ -51,13 +51,17 @@ impl PipelineExecutor { /// Default stages (in order): /// 1. `parse` - Parse document into raw nodes /// 2. `build` - Build tree structure - /// 3. `enrich` - Add metadata and cross-references - /// 4. `reasoning_index` - Build pre-computed reasoning index - /// 5. `optimize` - Optimize tree structure + /// 3. `validate` - Verify tree integrity (optional) + /// 4. `split` - Split oversized leaf nodes (optional) + /// 5. `enrich` - Add metadata and cross-references + /// 6. `reasoning_index` - Build pre-computed reasoning index + /// 7. `optimize` - Optimize tree structure pub fn new() -> Self { let orchestrator = PipelineOrchestrator::new() .stage_with_priority(ParseStage::new(), 10) .stage_with_priority(BuildStage::new(), 20) + .stage_with_priority(ValidateStage::new(), 22) + .stage_with_priority(SplitStage::new(), 25) .stage_with_priority(EnrichStage::new(), 40) .stage_with_priority(ReasoningIndexStage::new(), 45) .stage_with_priority(OptimizeStage::new(), 60); @@ -70,14 +74,18 @@ impl PipelineExecutor { /// Stages (in order): /// 1. `parse` - Parse document /// 2. `build` - Build tree - /// 3. `enhance` - LLM-based enhancement (summaries) - /// 4. `enrich` - Add metadata - /// 5. `reasoning_index` - Build pre-computed reasoning index - /// 6. `optimize` - Optimize tree + /// 3. `validate` - Verify tree integrity (optional) + /// 4. `split` - Split oversized leaf nodes (optional) + /// 5. `enhance` - LLM-based enhancement (summaries) + /// 6. `enrich` - Add metadata + /// 7. `reasoning_index` - Build pre-computed reasoning index + /// 8. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { let orchestrator = PipelineOrchestrator::new() .stage_with_priority(ParseStage::new(), 10) .stage_with_priority(BuildStage::new(), 20) + .stage_with_priority(ValidateStage::new(), 22) + .stage_with_priority(SplitStage::new(), 25) .stage_with_priority(EnhanceStage::with_llm_client(client), 30) .stage_with_priority(EnrichStage::new(), 40) .stage_with_priority(ReasoningIndexStage::new(), 45) diff --git a/rust/src/index/pipeline/mod.rs b/rust/src/index/pipeline/mod.rs index 2d221a8a..e6e3752d 100644 --- a/rust/src/index/pipeline/mod.rs +++ b/rust/src/index/pipeline/mod.rs @@ -11,6 +11,7 @@ //! - [`FailurePolicy`] - Configurable failure handling for stages //! - [`StageRetryConfig`] - Retry configuration for stages +mod checkpoint; mod context; mod executor; mod metrics; @@ -20,5 +21,4 @@ mod policy; pub use context::{IndexContext, IndexInput, PipelineResult, StageResult}; pub use executor::PipelineExecutor; pub use metrics::IndexMetrics; -pub use orchestrator::{CustomStageBuilder, ExecutionGroup, PipelineOrchestrator}; pub use policy::{FailurePolicy, StageRetryConfig}; diff --git a/rust/src/index/pipeline/orchestrator.rs b/rust/src/index/pipeline/orchestrator.rs index 2f0b1def..892497f2 100644 --- a/rust/src/index/pipeline/orchestrator.rs +++ b/rust/src/index/pipeline/orchestrator.rs @@ -30,7 +30,7 @@ use tracing::{error, info, warn}; use crate::error::Result; use super::super::PipelineOptions; -use super::super::stages::{AccessPattern, IndexStage}; +use super::super::stages::IndexStage; use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; use super::policy::FailurePolicy; @@ -498,14 +498,10 @@ impl PipelineOrchestrator { let existing_tree_snapshot = ctx.existing_tree.clone(); // Take both stages out to avoid double &mut self - let mut stage_writer = std::mem::replace( - &mut self.stages[writer_idx].stage, - Box::new(NopStage), - ); - let mut stage_reader = std::mem::replace( - &mut self.stages[reader_idx].stage, - Box::new(NopStage), - ); + let mut stage_writer = + std::mem::replace(&mut self.stages[writer_idx].stage, Box::new(NopStage)); + let mut stage_reader = + std::mem::replace(&mut self.stages[reader_idx].stage, Box::new(NopStage)); let writer_name = stage_writer.name().to_string(); let reader_name = stage_reader.name().to_string(); @@ -560,7 +556,8 @@ impl PipelineOrchestrator { ); } if reader_ctx.metrics.optimize_time_ms > 0 { - ctx.metrics.record_optimize(reader_ctx.metrics.optimize_time_ms); + ctx.metrics + .record_optimize(reader_ctx.metrics.optimize_time_ms); } ctx.metrics.nodes_merged += reader_ctx.metrics.nodes_merged; ctx.metrics.nodes_skipped += reader_ctx.metrics.nodes_skipped; diff --git a/rust/src/index/stages/build.rs b/rust/src/index/stages/build.rs index 389bf25e..aee0b51c 100644 --- a/rust/src/index/stages/build.rs +++ b/rust/src/index/stages/build.rs @@ -9,7 +9,7 @@ use tracing::info; use crate::document::{DocumentTree, NodeId}; use crate::error::Result; -use crate::parser::RawNode; +use crate::index::parse::RawNode; use crate::utils::estimate_tokens; use super::{IndexStage, StageResult}; @@ -84,19 +84,47 @@ impl BuildStage { } /// Apply thinning to raw nodes before tree construction. - fn apply_thinning(nodes: &[RawNode], config: &ThinningConfig) -> Vec { + /// + /// When `merge_content` is true: small nodes are merged into their parent + /// by concatenating child content into the parent, then marking children for removal. + /// When `merge_content` is false: small nodes are simply marked for removal. + fn apply_thinning(nodes: &mut [RawNode], config: &ThinningConfig) -> Vec { if !config.enabled || nodes.is_empty() { return vec![true; nodes.len()]; } let mut keep = vec![true; nodes.len()]; - // Process from leaves to root + // Process from leaves to root (bottom-up) for i in (0..nodes.len()).rev() { + if !keep[i] { + continue; + } let total_tokens = nodes[i].total_token_count.unwrap_or(0); if total_tokens < config.threshold { - keep[i] = false; + // Find all children of this node + let children_indices = Self::find_all_children_indices(i, nodes); + + if !children_indices.is_empty() && config.merge_content { + // Merge children content into this node + let mut merged_content = nodes[i].content.clone(); + for &child_idx in &children_indices { + if !nodes[child_idx].content.trim().is_empty() { + if !merged_content.is_empty() { + merged_content.push_str("\n\n"); + } + merged_content.push_str(&nodes[child_idx].content); + } + } + nodes[i].content = merged_content; + nodes[i].token_count = Some(nodes[i].token_count.unwrap_or(0)); + } + + // Mark children for removal + for &child_idx in &children_indices { + keep[child_idx] = false; + } } } @@ -241,7 +269,7 @@ impl IndexStage for BuildStage { // Step 2: Apply thinning if enabled let _original_count = raw_nodes.len(); - let keep = Self::apply_thinning(&raw_nodes, &ctx.options.thinning); + let keep = Self::apply_thinning(&mut raw_nodes, &ctx.options.thinning); let nodes_before_merge = raw_nodes.len(); raw_nodes = raw_nodes diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs index 92e5136d..3dc0ad81 100644 --- a/rust/src/index/stages/enhance.rs +++ b/rust/src/index/stages/enhance.rs @@ -9,12 +9,12 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; -use crate::document::{DocumentTree, NodeId, TreeNode}; +use crate::document::NodeId; use crate::error::Result; use crate::index::incremental; -use crate::utils::fingerprint::Fingerprint; use crate::llm::LlmClient; -use crate::memo::{MemoKey, MemoStore, MemoValue}; +use crate::memo::{MemoKey, MemoStore}; +use crate::utils::fingerprint::Fingerprint; use super::{IndexStage, StageResult}; use crate::index::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; @@ -25,6 +25,7 @@ struct PendingNode { node_id: NodeId, title: String, content: String, + is_leaf: bool, } /// Enhance stage - generates summaries using LLM. @@ -174,6 +175,8 @@ impl IndexStage for EnhanceStage { let mut generated = 0; let mut skipped_no_content = 0; let mut skipped_tokens = 0; + let mut shortcut_used = 0; + let shortcut_threshold = strategy.shortcut_threshold(); for node_id in node_ids { let node = match tree.get(node_id) { @@ -201,13 +204,19 @@ impl IndexStage for EnhanceStage { // Check memo store (fast path — apply immediately) if let Some(store) = self.memo_store.as_deref() { - let content_fp = - Fingerprint::from_str(&format!("{}|{}", node.title, node.content)); + let content_fp = Fingerprint::from_str(&format!("{}|{}", node.title, node.content)); let memo_key = MemoKey::summary(&content_fp); - if let Some(cached) = store.get(&memo_key).and_then(|c| c.as_summary().map(|s| s.to_string())) { + if let Some(cached) = store + .get(&memo_key) + .and_then(|c| c.as_summary().map(|s| s.to_string())) + { if !cached.is_empty() { tree.set_summary(node_id, &cached); - debug!("Using cached summary for node: {} ({} chars)", node.title, cached.len()); + debug!( + "Using cached summary for node: {} ({} chars)", + node.title, + cached.len() + ); ctx.metrics.increment_summaries(); generated += 1; continue; @@ -215,11 +224,29 @@ impl IndexStage for EnhanceStage { } } + // Shortcut: use original content as summary for short nodes (Borrow A) + let token_count = node + .token_count + .unwrap_or_else(|| crate::utils::estimate_tokens(&node.content)); + if shortcut_threshold > 0 && token_count > 0 && token_count <= shortcut_threshold { + tree.set_summary(node_id, &node.content); + debug!( + "Shortcut: using original content as summary for '{}' ({} tokens)", + node.title, token_count + ); + ctx.metrics.increment_summaries(); + generated += 1; + shortcut_used += 1; + continue; + } + // Needs LLM call + let is_leaf = tree.is_leaf(node_id); pending_llm.push(PendingNode { node_id, title: node.title, content: node.content, + is_leaf, }); } @@ -230,7 +257,8 @@ impl IndexStage for EnhanceStage { if !pending_llm.is_empty() { info!( "Generating summaries for {} nodes (concurrency: {})", - pending_llm.len(), concurrency + pending_llm.len(), + concurrency ); // Collect results: (NodeId, Result) @@ -239,7 +267,13 @@ impl IndexStage for EnhanceStage { .map(|pending| { let generator = Arc::clone(&generator); async move { - let result = generator.generate(&pending.title, &pending.content).await; + let result = generator + .generate_for_node( + &pending.title, + &pending.content, + pending.is_leaf, + ) + .await; (pending.node_id, result.map_err(|e| e.to_string())) } }) @@ -272,8 +306,8 @@ impl IndexStage for EnhanceStage { ctx.metrics.record_enhance(duration); info!( - "Generated {} summaries ({} failed, {} skipped no content, {} skipped tokens) in {}ms", - generated, failed, skipped_no_content, skipped_tokens, duration + "Generated {} summaries ({} shortcut, {} failed, {} skipped no content, {} skipped tokens) in {}ms", + generated, shortcut_used, failed, skipped_no_content, skipped_tokens, duration ); let mut stage_result = StageResult::success("enhance"); diff --git a/rust/src/index/stages/enrich.rs b/rust/src/index/stages/enrich.rs index 0ff3cb55..ff758ddd 100644 --- a/rust/src/index/stages/enrich.rs +++ b/rust/src/index/stages/enrich.rs @@ -114,7 +114,7 @@ impl IndexStage for EnrichStage { fn access_pattern(&self) -> AccessPattern { AccessPattern { reads_tree: true, - writes_tree: true, // sets page_boundaries + writes_tree: true, // sets page_boundaries writes_description: true, ..Default::default() } diff --git a/rust/src/index/stages/mod.rs b/rust/src/index/stages/mod.rs index 5b1847c4..f320a525 100644 --- a/rust/src/index/stages/mod.rs +++ b/rust/src/index/stages/mod.rs @@ -9,6 +9,8 @@ mod enrich; mod optimize; mod parse; mod reasoning; +mod split; +mod validate; pub use build::BuildStage; pub use enhance::EnhanceStage; @@ -16,6 +18,8 @@ pub use enrich::EnrichStage; pub use optimize::OptimizeStage; pub use parse::ParseStage; pub use reasoning::ReasoningIndexStage; +pub use split::SplitStage; +pub use validate::ValidateStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; use crate::error::Result; diff --git a/rust/src/index/stages/optimize.rs b/rust/src/index/stages/optimize.rs index b9c90948..9eca0b8f 100644 --- a/rust/src/index/stages/optimize.rs +++ b/rust/src/index/stages/optimize.rs @@ -146,7 +146,7 @@ impl IndexStage for OptimizeStage { fn access_pattern(&self) -> AccessPattern { AccessPattern { reads_tree: true, - writes_tree: true, // merges small leaf nodes + writes_tree: true, // merges small leaf nodes ..Default::default() } } diff --git a/rust/src/index/stages/parse.rs b/rust/src/index/stages/parse.rs index d5f0ba56..98ef911b 100644 --- a/rust/src/index/stages/parse.rs +++ b/rust/src/index/stages/parse.rs @@ -8,24 +8,19 @@ use std::time::Instant; use tracing::info; use crate::error::Result; -use crate::parser::DocumentFormat; -use crate::parser::ParserRegistry; +use crate::index::parse::DocumentFormat; use super::{IndexStage, StageResult}; use crate::index::IndexMode; use crate::index::pipeline::{IndexContext, IndexInput}; /// Parse stage - extracts raw nodes from documents. -pub struct ParseStage { - parser_registry: ParserRegistry, -} +pub struct ParseStage; impl ParseStage { /// Create a new parse stage. pub fn new() -> Self { - Self { - parser_registry: ParserRegistry::with_defaults(), - } + Self } /// Detect document format from path and options. @@ -42,8 +37,6 @@ impl ParseStage { }, IndexMode::Markdown => Ok(DocumentFormat::Markdown), IndexMode::Pdf => Ok(DocumentFormat::Pdf), - IndexMode::Docx => Ok(DocumentFormat::Docx), - IndexMode::Html => Ok(DocumentFormat::Html), } } } @@ -83,8 +76,8 @@ impl IndexStage for ParseStage { .unwrap_or("document") .to_string(); - // Parse using registry - self.parser_registry.parse_file(&path).await? + // Parse directly + crate::index::parse::parse_file(&path, format).await? } IndexInput::Content { content, @@ -95,14 +88,14 @@ impl IndexStage for ParseStage { ctx.name = name.clone(); // Parse content directly - self.parser_registry.parse(content, *format).await? + crate::index::parse::parse_content(content, *format).await? } IndexInput::Bytes { data, name, format } => { // Set name ctx.name = name.clone(); // Parse bytes - self.parser_registry.parse_bytes(data, *format).await? + crate::index::parse::parse_bytes(data, *format).await? } }; diff --git a/rust/src/index/stages/reasoning.rs b/rust/src/index/stages/reasoning.rs index 9fd98b42..7133dc2a 100644 --- a/rust/src/index/stages/reasoning.rs +++ b/rust/src/index/stages/reasoning.rs @@ -11,8 +11,8 @@ use std::time::Instant; use tracing::info; use crate::document::{ - NodeId, ReasoningIndex, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, - SummaryShortcut, TopicEntry, + NodeId, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, + TopicEntry, }; use crate::error::Result; use crate::retrieval::search::extract_keywords; @@ -63,8 +63,10 @@ impl ReasoningIndexStage { // Walk all nodes and extract keywords from title + summary for node_id in tree.traverse() { if let Some(node) = tree.get(node_id) { - let title_keywords = Self::extract_node_keywords(&node.title, config.min_keyword_length); - let summary_keywords = Self::extract_node_keywords(&node.summary, config.min_keyword_length); + let title_keywords = + Self::extract_node_keywords(&node.title, config.min_keyword_length); + let summary_keywords = + Self::extract_node_keywords(&node.summary, config.min_keyword_length); let content_keywords = if node.summary.is_empty() { // Fallback: extract from content if no summary let content_sample: String = node.content.chars().take(500).collect(); @@ -117,7 +119,11 @@ impl ReasoningIndexStage { // Normalize weights to 0.0-1.0 range let max_weight = merged.values().map(|(w, _)| *w).fold(0.0_f32, f32::max); - let scale = if max_weight > 0.0 { 1.0 / max_weight } else { 1.0 }; + let scale = if max_weight > 0.0 { + 1.0 / max_weight + } else { + 1.0 + }; let mut topic_entries: Vec = merged .into_iter() @@ -142,7 +148,9 @@ impl ReasoningIndexStage { } /// Build section map from depth-1 nodes. - fn build_section_map(tree: &crate::document::DocumentTree) -> std::collections::HashMap { + fn build_section_map( + tree: &crate::document::DocumentTree, + ) -> std::collections::HashMap { let mut section_map = std::collections::HashMap::new(); let root = tree.root(); for child_id in tree.children(root) { @@ -158,9 +166,7 @@ impl ReasoningIndexStage { } /// Build summary shortcut from root and depth-1 nodes. - fn build_summary_shortcut( - tree: &crate::document::DocumentTree, - ) -> Option { + fn build_summary_shortcut(tree: &crate::document::DocumentTree) -> Option { let root = tree.root(); let root_node = tree.get(root)?; @@ -244,10 +250,7 @@ impl IndexStage for ReasoningIndexStage { let tree = match ctx.tree.as_ref() { Some(t) => t, None => { - return Ok(StageResult::failure( - "reasoning_index", - "Tree not built", - )); + return Ok(StageResult::failure("reasoning_index", "Tree not built")); } }; @@ -313,10 +316,9 @@ impl IndexStage for ReasoningIndexStage { "keywords_indexed".to_string(), serde_json::json!(keyword_count), ); - stage_result.metadata.insert( - "topics_indexed".to_string(), - serde_json::json!(topic_count), - ); + stage_result + .metadata + .insert("topics_indexed".to_string(), serde_json::json!(topic_count)); Ok(stage_result) } @@ -328,7 +330,8 @@ mod tests { #[test] fn test_extract_node_keywords() { - let keywords = ReasoningIndexStage::extract_node_keywords("Introduction to Machine Learning", 2); + let keywords = + ReasoningIndexStage::extract_node_keywords("Introduction to Machine Learning", 2); assert!(keywords.contains(&"introduction".to_string())); assert!(keywords.contains(&"machine".to_string())); assert!(keywords.contains(&"learning".to_string())); diff --git a/rust/src/index/stages/split.rs b/rust/src/index/stages/split.rs new file mode 100644 index 00000000..2bb54d4a --- /dev/null +++ b/rust/src/index/stages/split.rs @@ -0,0 +1,338 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Split stage - Break large leaf nodes into smaller ones. + +use std::time::Instant; +use tracing::info; + +use crate::document::{DocumentTree, NodeId}; +use crate::error::Result; +use crate::utils::estimate_tokens; + +use super::{AccessPattern, IndexStage, StageResult, async_trait}; +use crate::index::config::SplitConfig; +use crate::index::pipeline::IndexContext; + +/// Split stage — breaks oversized leaf nodes into smaller children. +/// +/// When a leaf node exceeds the token limit, the stage searches for natural +/// split points (headings `\n#`, paragraph boundaries `\n\n`) and creates +/// child nodes from the resulting chunks. +/// +/// This stage runs after validate (priority 22) at priority 25. +pub struct SplitStage; + +impl SplitStage { + /// Create a new split stage. + pub fn new() -> Self { + Self + } + + /// Find natural split points in content. + /// + /// Returns byte offsets where the content can be split. + /// Prioritizes heading boundaries (`\n#`), then paragraph breaks (`\n\n`). + fn find_split_points(content: &str, max_tokens: usize) -> Vec { + let total_tokens = estimate_tokens(content); + if total_tokens <= max_tokens { + return Vec::new(); + } + + // Estimate how many parts we need + let estimated_parts = (total_tokens + max_tokens - 1) / max_tokens; + let target_size = content.len() / estimated_parts.max(1); + + let mut points = Vec::new(); + + // First pass: find heading boundaries + let mut last_split = 0; + for (i, line) in content.lines().enumerate() { + let byte_offset = line.as_ptr() as usize - content.as_ptr() as usize; + if i > 0 && line.starts_with('#') && byte_offset > last_split { + let chunk_tokens = estimate_tokens(&content[last_split..byte_offset]); + if chunk_tokens >= max_tokens / 2 { + points.push(byte_offset); + last_split = byte_offset; + } + } + } + + // If heading splits are sufficient, return them + if !points.is_empty() { + let approx_size = content.len() / (points.len() + 1); + if approx_size <= target_size * 2 { + return points; + } + } + + // Second pass: use paragraph boundaries + points.clear(); + let mut pos = 0; + for paragraph in content.split("\n\n") { + let para_end = pos + paragraph.len(); + if para_end > 0 && pos > 0 { + let chunk_tokens = + estimate_tokens(&content[points.last().copied().unwrap_or(0)..pos]); + if chunk_tokens >= max_tokens / 2 { + points.push(pos); + } + } + pos = para_end + 2; // skip "\n\n" + } + + // If still not enough split points, use approximate byte boundaries + if points.is_empty() { + let bytes_per_token = content.len().max(1) / total_tokens.max(1); + let target_bytes = max_tokens * bytes_per_token; + + let mut offset = target_bytes; + while offset < content.len() { + // Find the nearest newline + if let Some(nl_pos) = content[offset..].find('\n') { + points.push(offset + nl_pos); + } else { + break; + } + offset += target_bytes; + } + } + + points + } + + /// Split a single leaf node into children. + /// + /// Returns the number of new children created. + fn split_leaf(tree: &mut DocumentTree, leaf_id: NodeId, max_tokens: usize) -> usize { + let content = match tree.get(leaf_id) { + Some(node) => node.content.clone(), + None => return 0, + }; + + let split_points = Self::find_split_points(&content, max_tokens); + if split_points.is_empty() { + return 0; + } + + // Extract title for child naming + let parent_title = tree + .get(leaf_id) + .map(|n| n.title.clone()) + .unwrap_or_default(); + + // Create chunks from split points + let mut chunks: Vec<&str> = Vec::new(); + let mut prev = 0; + for &point in &split_points { + if point > prev { + chunks.push(&content[prev..point]); + } + prev = point; + } + if prev < content.len() { + chunks.push(&content[prev..]); + } + + let child_count = chunks.len(); + for (i, chunk) in chunks.into_iter().enumerate() { + let chunk_trimmed = chunk.trim(); + if chunk_trimmed.is_empty() { + continue; + } + + // Try to extract a title from the first line + let title = if chunk_trimmed.starts_with('#') { + chunk_trimmed + .lines() + .next() + .unwrap_or("") + .trim_start_matches('#') + .trim() + .to_string() + } else { + format!("{} (part {})", parent_title, i + 1) + }; + + let child_id = tree.add_child(leaf_id, &title, chunk_trimmed); + let token_count = estimate_tokens(chunk_trimmed); + tree.set_token_count(child_id, token_count); + } + + // Clear parent's content (moved to children) + tree.set_content(leaf_id, ""); + tree.set_token_count(leaf_id, 0); + + child_count + } + + /// Process all oversized leaf nodes in the tree. + fn split_tree(tree: &mut DocumentTree, config: &SplitConfig) -> usize { + if !config.enabled { + return 0; + } + + // Collect leaves first to avoid borrow issues + let leaves: Vec = tree.leaves(); + let mut total_split = 0; + + for leaf_id in leaves { + // Check if this leaf exceeds the token limit + let token_count = tree.get(leaf_id).and_then(|n| n.token_count).unwrap_or(0); + + // Use estimated tokens if no count set + let tokens = if token_count > 0 { + token_count + } else { + tree.get(leaf_id) + .map(|n| estimate_tokens(&n.content)) + .unwrap_or(0) + }; + + if tokens > config.max_tokens_per_node { + let split_count = Self::split_leaf(tree, leaf_id, config.max_tokens_per_node); + if split_count > 0 { + total_split += 1; + } + } + } + + total_split + } +} + +impl Default for SplitStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for SplitStage { + fn name(&self) -> &'static str { + "split" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, + writes_reasoning_index: false, + writes_description: false, + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let tree = match ctx.tree.as_mut() { + Some(t) => t, + None => { + return Ok(StageResult::success("split")); + } + }; + + let config = &ctx.options.split; + if !config.enabled { + return Ok(StageResult::success("split")); + } + + let node_count_before = tree.node_count(); + let split_count = Self::split_tree(tree, config); + let node_count_after = tree.node_count(); + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_split(duration); + ctx.metrics.nodes_merged += split_count; + + info!( + "Split {} oversized nodes ({} → {} total nodes) in {}ms", + split_count, node_count_before, node_count_after, duration + ); + + let mut stage_result = StageResult::success("split"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("nodes_split".to_string(), serde_json::json!(split_count)); + stage_result.metadata.insert( + "node_count_before".to_string(), + serde_json::json!(node_count_before), + ); + stage_result.metadata.insert( + "node_count_after".to_string(), + serde_json::json!(node_count_after), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_split_points_small_content() { + let content = "Hello world"; + let points = SplitStage::find_split_points(content, 8000); + assert!(points.is_empty()); + } + + #[test] + fn test_find_split_points_heading_boundaries() { + let mut content = String::from("Introduction text that is long enough. "); + // Pad to exceed token limit + for _ in 0..500 { + content.push_str("This is some content. "); + } + content.push_str("\n## Section One\n"); + for _ in 0..500 { + content.push_str("More content here. "); + } + content.push_str("\n## Section Two\n"); + for _ in 0..500 { + content.push_str("Final content. "); + } + + let points = SplitStage::find_split_points(&content, 200); + assert!(!points.is_empty()); + } + + #[test] + fn test_find_split_points_paragraph_boundaries() { + let mut content = String::new(); + for i in 0..10 { + for _ in 0..100 { + content.push_str(&format!("Paragraph {} content. ", i)); + } + content.push_str("\n\n"); + } + + let points = SplitStage::find_split_points(&content, 200); + assert!(!points.is_empty()); + } + + #[test] + fn test_split_tree_disabled() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child( + tree.root(), + "Big", + "Very long content here with lots of text that would normally exceed limits", + ); + tree.set_token_count(child, 15000); + + let config = SplitConfig::disabled(); + let count = SplitStage::split_tree(&mut tree, &config); + assert_eq!(count, 0); + } +} diff --git a/rust/src/index/stages/validate.rs b/rust/src/index/stages/validate.rs new file mode 100644 index 00000000..f07c32c2 --- /dev/null +++ b/rust/src/index/stages/validate.rs @@ -0,0 +1,356 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Validate stage - Verify tree integrity after build. + +use std::collections::HashSet; +use std::time::Instant; +use tracing::{info, warn}; + +use crate::error::Result; + +use super::{AccessPattern, IndexStage, StageResult, async_trait}; +use crate::index::pipeline::IndexContext; + +/// Maximum allowed tree depth. +const MAX_DEPTH: usize = 20; + +/// Minimum token count ratio for parent vs children consistency check. +/// A parent's token count should be at least `ratio` of the sum of its children. +const MIN_PARENT_TOKEN_RATIO: f32 = 0.8; + +/// Minimum content similarity threshold to flag potential duplicates. +/// Content is considered duplicate if normalized equality matches. +const DUPLICATE_MIN_LENGTH: usize = 50; + +/// Validation issue severity. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Severity { + /// Warning — tree is usable but may have quality issues. + Warning, + /// Error — tree has structural problems. + Error, +} + +/// A single validation issue found during tree inspection. +#[derive(Debug, Clone)] +struct ValidationIssue { + /// Severity level. + severity: Severity, + /// Human-readable description. + message: String, +} + +/// Validate stage — checks tree integrity after build. +/// +/// Validates: +/// 1. Tree structural integrity (all nodes reachable from root) +/// 2. Depth sanity (max depth < 20) +/// 3. Empty title detection on leaf nodes +/// 4. Token count consistency (parent >= sum of children) +/// 5. Content duplication detection +pub struct ValidateStage; + +impl ValidateStage { + /// Create a new validate stage. + pub fn new() -> Self { + Self + } + + /// Run all validation checks and collect issues. + fn validate_tree(&self, ctx: &IndexContext) -> Vec { + let tree = match ctx.tree.as_ref() { + Some(t) => t, + None => { + return vec![ValidationIssue { + severity: Severity::Error, + message: "No tree available for validation".to_string(), + }]; + } + }; + + let mut issues = Vec::new(); + + Self::check_depth(tree, &mut issues); + Self::check_empty_titles(tree, &mut issues); + Self::check_token_consistency(tree, &mut issues); + Self::check_content_duplication(tree, &mut issues); + + issues + } + + /// Check that tree depth is reasonable. + fn check_depth(tree: &crate::document::DocumentTree, issues: &mut Vec) { + let all_nodes = tree.traverse(); + let max_depth = all_nodes + .iter() + .map(|&id| tree.depth(id)) + .max() + .unwrap_or(0); + + if max_depth > MAX_DEPTH { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Tree depth ({}) exceeds recommended maximum ({})", + max_depth, MAX_DEPTH + ), + }); + } + } + + /// Check for leaf nodes with empty titles. + fn check_empty_titles(tree: &crate::document::DocumentTree, issues: &mut Vec) { + let leaves = tree.leaves(); + let mut empty_count = 0; + + for &leaf_id in &leaves { + if let Some(node) = tree.get(leaf_id) { + if node.title.trim().is_empty() { + empty_count += 1; + } + } + } + + if empty_count > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!("Found {} leaf nodes with empty titles", empty_count), + }); + } + } + + /// Check token count consistency: parent's tokens should be >= sum of children's. + fn check_token_consistency( + tree: &crate::document::DocumentTree, + issues: &mut Vec, + ) { + let all_nodes = tree.traverse(); + let mut inconsistent = 0; + + for &node_id in &all_nodes { + let children: Vec<_> = tree.children(node_id); + if children.is_empty() { + continue; + } + + let parent_tokens = tree.get(node_id).and_then(|n| n.token_count).unwrap_or(0); + + let children_sum: usize = children + .iter() + .map(|&c| tree.get(c).and_then(|n| n.token_count).unwrap_or(0)) + .sum(); + + // Parent should have at least some proportion of children's tokens + // (parent has its own content plus children, but after thinning this may vary) + if parent_tokens > 0 + && children_sum > 0 + && (parent_tokens as f32 / children_sum as f32) < MIN_PARENT_TOKEN_RATIO + { + // Only flag if both are non-trivial + if children_sum >= 100 { + inconsistent += 1; + } + } + } + + if inconsistent > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Found {} nodes with token counts significantly less than their children's sum", + inconsistent + ), + }); + } + } + + /// Check for content duplication across leaf nodes. + fn check_content_duplication( + tree: &crate::document::DocumentTree, + issues: &mut Vec, + ) { + let leaves = tree.leaves(); + let mut seen: HashSet = HashSet::new(); + let mut duplicate_count = 0; + + for &leaf_id in &leaves { + if let Some(node) = tree.get(leaf_id) { + let content = node.content.trim(); + if content.len() < DUPLICATE_MIN_LENGTH { + continue; + } + + // Simple hash of normalized content for duplicate detection + let hash = Self::simple_hash(content); + if !seen.insert(hash) { + duplicate_count += 1; + } + } + } + + if duplicate_count > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Found {} leaf nodes with duplicate content", + duplicate_count + ), + }); + } + } + + /// Simple FNV-1a-like hash for duplicate detection. + /// Not cryptographic — just for grouping identical content. + fn simple_hash(s: &str) -> u64 { + let mut hash: u64 = 0xcbf29ce484222325; + for byte in s.bytes() { + hash ^= byte as u64; + hash = hash.wrapping_mul(0x100000001b3); + } + hash + } +} + +impl Default for ValidateStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for ValidateStage { + fn name(&self) -> &'static str { + "validate" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: false, + writes_reasoning_index: false, + writes_description: false, + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let issues = self.validate_tree(ctx); + + let warnings = issues + .iter() + .filter(|i| i.severity == Severity::Warning) + .count(); + let errors = issues + .iter() + .filter(|i| i.severity == Severity::Error) + .count(); + + // Log all issues + for issue in &issues { + match issue.severity { + Severity::Warning => warn!("[validate] {}", issue.message), + Severity::Error => warn!("[validate] ERROR: {}", issue.message), + } + } + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_validate(duration); + + info!( + "Validated tree: {} warnings, {} errors in {}ms", + warnings, errors, duration + ); + + let mut stage_result = StageResult::success("validate"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("warnings".to_string(), serde_json::json!(warnings)); + stage_result + .metadata + .insert("errors".to_string(), serde_json::json!(errors)); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::DocumentTree; + + fn make_context_with_tree(tree: DocumentTree) -> IndexContext { + let input = crate::index::IndexInput::content("test"); + let options = crate::index::config::PipelineOptions::default(); + let mut ctx = IndexContext::new(input, options); + ctx.tree = Some(tree); + ctx + } + + #[test] + fn test_validate_empty_tree() { + let tree = DocumentTree::new("Root", ""); + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + // Single root node is valid — no issues expected + assert!(issues.is_empty()); + } + + #[test] + fn test_validate_simple_tree() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "Section 1", "Content of section 1"); + tree.set_token_count(child, 100); + + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + assert!(issues.is_empty()); + } + + #[test] + fn test_validate_empty_title_warning() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "", "Some content here"); + tree.set_token_count(child, 50); + + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + let warning_count = issues + .iter() + .filter(|i| i.message.contains("empty titles")) + .count(); + assert_eq!(warning_count, 1); + } + + #[test] + fn test_validate_no_tree_error() { + let input = crate::index::IndexInput::content("test"); + let options = crate::index::config::PipelineOptions::default(); + let ctx = IndexContext::new(input, options); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + assert_eq!(issues.len(), 1); + assert_eq!(issues[0].severity, Severity::Error); + } +} diff --git a/rust/src/index/summary/mod.rs b/rust/src/index/summary/mod.rs index 7a600482..f87593d0 100644 --- a/rust/src/index/summary/mod.rs +++ b/rust/src/index/summary/mod.rs @@ -21,7 +21,4 @@ mod lazy; mod selective; mod strategy; -pub use full::FullStrategy; -pub use lazy::LazyStrategy; -pub use selective::SelectiveStrategy; pub use strategy::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy, SummaryStrategyConfig}; diff --git a/rust/src/index/summary/strategy.rs b/rust/src/index/summary/strategy.rs index 91c0ff29..faa024bb 100644 --- a/rust/src/index/summary/strategy.rs +++ b/rust/src/index/summary/strategy.rs @@ -6,9 +6,9 @@ use async_trait::async_trait; use crate::document::{DocumentTree, NodeId}; -use crate::utils::fingerprint::Fingerprint; use crate::llm::{LlmClient, LlmResult}; use crate::memo::{MemoKey, MemoStore, MemoValue}; +use crate::utils::fingerprint::Fingerprint; /// Configuration for summary strategies. #[derive(Debug, Clone)] @@ -21,6 +21,11 @@ pub struct SummaryStrategyConfig { /// Whether to persist lazy-generated summaries. pub persist_lazy: bool, + + /// Token threshold below which the original content is used as summary + /// instead of calling LLM. Saves API cost for short, self-contained nodes. + /// Set to 0 to always call LLM. + pub shortcut_threshold: usize, } impl Default for SummaryStrategyConfig { @@ -29,6 +34,7 @@ impl Default for SummaryStrategyConfig { max_tokens: 200, min_content_tokens: 50, persist_lazy: false, + shortcut_threshold: 200, } } } @@ -149,6 +155,11 @@ impl SummaryStrategy { Self::Lazy { config, .. } => config.clone(), } } + + /// Get the shortcut threshold (tokens below which content is used as-is). + pub fn shortcut_threshold(&self) -> usize { + self.config().shortcut_threshold + } } /// Summary generator trait. @@ -156,6 +167,19 @@ impl SummaryStrategy { pub trait SummaryGenerator: Send + Sync { /// Generate a summary for the given content. async fn generate(&self, title: &str, content: &str) -> LlmResult; + + /// Generate a summary with leaf/non-leaf context. + /// Non-leaf nodes get a navigation-oriented prompt ("what does this section cover"), + /// leaf nodes get a content-oriented prompt ("what does this section say"). + async fn generate_for_node( + &self, + title: &str, + content: &str, + is_leaf: bool, + ) -> LlmResult { + let _ = is_leaf; + self.generate(title, content).await + } } /// LLM-based summary generator. @@ -214,7 +238,8 @@ impl SummaryGenerator for LlmSummaryGenerator { let user_prompt = format!("Title: {}\n\nContent:\n{}", title, content); - let summary = self.client + let summary = self + .client .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) .await?; @@ -232,4 +257,60 @@ impl SummaryGenerator for LlmSummaryGenerator { Ok(summary) } + + async fn generate_for_node( + &self, + title: &str, + content: &str, + is_leaf: bool, + ) -> LlmResult { + // Compute content fingerprint for cache key (include leaf flag) + let content_fp = Fingerprint::from_str(&format!("{}|{}|leaf={}", title, content, is_leaf)); + let memo_key = MemoKey::summary(&content_fp); + + // Check memo store first + if let Some(ref store) = self.memo_store { + if let Some(cached) = store.get(&memo_key) { + if let Some(summary) = cached.as_summary() { + tracing::debug!("Memo cache hit for summary: {}", title); + return Ok(summary.to_string()); + } + } + } + + // Choose prompt based on node type + let system_prompt = if is_leaf { + // Leaf nodes: content-oriented — "what does this section say" + "You are a document summarization assistant. \ + Generate a concise summary (2-3 sentences) of the given section's content. \ + Focus on the key information and facts presented. \ + Respond with only the summary, no additional text." + } else { + // Non-leaf (branch) nodes: navigation-oriented — "what does this section cover" + "You are a document summarization assistant. \ + Generate a concise overview (2-3 sentences) describing what topics and subtopics \ + this section covers. This summary will be used as a navigation guide. \ + Respond with only the summary, no additional text." + }; + + let user_prompt = format!("Title: {}\n\nContent:\n{}", title, content); + + let summary = self + .client + .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) + .await?; + + // Cache the result + if let Some(ref store) = self.memo_store { + let tokens_saved = (title.len() + content.len() + summary.len()) / 4; + store.put_with_tokens( + memo_key, + MemoValue::Summary(summary.clone()), + tokens_saved as u64, + ); + tracing::debug!("Memo cache stored for summary: {}", title); + } + + Ok(summary) + } } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 3e13344e..b34019c4 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -7,7 +7,6 @@ #![allow(clippy::all)] #![allow(dead_code)] #![allow(unused_variables)] -#![allow(unused_imports)] #![allow(clippy::iter_over_hash_type)] #![allow(clippy::large_enum_variant)] #![allow(clippy::manual_unwrap_or_default)] @@ -43,14 +42,6 @@ //! Ok(()) //! } //! ``` -//! -//! ## Modules -//! -//! | Module | Description | -//! |--------|-------------| -//! | [`client`] | High-level API (`Engine`, `EngineBuilder`, `IndexContext`, `QueryContext`) | -//! | [`document`] | Core domain types (`DocumentTree`, `TreeNode`, `NodeId`) | -//! | [`error`] | Error types | pub mod client; mod config; @@ -62,7 +53,6 @@ mod index; mod llm; mod memo; mod metrics; -mod parser; mod retrieval; mod storage; mod throttle; @@ -70,8 +60,8 @@ mod utils; // Client API pub use client::{ - BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, - IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, + BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, IndexContext, + IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, }; // Error types @@ -91,4 +81,3 @@ pub use events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; // Index metrics pub use metrics::IndexMetrics; - diff --git a/rust/src/llm/client.rs b/rust/src/llm/client.rs index 40386094..0c01bbdc 100644 --- a/rust/src/llm/client.rs +++ b/rust/src/llm/client.rs @@ -347,14 +347,4 @@ mod tests { assert!(client.concurrency().is_some()); } - - #[test] - fn test_client_with_fallback() { - use crate::llm::FallbackConfig; - - let fallback = FallbackChain::new(FallbackConfig::default()); - let client = LlmClient::for_model("gpt-4o").with_fallback(fallback); - - assert!(client.fallback().is_some()); - } } diff --git a/rust/src/llm/mod.rs b/rust/src/llm/mod.rs index 50cd8557..c19b60e1 100644 --- a/rust/src/llm/mod.rs +++ b/rust/src/llm/mod.rs @@ -72,8 +72,6 @@ mod pool; mod retry; pub use client::LlmClient; -pub use config::{LlmConfig, LlmConfigs, RetryConfig}; -pub use error::{LlmError, LlmResult}; +pub use config::LlmConfig; +pub use error::LlmResult; pub use executor::LlmExecutor; -pub use fallback::{FallbackChain, FallbackConfig, FallbackResult, FallbackStep}; -pub use pool::LlmPool; diff --git a/rust/src/memo/mod.rs b/rust/src/memo/mod.rs index 75d754c2..50523c16 100644 --- a/rust/src/memo/mod.rs +++ b/rust/src/memo/mod.rs @@ -32,4 +32,4 @@ mod store; mod types; pub use store::MemoStore; -pub use types::{MemoEntry, MemoKey, MemoOpType, MemoStats, MemoValue, PilotDecisionValue}; +pub use types::{MemoKey, MemoValue, PilotDecisionValue}; diff --git a/rust/src/memo/store.rs b/rust/src/memo/store.rs index 913392fa..85860937 100644 --- a/rust/src/memo/store.rs +++ b/rust/src/memo/store.rs @@ -7,17 +7,16 @@ use std::collections::HashMap; use std::future::Future; -use std::num::NonZeroUsize; use std::path::Path; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; -use chrono::{Duration, Utc}; +use chrono::Duration; use lru::LruCache; use parking_lot::RwLock; use serde::{Deserialize, Serialize}; use tokio::sync::RwLock as AsyncRwLock; -use tracing::{debug, info, warn}; +use tracing::{debug, info}; use super::types::{MemoEntry, MemoKey, MemoOpType, MemoStats, MemoValue}; use crate::error::Result; @@ -150,7 +149,8 @@ impl MemoStore { pub fn with_capacity(capacity: usize) -> Self { Self { cache: Arc::new(RwLock::new(LruCache::new( - std::num::NonZeroUsize::new(capacity).unwrap_or(std::num::NonZeroUsize::new(1000).unwrap()), + std::num::NonZeroUsize::new(capacity) + .unwrap_or(std::num::NonZeroUsize::new(1000).unwrap()), ))), stats: Arc::new(AsyncRwLock::new(MemoStats::default())), ttl: DEFAULT_TTL, @@ -219,11 +219,7 @@ impl MemoStore { /// This is the primary method for using the memo store. /// It will return the cached value if present, or call the /// provided compute function and cache the result. - pub async fn get_or_compute( - &self, - key: MemoKey, - compute: F, - ) -> Result + pub async fn get_or_compute(&self, key: MemoKey, compute: F) -> Result where F: FnOnce() -> Fut, Fut: Future>, // (value, tokens) @@ -376,14 +372,19 @@ impl MemoStore { // Since the key is a fingerprint, we need to check model_id from entries // For now, we'll clear all entries if prefix matches our model_id // A better approach would be to store model_id in entry metadata - let should_clear = self.model_id.as_ref() + let should_clear = self + .model_id + .as_ref() .map(|m| m.starts_with(prefix)) .unwrap_or(false); if should_clear { cache.clear(); let removed = before; - debug!("Invalidated all {} entries (model prefix '{}')", removed, prefix); + debug!( + "Invalidated all {} entries (model prefix '{}')", + removed, prefix + ); return removed; } @@ -419,10 +420,8 @@ impl MemoStore { let cache = self.cache.read(); let stats = self.stats.read().await; - let entries: HashMap = cache - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); + let entries: HashMap = + cache.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); let data = MemoStoreData { version: 1, @@ -430,9 +429,9 @@ impl MemoStore { stats: stats.clone(), }; - let parent = path.parent().ok_or_else(|| { - crate::Error::Parse("Invalid path for memo store".to_string()) - })?; + let parent = path + .parent() + .ok_or_else(|| crate::Error::Parse("Invalid path for memo store".to_string()))?; tokio::fs::create_dir_all(parent).await?; let temp_path = path.with_extension("tmp"); @@ -441,7 +440,11 @@ impl MemoStore { tokio::fs::write(&temp_path, &json).await?; tokio::fs::rename(&temp_path, path).await?; - info!("Saved memo store with {} entries to {:?}", data.entries.len(), path); + info!( + "Saved memo store with {} entries to {:?}", + data.entries.len(), + path + ); Ok(()) } @@ -471,7 +474,11 @@ impl MemoStore { stats.tokens_saved = data.stats.tokens_saved; stats.cost_saved = data.stats.cost_saved; - info!("Loaded memo store with {} entries from {:?}", cache.len(), path); + info!( + "Loaded memo store with {} entries from {:?}", + cache.len(), + path + ); Ok(()) } @@ -660,7 +667,11 @@ mod tests { let store = MemoStore::new(); let key = make_test_key(); - store.put_with_tokens(key.clone(), MemoValue::Summary("Test summary".to_string()), 100); + store.put_with_tokens( + key.clone(), + MemoValue::Summary("Test summary".to_string()), + 100, + ); // Save store.save(&path).await.unwrap(); diff --git a/rust/src/metrics/hub.rs b/rust/src/metrics/hub.rs index 2088e25c..ee6e14af 100644 --- a/rust/src/metrics/hub.rs +++ b/rust/src/metrics/hub.rs @@ -97,8 +97,13 @@ impl MetricsHub { if !self.config.enabled || !self.config.llm.track_tokens { return; } - self.llm - .record_call(input_tokens, output_tokens, latency_ms, success, &self.config.llm); + self.llm.record_call( + input_tokens, + output_tokens, + latency_ms, + success, + &self.config.llm, + ); } /// Record an LLM rate limit error. @@ -193,17 +198,16 @@ impl MetricsHub { // ======================================================================== /// Record a retrieval query. - pub fn record_retrieval_query( - &self, - iterations: u64, - nodes_visited: u64, - latency_ms: u64, - ) { + pub fn record_retrieval_query(&self, iterations: u64, nodes_visited: u64, latency_ms: u64) { if !self.config.enabled { return; } - self.retrieval - .record_query(iterations, nodes_visited, latency_ms, &self.config.retrieval); + self.retrieval.record_query( + iterations, + nodes_visited, + latency_ms, + &self.config.retrieval, + ); } /// Record a found path. diff --git a/rust/src/metrics/index.rs b/rust/src/metrics/index.rs index e23769d2..58054661 100644 --- a/rust/src/metrics/index.rs +++ b/rust/src/metrics/index.rs @@ -28,6 +28,14 @@ pub struct IndexMetrics { #[serde(default)] pub optimize_time_ms: u64, + /// Validate stage duration (ms). + #[serde(default)] + pub validate_time_ms: u64, + + /// Split stage duration (ms). + #[serde(default)] + pub split_time_ms: u64, + /// Reasoning index build duration (ms). #[serde(default)] pub reasoning_index_time_ms: u64, @@ -96,6 +104,16 @@ impl IndexMetrics { self.optimize_time_ms = duration_ms; } + /// Record validate stage time. + pub fn record_validate(&mut self, duration_ms: u64) { + self.validate_time_ms = duration_ms; + } + + /// Record split stage time. + pub fn record_split(&mut self, duration_ms: u64) { + self.split_time_ms = duration_ms; + } + /// Record reasoning index build time. pub fn record_reasoning_index(&mut self, duration_ms: u64, topics: usize, keywords: usize) { self.reasoning_index_time_ms = duration_ms; @@ -137,6 +155,8 @@ impl IndexMetrics { pub fn total_time_ms(&self) -> u64 { self.parse_time_ms + self.build_time_ms + + self.validate_time_ms + + self.split_time_ms + self.enhance_time_ms + self.enrich_time_ms + self.reasoning_index_time_ms diff --git a/rust/src/metrics/llm.rs b/rust/src/metrics/llm.rs index c8dc30f1..257747ae 100644 --- a/rust/src/metrics/llm.rs +++ b/rust/src/metrics/llm.rs @@ -131,7 +131,8 @@ impl LlmMetrics { 0.0 }, total_latency_ms: total_latency, - estimated_cost_usd: self.estimated_cost_micros.load(Ordering::Relaxed) as f64 / 1_000_000.0, + estimated_cost_usd: self.estimated_cost_micros.load(Ordering::Relaxed) as f64 + / 1_000_000.0, rate_limit_errors: self.rate_limit_errors.load(Ordering::Relaxed), timeout_errors: self.timeout_errors.load(Ordering::Relaxed), fallback_triggers: self.fallback_triggers.load(Ordering::Relaxed), diff --git a/rust/src/metrics/mod.rs b/rust/src/metrics/mod.rs index b311efea..258f235d 100644 --- a/rust/src/metrics/mod.rs +++ b/rust/src/metrics/mod.rs @@ -55,10 +55,4 @@ mod llm; mod pilot; mod retrieval; -pub use hub::MetricsHub; pub use index::IndexMetrics; -pub use llm::{LlmMetrics, LlmMetricsReport}; -pub use pilot::{InterventionPoint, PilotMetrics, PilotMetricsReport}; -pub use retrieval::{RetrievalMetrics, RetrievalMetricsReport}; - -pub(crate) use crate::config::MetricsConfig; diff --git a/rust/src/metrics/pilot.rs b/rust/src/metrics/pilot.rs index ccc2ea5e..fee0e011 100644 --- a/rust/src/metrics/pilot.rs +++ b/rust/src/metrics/pilot.rs @@ -71,7 +71,12 @@ impl PilotMetrics { } /// Record a Pilot decision. - pub fn record_decision(&self, confidence: f64, point: InterventionPoint, config: &PilotMetricsConfig) { + pub fn record_decision( + &self, + confidence: f64, + point: InterventionPoint, + config: &PilotMetricsConfig, + ) { if !config.track_decisions { return; } @@ -95,7 +100,8 @@ impl PilotMetrics { // Update average confidence (store as scaled integer for atomic operations) let scaled_confidence = (confidence * 1_000_000.0) as u64; - self.confidence_sum_scaled.fetch_add(scaled_confidence, Ordering::Relaxed); + self.confidence_sum_scaled + .fetch_add(scaled_confidence, Ordering::Relaxed); self.confidence_count.fetch_add(1, Ordering::Relaxed); } diff --git a/rust/src/metrics/retrieval.rs b/rust/src/metrics/retrieval.rs index 56ecb140..682250e9 100644 --- a/rust/src/metrics/retrieval.rs +++ b/rust/src/metrics/retrieval.rs @@ -49,11 +49,18 @@ impl RetrievalMetrics { } /// Record a query. - pub fn record_query(&self, iterations: u64, nodes: u64, latency_ms: u64, config: &RetrievalMetricsConfig) { + pub fn record_query( + &self, + iterations: u64, + nodes: u64, + latency_ms: u64, + config: &RetrievalMetricsConfig, + ) { self.total_queries.fetch_add(1, Ordering::Relaxed); if config.track_iterations { - self.total_iterations.fetch_add(iterations, Ordering::Relaxed); + self.total_iterations + .fetch_add(iterations, Ordering::Relaxed); self.iterations_sum.fetch_add(iterations, Ordering::Relaxed); } @@ -61,7 +68,8 @@ impl RetrievalMetrics { self.nodes_visited.fetch_add(nodes, Ordering::Relaxed); } - self.total_latency_ms.fetch_add(latency_ms, Ordering::Relaxed); + self.total_latency_ms + .fetch_add(latency_ms, Ordering::Relaxed); } /// Record a found path. @@ -75,7 +83,8 @@ impl RetrievalMetrics { if config.track_scores { let scaled_score = (score * 1_000_000.0) as u64; - self.path_score_sum_scaled.fetch_add(scaled_score, Ordering::Relaxed); + self.path_score_sum_scaled + .fetch_add(scaled_score, Ordering::Relaxed); if score >= 0.5 { self.high_score_paths.fetch_add(1, Ordering::Relaxed); @@ -156,7 +165,8 @@ impl RetrievalMetrics { 0.0 }, avg_path_score: if paths_found > 0 { - (self.path_score_sum_scaled.load(Ordering::Relaxed) as f64 / 1_000_000.0) / paths_found as f64 + (self.path_score_sum_scaled.load(Ordering::Relaxed) as f64 / 1_000_000.0) + / paths_found as f64 } else { 0.0 }, diff --git a/rust/src/parser/docx/mod.rs b/rust/src/parser/docx/mod.rs deleted file mode 100644 index b5bf602e..00000000 --- a/rust/src/parser/docx/mod.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! DOCX document parsing module. -//! -//! This module provides functionality to parse DOCX (Microsoft Word) documents: -//! - **DocxParser** — Extract structured content from DOCX files -//! - **StyleResolver** — Resolve heading styles from style definitions -//! -//! # DOCX Structure -//! -//! A DOCX file is a ZIP archive containing XML files: -//! -//! ```text -//! document.docx -//! ├── word/ -//! │ ├── document.xml # Main content -//! │ └── styles.xml # Style definitions (optional) -//! ``` -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::parser::docx::DocxParser; -//! use vectorless::DocumentParser; -//! use std::path::Path; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::Result<()> { -//! let parser = DocxParser::new(); -//! let result = parser.parse_file(Path::new("document.docx")).await?; -//! -//! println!("Extracted {} nodes", result.node_count()); -//! for node in &result.nodes { -//! println!(" - {} (level {})", node.title, node.level); -//! } -//! # Ok(()) -//! # } -//! ``` - -mod parser; -mod styles; -mod types; - -pub use parser::DocxParser; -pub use styles::StyleResolver; -pub use types::{DocxParagraph, DocxStyle}; diff --git a/rust/src/parser/docx/parser.rs b/rust/src/parser/docx/parser.rs deleted file mode 100644 index c32a047f..00000000 --- a/rust/src/parser/docx/parser.rs +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! DOCX document parser. -//! -//! This module provides functionality to parse DOCX (Microsoft Word) documents: -//! - **DocxParser** — Extract structured content from DOCX files -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::parser::docx::DocxParser; -//! use vectorless::DocumentParser; -//! use std::path::Path; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::Result<()> { -//! let parser = DocxParser::new(); -//! let result = parser.parse_file(Path::new("document.docx")).await?; -//! -//! println!("Extracted {} nodes", result.node_count()); -//! for node in &result.nodes { -//! println!(" - {} (level {})", node.title, node.level); -//! } -//! # Ok(()) -//! # } -//! ``` - -use std::io::{Cursor, Read}; -use std::path::Path; - -use async_trait::async_trait; -use zip::ZipArchive; - -use crate::Error; -use crate::error::Result; -use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; - -use super::styles::StyleResolver; -use super::types::DocxParagraph; - -/// DOCX document parser. -#[derive(Debug, Clone, Default)] -pub struct DocxParser; - -impl DocxParser { - /// Create a new DOCX parser. - pub fn new() -> Self { - Self::default() - } - - /// Parse a DOCX file and return raw nodes. - pub fn parse_file_sync(&self, path: &Path) -> Result { - let bytes = std::fs::read(path) - .map_err(|e| Error::Parse(format!("Failed to read DOCX file: {}", e)))?; - - self.parse_bytes(&bytes, path.file_stem().and_then(|s| s.to_str())) - } - - /// Parse DOCX from bytes. - pub fn parse_bytes(&self, bytes: &[u8], filename: Option<&str>) -> Result { - // Create ZIP archive from bytes - let cursor = Cursor::new(bytes); - let mut archive = ZipArchive::new(cursor) - .map_err(|e| Error::Parse(format!("Failed to open DOCX archive: {}", e)))?; - - // Read styles.xml (optional) - let style_resolver = self.read_styles(&mut archive)?; - - // Read document.xml (required) - let document_xml = self.read_xml_file(&mut archive, "word/document.xml")?; - - // Parse paragraphs from document - let paragraphs = self.parse_paragraphs(&document_xml, &style_resolver)?; - - // Convert paragraphs to raw nodes - let nodes = self.build_raw_nodes(paragraphs)?; - - // Create metadata - let meta = DocumentMeta { - name: filename.unwrap_or("Document").to_string(), - format: DocumentFormat::Docx, - page_count: None, - line_count: nodes.len(), - source_path: None, - description: None, - }; - - Ok(ParseResult::new(meta, nodes)) - } - - /// Read styles.xml and create a style resolver. - fn read_styles(&self, archive: &mut ZipArchive>) -> Result { - match self.read_xml_file(archive, "word/styles.xml") { - Ok(xml) => Ok(StyleResolver::from_xml(&xml)), - Err(_) => { - // styles.xml is optional, use default resolver with built-in styles - Ok(StyleResolver::from_xml("")) - } - } - } - - /// Read an XML file from the archive. - fn read_xml_file(&self, archive: &mut ZipArchive>, path: &str) -> Result { - let mut file = archive - .by_name(path) - .map_err(|e| Error::Parse(format!("Failed to read {} from DOCX: {}", path, e)))?; - - let mut content = String::new(); - file.read_to_string(&mut content) - .map_err(|e| Error::Parse(format!("Failed to read {} content: {}", path, e)))?; - - Ok(content) - } - - /// Parse paragraphs from document.xml. - /// Word namespace URI. - const WORD_NS: &'static str = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; - - fn parse_paragraphs( - &self, - document_xml: &str, - style_resolver: &StyleResolver, - ) -> Result> { - let doc = roxmltree::Document::parse(document_xml) - .map_err(|e| Error::Parse(format!("Failed to parse document.xml: {}", e)))?; - - let mut paragraphs = Vec::new(); - - // Find all w:p elements (paragraphs) - for para_elem in doc - .descendants() - .filter(|n| n.has_tag_name((Self::WORD_NS, "p"))) - { - if let Some(para) = self.parse_paragraph(¶_elem, style_resolver) { - paragraphs.push(para); - } - } - - Ok(paragraphs) - } - - /// Parse a single paragraph element. - fn parse_paragraph( - &self, - elem: &roxmltree::Node, - style_resolver: &StyleResolver, - ) -> Option { - // Extract text from all w:t elements - let text = self.extract_text(elem); - - if text.trim().is_empty() { - return None; - } - - let mut para = DocxParagraph::new(text); - - // Get style from w:pPr/w:pStyle - for child in elem.children() { - if child.has_tag_name((Self::WORD_NS, "pPr")) { - for ppr_child in child.children() { - if ppr_child.has_tag_name((Self::WORD_NS, "pStyle")) { - if let Some(style_id) = ppr_child.attribute((Self::WORD_NS, "val")) { - para.style_id = Some(style_id.to_string()); - para.heading_level = style_resolver.get_heading_level(¶.style_id); - } - } - } - } - } - - // If no style found, try heuristics - if para.heading_level.is_none() { - para.heading_level = style_resolver.detect_heading_by_heuristics(¶.text); - } - - Some(para) - } - - /// Extract text from a paragraph element. - fn extract_text(&self, elem: &roxmltree::Node) -> String { - let mut text = String::new(); - - // Find all w:t elements (text runs) - for text_elem in elem - .descendants() - .filter(|n| n.has_tag_name((Self::WORD_NS, "t"))) - { - if let Some(t) = text_elem.text() { - text.push_str(t); - } - } - - text - } - - /// Build raw nodes from parsed paragraphs. - fn build_raw_nodes(&self, paragraphs: Vec) -> Result> { - let mut nodes: Vec = Vec::new(); - let mut current_sections: Vec<(u8, RawNode)> = Vec::new(); // (level, node) - let mut has_headings = false; - let mut unassigned_text: Vec = Vec::new(); - - for para in paragraphs { - if !para.has_content() { - continue; - } - - if let Some(level) = para.heading_level { - // This is a heading - create a new section - has_headings = true; - - // If there was unassigned text before this heading, add it to the previous section - if !unassigned_text.is_empty() { - if let Some((_, node)) = current_sections.last_mut() { - if !node.content.is_empty() { - node.content.push('\n'); - } - node.content.push_str(&unassigned_text.join("\n")); - } - unassigned_text.clear(); - } - - // Save content to previous section at the same or deeper level - self.finalize_deeper_sections(&mut current_sections, level); - - // Create new section - let node = RawNode::new(¶.text).with_level(level as usize); - - current_sections.push((level, node)); - } else { - // This is body text - if current_sections.is_empty() { - // No sections yet, collect text for later - unassigned_text.push(para.text); - } else { - // Append to the deepest current section - if let Some((_, node)) = current_sections.last_mut() { - if !node.content.is_empty() { - node.content.push('\n'); - } - node.content.push_str(¶.text); - } - } - } - } - - // Finalize remaining sections - while let Some((_level, node)) = current_sections.pop() { - // The tree builder will handle proper hierarchy - nodes.insert(0, node); - } - - // If no headings found, create a single node with all content - if !has_headings { - let combined = unassigned_text.join("\n"); - let node = RawNode::new("Document") - .with_content(combined) - .with_level(1); - return Ok(vec![node]); - } - - Ok(nodes) - } - - /// Finalize sections that are deeper than the given level. - fn finalize_deeper_sections(&self, sections: &mut Vec<(u8, RawNode)>, new_level: u8) { - // Pop sections that are at the same level or deeper - while let Some((level, _)) = sections.last() { - if *level >= new_level { - // This section will be replaced by the new one - sections.pop(); - } else { - break; - } - } - } -} - -#[async_trait] -impl DocumentParser for DocxParser { - fn format(&self) -> DocumentFormat { - DocumentFormat::Docx - } - - async fn parse(&self, content: &str) -> Result { - // For DOCX, content should be a file path - let path = Path::new(content); - self.parse_file(path).await - } - - async fn parse_file(&self, path: &Path) -> Result { - // Run sync parsing in a blocking task - let path = path.to_path_buf(); - tokio::task::spawn_blocking(move || { - let parser = DocxParser::new(); - parser.parse_file_sync(&path) - }) - .await - .map_err(|e| Error::Parse(format!("DOCX parsing task failed: {}", e)))? - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parser_creation() { - let parser = DocxParser::new(); - assert_eq!(parser.format(), DocumentFormat::Docx); - } - - #[test] - fn test_extract_text() { - let parser = DocxParser::new(); - - // Include namespace declaration for w: prefix - let xml = r#" - - - Hello - - - World - - - "#; - - let doc = roxmltree::Document::parse(xml).unwrap(); - let elem = doc.root().first_child().unwrap(); - let text = parser.extract_text(&elem); - - assert_eq!(text, "Hello World"); - } - - #[test] - fn test_build_raw_nodes_no_headings() { - let parser = DocxParser::new(); - - let paragraphs = vec![ - DocxParagraph::new("First paragraph"), - DocxParagraph::new("Second paragraph"), - ]; - - let nodes = parser.build_raw_nodes(paragraphs).unwrap(); - - assert_eq!(nodes.len(), 1, "Should have exactly one node"); - assert_eq!( - nodes[0].title, "Document", - "Node title should be 'Document'" - ); - assert!( - nodes[0].content.contains("First paragraph"), - "Content should contain 'First paragraph', got: {:?}", - nodes[0].content - ); - assert!( - nodes[0].content.contains("Second paragraph"), - "Content should contain 'Second paragraph', got: {:?}", - nodes[0].content - ); - } - - #[test] - fn test_build_raw_nodes_with_headings() { - let parser = DocxParser::new(); - - let mut para1 = DocxParagraph::new("Introduction"); - para1.heading_level = Some(1); - - let para2 = DocxParagraph::new("This is the intro content."); - - let mut para3 = DocxParagraph::new("Details"); - para3.heading_level = Some(2); - - let para4 = DocxParagraph::new("More details here."); - - let paragraphs = vec![para1, para2, para3, para4]; - - let nodes = parser.build_raw_nodes(paragraphs).unwrap(); - - assert!(nodes.len() >= 2); - assert!(nodes.iter().any(|n| n.title == "Introduction")); - assert!(nodes.iter().any(|n| n.title == "Details")); - } -} diff --git a/rust/src/parser/docx/styles.rs b/rust/src/parser/docx/styles.rs deleted file mode 100644 index 8414410a..00000000 --- a/rust/src/parser/docx/styles.rs +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Style resolution for DOCX documents. -//! -//! This module handles the detection of heading styles from DOCX documents. - -use std::collections::HashMap; - -use super::types::DocxStyle; - -/// Word namespace URI. -const WORD_NS: &str = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; - -/// Style resolver for mapping style IDs to heading levels. -#[derive(Debug, Clone, Default)] -pub struct StyleResolver { - /// Map from style_id to resolved style info. - styles: HashMap, -} - -impl StyleResolver { - /// Create a new empty style resolver. - pub fn new() -> Self { - Self::default() - } - - /// Create a style resolver from styles.xml content. - pub fn from_xml(styles_xml: &str) -> Self { - let mut resolver = Self::new(); - - // Add built-in styles first - resolver.add_builtin_styles(); - - // Parse styles.xml if available - if !styles_xml.is_empty() { - resolver.parse_styles_xml(styles_xml); - } - - resolver - } - - /// Add built-in heading styles. - fn add_builtin_styles(&mut self) { - // Standard Word heading styles - for level in 1..=6 { - let style_id = format!("Heading{}", level); - self.styles - .insert(style_id.clone(), DocxStyle::heading(&style_id, level)); - } - - // Some documents use lowercase or different casing - for level in 1..=6 { - let style_id = format!("heading{}", level); - self.styles - .insert(style_id.clone(), DocxStyle::heading(&style_id, level)); - } - - // Title style (treat as H1) - self.styles - .insert("Title".to_string(), DocxStyle::heading("Title", 1)); - } - - /// Parse styles.xml content. - fn parse_styles_xml(&mut self, styles_xml: &str) { - let doc = match roxmltree::Document::parse(styles_xml) { - Ok(doc) => doc, - Err(_) => return, - }; - - // Find all w:style elements - for style_elem in doc - .descendants() - .filter(|n| n.has_tag_name((WORD_NS, "style"))) - { - if let Some(style) = self.parse_style_element(&style_elem) { - self.styles.insert(style.style_id.clone(), style); - } - } - } - - /// Parse a single w:style element. - fn parse_style_element(&self, elem: &roxmltree::Node) -> Option { - // Get style ID - let style_id = elem.attribute((WORD_NS, "styleId"))?.to_string(); - - let mut style = DocxStyle::new(&style_id); - - // Get style name - for child in elem.children() { - if child.has_tag_name((WORD_NS, "name")) { - if let Some(name) = child.attribute((WORD_NS, "val")) { - style.name = Some(name.to_string()); - - // Check if name indicates a heading - let name_lower = name.to_lowercase(); - if name_lower.starts_with("heading") { - style.is_heading = true; - // Extract heading level from name - if let Some(level) = self.extract_heading_level(&name_lower) { - style.heading_level = Some(level); - } - } - } - } - - // Check for outline level (indicates heading) - if child.has_tag_name((WORD_NS, "pPr")) { - for ppr_child in child.children() { - if ppr_child.has_tag_name((WORD_NS, "outlineLvl")) { - if let Some(level_str) = ppr_child.attribute((WORD_NS, "val")) { - if let Ok(level) = level_str.parse::() { - style.is_heading = true; - // outlineLvl is 0-indexed, heading level is 1-indexed - style.heading_level = Some(level + 1); - } - } - } - } - } - } - - Some(style) - } - - /// Extract heading level from a style name. - fn extract_heading_level(&self, name: &str) -> Option { - // Try to extract number from "heading N" or "headingN" - let digits: String = name.chars().filter(|c| c.is_ascii_digit()).collect(); - digits.parse().ok().filter(|&l| l >= 1 && l <= 6) - } - - /// Get heading level for a style ID. - pub fn get_heading_level(&self, style_id: &Option) -> Option { - style_id - .as_ref() - .and_then(|id| self.styles.get(id).and_then(|s| s.heading_level)) - } - - /// Check if a style is a heading. - pub fn is_heading(&self, style_id: &Option) -> bool { - style_id - .as_ref() - .is_some_and(|id| self.styles.get(id).is_some_and(|s| s.is_heading)) - } - - /// Try to detect heading level from text content heuristics. - /// - /// This is used when no style information is available. - pub fn detect_heading_by_heuristics(&self, text: &str) -> Option { - let text = text.trim(); - - // Skip very long texts (unlikely to be headings) - if text.len() > 100 { - return None; - } - - // Check for common heading patterns - // Pattern: "Chapter X" or "Section X" - let text_lower = text.to_lowercase(); - if text_lower.starts_with("chapter ") || text_lower.starts_with("section ") { - return Some(1); - } - - // Pattern: numbered sections like "1.", "1.1", "1.1.1" - let numbered_level = self.detect_numbered_heading(text); - if numbered_level.is_some() { - return numbered_level; - } - - None - } - - /// Detect heading level from numbered patterns. - fn detect_numbered_heading(&self, text: &str) -> Option { - // Match patterns like "1.", "1.1", "1.1.1", etc. - let mut depth = 0u8; - let mut prev_was_digit = false; - let mut has_digit = false; - - for ch in text.chars() { - if ch.is_ascii_digit() { - prev_was_digit = true; - has_digit = true; - } else if ch == '.' && prev_was_digit { - depth += 1; - prev_was_digit = false; - } else if ch == ' ' && prev_was_digit { - // End of number sequence - depth += 1; - break; - } else if !ch.is_whitespace() && has_digit { - // Non-digit, non-dot, non-space after digits - break; - } - } - - if depth > 0 && depth <= 6 { - Some(depth) - } else { - None - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_builtin_styles() { - let resolver = StyleResolver::new(); - let resolver = { - let mut r = resolver; - r.add_builtin_styles(); - r - }; - - assert_eq!( - resolver.get_heading_level(&Some("Heading1".to_string())), - Some(1) - ); - assert_eq!( - resolver.get_heading_level(&Some("Heading2".to_string())), - Some(2) - ); - assert_eq!( - resolver.get_heading_level(&Some("Normal".to_string())), - None - ); - } - - #[test] - fn test_detect_numbered_heading() { - let resolver = StyleResolver::new(); - - assert_eq!(resolver.detect_numbered_heading("1. Introduction"), Some(1)); - assert_eq!(resolver.detect_numbered_heading("1.1 Background"), Some(2)); - assert_eq!(resolver.detect_numbered_heading("1.1.1 Details"), Some(3)); - assert_eq!(resolver.detect_numbered_heading("Introduction"), None); - } - - #[test] - fn test_detect_heading_by_heuristics() { - let resolver = StyleResolver::new(); - - assert_eq!(resolver.detect_heading_by_heuristics("Chapter 1"), Some(1)); - assert_eq!(resolver.detect_heading_by_heuristics("Section 2"), Some(1)); - assert_eq!( - resolver.detect_heading_by_heuristics("1. Introduction"), - Some(1) - ); - assert_eq!( - resolver.detect_heading_by_heuristics("1.1 Background"), - Some(2) - ); - assert_eq!( - resolver.detect_heading_by_heuristics( - "This is a very long piece of text that is unlikely to be a heading" - ), - None - ); - } -} diff --git a/rust/src/parser/docx/types.rs b/rust/src/parser/docx/types.rs deleted file mode 100644 index d406660e..00000000 --- a/rust/src/parser/docx/types.rs +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! DOCX-specific type definitions. - -/// Parsed DOCX paragraph. -#[derive(Debug, Clone)] -pub struct DocxParagraph { - /// Text content. - pub text: String, - /// Style ID (e.g., "Heading1", "Normal"). - pub style_id: Option, - /// Detected heading level (1-6), None for body text. - pub heading_level: Option, -} - -impl DocxParagraph { - /// Create a new paragraph. - pub fn new(text: impl Into) -> Self { - Self { - text: text.into(), - style_id: None, - heading_level: None, - } - } - - /// Check if this paragraph has content. - pub fn has_content(&self) -> bool { - !self.text.trim().is_empty() - } - - /// Check if this is a heading. - pub fn is_heading(&self) -> bool { - self.heading_level.is_some() - } -} - -/// Parsed style definition. -#[derive(Debug, Clone)] -pub struct DocxStyle { - /// Style ID (e.g., "Heading1"). - pub style_id: String, - /// Style name (e.g., "heading 1"). - pub name: Option, - /// Whether this style is a heading. - pub is_heading: bool, - /// Heading level (1-6), if this is a heading. - pub heading_level: Option, -} - -impl DocxStyle { - /// Create a new style. - pub fn new(style_id: impl Into) -> Self { - Self { - style_id: style_id.into(), - name: None, - is_heading: false, - heading_level: None, - } - } - - /// Create a heading style. - pub fn heading(style_id: impl Into, level: u8) -> Self { - Self { - style_id: style_id.into(), - name: Some(format!("heading {}", level)), - is_heading: true, - heading_level: Some(level), - } - } -} diff --git a/rust/src/parser/html/config.rs b/rust/src/parser/html/config.rs deleted file mode 100644 index f3b6c05c..00000000 --- a/rust/src/parser/html/config.rs +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration for HTML parsing. - -use serde::{Deserialize, Serialize}; - -/// Configuration for HTML parsing. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HtmlConfig { - /// Default title for nodes without headings. - #[serde(default = "default_title")] - pub default_title: String, - - /// Minimum content length to keep a node. - #[serde(default = "default_min_content_length")] - pub min_content_length: usize, - - /// Whether to include code blocks. - #[serde(default = "default_include_code_blocks")] - pub include_code_blocks: bool, - - /// Whether to merge small consecutive nodes. - #[serde(default = "default_merge_small_nodes")] - pub merge_small_nodes: bool, - - /// Maximum heading level to process (1-6). - #[serde(default = "default_max_heading_level")] - pub max_heading_level: usize, -} - -fn default_title() -> String { - "Introduction".to_string() -} - -fn default_min_content_length() -> usize { - 50 -} - -fn default_include_code_blocks() -> bool { - true -} - -fn default_merge_small_nodes() -> bool { - true -} - -fn default_max_heading_level() -> usize { - 6 -} - -impl Default for HtmlConfig { - fn default() -> Self { - Self { - default_title: default_title(), - min_content_length: default_min_content_length(), - include_code_blocks: default_include_code_blocks(), - merge_small_nodes: default_merge_small_nodes(), - max_heading_level: default_max_heading_level(), - } - } -} - -impl HtmlConfig { - /// Create a new config with default values. - pub fn new() -> Self { - Self::default() - } - - /// Set the default title for nodes without headings. - #[must_use] - pub fn with_default_title(mut self, title: impl Into) -> Self { - self.default_title = title.into(); - self - } - - /// Set minimum content length to keep a node. - #[must_use] - pub fn with_min_content_length(mut self, len: usize) -> Self { - self.min_content_length = len; - self - } - - /// Enable or disable code blocks. - #[must_use] - pub fn with_code_blocks(mut self, include: bool) -> Self { - self.include_code_blocks = include; - self - } - - /// Enable or disable merging of small consecutive nodes. - #[must_use] - pub fn with_merge_small_nodes(mut self, merge: bool) -> Self { - self.merge_small_nodes = merge; - self - } - - /// Set maximum heading level to process (1-6). - #[must_use] - pub fn with_max_heading_level(mut self, level: usize) -> Self { - self.max_heading_level = level.clamp(1, 6); - self - } - - /// Create a config that excludes code blocks. - #[must_use] - pub fn no_code_blocks() -> Self { - Self::new().with_code_blocks(false) - } - - /// Create a config for simple documents (no merging). - #[must_use] - pub fn simple() -> Self { - Self::new() - .with_merge_small_nodes(false) - .with_min_content_length(0) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_default_config() { - let config = HtmlConfig::default(); - assert_eq!(config.default_title, "Introduction"); - assert_eq!(config.min_content_length, 50); - assert!(config.include_code_blocks); - assert!(config.merge_small_nodes); - assert_eq!(config.max_heading_level, 6); - } - - #[test] - fn test_builder_pattern() { - let config = HtmlConfig::new() - .with_default_title("Overview") - .with_min_content_length(100) - .with_code_blocks(false) - .with_max_heading_level(3); - - assert_eq!(config.default_title, "Overview"); - assert_eq!(config.min_content_length, 100); - assert!(!config.include_code_blocks); - assert_eq!(config.max_heading_level, 3); - } - - #[test] - fn test_max_heading_level_clamp() { - let config = HtmlConfig::new().with_max_heading_level(10); - assert_eq!(config.max_heading_level, 6); - - let config = HtmlConfig::new().with_max_heading_level(0); - assert_eq!(config.max_heading_level, 1); - } - - #[test] - fn test_preset_configs() { - let config = HtmlConfig::no_code_blocks(); - assert!(!config.include_code_blocks); - - let config = HtmlConfig::simple(); - assert!(!config.merge_small_nodes); - assert_eq!(config.min_content_length, 0); - } -} diff --git a/rust/src/parser/html/mod.rs b/rust/src/parser/html/mod.rs deleted file mode 100644 index b920b4a5..00000000 --- a/rust/src/parser/html/mod.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! HTML document parser. -//! -//! This module provides an HTML parser that extracts hierarchical structure -//! from HTML documents using heading tags (`

`-`

`) as section markers. -//! -//! # Features -//! -//! - Parses HTML5 documents using `scraper` -//! - Extracts heading hierarchy (`

`-`

`) -//! - Extracts content from paragraphs, lists, tables, etc. -//! - Preserves document structure -//! -//! # Example -//! -//! ```rust -//! use vectorless::parser::html::HtmlParser; -//! use vectorless::parser::DocumentParser; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::Result<()> { -//! let parser = HtmlParser::new(); -//! let html = r#" -//! -//! -//!

Title

-//!

Introduction paragraph.

-//!

Section 1

-//!

Content for section 1.

-//! -//! -//! "#; -//! let result = parser.parse(html).await?; -//! println!("Found {} nodes", result.node_count()); -//! # Ok(()) -//! # } -//! ``` - -mod config; -mod parser; - -pub use config::HtmlConfig; -pub use parser::HtmlParser; diff --git a/rust/src/parser/html/parser.rs b/rust/src/parser/html/parser.rs deleted file mode 100644 index a0c81e5c..00000000 --- a/rust/src/parser/html/parser.rs +++ /dev/null @@ -1,540 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! HTML parser implementation using scraper. - -use async_trait::async_trait; -use scraper::{ElementRef, Html, Selector}; -use std::path::Path; - -use crate::error::Result; -use crate::parser::{DocumentFormat, DocumentMeta, DocumentParser, ParseResult, RawNode}; -use crate::utils::estimate_tokens; - -use super::config::HtmlConfig; - -/// Metadata extracted from HTML. -struct HtmlMetadata { - title: String, - description: Option, - author: Option, - keywords: Option, -} - -impl Default for HtmlMetadata { - fn default() -> Self { - Self { - title: String::new(), - description: None, - author: None, - keywords: None, - } - } -} - -/// HTML parser that extracts hierarchical structure from HTML documents. -/// -/// Uses `scraper` for HTML5-compliant parsing. Extracts heading hierarchy -/// and content from various HTML elements. -#[derive(Debug, Clone)] -pub struct HtmlParser { - /// Configuration options. - config: HtmlConfig, -} - -impl Default for HtmlParser { - fn default() -> Self { - Self::new() - } -} - -impl HtmlParser { - /// Create a new HTML parser with default configuration. - #[must_use] - pub fn new() -> Self { - Self::with_config(HtmlConfig::default()) - } - - /// Create a parser with custom configuration. - #[must_use] - pub fn with_config(config: HtmlConfig) -> Self { - Self { config } - } - - /// Parse HTML content and extract nodes. - fn extract_nodes(&self, content: &str) -> (Vec, HtmlMetadata) { - let document = Html::parse_document(content); - - // Extract metadata from - let metadata = self.extract_metadata(&document); - - // Extract nodes from - let nodes = self.extract_nodes_from_document(&document); - - (nodes, metadata) - } - - /// Extract metadata from the document head. - fn extract_metadata(&self, document: &Html) -> HtmlMetadata { - let mut meta = HtmlMetadata::default(); - - // Extract title - if let Ok(selector) = Selector::parse("title") { - if let Some(title_elem) = document.select(&selector).next() { - meta.title = title_elem.text().collect::(); - } - } - - // Extract meta description - if let Ok(selector) = Selector::parse("meta[name=\"description\"]") { - if let Some(desc_elem) = document.select(&selector).next() { - if let Some(content) = desc_elem.value().attr("content") { - meta.description = Some(content.to_string()); - } - } - } - - // Extract meta author - if let Ok(selector) = Selector::parse("meta[name=\"author\"]") { - if let Some(author_elem) = document.select(&selector).next() { - if let Some(content) = author_elem.value().attr("content") { - meta.author = Some(content.to_string()); - } - } - } - - // Extract meta keywords - if let Ok(selector) = Selector::parse("meta[name=\"keywords\"]") { - if let Some(keywords_elem) = document.select(&selector).next() { - if let Some(content) = keywords_elem.value().attr("content") { - meta.keywords = Some(content.to_string()); - } - } - } - - // Also try Open Graph description - if meta.description.is_none() { - if let Ok(selector) = Selector::parse("meta[property=\"og:description\"]") { - if let Some(og_elem) = document.select(&selector).next() { - if let Some(content) = og_elem.value().attr("content") { - meta.description = Some(content.to_string()); - } - } - } - } - - meta - } - - /// Extract nodes from the document. - fn extract_nodes_from_document(&self, document: &Html) -> Vec { - let mut nodes = Vec::new(); - - // Parse body selector - let body_selector = match Selector::parse("body") { - Ok(s) => s, - Err(_) => return nodes, - }; - - let body = match document.select(&body_selector).next() { - Some(b) => b, - None => return nodes, - }; - - // Collect all headings in order - let heading_selector = Selector::parse("h1, h2, h3, h4, h5, h6").unwrap(); - - let mut headings: Vec<(usize, String, usize)> = Vec::new(); // (index, title, level) - - for (idx, heading) in body.select(&heading_selector).enumerate() { - let level = self.get_heading_level(heading.value().name()); - if let Some(lvl) = level { - if lvl <= self.config.max_heading_level { - let title: String = heading.text().collect(); - if !title.trim().is_empty() { - headings.push((idx, title.trim().to_string(), lvl)); - } - } - } - } - - // If no headings found, try to extract content anyway - if headings.is_empty() { - let content = self.extract_body_content(body); - if !content.trim().is_empty() { - nodes.push(RawNode { - title: self.config.default_title.clone(), - content: content.trim().to_string(), - level: 0, - line_start: 1, - line_end: 1, - page: None, - token_count: Some(estimate_tokens(&content)), - total_token_count: None, - }); - } - return nodes; - } - - // Extract content between headings - for (i, (_, title, level)) in headings.iter().enumerate() { - let content = self.extract_content_after_heading(body, &headings, i); - - if !title.is_empty() || !content.trim().is_empty() { - nodes.push(RawNode { - title: title.clone(), - content: content.trim().to_string(), - level: *level, - line_start: 1, - line_end: 1, - page: None, - token_count: Some(estimate_tokens(&content)), - total_token_count: None, - }); - } - } - - // Post-process nodes - self.finalize_nodes(nodes) - } - - /// Get heading level from tag name (h1-h6). - fn get_heading_level(&self, tag: &str) -> Option { - match tag { - "h1" => Some(1), - "h2" => Some(2), - "h3" => Some(3), - "h4" => Some(4), - "h5" => Some(5), - "h6" => Some(6), - _ => None, - } - } - - /// Extract body content (for documents without headings). - fn extract_body_content(&self, body: ElementRef) -> String { - let mut content = String::new(); - - // Extract paragraphs - if let Ok(selector) = Selector::parse("p") { - for p in body.select(&selector) { - let text: String = p.text().collect(); - if !text.trim().is_empty() { - if !content.is_empty() { - content.push_str("\n\n"); - } - content.push_str(text.trim()); - } - } - } - - content - } - - /// Extract content after a heading until the next heading. - fn extract_content_after_heading( - &self, - body: ElementRef, - headings: &[(usize, String, usize)], - heading_index: usize, - ) -> String { - let mut content = String::new(); - - // Get all content elements - let content_selector = Selector::parse("p, ul, ol, table, pre, blockquote, div.content, article, section") - .unwrap(); - - // This is a simplified approach - extract content from sibling elements - // In a more sophisticated implementation, we would track DOM positions - for elem in body.select(&content_selector) { - let text = self.extract_element_content(elem); - if !text.is_empty() { - if !content.is_empty() { - content.push_str("\n\n"); - } - content.push_str(&text); - } - } - - content - } - - /// Extract content from a single element. - fn extract_element_content(&self, elem: ElementRef) -> String { - let tag = elem.value().name(); - - match tag { - "p" | "div" | "article" | "section" => { - let text: String = elem.text().collect(); - text.trim().to_string() - } - "ul" => self.extract_list(elem, false), - "ol" => self.extract_list(elem, true), - "table" => self.extract_table(elem), - "pre" | "code" if self.config.include_code_blocks => { - let text: String = elem.text().collect(); - if !text.trim().is_empty() { - format!("```\n{}\n```", text.trim()) - } else { - String::new() - } - } - "blockquote" => { - let text: String = elem.text().collect(); - if !text.trim().is_empty() { - text - .lines() - .map(|line| format!("> {}", line)) - .collect::>() - .join("\n") - } else { - String::new() - } - } - _ => String::new(), - } - } - - /// Extract list content. - fn extract_list(&self, element: ElementRef, ordered: bool) -> String { - let mut result = String::new(); - let li_selector = Selector::parse("li").unwrap(); - let mut counter = 1; - - for li in element.select(&li_selector) { - let text: String = li.text().collect(); - if !text.trim().is_empty() { - if !result.is_empty() { - result.push('\n'); - } - if ordered { - result.push_str(&format!("{}. {}", counter, text.trim())); - counter += 1; - } else { - result.push_str(&format!("• {}", text.trim())); - } - } - } - - result - } - - /// Extract table content. - fn extract_table(&self, element: ElementRef) -> String { - let mut result = String::new(); - let tr_selector = Selector::parse("tr").unwrap(); - - for tr in element.select(&tr_selector) { - let mut cells = Vec::new(); - let td_selector = Selector::parse("td, th").unwrap(); - - for cell in tr.select(&td_selector) { - let text: String = cell.text().collect(); - cells.push(text.trim().to_string()); - } - - if !cells.is_empty() { - if !result.is_empty() { - result.push('\n'); - } - result.push_str(&cells.join(" | ")); - } - } - - result - } - - /// Finalize nodes after extraction. - fn finalize_nodes(&self, mut nodes: Vec) -> Vec { - // Remove empty nodes - nodes.retain(|n| !n.title.is_empty() || !n.content.trim().is_empty()); - - // Merge small consecutive nodes if configured - if self.config.merge_small_nodes { - nodes = self.merge_small_nodes(nodes); - } - - nodes - } - - /// Merge small consecutive nodes. - fn merge_small_nodes(&self, nodes: Vec) -> Vec { - let mut result: Vec = Vec::new(); - - for node in nodes { - if let Some(last) = result.last_mut() { - // Merge if same level and content is small - if last.level == node.level && last.content.len() < self.config.min_content_length - { - if !last.content.is_empty() { - last.content.push_str("\n\n"); - } - last.content.push_str(&node.content); - continue; - } - } - result.push(node); - } - - result - } -} - -#[async_trait] -impl DocumentParser for HtmlParser { - fn format(&self) -> DocumentFormat { - DocumentFormat::Html - } - - async fn parse(&self, content: &str) -> Result { - let line_count = content.lines().count(); - let (nodes, html_meta) = self.extract_nodes(content); - - let meta = DocumentMeta { - name: html_meta.title, - format: DocumentFormat::Html, - page_count: None, - line_count, - source_path: None, - description: html_meta.description, - }; - - Ok(ParseResult::new(meta, nodes)) - } - - async fn parse_file(&self, path: &Path) -> Result { - let content = tokio::fs::read_to_string(path) - .await - .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; - - let mut result = self.parse(&content).await?; - - // Extract document name from filename (if not set by meta) - if result.meta.name.is_empty() { - if let Some(stem) = path.file_stem() { - result.meta.name = stem.to_string_lossy().to_string(); - } - } - result.meta.source_path = Some(path.to_string_lossy().to_string()); - - Ok(result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_parse_simple_html() { - let parser = HtmlParser::new(); - let html = r#" - Test Document - -

Main Title

-

This is a paragraph.

-

Section 1

-

Section content.

- - "#; - - let result = parser.parse(html).await.unwrap(); - - assert_eq!(result.meta.name, "Test Document"); - assert!(!result.nodes.is_empty()); - } - - #[tokio::test] - async fn test_parse_headings() { - let parser = HtmlParser::new(); - let html = r#" -

H1 Title

-

Content 1

-

H2 Title

-

Content 2

-

H3 Title

-

Content 3

- "#; - - let result = parser.parse(html).await.unwrap(); - - let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); - assert!(heading_nodes.len() >= 3); - } - - #[tokio::test] - async fn test_parse_metadata() { - let parser = HtmlParser::new(); - let html = r#" - - My Page - - - -

Content

- "#; - - let result = parser.parse(html).await.unwrap(); - - assert_eq!(result.meta.name, "My Page"); - assert_eq!(result.meta.description, Some("A test page".to_string())); - } - - #[tokio::test] - async fn test_parse_list() { - let parser = HtmlParser::new(); - let html = r#" -

List Example

-
    -
  • Item 1
  • -
  • Item 2
  • -
  • Item 3
  • -
- "#; - - let result = parser.parse(html).await.unwrap(); - - let list_node = result.nodes.iter().find(|n| n.title == "List Example"); - assert!(list_node.is_some()); - } - - #[tokio::test] - async fn test_parse_table() { - let parser = HtmlParser::new(); - let html = r#" -

Table Example

- - - -
NameAge
Alice30
- "#; - - let result = parser.parse(html).await.unwrap(); - - let table_node = result.nodes.iter().find(|n| n.title == "Table Example"); - assert!(table_node.is_some()); - } - - #[tokio::test] - async fn test_empty_document() { - let parser = HtmlParser::new(); - let result = parser.parse("").await.unwrap(); - - assert!(result.nodes.is_empty()); - } - - #[tokio::test] - async fn test_no_headings() { - let parser = HtmlParser::new(); - let html = r#" -

Just some text.

-

More text.

- "#; - - let result = parser.parse(html).await.unwrap(); - - // Should create a default node - assert_eq!(result.nodes.len(), 1); - assert_eq!(result.nodes[0].title, "Introduction"); - } -} diff --git a/rust/src/parser/mod.rs b/rust/src/parser/mod.rs deleted file mode 100644 index 7bb952d7..00000000 --- a/rust/src/parser/mod.rs +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document parsing module. -//! -//! This module provides parsers for different document formats. -//! Each parser extracts [`RawNode`]s from documents that can then be -//! organized into a [`DocumentTree`]. -//! -//! # Supported Formats -//! -//! - **Markdown** - Full support via [`MarkdownParser`] -//! - **PDF** - Full support via [`PdfParser`] with TOC extraction -//! - **DOCX** - Full support via [`DocxParser`] with heading detection -//! - **HTML** - Full support via [`HtmlParser`] with heading hierarchy -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::parser::{DocumentParser, MarkdownParser, DocumentFormat}; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::Result<()> { -//! // Create a parser -//! let parser = MarkdownParser::new(); -//! -//! // Parse content -//! let content = "# Title\n\nContent here."; -//! let result = parser.parse(content).await?; -//! -//! println!("Extracted {} nodes", result.node_count()); -//! for node in &result.nodes { -//! println!(" - {} (level {})", node.title, node.level); -//! } -//! # Ok(()) -//! # } -//! ``` - -mod registry; -mod traits; -mod types; - -// Markdown parsing module -pub mod markdown; - -// PDF parsing module -pub mod pdf; - -// HTML parsing module -pub mod html; - -// TOC processing module -pub mod toc; - -// DOCX parsing module -pub mod docx; - -// Re-export main types -pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; - -// Re-export parser trait -pub use traits::DocumentParser; - -// Re-export registry and convenience functions -pub use registry::{ParserRegistry, get_parser, get_parser_for_file, parse_content, parse_file}; - -// Re-export concrete parsers -pub use docx::DocxParser; -pub use html::{HtmlConfig, HtmlParser}; -pub use markdown::{MarkdownConfig, MarkdownParser}; -pub use pdf::PdfParser; diff --git a/rust/src/parser/registry.rs b/rust/src/parser/registry.rs deleted file mode 100644 index 4f3d0e8c..00000000 --- a/rust/src/parser/registry.rs +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Parser registry for managing document parsers. -//! -//! This module provides: -//! - [`ParserRegistry`] - A registry for document parsers with dynamic registration -//! - Module-level functions for quick parsing without registry setup - -use std::collections::HashMap; -use std::path::Path; -use std::sync::{Arc, RwLock}; - -use crate::Error; -use crate::error::Result; -use crate::parser::{ - DocumentFormat, DocumentParser, HtmlParser, MarkdownParser, ParseResult, PdfParser, -}; - -/// Type alias for parser factory functions. -type ParserFactory = Box Box + Send + Sync>; - -/// Registry for document parsers. -/// -/// Parsers can be registered by format and retrieved at runtime. -/// -/// # Example -/// -/// ```rust -/// use vectorless::parser::ParserRegistry; -/// -/// // Create with default parsers -/// let registry = ParserRegistry::with_defaults(); -/// -/// // Or create empty and register custom parsers -/// let registry = ParserRegistry::new(); -/// ``` -pub struct ParserRegistry { - /// Registered parser factories by format. - factories: Arc>>, -} - -impl std::fmt::Debug for ParserRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let factories = self.factories.read().unwrap(); - let formats: Vec<_> = factories.keys().collect(); - f.debug_struct("ParserRegistry") - .field("formats", &formats) - .finish() - } -} - -impl ParserRegistry { - /// Create a new empty registry. - pub fn new() -> Self { - Self { - factories: Arc::new(RwLock::new(HashMap::new())), - } - } - - /// Create a registry with default parsers. - pub fn with_defaults() -> Self { - let registry = Self::new(); - registry.register_defaults(); - registry - } - - /// Register default parsers (Markdown, PDF, HTML, DOCX). - pub fn register_defaults(&self) { - self.register("markdown", || Box::new(MarkdownParser::new())); - self.register("pdf", || Box::new(PdfParser::new())); - self.register("html", || Box::new(HtmlParser::new())); - self.register("docx", || Box::new(super::docx::DocxParser::new())); - } - - /// Register a parser factory by name. - pub fn register(&self, name: &str, factory: F) - where - F: Fn() -> Box + Send + Sync + 'static, - { - // Create a temporary parser to get the format - let parser = factory(); - let format = parser.format(); - - let mut factories = self.factories.write().unwrap(); - factories.insert(format, Box::new(factory)); - - let _ = name; // Name is for documentation purposes - } - - /// Get a parser by format. - pub fn get(&self, format: DocumentFormat) -> Option> { - let factories = self.factories.read().unwrap(); - factories.get(&format).map(|f| f()) - } - - /// Check if a format is supported. - pub fn supports(&self, format: DocumentFormat) -> bool { - let factories = self.factories.read().unwrap(); - factories.contains_key(&format) - } - - /// List supported formats. - pub fn supported_formats(&self) -> Vec { - let factories = self.factories.read().unwrap(); - factories.keys().copied().collect() - } - - /// Parse content using the appropriate parser. - pub async fn parse(&self, content: &str, format: DocumentFormat) -> Result { - let parser = self - .get(format) - .ok_or_else(|| Error::Parse(format!("Unsupported format: {:?}", format)))?; - parser.parse(content).await - } - - /// Parse a file using the appropriate parser. - pub async fn parse_file(&self, path: &Path) -> Result { - let ext = path - .extension() - .and_then(|e| e.to_str()) - .ok_or_else(|| Error::Parse("Could not determine file extension".to_string()))?; - - let format = DocumentFormat::from_extension(ext) - .ok_or_else(|| Error::Parse(format!("Unknown format: {}", ext)))?; - - self.parse_file_as(path, format).await - } - - /// Parse a file with a specific format. - pub async fn parse_file_as(&self, path: &Path, format: DocumentFormat) -> Result { - let parser = self - .get(format) - .ok_or_else(|| Error::Parse(format!("Unsupported format: {:?}", format)))?; - parser.parse_file(path).await - } - - /// Parse binary data using the appropriate parser. - /// - /// For text-based formats, the bytes are converted to UTF-8 string first. - /// For binary formats (PDF, DOCX), the parser handles the bytes directly. - pub async fn parse_bytes(&self, bytes: &[u8], format: DocumentFormat) -> Result { - match format { - DocumentFormat::Markdown | DocumentFormat::Html => { - // Text formats - convert to string first - let content = std::str::from_utf8(bytes) - .map_err(|e| Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; - self.parse(content, format).await - } - DocumentFormat::Pdf | DocumentFormat::Docx => { - // Binary formats - write to temp file and parse - // This is a temporary solution until parsers support bytes directly - let temp_dir = std::env::temp_dir(); - let ext = format.extension(); - let temp_file = - temp_dir.join(format!("vectorless_temp_{}.{}", uuid::Uuid::new_v4(), ext)); - - std::fs::write(&temp_file, bytes) - .map_err(|e| Error::Parse(format!("Failed to write temp file: {}", e)))?; - - let result = self.parse_file_as(&temp_file, format).await; - - // Clean up temp file - let _ = std::fs::remove_file(&temp_file); - - result - } - } - } -} - -impl Default for ParserRegistry { - fn default() -> Self { - Self::with_defaults() - } -} - -// ============================================================================= -// Module-level convenience functions -// ============================================================================= - -/// Get a parser for the given format. -/// -/// Returns `None` if the format is not supported. -pub fn get_parser(format: DocumentFormat) -> Option> { - match format { - DocumentFormat::Markdown => Some(Box::new(MarkdownParser::new())), - DocumentFormat::Pdf => Some(Box::new(PdfParser::new())), - DocumentFormat::Html => Some(Box::new(HtmlParser::new())), - DocumentFormat::Docx => Some(Box::new(super::docx::DocxParser::new())), - } -} - -/// Get a parser for a file based on its extension. -/// -/// Returns `None` if the extension is not recognized or not supported. -pub fn get_parser_for_file(path: &Path) -> Option> { - let ext = path.extension()?.to_str()?; - let format = DocumentFormat::from_extension(ext)?; - get_parser(format) -} - -/// Parse a document from content using the appropriate parser. -/// -/// # Arguments -/// -/// * `content` - The document content -/// * `format` - The document format -/// -/// # Returns -/// -/// A [`ParseResult`] containing the extracted nodes. -pub async fn parse_content(content: &str, format: DocumentFormat) -> Result { - let parser = get_parser(format) - .ok_or_else(|| Error::Parse(format!("Unsupported format: {:?}", format)))?; - parser.parse(content).await -} - -/// Parse a document from a file. -/// -/// # Arguments -/// -/// * `path` - Path to the file -/// -/// # Returns -/// -/// A [`ParseResult`] containing the extracted nodes. -pub async fn parse_file(path: &Path) -> Result { - let parser = get_parser_for_file(path) - .ok_or_else(|| Error::Parse(format!("Unsupported file: {:?}", path)))?; - parser.parse_file(path).await -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_registry_defaults() { - let registry = ParserRegistry::with_defaults(); - assert!(registry.supports(DocumentFormat::Markdown)); - } - - #[test] - fn test_supported_formats() { - let registry = ParserRegistry::with_defaults(); - let formats = registry.supported_formats(); - assert!(formats.contains(&DocumentFormat::Markdown)); - assert!(formats.contains(&DocumentFormat::Html)); - } - - #[test] - fn test_get_parser() { - let registry = ParserRegistry::with_defaults(); - let parser = registry.get(DocumentFormat::Markdown); - assert!(parser.is_some()); - } - - #[test] - fn test_unsupported_format() { - let registry = ParserRegistry::new(); // Empty registry - let parser = registry.get(DocumentFormat::Pdf); - assert!(parser.is_none()); - } - - #[test] - fn test_pdf_parser_registered() { - let registry = ParserRegistry::with_defaults(); - assert!(registry.supports(DocumentFormat::Pdf)); - let parser = registry.get(DocumentFormat::Pdf); - assert!(parser.is_some()); - } - - #[test] - fn test_html_parser_registered() { - let registry = ParserRegistry::with_defaults(); - assert!(registry.supports(DocumentFormat::Html)); - let parser = registry.get(DocumentFormat::Html); - assert!(parser.is_some()); - } - - #[test] - fn test_get_parser_function() { - let parser = get_parser(DocumentFormat::Markdown); - assert!(parser.is_some()); - } - - #[test] - fn test_get_parser_for_file() { - let parser = get_parser_for_file(Path::new("test.md")); - assert!(parser.is_some()); - } - - #[test] - fn test_get_html_parser_for_file() { - let parser = get_parser_for_file(Path::new("test.html")); - assert!(parser.is_some()); - } -} diff --git a/rust/src/parser/toc/mod.rs b/rust/src/parser/toc/mod.rs deleted file mode 100644 index 99e4861d..00000000 --- a/rust/src/parser/toc/mod.rs +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Table of Contents (TOC) processing module. -//! -//! This module provides functionality to extract and verify document structure -//! from PDF Table of Contents: -//! -//! - **Detection** — Find TOC in document (regex + LLM fallback) -//! - **Parsing** — Convert TOC text to structured entries (LLM) -//! - **Assignment** — Map TOC pages to physical pages -//! - **Verification** — Sample verification of page assignments -//! - **Repair** — Fix incorrect assignments -//! -//! # Architecture -//! -//! ```text -//! PDF Pages -//! │ -//! ▼ -//! ┌─────────────────────────────────────────────────┐ -//! │ TocProcessor │ -//! │ │ -//! │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ -//! │ │Detector │─▶│ Parser │─▶│Assigner │ │ -//! │ └─────────┘ └─────────┘ └────┬────┘ │ -//! │ │ │ -//! │ ▼ │ -//! │ ┌─────────────┐ │ -//! │ │ Verifier │ │ -//! │ └──────┬──────┘ │ -//! │ │ │ -//! │ ▼ │ -//! │ ┌─────────────┐ │ -//! │ │ Repairer │ │ -//! │ └─────────────┘ │ -//! └─────────────────────────────────────────────────┘ -//! │ -//! ▼ -//! Vec -//! ``` -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::parser::toc::TocProcessor; -//! use vectorless::parser::pdf::{PdfParser, PdfPage}; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::Result<()> { -//! // Parse PDF -//! let pdf_parser = PdfParser::new(); -//! let result = pdf_parser.parse_file("document.pdf".as_ref())?; -//! -//! // Extract TOC -//! let processor = TocProcessor::new(); -//! let entries = processor.process(&result.pages).await?; -//! -//! // Use entries -//! for entry in &entries { -//! println!("{} - Page {:?}", entry.title, entry.physical_page); -//! } -//! # Ok(()) -//! # } -//! ``` - -mod assigner; -mod detector; -mod parser; -mod processor; -mod repairer; -mod types; -mod verifier; - -// Re-export main types -pub use types::{ - ErrorType, PageOffset, TocDetection, TocEntry, VerificationError, VerificationReport, -}; - -// Re-export components -pub use assigner::{PageAssigner, PageAssignerConfig}; -pub use detector::{TocDetector, TocDetectorConfig}; -pub use parser::{TocParser, TocParserConfig}; -pub use processor::{TocProcessor, TocProcessorConfig}; -pub use repairer::{IndexRepairer, RepairerConfig}; -pub use verifier::{IndexVerifier, VerifierConfig}; diff --git a/rust/src/parser/traits.rs b/rust/src/parser/traits.rs deleted file mode 100644 index e93cef70..00000000 --- a/rust/src/parser/traits.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Parser trait definition. - -use async_trait::async_trait; -use std::path::Path; - -use super::{DocumentFormat, ParseResult}; -use crate::error::Result; - -/// A parser for extracting content from documents. -/// -/// Implementations parse different document formats and produce -/// a sequence of raw nodes that can be organized into a tree. -/// -/// # Example -/// -/// ```rust -/// use vectorless::parser::{DocumentParser, MarkdownParser}; -/// -/// # #[tokio::main] -/// # async fn main() -> vectorless::Result<()> { -/// let parser = MarkdownParser::new(); -/// let content = "# Title\n\nContent here."; -/// let result = parser.parse(content).await?; -/// println!("Found {} nodes", result.node_count()); -/// # Ok(()) -/// # } -/// ``` -#[async_trait] -pub trait DocumentParser: Send + Sync { - /// Get the document format this parser handles. - fn format(&self) -> DocumentFormat; - - /// Parse content from a string. - /// - /// # Arguments - /// - /// * `content` - The document content as a string - /// - /// # Returns - /// - /// A [`ParseResult`] containing extracted nodes and metadata. - async fn parse(&self, content: &str) -> Result; - - /// Parse content from a file. - /// - /// Default implementation reads the file and calls [`parse`](Self::parse). - /// - /// # Arguments - /// - /// * `path` - Path to the file - async fn parse_file(&self, path: &Path) -> Result { - let content = tokio::fs::read_to_string(path) - .await - .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; - - self.parse(&content).await - } -} diff --git a/rust/src/retrieval/cache/hot_tracker.rs b/rust/src/retrieval/cache/hot_tracker.rs index bad19bdd..a284e065 100644 --- a/rust/src/retrieval/cache/hot_tracker.rs +++ b/rust/src/retrieval/cache/hot_tracker.rs @@ -10,8 +10,8 @@ use std::collections::HashMap; use std::sync::RwLock; -use crate::document::NodeId; use crate::document::HotNodeEntry; +use crate::document::NodeId; /// Thread-safe tracker for hot (frequently retrieved) nodes. pub struct HotNodeTracker { @@ -60,9 +60,7 @@ impl HotNodeTracker { pub fn is_hot(&self, node_id: NodeId) -> bool { self.inner .read() - .map(|inner| { - inner.hits.get(&node_id).copied().unwrap_or(0) >= self.hot_threshold - }) + .map(|inner| inner.hits.get(&node_id).copied().unwrap_or(0) >= self.hot_threshold) .unwrap_or(false) } @@ -174,11 +172,7 @@ mod tests { let (node_a, node_b, node_c) = make_node_ids(); - let hits = vec![ - (node_a, 0.9), - (node_b, 0.8), - (node_c, 0.7), - ]; + let hits = vec![(node_a, 0.9), (node_b, 0.8), (node_c, 0.7)]; tracker.record_hits(&hits); assert!(tracker.is_hot(node_a)); diff --git a/rust/src/retrieval/cache/mod.rs b/rust/src/retrieval/cache/mod.rs index 34202fd8..60eabb56 100644 --- a/rust/src/retrieval/cache/mod.rs +++ b/rust/src/retrieval/cache/mod.rs @@ -15,7 +15,4 @@ mod path_cache; mod reasoning_cache; pub use hot_tracker::HotNodeTracker; -pub use path_cache::PathCache; -pub use reasoning_cache::{ - CachedCandidate, ReasoningCache, ReasoningCacheConfig, ReasoningCacheStats, -}; +pub use reasoning_cache::{CachedCandidate, ReasoningCache}; diff --git a/rust/src/retrieval/cache/reasoning_cache.rs b/rust/src/retrieval/cache/reasoning_cache.rs index 6dc87f87..f4d98397 100644 --- a/rust/src/retrieval/cache/reasoning_cache.rs +++ b/rust/src/retrieval/cache/reasoning_cache.rs @@ -21,7 +21,6 @@ use std::sync::RwLock; use std::time::Instant; use crate::document::NodeId; -use crate::retrieval::pipeline::CandidateNode; use crate::utils::fingerprint::Fingerprint; /// A tiered reasoning cache for the retrieval pipeline. @@ -155,11 +154,7 @@ impl ReasoningCache { /// /// Returns cached candidates if the same query was executed before /// on the same document scope. - pub fn l1_get( - &self, - query: &str, - scope_fp: &Fingerprint, - ) -> Option> { + pub fn l1_get(&self, query: &str, scope_fp: &Fingerprint) -> Option> { let query_fp = Fingerprint::from_str(query); let l1 = self.l1.read().ok()?; let entry = l1.entries.get(&query_fp)?; @@ -271,12 +266,7 @@ impl ReasoningCache { } /// Store a strategy score for a node. - pub fn l3_store( - &self, - node_content_fp: Fingerprint, - score: f32, - strategy: String, - ) { + pub fn l3_store(&self, node_content_fp: Fingerprint, score: f32, strategy: String) { if let Ok(mut l3) = self.l3.write() { if l3.entries.len() >= self.config.l3_max { Self::evict_lru_fingerprint_l3(&mut l3); diff --git a/rust/src/retrieval/content/aggregator.rs b/rust/src/retrieval/content/aggregator.rs index 04464930..be7028ae 100644 --- a/rust/src/retrieval/content/aggregator.rs +++ b/rust/src/retrieval/content/aggregator.rs @@ -13,12 +13,10 @@ use tracing::{debug, info}; use crate::document::{DocumentTree, NodeId}; use crate::utils::estimate_tokens; -use super::budget::{AllocationResult, AllocationStrategy, BudgetAllocator, SelectedContent}; -use super::builder::{ContentMetadata, StructureBuilder, StructuredContent}; -use super::config::{ContentAggregatorConfig, OutputFormatConfig, ScoringStrategyConfig}; -use super::scorer::{ - ContentChunk, ContentRelevance, RelevanceScorer, ScoreComponents, ScoringContext, -}; +use super::budget::{AllocationStrategy, BudgetAllocator}; +use super::builder::{ContentMetadata, StructureBuilder}; +use super::config::ContentAggregatorConfig; +use super::scorer::{ContentChunk, RelevanceScorer, ScoringContext}; /// Candidate node from retrieval. #[derive(Debug, Clone)] diff --git a/rust/src/retrieval/content/budget.rs b/rust/src/retrieval/content/budget.rs index 830a7685..82831603 100644 --- a/rust/src/retrieval/content/budget.rs +++ b/rust/src/retrieval/content/budget.rs @@ -522,107 +522,3 @@ impl Default for BudgetAllocator { Self::new(4000) } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::retrieval::content::{ContentChunk, ScoreComponents}; - use indextree::Arena; - - fn make_test_node_id() -> NodeId { - let mut arena = Arena::new(); - let node = crate::document::TreeNode { - title: "Test".to_string(), - structure: String::new(), - content: String::new(), - summary: String::new(), - depth: 0, - start_index: 0, - end_index: 0, - start_page: None, - end_page: None, - node_id: None, - physical_index: None, - token_count: None, - references: Vec::new(), - }; - NodeId(arena.new_node(node)) - } - - fn make_relevance(content: &str, score: f32, depth: usize) -> ContentRelevance { - let chunk = ContentChunk::new( - make_test_node_id(), - "Test".to_string(), - content.to_string(), - depth, - ); - ContentRelevance::new(chunk, score, ScoreComponents::default()) - } - - #[test] - fn test_allocator_creation() { - let allocator = BudgetAllocator::new(1000); - assert_eq!(allocator.total_budget, 1000); - } - - #[test] - fn test_greedy_allocation() { - let allocator = BudgetAllocator::new(100).with_strategy(AllocationStrategy::Greedy); - - let content = vec![ - make_relevance("High score content with enough text", 0.9, 0), - make_relevance("Low score content", 0.3, 0), - ]; - - let result = allocator.allocate(content, 1); - assert!(!result.is_empty()); - assert!(result.tokens_used <= 100); - } - - #[test] - fn test_min_score_filter() { - let allocator = BudgetAllocator::new(1000).with_min_score(0.5); - - let content = vec![ - make_relevance("Good content", 0.8, 0), - make_relevance("Bad content", 0.2, 0), - ]; - - let result = allocator.allocate(content, 1); - assert_eq!(result.selected.len(), 1); - } - - #[test] - fn test_truncation() { - let allocator = BudgetAllocator::new(50); - let truncated = allocator.truncate_content( - "This is a very long piece of content. It has multiple sentences. We want to test truncation at sentence boundary.", - 25, // Need at least 20 tokens for truncation - ); - - assert!(truncated.is_some()); - let text = truncated.unwrap(); - // Should truncate and add ellipsis - assert!(text.len() < 200); // Should be truncated - } - - #[test] - fn test_hierarchical_allocation() { - let allocator = BudgetAllocator::new(200) - .with_strategy(AllocationStrategy::Hierarchical { min_per_level: 0.2 }); - - let content = vec![ - make_relevance("Depth 0 content", 0.9, 0), - make_relevance("Depth 1 content A", 0.7, 1), - make_relevance("Depth 1 content B", 0.6, 1), - make_relevance("Depth 2 content", 0.8, 2), - ]; - - let result = allocator.allocate(content, 2); - - // Should have content from multiple depths - let depths: std::collections::HashSet = - result.selected.iter().map(|s| s.depth).collect(); - assert!(depths.len() >= 2); - } -} diff --git a/rust/src/retrieval/content/mod.rs b/rust/src/retrieval/content/mod.rs index 5b02588b..f339f182 100644 --- a/rust/src/retrieval/content/mod.rs +++ b/rust/src/retrieval/content/mod.rs @@ -37,10 +37,5 @@ mod builder; mod config; mod scorer; -pub use aggregator::{AggregationResult, CandidateNode, ContentAggregator}; -pub use budget::{AllocationResult, AllocationStrategy, BudgetAllocator, SelectedContent}; -pub use builder::{ContentTree, OutputFormat, StructureBuilder, StructuredContent}; +pub use aggregator::{CandidateNode, ContentAggregator}; pub use config::{ContentAggregatorConfig, OutputFormatConfig, ScoringStrategyConfig}; -pub use scorer::{ - ContentChunk, ContentRelevance, RelevanceScorer, ScoreComponents, ScoringContext, -}; diff --git a/rust/src/retrieval/content/scorer.rs b/rust/src/retrieval/content/scorer.rs index 6e3c3abb..389225df 100644 --- a/rust/src/retrieval/content/scorer.rs +++ b/rust/src/retrieval/content/scorer.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use crate::document::NodeId; -use crate::retrieval::search::{extract_keywords, Bm25Params, STOPWORDS}; +use crate::retrieval::search::{Bm25Params, STOPWORDS, extract_keywords}; use crate::utils::estimate_tokens; use super::config::ScoringStrategyConfig; diff --git a/rust/src/retrieval/decompose.rs b/rust/src/retrieval/decompose.rs index 9c547ef8..da928ab1 100644 --- a/rust/src/retrieval/decompose.rs +++ b/rust/src/retrieval/decompose.rs @@ -44,8 +44,6 @@ //! } //! ``` -use std::collections::HashMap; - use serde::{Deserialize, Serialize}; use tracing::{debug, info}; @@ -253,7 +251,10 @@ impl QueryDecomposer { match self.llm_decompose(query).await { Ok(result) => return Ok(result), Err(e) => { - debug!("LLM decomposition failed, falling back to rule-based: {}", e); + debug!( + "LLM decomposition failed, falling back to rule-based: {}", + e + ); } } } @@ -352,7 +353,10 @@ impl QueryDecomposer { if sub_queries.is_empty() { for (pattern, _) in &patterns { if query_lower.contains(pattern) { - let parts: Vec<&str> = query.split(pattern).filter(|s| !s.trim().is_empty()).collect(); + let parts: Vec<&str> = query + .split(pattern) + .filter(|s| !s.trim().is_empty()) + .collect(); if parts.len() > 1 { for (i, part) in parts.iter().enumerate() { sub_queries.push(SubQuery { @@ -406,13 +410,15 @@ If the query is simple enough, return just one sub-query."#; let user = format!("Decompose this query: {}", query); let response = if let Some(ref executor) = self.llm_executor { - executor.complete(system, &user).await.map_err(|e| { - crate::error::Error::Llm(format!("LLM executor error: {}", e)) - })? + executor + .complete(system, &user) + .await + .map_err(|e| crate::error::Error::Llm(format!("LLM executor error: {}", e)))? } else if let Some(ref client) = self.llm_client { - client.complete(system, &user).await.map_err(|e| { - crate::error::Error::Llm(format!("LLM client error: {}", e)) - })? + client + .complete(system, &user) + .await + .map_err(|e| crate::error::Error::Llm(format!("LLM client error: {}", e)))? } else { return Err(crate::error::Error::Config( "No LLM client or executor configured".to_string(), @@ -426,8 +432,8 @@ If the query is simple enough, return just one sub-query."#; reason: String, } - let parsed: DecompositionResponse = - serde_json::from_str(&extract_json(&response)).map_err(|e| { + let parsed: DecompositionResponse = serde_json::from_str(&extract_json(&response)) + .map_err(|e| { crate::error::Error::Llm(format!("Failed to parse decomposition: {}", e)) })?; @@ -584,7 +590,11 @@ impl ResultAggregator { let order = decomposition.execution_order(); let sorted_results: Vec<_> = order .iter() - .filter_map(|&i| results.iter().find(|r| r.query.text == decomposition.sub_queries[i].text)) + .filter_map(|&i| { + results + .iter() + .find(|r| r.query.text == decomposition.sub_queries[i].text) + }) .collect(); // Combine results with section headers @@ -592,11 +602,7 @@ impl ResultAggregator { let mut total_tokens = 0; for result in sorted_results { - let section = format!( - "\n### {}\n\n{}\n", - result.query.text, - result.content - ); + let section = format!("\n### {}\n\n{}\n", result.query.text, result.content); let section_tokens = section.len() / 4; // Rough estimate if total_tokens + section_tokens > self.max_tokens { @@ -644,9 +650,9 @@ mod tests { fn test_rule_based_decomposition() { let decomposer = QueryDecomposer::default(); - let result = decomposer.rule_based_decompose( - "What is the architecture? How does caching work?", - ).unwrap(); + let result = decomposer + .rule_based_decompose("What is the architecture? How does caching work?") + .unwrap(); assert!(result.was_decomposed); assert_eq!(result.sub_queries.len(), 2); @@ -654,10 +660,7 @@ mod tests { #[test] fn test_no_decomposition() { - let result = DecompositionResult::no_decomposition( - "What is this?", - "Query is simple", - ); + let result = DecompositionResult::no_decomposition("What is this?", "Query is simple"); assert!(!result.was_decomposed); assert!(!result.is_multi_turn()); diff --git a/rust/src/retrieval/mod.rs b/rust/src/retrieval/mod.rs index d5d65e22..35b4508d 100644 --- a/rust/src/retrieval/mod.rs +++ b/rust/src/retrieval/mod.rs @@ -65,77 +65,13 @@ pub mod stages; pub mod strategy; pub mod sufficiency; -pub use context::{ - ContextBuilder, PruningStrategy, TokenEstimation, format_for_llm, format_for_llm_async, - format_tree_for_llm, format_tree_for_llm_async, -}; +pub use context::{PruningStrategy, TokenEstimation}; pub use pipeline_retriever::PipelineRetriever; -pub use retriever::{RetrievalContext, Retriever, RetrieverError, RetrieverResult}; +pub use retriever::{RetrievalContext, Retriever}; pub use types::*; -pub use types::{LlmCallSummary, ReasoningCandidate, ReasoningChain, ReasoningStep, StageName}; - -// Re-export StrategyPreference as Strategy for convenience -pub use types::StrategyPreference as Strategy; - -// Pipeline exports -pub use pipeline::{ - CandidateNode, ExecutionGroup, FailurePolicy, PipelineContext, RetrievalBudgetController, - RetrievalMetrics, RetrievalOrchestrator, RetrievalStage, SearchAlgorithm, SearchConfig, - StageOutcome, BudgetStatus, -}; - -// Re-export PipelineContext as RetrievalContext for stages (alias for clarity) -pub use pipeline::PipelineContext as StageContext; - -// Stage exports -pub use stages::{AnalyzeStage, EvaluateStage, PlanStage, SearchStage}; - -// Strategy exports -pub use strategy::{ - CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry, DocumentId, DocumentResult, - HybridConfig, HybridStrategy, KeywordStrategy, LlmStrategy, MergeStrategy, - PageRange, PageRangeConfig, PageRangeStrategy, RetrievalStrategy, SemanticStrategy, - StrategyCapabilities, StrategyCost, -}; - -// Search exports -pub use search::{BeamSearch, GreedySearch, SearchConfig as SearchAlgConfig, SearchResult}; // Sufficiency exports -pub use sufficiency::{SufficiencyChecker, SufficiencyLevel, ThresholdChecker}; - -// Complexity exports -pub use complexity::ComplexityDetector; - -// Cache exports -pub use cache::PathCache; -pub use cache::{CachedCandidate, ReasoningCache, ReasoningCacheConfig, ReasoningCacheStats}; - -// Content aggregation exports -pub use content::{ - AggregationResult, AllocationResult, AllocationStrategy, BudgetAllocator, ContentAggregator, - ContentAggregatorConfig, ContentChunk, ContentRelevance, OutputFormat, RelevanceScorer, - ScoreComponents, ScoringStrategyConfig, SelectedContent, StructureBuilder, StructuredContent, -}; - -// Pilot exports -pub use pilot::NoopPilot; -pub use pilot::{ - BudgetConfig, InterventionConfig, InterventionPoint, Pilot, PilotConfig, PilotDecision, - PilotMode, RankedCandidate, SearchDirection, SearchState, -}; - -// Decompose exports (multi-turn retrieval) -pub use decompose::{ - DecompositionConfig, DecompositionResult, QueryDecomposer, ResultAggregator, SubQuery, - SubQueryComplexity, SubQueryResult, SubQueryType, -}; - -// Reference following exports -pub use reference::{ - expand_with_references, FollowedReference, ReferenceConfig, ReferenceExpansion, - ReferenceFollower, -}; +pub use sufficiency::SufficiencyLevel; // Streaming exports -pub use stream::{RetrieveEvent, RetrieveEventReceiver, DEFAULT_STREAM_BOUND}; +pub use stream::RetrieveEventReceiver; diff --git a/rust/src/retrieval/pilot/builder.rs b/rust/src/retrieval/pilot/builder.rs index 4b30c7ff..0096cde4 100644 --- a/rust/src/retrieval/pilot/builder.rs +++ b/rust/src/retrieval/pilot/builder.rs @@ -354,7 +354,8 @@ impl ContextBuilder { /// Get the effective max candidates (mode default or override). fn effective_max_candidates(&self) -> usize { - self.max_candidates.unwrap_or_else(|| self.mode.max_candidates()) + self.max_candidates + .unwrap_or_else(|| self.mode.max_candidates()) } /// Get the effective max path depth (mode default or override). @@ -364,17 +365,20 @@ impl ContextBuilder { /// Get the effective include summaries setting (mode default or override). fn effective_include_summaries(&self) -> bool { - self.include_summaries.unwrap_or_else(|| self.mode.include_summaries()) + self.include_summaries + .unwrap_or_else(|| self.mode.include_summaries()) } /// Get the effective max TOC depth (mode default or override). fn effective_max_toc_depth(&self) -> usize { - self.max_toc_depth.unwrap_or_else(|| self.mode.max_toc_depth()) + self.max_toc_depth + .unwrap_or_else(|| self.mode.max_toc_depth()) } /// Get the effective summary truncation length (mode default or override). fn effective_summary_truncation(&self) -> usize { - self.summary_truncation.unwrap_or_else(|| self.mode.summary_truncation()) + self.summary_truncation + .unwrap_or_else(|| self.mode.summary_truncation()) } /// Get the current mode. diff --git a/rust/src/retrieval/pilot/feedback.rs b/rust/src/retrieval/pilot/feedback.rs index 0d2efbdd..495fae34 100644 --- a/rust/src/retrieval/pilot/feedback.rs +++ b/rust/src/retrieval/pilot/feedback.rs @@ -42,12 +42,11 @@ use std::collections::HashMap; use std::path::Path; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::sync::atomic::{AtomicU64, Ordering}; use serde::{Deserialize, Serialize}; -use tracing::{debug, info, warn}; +use tracing::{debug, info}; use super::decision::InterventionPoint; @@ -148,16 +147,14 @@ impl ContextStats { if was_correct { self.correct += 1; // Running average - self.avg_confidence_correct = (self.avg_confidence_correct - * (self.correct - 1) as f64 + self.avg_confidence_correct = (self.avg_confidence_correct * (self.correct - 1) as f64 + confidence) / self.correct as f64; } else { let incorrect = self.total - self.correct; - self.avg_confidence_incorrect = (self.avg_confidence_incorrect - * (incorrect - 1) as f64 - + confidence) - / incorrect as f64; + self.avg_confidence_incorrect = + (self.avg_confidence_incorrect * (incorrect - 1) as f64 + confidence) + / incorrect as f64; } } } @@ -333,10 +330,8 @@ impl FeedbackStore { /// Get overall accuracy across all feedback. pub fn overall_accuracy(&self) -> f64 { let stats = self.intervention_stats.read().unwrap(); - let total = stats.start.total - + stats.fork.total - + stats.backtrack.total - + stats.evaluate.total; + let total = + stats.start.total + stats.fork.total + stats.backtrack.total + stats.evaluate.total; let correct = stats.start.correct + stats.fork.correct + stats.backtrack.correct @@ -555,9 +550,10 @@ impl PilotLearner { } // Clamp confidence delta - adjustment.confidence_delta = adjustment - .confidence_delta - .clamp(-self.config.max_confidence_delta, self.config.max_confidence_delta); + adjustment.confidence_delta = adjustment.confidence_delta.clamp( + -self.config.max_confidence_delta, + self.config.max_confidence_delta, + ); adjustment } @@ -575,10 +571,8 @@ impl PilotLearner { /// Check if enough feedback has been collected. pub fn has_sufficient_data(&self) -> bool { let stats = self.store.intervention_stats(); - let total = stats.start.total - + stats.fork.total - + stats.backtrack.total - + stats.evaluate.total; + let total = + stats.start.total + stats.fork.total + stats.backtrack.total + stats.evaluate.total; total >= self.config.min_samples } } diff --git a/rust/src/retrieval/pilot/llm_pilot.rs b/rust/src/retrieval/pilot/llm_pilot.rs index de41c052..2e96120f 100644 --- a/rust/src/retrieval/pilot/llm_pilot.rs +++ b/rust/src/retrieval/pilot/llm_pilot.rs @@ -12,15 +12,14 @@ use tracing::{debug, info, warn}; use crate::document::DocumentTree; use crate::llm::{LlmClient, LlmExecutor}; -use crate::memo::{MemoKey, MemoOpType, MemoStore, MemoValue}; -use crate::throttle::ConcurrencyController; +use crate::memo::{MemoKey, MemoStore, MemoValue}; use crate::utils::fingerprint::Fingerprint; use super::budget::BudgetController; use super::builder::ContextBuilder; use super::config::PilotConfig; use super::decision::{InterventionPoint, PilotDecision}; -use super::feedback::{DecisionAdjustment, FeedbackRecord, FeedbackStore, PilotLearner}; +use super::feedback::{FeedbackRecord, FeedbackStore, PilotLearner}; use super::parser::ResponseParser; use super::prompts::PromptBuilder; use super::r#trait::{Pilot, SearchState}; @@ -244,7 +243,11 @@ impl LlmPilot { } /// Compute a cache key for a pilot decision. - fn compute_cache_key(&self, context: &super::builder::PilotContext, point: InterventionPoint) -> Option { + fn compute_cache_key( + &self, + context: &super::builder::PilotContext, + point: InterventionPoint, + ) -> Option { let store = self.memo_store.as_ref()?; // Build a fingerprint from the context using available methods @@ -295,7 +298,8 @@ impl LlmPilot { if let MemoValue::PilotDecision(decision_value) = cached { debug!("Memo cache hit for pilot decision at {:?}", point); // Convert cached value back to PilotDecision - let decision = self.cached_value_to_decision(decision_value, candidates, point); + let decision = + self.cached_value_to_decision(decision_value, candidates, point); return decision; } } @@ -331,10 +335,22 @@ impl LlmPilot { } } - println!("[DEBUG] LlmPilot::call_llm() - point={:?}, estimated_tokens={}", point, prompt.estimated_tokens); - println!("[DEBUG] LlmPilot::call_llm() - SYSTEM PROMPT:\n{}", prompt.system); - println!("[DEBUG] LlmPilot::call_llm() - USER PROMPT:\n{}", prompt.user); - println!("[DEBUG] LlmPilot::call_llm() - candidates count: {}", candidates.len()); + println!( + "[DEBUG] LlmPilot::call_llm() - point={:?}, estimated_tokens={}", + point, prompt.estimated_tokens + ); + println!( + "[DEBUG] LlmPilot::call_llm() - SYSTEM PROMPT:\n{}", + prompt.system + ); + println!( + "[DEBUG] LlmPilot::call_llm() - USER PROMPT:\n{}", + prompt.user + ); + println!( + "[DEBUG] LlmPilot::call_llm() - candidates count: {}", + candidates.len() + ); debug!( "Calling LLM for {:?} point (estimated: {} tokens)", point, prompt.estimated_tokens @@ -353,7 +369,10 @@ impl LlmPilot { match result { Ok(response) => { - println!("[DEBUG] LlmPilot::call_llm() - RAW LLM RESPONSE:\n{}", response); + println!( + "[DEBUG] LlmPilot::call_llm() - RAW LLM RESPONSE:\n{}", + response + ); // Record usage (estimate output tokens) let output_tokens = self.estimate_tokens(&response); self.budget @@ -361,10 +380,13 @@ impl LlmPilot { // Parse response let mut decision = self.response_parser.parse(&response, candidates, point); - println!("[DEBUG] LlmPilot::call_llm() - PARSED DECISION: confidence={:.2}, ranked={}, direction={:?}, reasoning={}", - decision.confidence, decision.ranked_candidates.len(), + println!( + "[DEBUG] LlmPilot::call_llm() - PARSED DECISION: confidence={:.2}, ranked={}, direction={:?}, reasoning={}", + decision.confidence, + decision.ranked_candidates.len(), std::mem::discriminant(&decision.direction), - decision.reasoning.chars().take(100).collect::()); + decision.reasoning.chars().take(100).collect::() + ); // Apply learner adjustment if available if let Some(ref adj) = adjustment { @@ -388,7 +410,11 @@ impl LlmPilot { if let Some(cache_key) = self.compute_cache_key(context, point) { let decision_value = self.decision_to_cached_value(&decision); let tokens_saved = prompt.estimated_tokens as u64 + output_tokens as u64; - store.put_with_tokens(cache_key, MemoValue::PilotDecision(decision_value), tokens_saved); + store.put_with_tokens( + cache_key, + MemoValue::PilotDecision(decision_value), + tokens_saved, + ); debug!("Memo cache stored for pilot decision at {:?}", point); } } @@ -403,9 +429,14 @@ impl LlmPilot { } /// Convert a PilotDecision to a cacheable value. - fn decision_to_cached_value(&self, decision: &PilotDecision) -> crate::memo::PilotDecisionValue { + fn decision_to_cached_value( + &self, + decision: &PilotDecision, + ) -> crate::memo::PilotDecisionValue { crate::memo::PilotDecisionValue { - selected_idx: decision.ranked_candidates.first() + selected_idx: decision + .ranked_candidates + .first() .map(|c| c.node_id.0.into()) .unwrap_or(0), confidence: decision.confidence, @@ -425,7 +456,11 @@ impl LlmPilot { .enumerate() .map(|(i, c)| super::decision::RankedCandidate { node_id: c.node_id, - score: if i == value.selected_idx { 1.0 } else { 0.5 / (i + 1) as f32 }, + score: if i == value.selected_idx { + 1.0 + } else { + 0.5 / (i + 1) as f32 + }, reason: None, }) .collect(); @@ -505,8 +540,11 @@ impl Pilot for LlmPilot { // Condition 1: Fork point with enough candidates if state.candidates.len() > intervention.fork_threshold { - println!("[DEBUG] LlmPilot::should_intervene() - YES: fork point with {} candidates (threshold={})", - state.candidates.len(), intervention.fork_threshold); + println!( + "[DEBUG] LlmPilot::should_intervene() - YES: fork point with {} candidates (threshold={})", + state.candidates.len(), + intervention.fork_threshold + ); debug!( "Intervening: fork point with {} candidates", state.candidates.len() @@ -516,15 +554,20 @@ impl Pilot for LlmPilot { // Condition 2: Scores are too close (algorithm uncertain) if self.scores_are_close(state) { - println!("[DEBUG] LlmPilot::should_intervene() - YES: scores are close (best={:.2})", state.best_score); + println!( + "[DEBUG] LlmPilot::should_intervene() - YES: scores are close (best={:.2})", + state.best_score + ); debug!("Intervening: scores are close"); return true; } // Condition 3: Low confidence (best score too low) if intervention.is_low_confidence(state.best_score) { - println!("[DEBUG] LlmPilot::should_intervene() - YES: low confidence (best_score={:.2}, threshold={:.2})", - state.best_score, intervention.low_score_threshold); + println!( + "[DEBUG] LlmPilot::should_intervene() - YES: low confidence (best_score={:.2}, threshold={:.2})", + state.best_score, intervention.low_score_threshold + ); debug!( "Intervening: low confidence (best_score={:.2})", state.best_score @@ -539,37 +582,51 @@ impl Pilot for LlmPilot { return true; } - println!("[DEBUG] LlmPilot::should_intervene() - NO: candidates={}, best_score={:.2}", - state.candidates.len(), state.best_score); + println!( + "[DEBUG] LlmPilot::should_intervene() - NO: candidates={}, best_score={:.2}", + state.candidates.len(), + state.best_score + ); false } async fn decide(&self, state: &SearchState<'_>) -> PilotDecision { let point = self.get_intervention_point(state); - println!("[DEBUG] LlmPilot::decide() - intervention_point={:?}, candidates={}", - point, state.candidates.len()); + println!( + "[DEBUG] LlmPilot::decide() - intervention_point={:?}, candidates={}", + point, + state.candidates.len() + ); // Build context let context = self.context_builder.build(state); // Build candidate info with titles - let candidate_info: Vec = state.candidates + let candidate_info: Vec = state + .candidates .iter() .enumerate() .filter_map(|(i, &node_id)| { - state.tree.get(node_id).map(|node| super::parser::CandidateInfo { - node_id, - title: node.title.clone(), - index: i, - }) + state + .tree + .get(node_id) + .map(|node| super::parser::CandidateInfo { + node_id, + title: node.title.clone(), + index: i, + }) }) .collect(); // Make LLM call let decision = self.call_llm(point, &context, &candidate_info).await; - println!("[DEBUG] LlmPilot::decide() - result: confidence={:.2}, direction={:?}, ranked={}", - decision.confidence, std::mem::discriminant(&decision.direction), decision.ranked_candidates.len()); + println!( + "[DEBUG] LlmPilot::decide() - result: confidence={:.2}, direction={:?}, ranked={}", + decision.confidence, + std::mem::discriminant(&decision.direction), + decision.ranked_candidates.len() + ); decision } @@ -595,7 +652,10 @@ impl Pilot for LlmPilot { // Get root's children as candidates let node_ids = tree.children(tree.root()); - println!("[DEBUG] LlmPilot::guide_start() - {} root children candidates", node_ids.len()); + println!( + "[DEBUG] LlmPilot::guide_start() - {} root children candidates", + node_ids.len() + ); // Build CandidateInfo with titles let candidates: Vec = node_ids @@ -616,16 +676,20 @@ impl Pilot for LlmPilot { .call_llm(InterventionPoint::Start, &context, &candidates) .await; - println!("[DEBUG] LlmPilot::guide_start() - LLM returned: confidence={:.2}, ranked_candidates={}, reasoning='{}'", + println!( + "[DEBUG] LlmPilot::guide_start() - LLM returned: confidence={:.2}, ranked_candidates={}, reasoning='{}'", decision.confidence, decision.ranked_candidates.len(), - decision.reasoning.chars().take(100).collect::()); + decision.reasoning.chars().take(100).collect::() + ); // Debug: show top ranked candidates for (i, rc) in decision.ranked_candidates.iter().enumerate().take(3) { if let Some(node) = tree.get(rc.node_id) { - println!("[DEBUG] Ranked {}: node_id={:?}, score={:.3}, title='{}'", - i, rc.node_id, rc.score, node.title); + println!( + "[DEBUG] Ranked {}: node_id={:?}, score={:.3}, title='{}'", + i, rc.node_id, rc.score, node.title + ); } } @@ -655,15 +719,19 @@ impl Pilot for LlmPilot { .build_backtrack_context(state, state.path); // Build CandidateInfo - let candidates: Vec = state.candidates + let candidates: Vec = state + .candidates .iter() .enumerate() .filter_map(|(i, &node_id)| { - state.tree.get(node_id).map(|node| super::parser::CandidateInfo { - node_id, - title: node.title.clone(), - index: i, - }) + state + .tree + .get(node_id) + .map(|node| super::parser::CandidateInfo { + node_id, + title: node.title.clone(), + index: i, + }) }) .collect(); diff --git a/rust/src/retrieval/pilot/metrics.rs b/rust/src/retrieval/pilot/metrics.rs index dba2d9b5..b97a1977 100644 --- a/rust/src/retrieval/pilot/metrics.rs +++ b/rust/src/retrieval/pilot/metrics.rs @@ -10,7 +10,7 @@ //! - Decision quality metrics use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; -use std::time::{Duration, Instant}; +use std::time::Duration; use super::decision::InterventionPoint; diff --git a/rust/src/retrieval/pilot/mod.rs b/rust/src/retrieval/pilot/mod.rs index d462dd28..5af9cead 100644 --- a/rust/src/retrieval/pilot/mod.rs +++ b/rust/src/retrieval/pilot/mod.rs @@ -15,27 +15,6 @@ //! 3. Intervention at key decision points - not every step, only when needed //! 4. Layered fallback - algorithm takes over when LLM fails, Pilot rescues when algorithm fails //! -//! # Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────────────┐ -//! │ Pilot Architecture │ -//! ├─────────────────────────────────────────────────────────────────────────┤ -//! │ │ -//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -//! │ │ Query │ │ Context │ │ Decision │ │ -//! │ │ Analyzer │──▶│ Builder │──▶│ Engine │ │ -//! │ └─────────────┘ └─────────────┘ └──────┬──────┘ │ -//! │ │ │ -//! │ ▼ │ -//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -//! │ │ Response │◀──│ LLM │◀──│ Prompt │ │ -//! │ │ Parser │ │ Client │ │ Builder │ │ -//! │ └─────────────┘ └─────────────┘ └─────────────┘ │ -//! │ │ -//! │ Supporting: BudgetController, FallbackManager, MetricsCollector │ -//! └─────────────────────────────────────────────────────────────────────────┘ -//! ``` //! //! # Quick Start //! @@ -64,18 +43,8 @@ mod parser; mod prompts; mod r#trait; -pub use budget::{BudgetController, BudgetUsage}; -pub use builder::{ContextBuilder, ContextMode, PilotContext, TokenBudget}; -pub use config::{BudgetConfig, InterventionConfig, PilotConfig, PilotMode}; -pub use decision::{InterventionPoint, PilotDecision, RankedCandidate, SearchDirection}; -pub use fallback::{FallbackAction, FallbackConfig, FallbackError, FallbackLevel, FallbackManager}; -pub use feedback::{ - ContextStats, DecisionAdjustment, DecisionId, FeedbackId, FeedbackRecord, FeedbackStore, - FeedbackStoreConfig, InterventionStats, LearnerConfig, PilotLearner, -}; +pub use config::PilotConfig; +pub use decision::{InterventionPoint, PilotDecision}; + pub use llm_pilot::LlmPilot; -pub use metrics::{CallRecord, MetricsCollector, PilotMetrics}; -pub use noop::NoopPilot; -pub use parser::ResponseParser; -pub use prompts::PromptBuilder; -pub use r#trait::{Pilot, PilotExt, SearchState}; +pub use r#trait::{Pilot, SearchState}; diff --git a/rust/src/retrieval/pilot/parser.rs b/rust/src/retrieval/pilot/parser.rs index 651b1c3c..f7c2fe85 100644 --- a/rust/src/retrieval/pilot/parser.rs +++ b/rust/src/retrieval/pilot/parser.rs @@ -44,7 +44,10 @@ pub struct LlmResponse { #[serde(default)] pub direction: DirectionResponse, /// Confidence level (0.0 - 1.0 or "high"/"medium"/"low"). - #[serde(default = "default_confidence", deserialize_with = "deserialize_confidence")] + #[serde( + default = "default_confidence", + deserialize_with = "deserialize_confidence" + )] pub confidence: f32, /// Reasoning for the decision. #[serde(default)] @@ -56,8 +59,6 @@ fn deserialize_confidence<'de, D>(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - use serde::de::Error; - let value = serde_json::Value::deserialize(deserializer)?; match value { serde_json::Value::Number(n) => { @@ -227,18 +228,27 @@ impl ResponseParser { candidates: &[CandidateInfo], point: InterventionPoint, ) -> PilotDecision { - println!("[DEBUG] ResponseParser::parse() - candidates.len()={}", candidates.len()); + println!( + "[DEBUG] ResponseParser::parse() - candidates.len()={}", + candidates.len() + ); // Try JSON parse first if let Some(decision) = self.try_json_parse(response, candidates, point) { - println!("[DEBUG] ResponseParser::parse() - JSON parse succeeded, ranked={}", decision.ranked_candidates.len()); + println!( + "[DEBUG] ResponseParser::parse() - JSON parse succeeded, ranked={}", + decision.ranked_candidates.len() + ); return decision; } println!("[DEBUG] ResponseParser::parse() - JSON parse failed, trying regex..."); // Try regex extraction if let Some(decision) = self.try_regex_parse(response, candidates, point) { - println!("[DEBUG] ResponseParser::parse() - Regex parse succeeded, ranked={}", decision.ranked_candidates.len()); + println!( + "[DEBUG] ResponseParser::parse() - Regex parse succeeded, ranked={}", + decision.ranked_candidates.len() + ); return decision; } println!("[DEBUG] ResponseParser::parse() - Regex parse failed, using default decision"); @@ -268,17 +278,26 @@ impl ResponseParser { extracted }; - println!("[DEBUG] ResponseParser::try_json_parse() - Extracted JSON:\n{}", json_str); + println!( + "[DEBUG] ResponseParser::try_json_parse() - Extracted JSON:\n{}", + json_str + ); // Parse JSON let llm_response: LlmResponse = match serde_json::from_str::(&json_str) { Ok(r) => { println!("[DEBUG] ResponseParser::try_json_parse() - JSON parsed successfully"); - println!("[DEBUG] ResponseParser::try_json_parse() - ranked_candidates count: {}", r.ranked_candidates.len()); + println!( + "[DEBUG] ResponseParser::try_json_parse() - ranked_candidates count: {}", + r.ranked_candidates.len() + ); r - }, + } Err(e) => { - println!("[DEBUG] ResponseParser::try_json_parse() - JSON parse FAILED: {}", e); + println!( + "[DEBUG] ResponseParser::try_json_parse() - JSON parse FAILED: {}", + e + ); warn!("Failed to parse LLM response as JSON: {}", e); return None; } @@ -416,13 +435,37 @@ impl ResponseParser { candidates: &[CandidateInfo], point: InterventionPoint, ) -> PilotDecision { - println!("[DEBUG] ResponseParser::llm_response_to_decision() - point={:?}", point); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - ranked_candidates.len()={}", llm_response.ranked_candidates.len()); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - best_entry_points.len()={}", llm_response.best_entry_points.len()); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - entry_points.len()={}", llm_response.entry_points.len()); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - selected_nodes.len()={}", llm_response.selected_nodes.len()); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - selected_node={:?}", llm_response.selected_node); - println!("[DEBUG] ResponseParser::llm_response_to_decision() - analysis={:?}", llm_response.analysis.as_ref().map(|a| (&a.selected_node, &a.selected_nodes))); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - point={:?}", + point + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - ranked_candidates.len()={}", + llm_response.ranked_candidates.len() + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - best_entry_points.len()={}", + llm_response.best_entry_points.len() + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - entry_points.len()={}", + llm_response.entry_points.len() + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - selected_nodes.len()={}", + llm_response.selected_nodes.len() + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - selected_node={:?}", + llm_response.selected_node + ); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - analysis={:?}", + llm_response + .analysis + .as_ref() + .map(|a| (&a.selected_node, &a.selected_nodes)) + ); // Convert candidate scores to RankedCandidate let mut ranked_candidates: Vec = llm_response @@ -456,17 +499,16 @@ impl ResponseParser { }; if idx < candidates.len() { - let score = entry.relevance_score - .or(entry.score) - .unwrap_or(0.5) - / 5.0; // Normalize 1-5 scale to 0.0-1.0 + let score = entry.relevance_score.or(entry.score).unwrap_or(0.5) / 5.0; // Normalize 1-5 scale to 0.0-1.0 ranked_candidates.push(RankedCandidate { node_id: candidates[idx].node_id, score: score.clamp(0.0, 1.0), reason: entry.title.clone(), }); - println!("[DEBUG] ResponseParser - converted best_entry_point[{}] to ranked_candidate (idx={}, score={:.2})", - idx, idx, score); + println!( + "[DEBUG] ResponseParser - converted best_entry_point[{}] to ranked_candidate (idx={}, score={:.2})", + idx, idx, score + ); } } @@ -480,8 +522,10 @@ impl ResponseParser { score: 0.9, // High score for title match reason: Some(format!("Title match: {}", selected_title)), }); - println!("[DEBUG] ResponseParser - matched selected_node '{}' to candidate '{}' (index={})", - selected_title, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched selected_node '{}' to candidate '{}' (index={})", + selected_title, candidate.title, candidate.index + ); break; // Only match once per selected_node } } @@ -491,14 +535,19 @@ impl ResponseParser { if let Some(ref single_node) = llm_response.selected_node { for candidate in candidates { if Self::titles_match(single_node, &candidate.title) { - if !ranked_candidates.iter().any(|rc| rc.node_id == candidate.node_id) { + if !ranked_candidates + .iter() + .any(|rc| rc.node_id == candidate.node_id) + { ranked_candidates.push(RankedCandidate { node_id: candidate.node_id, score: 0.9, reason: Some(format!("Title match (singular): {}", single_node)), }); - println!("[DEBUG] ResponseParser - matched selected_node (singular) '{}' to candidate '{}' (index={})", - single_node, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched selected_node (singular) '{}' to candidate '{}' (index={})", + single_node, candidate.title, candidate.index + ); } break; } @@ -509,14 +558,19 @@ impl ResponseParser { if let Some(ref recommended) = llm_response.recommended_node { for candidate in candidates { if Self::titles_match(recommended, &candidate.title) { - if !ranked_candidates.iter().any(|rc| rc.node_id == candidate.node_id) { + if !ranked_candidates + .iter() + .any(|rc| rc.node_id == candidate.node_id) + { ranked_candidates.push(RankedCandidate { node_id: candidate.node_id, score: 0.85, reason: Some(format!("Recommended node: {}", recommended)), }); - println!("[DEBUG] ResponseParser - matched recommended_node '{}' to candidate '{}' (index={})", - recommended, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched recommended_node '{}' to candidate '{}' (index={})", + recommended, candidate.title, candidate.index + ); } break; } @@ -529,14 +583,22 @@ impl ResponseParser { for selected_title in &analysis.selected_nodes { for candidate in candidates { if Self::titles_match(selected_title, &candidate.title) { - if !ranked_candidates.iter().any(|rc| rc.node_id == candidate.node_id) { + if !ranked_candidates + .iter() + .any(|rc| rc.node_id == candidate.node_id) + { ranked_candidates.push(RankedCandidate { node_id: candidate.node_id, score: 0.85, - reason: Some(format!("Analysis selected_nodes: {}", selected_title)), + reason: Some(format!( + "Analysis selected_nodes: {}", + selected_title + )), }); - println!("[DEBUG] ResponseParser - matched analysis.selected_nodes '{}' to candidate '{}' (index={})", - selected_title, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched analysis.selected_nodes '{}' to candidate '{}' (index={})", + selected_title, candidate.title, candidate.index + ); } break; } @@ -547,14 +609,22 @@ impl ResponseParser { if let Some(ref single_node) = analysis.selected_node { for candidate in candidates { if Self::titles_match(single_node, &candidate.title) { - if !ranked_candidates.iter().any(|rc| rc.node_id == candidate.node_id) { + if !ranked_candidates + .iter() + .any(|rc| rc.node_id == candidate.node_id) + { ranked_candidates.push(RankedCandidate { node_id: candidate.node_id, score: 0.85, - reason: Some(format!("Analysis selected_node: {}", single_node)), + reason: Some(format!( + "Analysis selected_node: {}", + single_node + )), }); - println!("[DEBUG] ResponseParser - matched analysis.selected_node (singular) '{}' to candidate '{}' (index={})", - single_node, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched analysis.selected_node (singular) '{}' to candidate '{}' (index={})", + single_node, candidate.title, candidate.index + ); } break; } @@ -574,14 +644,19 @@ impl ResponseParser { for candidate in candidates { if Self::titles_match(entry_title, &candidate.title) { // Check if already added - if !ranked_candidates.iter().any(|rc| rc.node_id == candidate.node_id) { + if !ranked_candidates + .iter() + .any(|rc| rc.node_id == candidate.node_id) + { ranked_candidates.push(RankedCandidate { node_id: candidate.node_id, score: 0.8, // Slightly lower score for entry_points reason: Some(format!("Entry point: {}", entry_title)), }); - println!("[DEBUG] ResponseParser - matched entry_point '{}' to candidate '{}' (index={})", - entry_title, candidate.title, candidate.index); + println!( + "[DEBUG] ResponseParser - matched entry_point '{}' to candidate '{}' (index={})", + entry_title, candidate.title, candidate.index + ); } break; } @@ -614,7 +689,10 @@ impl ResponseParser { }, }; - println!("[DEBUG] ResponseParser::llm_response_to_decision() - final ranked_candidates.len()={}", ranked_candidates.len()); + println!( + "[DEBUG] ResponseParser::llm_response_to_decision() - final ranked_candidates.len()={}", + ranked_candidates.len() + ); PilotDecision { ranked_candidates, @@ -642,7 +720,8 @@ impl ResponseParser { // Word overlap match (at least 50% of words match) let llm_words: std::collections::HashSet<&str> = llm_lower.split_whitespace().collect(); - let candidate_words: std::collections::HashSet<&str> = candidate_lower.split_whitespace().collect(); + let candidate_words: std::collections::HashSet<&str> = + candidate_lower.split_whitespace().collect(); let overlap = llm_words.intersection(&candidate_words).count(); let min_words = llm_words.len().min(candidate_words.len()); if min_words > 0 && overlap as f32 / min_words as f32 >= 0.5 { @@ -653,7 +732,11 @@ impl ResponseParser { } /// Create a default decision when parsing fails. - fn default_decision(&self, candidates: &[CandidateInfo], point: InterventionPoint) -> PilotDecision { + fn default_decision( + &self, + candidates: &[CandidateInfo], + point: InterventionPoint, + ) -> PilotDecision { // Score candidates uniformly let ranked: Vec = candidates .iter() diff --git a/rust/src/retrieval/pilot/prompts/mod.rs b/rust/src/retrieval/pilot/prompts/mod.rs index 0bb336b2..aeee13c4 100644 --- a/rust/src/retrieval/pilot/prompts/mod.rs +++ b/rust/src/retrieval/pilot/prompts/mod.rs @@ -13,4 +13,3 @@ mod builder; mod templates; pub use builder::PromptBuilder; -pub use templates::{BacktrackPrompt, EvaluatePrompt, ForkPrompt, PromptTemplate, StartPrompt}; diff --git a/rust/src/retrieval/pilot/prompts/templates.rs b/rust/src/retrieval/pilot/prompts/templates.rs index 50f4c3cc..6829ade8 100644 --- a/rust/src/retrieval/pilot/prompts/templates.rs +++ b/rust/src/retrieval/pilot/prompts/templates.rs @@ -245,8 +245,6 @@ impl PromptTemplate for EvaluatePrompt { /// Fallback templates when file loading fails. pub mod fallback { - use super::*; - pub fn system_start() -> String { r#"You are a document navigation assistant. Help identify the best entry points for searching a hierarchical document. @@ -273,7 +271,8 @@ Respond with ONLY the JSON object (no markdown, no explanation): "entry_points": ["list of node titles as strings"], "reasoning": "your reasoning here", "confidence": 0.85 -}"#.to_string() +}"# + .to_string() } pub fn system_fork() -> String { @@ -309,7 +308,7 @@ Respond with ONLY the JSON object: "confidence": 0.85, "reasoning": "overall explanation" }"# - .to_string() + .to_string() } pub fn system_backtrack() -> String { @@ -339,7 +338,8 @@ Respond with ONLY the JSON object: "direction": "backtrack", "confidence": 0.85, "reasoning": "why original path failed" -}"#.to_string() +}"# + .to_string() } pub fn system_evaluate() -> String { @@ -368,7 +368,7 @@ Respond with ONLY the JSON object: "confidence": 0.85, "reasoning": "explanation" }"# - .to_string() + .to_string() } pub fn system_locate_top3() -> String { @@ -412,7 +412,7 @@ Respond with ONLY the JSON object: {"node_id": 3, "relevance_score": 0.65, "reason": "explanation"} ] }"# - .to_string() + .to_string() } } diff --git a/rust/src/retrieval/pipeline/budget.rs b/rust/src/retrieval/pipeline/budget.rs index 3fe69d76..91a77b2d 100644 --- a/rust/src/retrieval/pipeline/budget.rs +++ b/rust/src/retrieval/pipeline/budget.rs @@ -24,7 +24,6 @@ //! ``` use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::Arc; /// Status of the budget for stage-level decision making. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -89,9 +88,7 @@ impl Clone for RetrievalBudgetController { Self { total_budget: self.total_budget, consumed: AtomicUsize::new(self.consumed.load(Ordering::Relaxed)), - exhaustion_signaled: AtomicBool::new( - self.exhaustion_signaled.load(Ordering::Relaxed), - ), + exhaustion_signaled: AtomicBool::new(self.exhaustion_signaled.load(Ordering::Relaxed)), constrain_threshold: self.constrain_threshold, } } @@ -148,7 +145,8 @@ impl RetrievalBudgetController { /// Get remaining token budget. pub fn remaining(&self) -> usize { - self.total_budget.saturating_sub(self.consumed.load(Ordering::Relaxed)) + self.total_budget + .saturating_sub(self.consumed.load(Ordering::Relaxed)) } /// Get total budget. @@ -193,7 +191,11 @@ impl RetrievalBudgetController { } BudgetStatus::Constrained => { // Reduce beam to save tokens - let reduced = if iteration <= 1 { current_beam } else { (current_beam / 2).max(1) }; + let reduced = if iteration <= 1 { + current_beam + } else { + (current_beam / 2).max(1) + }; reduced } BudgetStatus::Exhausted => { diff --git a/rust/src/retrieval/pipeline/context.rs b/rust/src/retrieval/pipeline/context.rs index 45530ac2..7990ce92 100644 --- a/rust/src/retrieval/pipeline/context.rs +++ b/rust/src/retrieval/pipeline/context.rs @@ -13,8 +13,8 @@ use std::time::Instant; use crate::document::{DocumentTree, NodeId, ReasoningIndex, RetrievalIndex}; use crate::graph::DocumentGraph; use crate::retrieval::cache::{HotNodeTracker, ReasoningCache}; -use crate::retrieval::pipeline::budget::RetrievalBudgetController; use crate::retrieval::pilot::Pilot; +use crate::retrieval::pipeline::budget::RetrievalBudgetController; use crate::retrieval::types::{ NavigationDecision, QueryComplexity, ReasoningChain, ReasoningStep, RetrieveOptions, RetrieveResponse, SearchPath, StageName, StrategyPreference, SufficiencyLevel, diff --git a/rust/src/retrieval/pipeline/mod.rs b/rust/src/retrieval/pipeline/mod.rs index 88b47de5..6726a8ce 100644 --- a/rust/src/retrieval/pipeline/mod.rs +++ b/rust/src/retrieval/pipeline/mod.rs @@ -9,16 +9,6 @@ //! - [`StageOutcome`] - Controls pipeline flow (continue, backtrack, etc.) //! - [`RetrievalOrchestrator`] - Manages stage execution //! -//! # Architecture -//! -//! The retrieval pipeline consists of four stages: -//! -//! ```text -//! ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ -//! │ Analyze │───►│ Plan │───►│ Search │───►│ Evaluate │ -//! │ (分析) │ │ (规划) │ │ (搜索) │ │ (判断) │ -//! └─────────┘ └─────────┘ └─────────┘ └─────────┘ -//! ``` //! //! # Flow Control //! @@ -50,11 +40,9 @@ mod orchestrator; mod outcome; mod stage; -pub use budget::{BudgetStatus, RetrievalBudgetController}; -pub use context::{ - CandidateNode, PipelineContext, RetrievalMetrics, SearchAlgorithm, SearchConfig, StageResult, -}; -pub use orchestrator::{ExecutionGroup, RetrievalOrchestrator}; +pub use budget::BudgetStatus; +pub use context::{CandidateNode, PipelineContext, SearchAlgorithm, SearchConfig}; +pub use orchestrator::RetrievalOrchestrator; pub use outcome::StageOutcome; pub use stage::RetrievalStage; diff --git a/rust/src/retrieval/pipeline/orchestrator.rs b/rust/src/retrieval/pipeline/orchestrator.rs index 50976988..704629e3 100644 --- a/rust/src/retrieval/pipeline/orchestrator.rs +++ b/rust/src/retrieval/pipeline/orchestrator.rs @@ -21,7 +21,7 @@ use crate::error::Result; use crate::retrieval::pilot::{Pilot, SearchState}; // FailurePolicy is re-exported for stages use crate::retrieval::stream::{ - RetrieveEvent, RetrieveEventReceiver, RetrieveEventSender, DEFAULT_STREAM_BOUND, + DEFAULT_STREAM_BOUND, RetrieveEvent, RetrieveEventReceiver, RetrieveEventSender, }; use crate::retrieval::types::{RetrieveOptions, RetrieveResponse}; @@ -860,10 +860,7 @@ impl RetrievalOrchestrator { tree: Arc, query: &str, options: RetrieveOptions, - ) -> ( - tokio::task::JoinHandle<()>, - RetrieveEventReceiver, - ) { + ) -> (tokio::task::JoinHandle<()>, RetrieveEventReceiver) { let (tx, rx) = tokio::sync::mpsc::channel(DEFAULT_STREAM_BOUND); let query_owned = query.to_string(); @@ -956,9 +953,7 @@ impl RetrievalOrchestrator { total_start.elapsed().as_millis() as u64; info!("Retrieval completed by stage: {}", stage_name); let response = ctx.finalize(); - let _ = tx - .send(RetrieveEvent::Completed { response }) - .await; + let _ = tx.send(RetrieveEvent::Completed { response }).await; return Ok(()); } StageOutcome::NeedMoreData { @@ -1129,9 +1124,7 @@ impl RetrievalOrchestrator { ctx.metrics.total_time_ms = total_start.elapsed().as_millis() as u64; let response = ctx.finalize(); - let _ = tx - .send(RetrieveEvent::Completed { response }) - .await; + let _ = tx.send(RetrieveEvent::Completed { response }).await; return Ok(()); } } diff --git a/rust/src/retrieval/pipeline_retriever.rs b/rust/src/retrieval/pipeline_retriever.rs index e2faa499..2b5c25ec 100644 --- a/rust/src/retrieval/pipeline_retriever.rs +++ b/rust/src/retrieval/pipeline_retriever.rs @@ -13,11 +13,9 @@ use super::content::ContentAggregatorConfig; use super::pipeline::RetrievalOrchestrator; use super::retriever::{CostEstimate, Retriever, RetrieverError, RetrieverResult}; use super::stages::{AnalyzeStage, EvaluateStage, PlanStage, SearchStage}; -use super::stream::{RetrieveEvent, RetrieveEventReceiver}; -use super::strategy::LlmStrategy; +use super::stream::RetrieveEventReceiver; use super::types::{RetrieveOptions, RetrieveResponse}; use crate::document::DocumentTree; -use crate::error::Result; use crate::llm::LlmClient; use crate::memo::MemoStore; use crate::retrieval::pilot::{LlmPilot, PilotConfig}; diff --git a/rust/src/retrieval/reference.rs b/rust/src/retrieval/reference.rs index dcdcadd6..523fd042 100644 --- a/rust/src/retrieval/reference.rs +++ b/rust/src/retrieval/reference.rs @@ -52,7 +52,7 @@ //! } //! ``` -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use crate::document::{ DocumentTree, NodeId, NodeReference, RefType, ReferenceExtractor, RetrievalIndex, @@ -147,9 +147,6 @@ impl FollowedReference { } /// Reference follower for expanding content via cross-references. -/// -/// This implements the PageIndex paper's reference following capability, -/// allowing the retrieval system to follow "see Appendix G" style references. #[derive(Debug, Clone)] pub struct ReferenceFollower { config: ReferenceConfig, @@ -256,7 +253,14 @@ impl ReferenceFollower { if let Some(target_id) = r#ref.target_node { if !visited.contains(&target_id) { visited.insert(target_id); - self.follow_from_node_inner(tree, index, target_id, depth + 1, visited, results); + self.follow_from_node_inner( + tree, + index, + target_id, + depth + 1, + visited, + results, + ); } } } diff --git a/rust/src/retrieval/search/beam.rs b/rust/src/retrieval/search/beam.rs index e0d3c7dc..f285cb30 100644 --- a/rust/src/retrieval/search/beam.rs +++ b/rust/src/retrieval/search/beam.rs @@ -329,7 +329,8 @@ impl SearchTree for BeamSearch { config: &SearchConfig, pilot: Option<&dyn Pilot>, ) -> SearchResult { - self.search_impl(tree, context, config, pilot, tree.root()).await + self.search_impl(tree, context, config, pilot, tree.root()) + .await } async fn search_from( @@ -340,7 +341,8 @@ impl SearchTree for BeamSearch { pilot: Option<&dyn Pilot>, start_node: NodeId, ) -> SearchResult { - self.search_impl(tree, context, config, pilot, start_node).await + self.search_impl(tree, context, config, pilot, start_node) + .await } fn name(&self) -> &'static str { diff --git a/rust/src/retrieval/search/bm25.rs b/rust/src/retrieval/search/bm25.rs index 26c311f5..8bc20085 100644 --- a/rust/src/retrieval/search/bm25.rs +++ b/rust/src/retrieval/search/bm25.rs @@ -9,11 +9,8 @@ //! - IDF caching for efficient scoring //! - Query expansion support -use std::collections::HashMap; - use bm25::{ - Embedder, EmbedderBuilder, Embedding, Language, Scorer, ScoredDocument, - DefaultTokenizer, Tokenizer, + DefaultTokenizer, Embedder, EmbedderBuilder, Language, ScoredDocument, Scorer, Tokenizer, }; /// Field weights for BM25 scoring. @@ -84,7 +81,12 @@ pub struct FieldDocument { impl FieldDocument { /// Create a new field document. pub fn new(id: K, title: String, summary: String, content: String) -> Self { - Self { id, title, summary, content } + Self { + id, + title, + summary, + content, + } } /// Get combined text for embedding. @@ -161,13 +163,10 @@ impl Bm25Engine { /// This calculates the true average document length from the corpus. pub fn fit_to_corpus(documents: &[FieldDocument]) -> Self { // Collect owned strings first - let corpus: Vec = documents.iter() - .map(|d| d.combined_text()) - .collect(); + let corpus: Vec = documents.iter().map(|d| d.combined_text()).collect(); let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect(); - let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs) - .build(); + let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs).build(); let mut engine = Self { embedder, @@ -268,7 +267,8 @@ impl Bm25Engine { let total_weight = self.weights.title + self.weights.summary + self.weights.content; let weighted_score = (title_score * self.weights.title + summary_score * self.weights.summary - + content_score * self.weights.content) / total_weight; + + content_score * self.weights.content) + / total_weight; Some(weighted_score) } @@ -278,7 +278,11 @@ impl Bm25Engine { /// Returns documents sorted by score (descending). pub fn search(&self, query: &str, limit: usize) -> Vec> { let query_emb = self.embedder.embed(query); - self.scorer.matches(&query_emb).into_iter().take(limit).collect() + self.scorer + .matches(&query_emb) + .into_iter() + .take(limit) + .collect() } /// Search with per-field weighting. @@ -303,7 +307,8 @@ impl Bm25Engine { let total_weight = self.weights.title + self.weights.summary + self.weights.content; let weighted_score = (title_score * self.weights.title + summary_score * self.weights.summary - + content_score * self.weights.content) / total_weight; + + content_score * self.weights.content) + / total_weight; Some((id, weighted_score)) }) @@ -357,7 +362,11 @@ impl ExpandedQuery { /// Create a new expanded query. pub fn new(original: String, expansions: Vec) -> Self { let combined = format!("{} {}", original, expansions.join(" ")); - Self { original, expansions, combined } + Self { + original, + expansions, + combined, + } } } @@ -370,21 +379,128 @@ pub trait QueryExpander: Send + Sync { /// Common English stop words for keyword filtering. pub const STOPWORDS: &[&str] = &[ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "have", "has", "had", "do", "does", "did", "will", "would", "could", - "should", "may", "might", "must", "shall", "can", "need", "dare", - "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", - "from", "as", "into", "through", "during", "before", "after", "above", - "below", "between", "under", "again", "further", "then", "once", - "here", "there", "when", "where", "why", "how", "all", "each", "few", - "more", "most", "other", "some", "such", "no", "nor", "not", "only", - "own", "same", "so", "than", "too", "very", "just", "and", "but", - "if", "or", "because", "until", "while", "about", "what", "which", - "who", "whom", "this", "that", "these", "those", "i", "me", "my", - "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", - "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", - "hers", "herself", "it", "its", "itself", "they", "them", "their", - "theirs", "themselves", + "a", + "an", + "the", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + "have", + "has", + "had", + "do", + "does", + "did", + "will", + "would", + "could", + "should", + "may", + "might", + "must", + "shall", + "can", + "need", + "dare", + "ought", + "used", + "to", + "of", + "in", + "for", + "on", + "with", + "at", + "by", + "from", + "as", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "between", + "under", + "again", + "further", + "then", + "once", + "here", + "there", + "when", + "where", + "why", + "how", + "all", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", + "just", + "and", + "but", + "if", + "or", + "because", + "until", + "while", + "about", + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + "i", + "me", + "my", + "myself", + "we", + "our", + "ours", + "ourselves", + "you", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", ]; /// Extract keywords from a query string, filtering stop words. @@ -421,8 +537,18 @@ mod tests { #[test] fn test_bm25_engine_fit_to_corpus() { let docs = vec![ - FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()), - FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language.".to_string()), + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language.".to_string(), + ), + FieldDocument::new( + 2u32, + "Python Guide".to_string(), + "About Python".to_string(), + "Python is a scripting language.".to_string(), + ), ]; let engine = Bm25Engine::fit_to_corpus(&docs); @@ -433,9 +559,24 @@ mod tests { #[test] fn test_bm25_search() { let docs = vec![ - FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language with memory safety.".to_string()), - FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language for data science.".to_string()), - FieldDocument::new(3u32, "Rust Memory Safety".to_string(), "Memory in Rust".to_string(), "Rust provides guaranteed memory safety without garbage collection.".to_string()), + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language with memory safety.".to_string(), + ), + FieldDocument::new( + 2u32, + "Python Guide".to_string(), + "About Python".to_string(), + "Python is a scripting language for data science.".to_string(), + ), + FieldDocument::new( + 3u32, + "Rust Memory Safety".to_string(), + "Memory in Rust".to_string(), + "Rust provides guaranteed memory safety without garbage collection.".to_string(), + ), ]; let engine = Bm25Engine::fit_to_corpus(&docs); @@ -449,16 +590,25 @@ mod tests { #[test] fn test_bm25_weighted_search() { let docs = vec![ - FieldDocument::new(1u32, "Rust Programming".to_string(), "About memory safety".to_string(), "Content about other things.".to_string()), - FieldDocument::new(2u32, "Other Language".to_string(), "About other things".to_string(), "Rust memory safety is important.".to_string()), + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About memory safety".to_string(), + "Content about other things.".to_string(), + ), + FieldDocument::new( + 2u32, + "Other Language".to_string(), + "About other things".to_string(), + "Rust memory safety is important.".to_string(), + ), ]; - let engine = Bm25Engine::fit_to_corpus(&docs) - .with_weights(FieldWeights { - title: 3.0, - summary: 2.0, - content: 1.0, - }); + let engine = Bm25Engine::fit_to_corpus(&docs).with_weights(FieldWeights { + title: 3.0, + summary: 2.0, + content: 1.0, + }); let results = engine.search_weighted("rust", 10); @@ -468,9 +618,12 @@ mod tests { #[test] fn test_bm25_score() { - let docs = vec![ - FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()), - ]; + let docs = vec![FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language.".to_string(), + )]; let engine = Bm25Engine::fit_to_corpus(&docs); let score = engine.score(&1u32, "rust programming"); @@ -493,9 +646,12 @@ mod tests { #[test] fn test_bm25_remove() { - let docs = vec![ - FieldDocument::new(1u32, "Rust".to_string(), "About Rust".to_string(), "Rust content.".to_string()), - ]; + let docs = vec![FieldDocument::new( + 1u32, + "Rust".to_string(), + "About Rust".to_string(), + "Rust content.".to_string(), + )]; let mut engine = Bm25Engine::fit_to_corpus(&docs); assert_eq!(engine.len(), 1); diff --git a/rust/src/retrieval/search/greedy.rs b/rust/src/retrieval/search/greedy.rs index 1126b0d4..812cf5be 100644 --- a/rust/src/retrieval/search/greedy.rs +++ b/rust/src/retrieval/search/greedy.rs @@ -229,7 +229,8 @@ impl SearchTree for GreedySearch { config: &SearchConfig, pilot: Option<&dyn Pilot>, ) -> SearchResult { - self.search_impl(tree, context, config, pilot, tree.root()).await + self.search_impl(tree, context, config, pilot, tree.root()) + .await } async fn search_from( @@ -240,7 +241,8 @@ impl SearchTree for GreedySearch { pilot: Option<&dyn Pilot>, start_node: NodeId, ) -> SearchResult { - self.search_impl(tree, context, config, pilot, start_node).await + self.search_impl(tree, context, config, pilot, start_node) + .await } fn name(&self) -> &'static str { diff --git a/rust/src/retrieval/search/mod.rs b/rust/src/retrieval/search/mod.rs index 00aa31ef..39e78cc3 100644 --- a/rust/src/retrieval/search/mod.rs +++ b/rust/src/retrieval/search/mod.rs @@ -12,12 +12,7 @@ mod toc_navigator; mod r#trait; pub use beam::BeamSearch; -pub use bm25::{ - extract_keywords, Bm25Engine, Bm25Params, ExpandedQuery, FieldDocument, FieldWeights, - QueryExpander, STOPWORDS, -}; +pub use bm25::{Bm25Engine, Bm25Params, FieldDocument, STOPWORDS, extract_keywords}; pub use greedy::GreedySearch; -pub use mcts::MctsSearch; -pub use scorer::{NodeScorer, ScoringContext}; pub use toc_navigator::{SearchCue, ToCNavigator}; pub use r#trait::{SearchConfig, SearchResult, SearchTree}; diff --git a/rust/src/retrieval/search/scorer.rs b/rust/src/retrieval/search/scorer.rs index 5dbb9209..65af4713 100644 --- a/rust/src/retrieval/search/scorer.rs +++ b/rust/src/retrieval/search/scorer.rs @@ -10,7 +10,7 @@ use std::collections::HashMap; use crate::document::{DocumentTree, NodeId}; -use super::bm25::{Bm25Engine, Bm25Params, FieldDocument, FieldWeights}; +use super::bm25::Bm25Params; // Re-export extract_keywords for other modules to use pub use super::bm25::extract_keywords; diff --git a/rust/src/retrieval/search/toc_navigator.rs b/rust/src/retrieval/search/toc_navigator.rs index 10c13075..6e56a406 100644 --- a/rust/src/retrieval/search/toc_navigator.rs +++ b/rust/src/retrieval/search/toc_navigator.rs @@ -116,9 +116,7 @@ impl ToCNavigator { "Top BM25 score {:.3} below threshold {:.3}, attempting LLM refinement", best_score, self.llm_threshold ); - return self - .llm_refine(query, tree, top_level_nodes, client) - .await; + return self.llm_refine(query, tree, top_level_nodes, client).await; } } diff --git a/rust/src/retrieval/stages/analyze.rs b/rust/src/retrieval/stages/analyze.rs index 1748d440..c3928574 100644 --- a/rust/src/retrieval/stages/analyze.rs +++ b/rust/src/retrieval/stages/analyze.rs @@ -17,7 +17,6 @@ use crate::retrieval::complexity::ComplexityDetector; use crate::retrieval::decompose::{DecompositionConfig, QueryDecomposer}; use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; use crate::retrieval::types::{NavigationDecision, StageName}; -use crate::llm::LlmClient; /// Analyze Stage - analyzes queries for retrieval planning. /// @@ -41,8 +40,16 @@ fn chinese_num_to_int(s: &str) -> Option { } let map = |c: char| -> usize { match c { - '一' => 1, '二' => 2, '三' => 3, '四' => 4, '五' => 5, - '六' => 6, '七' => 7, '八' => 8, '九' => 9, '十' => 10, + '一' => 1, + '二' => 2, + '三' => 3, + '四' => 4, + '五' => 5, + '六' => 6, + '七' => 7, + '八' => 8, + '九' => 9, + '十' => 10, '百' => 100, _ => 0, } @@ -140,12 +147,11 @@ impl AnalyzeStage { /// Enable query decomposition with LLM client. pub fn with_llm_client(mut self, client: crate::llm::LlmClient) -> Self { if self.query_decomposer.is_none() { - self.query_decomposer = Some( - QueryDecomposer::new(DecompositionConfig::default()) - .with_llm_client(client), - ); + self.query_decomposer = + Some(QueryDecomposer::new(DecompositionConfig::default()).with_llm_client(client)); } else if let Some(ref mut decomposer) = self.query_decomposer { - *decomposer = QueryDecomposer::new(DecompositionConfig::default()).with_llm_client(client); + *decomposer = + QueryDecomposer::new(DecompositionConfig::default()).with_llm_client(client); } self.enable_decomposition = true; self @@ -370,14 +376,18 @@ impl RetrievalStage for AnalyzeStage { info!( "Resolved {} structure hints: {:?}", ctx.resolved_path_hints.len(), - ctx.resolved_path_hints.iter().map(|(s, _)| s).collect::>() + ctx.resolved_path_hints + .iter() + .map(|(s, _)| s) + .collect::>() ); } // 4. Decompose query if enabled and complex enough if self.enable_decomposition { if let Some(ref decomposer) = self.query_decomposer { - let complexity_score = ctx.complexity + let complexity_score = ctx + .complexity .as_ref() .map(|c| match c { crate::retrieval::types::QueryComplexity::Simple => 0.3, @@ -396,13 +406,19 @@ impl RetrievalStage for AnalyzeStage { result.sub_queries.len() ); for (i, sq) in result.sub_queries.iter().enumerate() { - info!(" Sub-query {}: {} (priority: {})", i, sq.text, sq.priority); + info!( + " Sub-query {}: {} (priority: {})", + i, sq.text, sq.priority + ); } } ctx.decomposition = Some(result); } Err(e) => { - info!("Query decomposition failed: {}, continuing with original query", e); + info!( + "Query decomposition failed: {}, continuing with original query", + e + ); } } } @@ -424,7 +440,10 @@ impl RetrievalStage for AnalyzeStage { if !ctx.resolved_path_hints.is_empty() { reasoning_parts.push(format!( "Structure hints: {:?}", - ctx.resolved_path_hints.iter().map(|(s, _)| s).collect::>() + ctx.resolved_path_hints + .iter() + .map(|(s, _)| s) + .collect::>() )); } if let Some(ref decomp) = ctx.decomposition { diff --git a/rust/src/retrieval/stages/evaluate.rs b/rust/src/retrieval/stages/evaluate.rs index 11a95713..32f9945c 100644 --- a/rust/src/retrieval/stages/evaluate.rs +++ b/rust/src/retrieval/stages/evaluate.rs @@ -12,9 +12,11 @@ use tracing::{info, warn}; use crate::llm::LlmClient; use crate::retrieval::content::{ContentAggregator, ContentAggregatorConfig}; -use crate::retrieval::pipeline::{BudgetStatus, FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; +use crate::retrieval::pipeline::{FailurePolicy, PipelineContext, RetrievalStage, StageOutcome}; use crate::retrieval::sufficiency::{LlmJudge, SufficiencyChecker, ThresholdChecker}; -use crate::retrieval::types::{NavigationDecision, ReasoningChain, RetrievalResult, RetrieveResponse, StageName, SufficiencyLevel}; +use crate::retrieval::types::{ + NavigationDecision, RetrievalResult, RetrieveResponse, StageName, SufficiencyLevel, +}; use crate::utils::estimate_tokens; /// Evaluate Stage - evaluates retrieval sufficiency. @@ -298,8 +300,10 @@ impl EvaluateStage { }; let confidence = avg_score * sufficiency_factor; - println!("[DEBUG] calculate_confidence: avg_score={:.3}, sufficiency={:?}, factor={:.1}, confidence={:.3}", - avg_score, ctx.sufficiency, sufficiency_factor, confidence); + println!( + "[DEBUG] calculate_confidence: avg_score={:.3}, sufficiency={:?}, factor={:.1}, confidence={:.3}", + avg_score, ctx.sufficiency, sufficiency_factor, confidence + ); confidence } } @@ -329,9 +333,12 @@ impl RetrievalStage for EvaluateStage { async fn execute(&self, ctx: &mut PipelineContext) -> crate::error::Result { let start = std::time::Instant::now(); - println!("[DEBUG] EvaluateStage: {} candidates, iteration {}", - ctx.candidates.len(), ctx.search_iterations); - + println!( + "[DEBUG] EvaluateStage: {} candidates, iteration {}", + ctx.candidates.len(), + ctx.search_iterations + ); + info!( "Judging sufficiency: {} candidates, iteration {}", ctx.candidates.len(), @@ -388,7 +395,8 @@ impl RetrievalStage for EvaluateStage { if let Some(node) = ctx.tree.get(candidate.node_id) { let path = format!("{}", node.depth); // Use the node title as path identifier for L2 - ctx.reasoning_cache.l2_record(&doc_key, &node.title, candidate.score); + ctx.reasoning_cache + .l2_record(&doc_key, &node.title, candidate.score); } } } diff --git a/rust/src/retrieval/stages/plan.rs b/rust/src/retrieval/stages/plan.rs index 865f070f..1f3b9cba 100644 --- a/rust/src/retrieval/stages/plan.rs +++ b/rust/src/retrieval/stages/plan.rs @@ -80,7 +80,9 @@ impl PlanStage { } QueryComplexity::Medium => { if budget_status == BudgetStatus::Constrained { - info!("Complexity is Medium but budget constrained, selecting Keyword strategy"); + info!( + "Complexity is Medium but budget constrained, selecting Keyword strategy" + ); StrategyPreference::ForceKeyword } else if self.llm_client.is_some() { info!("Complexity is Medium, selecting LLM strategy"); @@ -92,7 +94,9 @@ impl PlanStage { } QueryComplexity::Complex => { if budget_status == BudgetStatus::Constrained { - info!("Complexity is Complex but budget constrained, selecting Hybrid strategy"); + info!( + "Complexity is Complex but budget constrained, selecting Hybrid strategy" + ); if self.llm_client.is_some() { StrategyPreference::ForceHybrid } else { diff --git a/rust/src/retrieval/stages/search.rs b/rust/src/retrieval/stages/search.rs index 929dad76..ad522634 100644 --- a/rust/src/retrieval/stages/search.rs +++ b/rust/src/retrieval/stages/search.rs @@ -16,21 +16,21 @@ use crate::document::DocumentTree; use crate::document::ReasoningIndex; use crate::llm::LlmClient; use crate::retrieval::RetrievalContext; -use crate::retrieval::pilot::Pilot; use crate::retrieval::cache::CachedCandidate; +use crate::retrieval::pilot::Pilot; use crate::retrieval::pipeline::{ - BudgetStatus, CandidateNode, FailurePolicy, PipelineContext, RetrievalStage, SearchAlgorithm, - StageOutcome, + CandidateNode, FailurePolicy, PipelineContext, RetrievalStage, SearchAlgorithm, StageOutcome, }; +use crate::retrieval::search::extract_keywords; use crate::retrieval::search::{ - BeamSearch, GreedySearch, SearchConfig as SearchAlgConfig, SearchCue, SearchTree, - ToCNavigator, + BeamSearch, GreedySearch, SearchConfig as SearchAlgConfig, SearchCue, SearchTree, ToCNavigator, }; -use crate::retrieval::search::extract_keywords; use crate::retrieval::strategy::{ HybridConfig, HybridStrategy, KeywordStrategy, LlmStrategy, RetrievalStrategy, }; -use crate::retrieval::types::{NavigationDecision, ReasoningCandidate, ReasoningStep, StageName, StrategyPreference}; +use crate::retrieval::types::{ + NavigationDecision, ReasoningCandidate, ReasoningStep, StageName, StrategyPreference, +}; /// Search Stage - executes tree search with optional Pilot guidance. /// @@ -115,9 +115,8 @@ impl SearchStage { pub fn with_hybrid_config(mut self, config: HybridConfig) -> Self { if let Some(ref llm) = self.llm_strategy { let llm_boxed: Box = Box::new((**llm).clone()); - self.hybrid_strategy = Some(Arc::new( - HybridStrategy::new(llm_boxed).with_config(config) - )); + self.hybrid_strategy = + Some(Arc::new(HybridStrategy::new(llm_boxed).with_config(config))); } self } @@ -163,7 +162,9 @@ impl SearchStage { let llm_boxed: Box = Box::new((**llm).clone()); Arc::new(HybridStrategy::new(llm_boxed)) } else { - warn!("Hybrid strategy requested but no LLM available, falling back to Keyword"); + warn!( + "Hybrid strategy requested but no LLM available, falling back to Keyword" + ); Arc::new(self.keyword_strategy.clone()) } } @@ -172,13 +173,14 @@ impl SearchStage { info!("Using Hybrid strategy as fallback for {:?})", preference); strategy.clone() } else { - warn!("{:?} requires special configuration, falling back to Keyword", preference); + warn!( + "{:?} requires special configuration, falling back to Keyword", + preference + ); Arc::new(self.keyword_strategy.clone()) } } - StrategyPreference::Auto => { - Arc::new(self.keyword_strategy.clone()) - } + StrategyPreference::Auto => Arc::new(self.keyword_strategy.clone()), } } @@ -250,11 +252,8 @@ impl SearchStage { let mut total_pilot_interventions = 0u64; for query in queries { - let legacy_ctx = RetrievalContext::new( - query, - ctx.options.max_tokens, - ctx.options.sufficiency_check, - ); + let legacy_ctx = + RetrievalContext::new(query, ctx.options.max_tokens, ctx.options.sufficiency_check); for cue in cues { debug!( @@ -265,18 +264,36 @@ impl SearchStage { let result = match algorithm { SearchAlgorithm::Greedy => { GreedySearch::new() - .search_from(&ctx.tree, &legacy_ctx, &search_config, pilot_ref, cue.root) + .search_from( + &ctx.tree, + &legacy_ctx, + &search_config, + pilot_ref, + cue.root, + ) .await } SearchAlgorithm::Beam => { BeamSearch::new() - .search_from(&ctx.tree, &legacy_ctx, &search_config, pilot_ref, cue.root) + .search_from( + &ctx.tree, + &legacy_ctx, + &search_config, + pilot_ref, + cue.root, + ) .await } // MCTS is not truly implemented — falls back to Beam behavior. SearchAlgorithm::Mcts => { BeamSearch::new() - .search_from(&ctx.tree, &legacy_ctx, &search_config, pilot_ref, cue.root) + .search_from( + &ctx.tree, + &legacy_ctx, + &search_config, + pilot_ref, + cue.root, + ) .await } }; @@ -467,7 +484,10 @@ impl RetrievalStage for SearchStage { // Reset Pilot state for new query if let Some(ref pilot) = self.pilot { pilot.reset(); - debug!("SearchStage: Pilot is available, is_active={}", pilot.is_active()); + debug!( + "SearchStage: Pilot is available, is_active={}", + pilot.is_active() + ); } // Apply budget-aware beam width adjustment @@ -480,21 +500,29 @@ impl RetrievalStage for SearchStage { algorithm, effective_beam, budget_status, - if self.has_pilot() { "enabled" } else { "disabled" } + if self.has_pilot() { + "enabled" + } else { + "disabled" + } ); ctx.increment_search_iteration(); // === L1 Cache check: return cached results if available === if ctx.options.enable_cache && ctx.search_iterations <= 1 { - let scope_fp = crate::utils::fingerprint::Fingerprint::from_str( - &format!("{:?}", ctx.tree.root()), - ); + let scope_fp = + crate::utils::fingerprint::Fingerprint::from_str(&format!("{:?}", ctx.tree.root())); if let Some(cached) = ctx.reasoning_cache.l1_get(&ctx.query, &scope_fp) { - info!("L1 cache hit for query, returning {} cached candidates", cached.len()); + info!( + "L1 cache hit for query, returning {} cached candidates", + cached.len() + ); ctx.candidates = cached .into_iter() - .map(|c| CandidateNode::new(c.node_id, c.score, c.depth, ctx.tree.is_leaf(c.node_id))) + .map(|c| { + CandidateNode::new(c.node_id, c.score, c.depth, ctx.tree.is_leaf(c.node_id)) + }) .collect(); ctx.metrics.cache_hits += 1; ctx.record_reasoning( @@ -599,7 +627,10 @@ impl RetrievalStage for SearchStage { ); for (i, c) in ctx.candidates.iter().enumerate().take(5) { if let Some(node) = ctx.tree.get(c.node_id) { - debug!("Candidate {}: score={:.3}, title='{}'", i, c.score, node.title); + debug!( + "Candidate {}: score={:.3}, title='{}'", + i, c.score, node.title + ); } } @@ -627,9 +658,8 @@ impl RetrievalStage for SearchStage { // Store results in L1 cache if ctx.options.enable_cache && ctx.search_iterations <= 1 && !ctx.candidates.is_empty() { - let scope_fp = crate::utils::fingerprint::Fingerprint::from_str( - &format!("{:?}", ctx.tree.root()), - ); + let scope_fp = + crate::utils::fingerprint::Fingerprint::from_str(&format!("{:?}", ctx.tree.root())); let cached: Vec = ctx .candidates .iter() @@ -662,7 +692,14 @@ impl RetrievalStage for SearchStage { .unwrap_or_else(|| "auto".to_string()); let search_iterations = ctx.search_iterations; - let reasoning_data: Vec<(String, Option, f32, usize, String, Vec)> = ctx + let reasoning_data: Vec<( + String, + Option, + f32, + usize, + String, + Vec, + )> = ctx .candidates .iter() .take(5) @@ -689,10 +726,20 @@ impl RetrievalStage for SearchStage { let reasoning = format!( "Candidate '{}' (score={:.3}) found via {} search, iteration {}", - title, candidate.score, algorithm.name(), search_iterations + title, + candidate.score, + algorithm.name(), + search_iterations ); - (format!("{:?}", candidate.node_id), Some(title), candidate.score, depth, reasoning, considered) + ( + format!("{:?}", candidate.node_id), + Some(title), + candidate.score, + depth, + reasoning, + considered, + ) }) .collect(); @@ -723,7 +770,6 @@ impl RetrievalStage for SearchStage { #[cfg(test)] mod tests { use super::*; - use crate::retrieval::pilot::NoopPilot; #[test] fn test_search_stage_creation() { @@ -738,13 +784,4 @@ mod tests { let stage = SearchStage::new(); assert_eq!(stage.depends_on(), vec!["plan"]); } - - #[test] - fn test_search_stage_with_noop_pilot() { - let pilot = Arc::new(NoopPilot::new()); - let stage = SearchStage::new().with_pilot(pilot); - - // NoopPilot is not active - assert!(!stage.has_pilot()); - } } diff --git a/rust/src/retrieval/strategy/cross_document.rs b/rust/src/retrieval/strategy/cross_document.rs index 4dfa1f4d..97a475d6 100644 --- a/rust/src/retrieval/strategy/cross_document.rs +++ b/rust/src/retrieval/strategy/cross_document.rs @@ -7,14 +7,13 @@ //! results into a unified response. use async_trait::async_trait; -use std::collections::HashMap; use std::sync::Arc; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; use crate::document::{DocumentTree, NodeId}; use crate::graph::DocumentGraph; -use crate::retrieval::types::{NavigationDecision, QueryComplexity}; use crate::retrieval::RetrievalContext; +use crate::retrieval::types::QueryComplexity; /// Document identifier for cross-document retrieval. pub type DocumentId = String; @@ -156,7 +155,10 @@ impl CrossDocumentStrategy { /// Set documents to search. pub fn with_documents(mut self, documents: Vec) -> Self { - self.documents = documents.into_iter().take(self.config.max_documents).collect(); + self.documents = documents + .into_iter() + .take(self.config.max_documents) + .collect(); self } @@ -228,7 +230,10 @@ impl CrossDocumentStrategy { let children = doc.tree.children(root_id); // Evaluate top-level nodes to find entry points - let evaluations = self.inner.evaluate_nodes(&doc.tree, &children, context).await; + let evaluations = self + .inner + .evaluate_nodes(&doc.tree, &children, context) + .await; // Collect results with scores above threshold let mut scored_nodes: Vec<(NodeId, NodeEvaluation)> = children @@ -238,7 +243,11 @@ impl CrossDocumentStrategy { .collect(); // Sort by score descending - scored_nodes.sort_by(|a, b| b.1.score.partial_cmp(&a.1.score).unwrap_or(std::cmp::Ordering::Equal)); + scored_nodes.sort_by(|a, b| { + b.1.score + .partial_cmp(&a.1.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); // Limit results per document scored_nodes.truncate(self.config.max_results_per_doc); @@ -254,20 +263,27 @@ impl CrossDocumentStrategy { } /// Merge results from all documents. - fn merge_results(&self, doc_results: Vec) -> Vec<(DocumentId, NodeId, NodeEvaluation)> { + fn merge_results( + &self, + doc_results: Vec, + ) -> Vec<(DocumentId, NodeId, NodeEvaluation)> { match self.config.merge_strategy { MergeStrategy::TopK => { // Collect all results and sort by score let mut all_results: Vec<_> = doc_results .into_iter() .flat_map(|doc| { - doc.evaluations.into_iter().map(move |(node_id, eval)| { - (doc.doc_id.clone(), node_id, eval) - }) + doc.evaluations + .into_iter() + .map(move |(node_id, eval)| (doc.doc_id.clone(), node_id, eval)) }) .collect(); - all_results.sort_by(|a, b| b.2.score.partial_cmp(&a.2.score).unwrap_or(std::cmp::Ordering::Equal)); + all_results.sort_by(|a, b| { + b.2.score + .partial_cmp(&a.2.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); all_results.truncate(self.config.max_total_results); all_results } @@ -277,9 +293,10 @@ impl CrossDocumentStrategy { doc_results .into_iter() .filter_map(|doc| { - doc.evaluations.into_iter().next().map(|(node_id, eval)| { - (doc.doc_id, node_id, eval) - }) + doc.evaluations + .into_iter() + .next() + .map(|(node_id, eval)| (doc.doc_id, node_id, eval)) }) .take(self.config.max_total_results) .collect() @@ -307,7 +324,11 @@ impl CrossDocumentStrategy { }) .collect(); - all_results.sort_by(|a, b| b.2.score.partial_cmp(&a.2.score).unwrap_or(std::cmp::Ordering::Equal)); + all_results.sort_by(|a, b| { + b.2.score + .partial_cmp(&a.2.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); all_results.truncate(self.config.max_total_results); all_results } @@ -317,13 +338,17 @@ impl CrossDocumentStrategy { let mut all_results: Vec<_> = doc_results .into_iter() .flat_map(|doc| { - doc.evaluations.into_iter().map(move |(node_id, eval)| { - (doc.doc_id.clone(), node_id, eval) - }) + doc.evaluations + .into_iter() + .map(move |(node_id, eval)| (doc.doc_id.clone(), node_id, eval)) }) .collect(); - all_results.sort_by(|a, b| b.2.score.partial_cmp(&a.2.score).unwrap_or(std::cmp::Ordering::Equal)); + all_results.sort_by(|a, b| { + b.2.score + .partial_cmp(&a.2.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); // Apply graph-based boosting self.apply_graph_boost(&mut all_results, 0.15); diff --git a/rust/src/retrieval/strategy/hybrid.rs b/rust/src/retrieval/strategy/hybrid.rs index f301d97b..c60572e5 100644 --- a/rust/src/retrieval/strategy/hybrid.rs +++ b/rust/src/retrieval/strategy/hybrid.rs @@ -11,9 +11,9 @@ use async_trait::async_trait; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; use crate::document::{DocumentTree, NodeId}; -use crate::retrieval::search::{Bm25Engine, FieldDocument, FieldWeights}; -use crate::retrieval::types::{NavigationDecision, QueryComplexity}; use crate::retrieval::RetrievalContext; +use crate::retrieval::search::{Bm25Engine, FieldDocument}; +use crate::retrieval::types::{NavigationDecision, QueryComplexity}; /// Configuration for hybrid retrieval. #[derive(Debug, Clone)] @@ -39,7 +39,7 @@ pub struct HybridConfig { impl Default for HybridConfig { fn default() -> Self { Self { - pre_filter_ratio: 0.3, // Keep top 30% + pre_filter_ratio: 0.3, // Keep top 30% min_candidates: 2, max_candidates: 5, auto_accept_threshold: 0.85, @@ -257,7 +257,9 @@ impl RetrievalStrategy for HybridStrategy { context: &RetrievalContext, ) -> NodeEvaluation { // Delegate to LLM strategy for single node - self.llm_strategy.evaluate_node(tree, node_id, context).await + self.llm_strategy + .evaluate_node(tree, node_id, context) + .await } async fn evaluate_nodes( @@ -275,7 +277,10 @@ impl RetrievalStrategy for HybridStrategy { // If no BM25 scores available, fall back to LLM only if bm25_scores.is_empty() { - return self.llm_strategy.evaluate_nodes(tree, node_ids, context).await; + return self + .llm_strategy + .evaluate_nodes(tree, node_ids, context) + .await; } // Create a score map for quick lookup @@ -341,7 +346,10 @@ impl RetrievalStrategy for HybridStrategy { // Call LLM for filtered candidates if !llm_nodes.is_empty() { - let llm_results = self.llm_strategy.evaluate_nodes(tree, &llm_nodes, context).await; + let llm_results = self + .llm_strategy + .evaluate_nodes(tree, &llm_nodes, context) + .await; // Map LLM results back with combined scores let mut llm_iter = llm_results.into_iter(); @@ -453,7 +461,8 @@ mod tests { #[test] fn test_combine_scores() { - let strategy = HybridStrategy::new(Box::new(crate::retrieval::strategy::KeywordStrategy::new())); + let strategy = + HybridStrategy::new(Box::new(crate::retrieval::strategy::KeywordStrategy::new())); let combined = strategy.combine_scores(0.8, 0.6); // 0.8 * 0.4 + 0.6 * 0.6 = 0.32 + 0.36 = 0.68 diff --git a/rust/src/retrieval/strategy/mod.rs b/rust/src/retrieval/strategy/mod.rs index 345e90f9..27e3a8c3 100644 --- a/rust/src/retrieval/strategy/mod.rs +++ b/rust/src/retrieval/strategy/mod.rs @@ -20,13 +20,7 @@ mod page_range; mod semantic; mod r#trait; -pub use cross_document::{ - CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry, DocumentId, DocumentResult, - MergeStrategy, -}; pub use hybrid::{HybridConfig, HybridStrategy}; pub use keyword::KeywordStrategy; pub use llm::LlmStrategy; -pub use page_range::{PageRange, PageRangeConfig, PageRangeStrategy}; -pub use semantic::SemanticStrategy; -pub use r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities, StrategyCost}; +pub use r#trait::RetrievalStrategy; diff --git a/rust/src/retrieval/strategy/page_range.rs b/rust/src/retrieval/strategy/page_range.rs index e362fb51..1dc26900 100644 --- a/rust/src/retrieval/strategy/page_range.rs +++ b/rust/src/retrieval/strategy/page_range.rs @@ -10,8 +10,8 @@ use async_trait::async_trait; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; use crate::document::{DocumentTree, NodeId}; -use crate::retrieval::types::{NavigationDecision, QueryComplexity}; use crate::retrieval::RetrievalContext; +use crate::retrieval::types::{NavigationDecision, QueryComplexity}; /// A page range for filtering. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -30,12 +30,18 @@ impl PageRange { /// Create a range from a single page. pub fn single(page: usize) -> Self { - Self { start: page, end: page } + Self { + start: page, + end: page, + } } /// Create a range starting from a page to the end. pub fn from(start: usize) -> Self { - Self { start, end: usize::MAX } + Self { + start, + end: usize::MAX, + } } /// Create a range from the beginning to a page. @@ -70,7 +76,10 @@ impl PageRange { impl Default for PageRange { fn default() -> Self { - Self { start: 1, end: usize::MAX } + Self { + start: 1, + end: usize::MAX, + } } } @@ -196,8 +205,16 @@ impl PageRangeStrategy { } PageRange { - start: self.config.range.start.saturating_sub(self.config.expand_context_pages), - end: self.config.range.end.saturating_add(self.config.expand_context_pages), + start: self + .config + .range + .start + .saturating_sub(self.config.expand_context_pages), + end: self + .config + .range + .end + .saturating_add(self.config.expand_context_pages), } } @@ -307,7 +324,10 @@ impl RetrievalStrategy for PageRangeStrategy { // Evaluate included nodes with inner strategy if !included.is_empty() { let included_ids: Vec = included.iter().map(|(_, id)| *id).collect(); - let inner_results = self.inner.evaluate_nodes(tree, &included_ids, context).await; + let inner_results = self + .inner + .evaluate_nodes(tree, &included_ids, context) + .await; // Map results back to original positions for ((orig_idx, _), eval) in included.into_iter().zip(inner_results.into_iter()) { diff --git a/rust/src/retrieval/types.rs b/rust/src/retrieval/types.rs index 649163d7..ec3e25dd 100644 --- a/rust/src/retrieval/types.rs +++ b/rust/src/retrieval/types.rs @@ -259,7 +259,10 @@ impl RetrieveOptions { /// Set the cross-document graph for graph-aware retrieval boosting. #[must_use] - pub fn with_document_graph(mut self, graph: std::sync::Arc) -> Self { + pub fn with_document_graph( + mut self, + graph: std::sync::Arc, + ) -> Self { self.document_graph = Some(graph); self } @@ -564,11 +567,11 @@ impl ReasoningChain { self.steps .iter() .map(|s| { - let node_info = s - .title - .as_deref() - .unwrap_or("(no node)"); - format!("[{}] {} (score={:.2}): {}", s.stage, node_info, s.score, s.reasoning) + let node_info = s.title.as_deref().unwrap_or("(no node)"); + format!( + "[{}] {} (score={:.2}): {}", + s.stage, node_info, s.score, s.reasoning + ) }) .collect::>() .join("\n") diff --git a/rust/src/storage/backend/file.rs b/rust/src/storage/backend/file.rs index 0eabc5c8..2933d806 100644 --- a/rust/src/storage/backend/file.rs +++ b/rust/src/storage/backend/file.rs @@ -7,7 +7,7 @@ use std::fs; use std::path::{Path, PathBuf}; use std::sync::RwLock; -use tracing::{debug, warn}; +use tracing::debug; use super::StorageBackend; use crate::Error; diff --git a/rust/src/storage/backend/mod.rs b/rust/src/storage/backend/mod.rs index b8d7ccef..a8bc8053 100644 --- a/rust/src/storage/backend/mod.rs +++ b/rust/src/storage/backend/mod.rs @@ -31,5 +31,4 @@ mod memory; mod trait_def; pub use file::FileBackend; -pub use memory::MemoryBackend; pub use trait_def::StorageBackend; diff --git a/rust/src/storage/lock.rs b/rust/src/storage/lock.rs index 6318e8a5..feb484ba 100644 --- a/rust/src/storage/lock.rs +++ b/rust/src/storage/lock.rs @@ -62,7 +62,6 @@ impl FileLock { // Try to acquire the lock #[cfg(unix)] { - use std::os::unix::fs::MetadataExt; let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); let result = if exclusive { diff --git a/rust/src/storage/mod.rs b/rust/src/storage/mod.rs index bf50d96e..b10019ce 100644 --- a/rust/src/storage/mod.rs +++ b/rust/src/storage/mod.rs @@ -42,15 +42,5 @@ mod persistence; pub mod workspace; // Re-export main types -pub use backend::{FileBackend, MemoryBackend, StorageBackend}; -pub use cache::DocumentCache; -pub use codec::{Codec, GzipCodec, IdentityCodec, codec_from_config}; -pub use lock::{FileLock, ScopedLock}; -pub use migration::{CURRENT_VERSION, Migration, MigrationContext, Migrator}; -pub use persistence::{ - DocumentMeta, PageContent, PersistedDocument, PersistenceOptions, load_document, - load_document_from_bytes, load_document_with_options, load_index, load_index_from_bytes, - load_index_with_options, save_document, save_document_to_bytes, save_document_with_options, - save_index, save_index_to_bytes, save_index_with_options, -}; -pub use workspace::{DocumentMetaEntry, Workspace, WorkspaceOptions}; +pub use persistence::{DocumentMeta, PersistedDocument}; +pub use workspace::Workspace; diff --git a/rust/src/storage/persistence.rs b/rust/src/storage/persistence.rs index fece82a8..a1c2d9e8 100644 --- a/rust/src/storage/persistence.rs +++ b/rust/src/storage/persistence.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::fs::File; -use std::io::{BufReader, BufWriter, Read, Write}; +use std::io::{BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; use crate::Error; @@ -53,14 +53,19 @@ pub struct DocumentMeta { pub modified_at: chrono::DateTime, // === Processing State (for incremental updates) === - /// Content fingerprint for change detection. - #[serde(default, skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero")] + #[serde( + default, + skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero" + )] pub content_fingerprint: crate::utils::fingerprint::Fingerprint, /// Logic fingerprint (hash of pipeline configuration used to produce this document). /// If the pipeline config changes, a full reprocess is needed even if content didn't change. - #[serde(default, skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero")] + #[serde( + default, + skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero" + )] pub logic_fingerprint: crate::utils::fingerprint::Fingerprint, /// Processing version (incremented when algorithm changes). @@ -171,7 +176,11 @@ impl DocumentMeta { } /// Check if the document needs reprocessing. - pub fn needs_reprocessing(&self, current_fp: &crate::utils::fingerprint::Fingerprint, current_version: u32) -> bool { + pub fn needs_reprocessing( + &self, + current_fp: &crate::utils::fingerprint::Fingerprint, + current_version: u32, + ) -> bool { // Never processed if self.processing_version == 0 { return true; diff --git a/rust/src/storage/workspace.rs b/rust/src/storage/workspace.rs index 974f0e68..052e9b75 100644 --- a/rust/src/storage/workspace.rs +++ b/rust/src/storage/workspace.rs @@ -33,7 +33,7 @@ //! ``` use std::collections::HashMap; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; use serde::{Deserialize, Serialize}; @@ -462,7 +462,11 @@ impl Workspace { })?; inner.backend.put(Self::GRAPH_KEY, &bytes)?; inner.document_graph = Some(graph.clone()); - info!("Persisted document graph ({} nodes, {} edges)", graph.node_count(), graph.edge_count()); + info!( + "Persisted document graph ({} nodes, {} edges)", + graph.node_count(), + graph.edge_count() + ); Ok(()) } @@ -562,121 +566,4 @@ mod tests { let tree = DocumentTree::new("Root", "Content"); PersistedDocument::new(meta, tree) } - - #[tokio::test] - async fn test_async_workspace_create() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - assert!(workspace.is_empty().await); - assert_eq!(workspace.len().await, 0); - } - - #[tokio::test] - async fn test_async_workspace_add_and_load() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - assert_eq!(workspace.len().await, 1); - assert!(workspace.contains("doc-1").await); - - let loaded = workspace.load("doc-1").await.unwrap(); - assert!(loaded.is_some()); - assert_eq!(loaded.unwrap().meta.id, "doc-1"); - } - - #[tokio::test] - async fn test_async_workspace_remove() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - let removed = workspace.remove("doc-1").await.unwrap(); - assert!(removed); - assert!(workspace.is_empty().await); - - let removed_again = workspace.remove("doc-1").await.unwrap(); - assert!(!removed_again); - } - - #[tokio::test] - async fn test_async_workspace_cache() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - // First load with caching - let _ = workspace.load_and_cache("doc-1").await.unwrap(); - let stats = workspace.cache_stats().await; - assert_eq!(stats.misses, 1); - - // Second load should hit cache - let _ = workspace.load_and_cache("doc-1").await.unwrap(); - let stats = workspace.cache_stats().await; - assert_eq!(stats.hits, 1); - } - - #[tokio::test] - async fn test_async_workspace_list_documents() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - workspace.add(&create_test_doc("doc-1")).await.unwrap(); - workspace.add(&create_test_doc("doc-2")).await.unwrap(); - workspace.add(&create_test_doc("doc-3")).await.unwrap(); - - let docs = workspace.list_documents().await; - assert_eq!(docs.len(), 3); - } - - #[tokio::test] - async fn test_async_workspace_get_meta() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Workspace::with_backend(backend).await.unwrap(); - - let doc = create_test_doc("doc-1"); - workspace.add(&doc).await.unwrap(); - - let meta = workspace.get_meta("doc-1").await; - assert!(meta.is_some()); - let meta = meta.unwrap(); - assert_eq!(meta.id, "doc-1"); - assert_eq!(meta.doc_name, "Test Doc"); - assert_eq!(meta.doc_type, "md"); - } - - #[tokio::test] - async fn test_async_workspace_concurrent_access() { - let backend = Arc::new(super::super::backend::MemoryBackend::new()); - let workspace = Arc::new(Workspace::with_backend(backend).await.unwrap()); - - // Spawn multiple concurrent tasks - let mut handles = vec![]; - - for i in 0..10 { - let ws = workspace.clone(); - let handle = tokio::spawn(async move { - let id = format!("doc-{}", i); - let doc = create_test_doc(&id); - ws.add(&doc).await.unwrap(); - let loaded = ws.load(&id).await.unwrap(); - assert!(loaded.is_some()); - }); - handles.push(handle); - } - - // Wait for all tasks - for handle in handles { - handle.await.unwrap(); - } - - assert_eq!(workspace.len().await, 10); - } } diff --git a/rust/src/throttle/mod.rs b/rust/src/throttle/mod.rs index 0e07c258..3bf6467f 100644 --- a/rust/src/throttle/mod.rs +++ b/rust/src/throttle/mod.rs @@ -9,26 +9,6 @@ //! - **Rate Limiter** — Token bucket algorithm to limit requests per time period //! - **Concurrency Controller** — Combined semaphore + rate limiter //! -//! # Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────┐ -//! │ LlmClient │ -//! │ │ -//! │ complete() ──▶ [Rate Limiter] ──▶ [Semaphore] ──▶ API Call │ -//! │ │ │ │ -//! │ 令牌桶限制 并发数限制 │ -//! │ │ -//! │ ┌─────────────────────────────────────────────────────────┐ │ -//! │ │ ConcurrencyController │ │ -//! │ │ │ │ -//! │ │ ┌─────────────┐ ┌─────────────┐ │ │ -//! │ │ │RateLimiter │ │ Semaphore │ │ │ -//! │ │ │(governor) │ │(tokio) │ │ │ -//! │ │ └─────────────┘ └─────────────┘ │ │ -//! │ └─────────────────────────────────────────────────────────┘ │ -//! └─────────────────────────────────────────────────────────────────┘ -//! ``` //! //! # Example //! @@ -60,4 +40,3 @@ mod rate_limiter; pub use config::ConcurrencyConfig; pub use controller::ConcurrencyController; -pub use rate_limiter::RateLimiter; diff --git a/rust/src/utils/mod.rs b/rust/src/utils/mod.rs index 2a7f4198..c6fd9b17 100644 --- a/rust/src/utils/mod.rs +++ b/rust/src/utils/mod.rs @@ -9,16 +9,9 @@ //! - **Timing** — Performance measurement utilities //! - **Format** — Text and number formatting utilities +pub mod fingerprint; mod format; mod timing; mod token; -pub mod fingerprint; -pub use format::{ - clean_whitespace, format_bytes, format_number, format_percent, indent, line_count, truncate, - truncate_words, word_count, -}; -pub use timing::{Timer, format_duration, format_duration_compact}; -pub use token::{estimate_tokens, estimate_tokens_batch, estimate_tokens_fast}; -// Fingerprint -pub use fingerprint::{Fingerprint, Fingerprinter, NodeFingerprint}; \ No newline at end of file +pub use token::estimate_tokens;