diff --git a/examples/python/advanced/main.py b/examples/python/advanced/main.py index d223ad02..001dde57 100644 --- a/examples/python/advanced/main.py +++ b/examples/python/advanced/main.py @@ -91,8 +91,9 @@ def main(): for q in questions: result = engine.query(doc_id, q) print(f"Q: {q}") - print(f"A: {result.content[:150]}...") - print(f" Score: {result.score:.2f}\n") + if item := result.single(): + print(f"A: {item.content[:150]}...") + print(f" Score: {item.score:.2f}\n") # Cleanup engine.remove(doc_id) diff --git a/examples/python/basic/main.py b/examples/python/basic/main.py index 4ae34b42..9ee5f1c5 100644 --- a/examples/python/basic/main.py +++ b/examples/python/basic/main.py @@ -59,8 +59,9 @@ def main(): # Query result = engine.query(doc_id, "How do I install vectorless?") print("Query: How do I install vectorless?") - print(f"Score: {result.score:.2f}") - print(f"Result: {result.content[:200]}...\n") + if item := result.single(): + print(f"Score: {item.score:.2f}") + print(f"Result: {item.content[:200]}...\n") # Cleanup engine.remove(doc_id) diff --git a/examples/python/custom_config/main.py b/examples/python/custom_config/main.py index d6e0bda4..eda1095c 100644 --- a/examples/python/custom_config/main.py +++ b/examples/python/custom_config/main.py @@ -94,14 +94,16 @@ def main(): # Query result = engine.query(doc_id, "How do I install the product?") print("Query: How do I install the product?") - print(f"Score: {result.score:.2f}") - print(f"Result: {result.content}\n") + if item := result.single(): + print(f"Score: {item.score:.2f}") + print(f"Result: {item.content}\n") # Another query result = engine.query(doc_id, "What features are available?") print("Query: What features are available?") - print(f"Score: {result.score:.2f}") - print(f"Result: {result.content}\n") + if item := result.single(): + print(f"Score: {item.score:.2f}") + print(f"Result: {item.content}\n") # Cleanup engine.remove(doc_id) diff --git a/examples/rust/advanced.rs b/examples/rust/advanced.rs index bc89d756..56a3213a 100644 --- a/examples/rust/advanced.rs +++ b/examples/rust/advanced.rs @@ -42,10 +42,12 @@ async fn main() -> vectorless::Result<()> { .query(QueryContext::new("What features does Vectorless provide?").with_doc_id(&doc_id)) .await?; println!("Query: What features does Vectorless provide?"); - println!("Score: {:.2}", result.score); - if !result.content.is_empty() { - let preview: String = result.content.chars().take(200).collect(); - println!("Result: {}...\n", preview); + if let Some(item) = result.single() { + println!("Score: {:.2}", item.score); + if !item.content.is_empty() { + let preview: String = item.content.chars().take(200).collect(); + println!("Result: {}...\n", preview); + } } // Cleanup diff --git a/examples/rust/basic.rs b/examples/rust/basic.rs index 5d5df2bd..bcc86d3c 100644 --- a/examples/rust/basic.rs +++ b/examples/rust/basic.rs @@ -42,10 +42,12 @@ async fn main() -> vectorless::Result<()> { .await { Ok(result) => { - println!("Score: {:.2}", result.score); - if !result.content.is_empty() { - let preview: String = result.content.chars().take(150).collect(); - println!("Result: {}...", preview); + if let Some(item) = result.single() { + println!("Score: {:.2}", item.score); + if !item.content.is_empty() { + let preview: String = item.content.chars().take(150).collect(); + println!("Result: {}...", preview); + } } } Err(e) => println!("Query: {}", e), diff --git a/examples/rust/custom_config.rs b/examples/rust/custom_config.rs index b916143b..14c33b11 100644 --- a/examples/rust/custom_config.rs +++ b/examples/rust/custom_config.rs @@ -53,10 +53,12 @@ async fn main() -> vectorless::Result<()> { .query(QueryContext::new("What is Vectorless?").with_doc_id(&doc_id)) .await?; println!("Query: What is Vectorless?"); - println!("Score: {:.2}", result.score); - if !result.content.is_empty() { - let preview: String = result.content.chars().take(200).collect(); - println!("Result: {}...\n", preview); + if let Some(item) = result.single() { + println!("Score: {:.2}", item.score); + if !item.content.is_empty() { + let preview: String = item.content.chars().take(200).collect(); + println!("Result: {}...\n", preview); + } } // Cleanup diff --git a/examples/rust/events.rs b/examples/rust/events.rs index 7d5c99c2..4dc558d1 100644 --- a/examples/rust/events.rs +++ b/examples/rust/events.rs @@ -138,11 +138,13 @@ The event system uses handlers that can be attached to the engine builder. // 5. Show results println!("Step 5: Query result:"); - println!(" - Score: {:.2}", result.score); - println!(" - Nodes: {}", result.node_ids.len()); - if !result.content.is_empty() { - let preview: String = result.content.chars().take(100).collect(); - println!(" - Content: {}...", preview); + if let Some(item) = result.single() { + println!(" - Score: {:.2}", item.score); + println!(" - Nodes: {}", item.node_ids.len()); + if !item.content.is_empty() { + let preview: String = item.content.chars().take(100).collect(); + println!(" - Content: {}...", preview); + } } println!(); diff --git a/examples/rust/graph.rs b/examples/rust/graph.rs new file mode 100644 index 00000000..d4e3e3a3 --- /dev/null +++ b/examples/rust/graph.rs @@ -0,0 +1,88 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document graph example for Vectorless. +//! +//! Demonstrates how to retrieve the cross-document relationship graph +//! after indexing. The graph is automatically rebuilt after each index call, +//! connecting documents that share keywords via Jaccard similarity. +//! +//! # Usage +//! +//! ```bash +//! cargo run --example graph +//! ``` + +use vectorless::{EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + println!("=== Document Graph Example ===\n"); + + // 1. Create engine + let engine = EngineBuilder::new() + .with_workspace("./workspace_graph_example") + .build() + .await + .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; + + // 2. Index documents — graph is rebuilt automatically + let result = engine + .index(IndexContext::from_paths(&["./README.md", "./CLAUDE.md"])) + .await?; + + println!("Indexed {} document(s)", result.items.len()); + for item in &result.items { + println!(" - {} ({})", item.name, item.doc_id); + } + println!(); + + // 3. Get the document graph + match engine.get_graph().await? { + Some(graph) => { + println!( + "Document graph: {} nodes, {} edges", + graph.node_count(), + graph.edge_count() + ); + + // Show document nodes + for doc_id in graph.doc_ids() { + if let Some(node) = graph.get_node(doc_id) { + println!( + " Node: {} — {} keyword(s), top: {:?}", + node.title, + node.top_keywords.len(), + node.top_keywords.iter().take(3).map(|kw| &kw.keyword).collect::>() + ); + + // Show edges (connected documents) + let neighbors = graph.get_neighbors(doc_id); + if !neighbors.is_empty() { + for edge in neighbors { + println!( + " → {} (weight={:.2}, jaccard={:.2}, shared={})", + edge.target_doc_id, + edge.weight, + edge.evidence.keyword_jaccard, + edge.evidence.shared_keyword_count, + ); + } + } else { + println!(" (no connections)"); + } + } + } + } + None => println!("No graph available (no documents with reasoning index)"), + } + + // 4. Cleanup + let docs = engine.list().await?; + for doc in &docs { + engine.remove(&doc.id).await?; + } + + println!("\n=== Done ==="); + Ok(()) +} diff --git a/examples/rust/markdownflow.rs b/examples/rust/markdownflow.rs index 7d7988c0..4efbb1cc 100644 --- a/examples/rust/markdownflow.rs +++ b/examples/rust/markdownflow.rs @@ -88,19 +88,22 @@ async fn main() -> Result<(), Box> { match client.query(QueryContext::new(query).with_doc_id(&doc_id)).await { Ok(result) => { - if result.content.is_empty() { - println!(" - No relevant content found"); - } else { - println!(" - Found relevant content:"); - // Print first 200 chars - let preview = if result.content.len() > 200 { - format!("{}...", &result.content[..200]) + if let Some(item) = result.single() { + if item.content.is_empty() { + println!(" - No relevant content found"); } else { - result.content.clone() - }; - for line in preview.lines().take(5) { - println!(" {}", line); + println!(" - Found relevant content:"); + let preview = if item.content.len() > 200 { + format!("{}...", &item.content[..200]) + } else { + item.content.clone() + }; + for line in preview.lines().take(5) { + println!(" {}", line); + } } + } else { + println!(" - No results"); } } Err(e) => { diff --git a/python/src/lib.rs b/python/src/lib.rs index 53cde317..70a43f85 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -11,7 +11,7 @@ use std::sync::Arc; use tokio::runtime::Runtime; // Use ::vectorless to avoid conflict with the #[pymodule] named vectorless -use ::vectorless::client::{Engine, EngineBuilder, IndexContext, IndexItem, IndexResult, QueryContext, QueryResult, DocumentInfo}; +use ::vectorless::client::{Engine, EngineBuilder, IndexContext, IndexItem, IndexResult, QueryContext, QueryResult, QueryResultItem, DocumentInfo, FailedItem}; use ::vectorless::client::DocumentFormat; use ::vectorless::error::Error as RustError; @@ -181,17 +181,17 @@ fn parse_format(format: &str) -> PyResult { } // ============================================================ -// QueryResult +// QueryResultItem // ============================================================ -/// Result of a document query. -#[pyclass(name = "QueryResult")] -pub struct PyQueryResult { - inner: QueryResult, +/// A single document's query result. +#[pyclass(name = "QueryResultItem")] +pub struct PyQueryResultItem { + inner: QueryResultItem, } #[pymethods] -impl PyQueryResult { +impl PyQueryResultItem { /// The document ID. #[getter] fn doc_id(&self) -> &str { @@ -218,7 +218,7 @@ impl PyQueryResult { fn __repr__(&self) -> String { format!( - "QueryResult(doc_id='{}', score={:.2}, content_len={})", + "QueryResultItem(doc_id='{}', score={:.2}, content_len={})", self.inner.doc_id, self.inner.score, self.inner.content.len() @@ -226,6 +226,91 @@ impl PyQueryResult { } } +// ============================================================ +// FailedItem +// ============================================================ + +/// A failed item in a batch operation. +#[pyclass(name = "FailedItem")] +pub struct PyFailedItem { + inner: FailedItem, +} + +#[pymethods] +impl PyFailedItem { + /// Source description. + #[getter] + fn source(&self) -> &str { + &self.inner.source + } + + /// Error message. + #[getter] + fn error(&self) -> &str { + &self.inner.error + } + + fn __repr__(&self) -> String { + format!("FailedItem(source='{}', error='{}')", self.inner.source, self.inner.error) + } +} + +// ============================================================ +// QueryResult +// ============================================================ + +/// Result of a document query (may contain results from multiple documents). +#[pyclass(name = "QueryResult")] +pub struct PyQueryResult { + inner: QueryResult, +} + +#[pymethods] +impl PyQueryResult { + /// Result items (one per document). + #[getter] + fn items(&self) -> Vec { + self.inner + .items + .iter() + .map(|i| PyQueryResultItem { + inner: i.clone(), + }) + .collect() + } + + /// Get the first (single-doc) result item. + fn single(&self) -> Option { + self.inner.single().map(|i| PyQueryResultItem { + inner: i.clone(), + }) + } + + /// Number of result items. + fn len(&self) -> usize { + self.inner.len() + } + + /// Whether any documents failed. + fn has_failures(&self) -> bool { + self.inner.has_failures() + } + + /// Failed items. + #[getter] + fn failed(&self) -> Vec { + self.inner + .failed + .iter() + .map(|f| PyFailedItem { inner: f.clone() }) + .collect() + } + + fn __repr__(&self) -> String { + format!("QueryResult(items={}, failed={})", self.inner.len(), self.inner.failed.len()) + } +} + // ============================================================ // IndexResult // ============================================================ @@ -254,11 +339,27 @@ impl PyIndexResult { .collect() } + /// Failed items. + #[getter] + fn failed(&self) -> Vec { + self.inner + .failed + .iter() + .map(|f| PyFailedItem { inner: f.clone() }) + .collect() + } + + /// Whether any items failed. + fn has_failures(&self) -> bool { + self.inner.has_failures() + } + fn __repr__(&self) -> String { format!( - "IndexResult(doc_id={:?}, count={})", + "IndexResult(doc_id={:?}, count={}, failed={})", self.inner.doc_id(), - self.inner.items.len() + self.inner.items.len(), + self.inner.failed.len() ) } } @@ -606,7 +707,9 @@ fn vectorless(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/rust/Cargo.toml b/rust/Cargo.toml index fae28620..392fa018 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -38,6 +38,10 @@ path = "../examples/rust/events.rs" name = "markdownflow" path = "../examples/rust/markdownflow.rs" +[[example]] +name = "graph" +path = "../examples/rust/graph.rs" + [dependencies] # Async runtime tokio = { workspace = true } diff --git a/rust/src/client/builder.rs b/rust/src/client/builder.rs index 8a71cc0d..91095da2 100644 --- a/rust/src/client/builder.rs +++ b/rust/src/client/builder.rs @@ -420,7 +420,9 @@ impl EngineBuilder { } // Also set legacy config for backwards compatibility if config.summary.api_key.is_none() { - config.summary.api_key = Some(std::env::var("OPENAI_API_KEY").unwrap()); + if let Ok(api_key) = std::env::var("OPENAI_API_KEY") { + config.summary.api_key = Some(api_key); + } } } @@ -542,9 +544,8 @@ impl EngineBuilder { .await .map_err(|e| BuildError::Workspace(e.to_string()))?; - // Create pipeline executor with LLM client if API key is available - let executor = if let Some(api_key) = config.summary.api_key.clone() { - // Create LlmConfig from SummaryConfig + // Create indexer client with LLM-enabled factory if API key is available + let indexer = if let Some(api_key) = config.summary.api_key.clone() { let llm_config = crate::llm::LlmConfig::new(&config.summary.model) .with_endpoint(config.summary.endpoint.clone()) .with_api_key(api_key) @@ -552,9 +553,9 @@ impl EngineBuilder { .with_temperature(config.summary.temperature); let llm_client = crate::llm::LlmClient::new(llm_config); - crate::index::PipelineExecutor::with_llm(llm_client) + crate::client::indexer::IndexerClient::with_llm(llm_client) } else { - crate::index::PipelineExecutor::new() + crate::client::indexer::IndexerClient::new(crate::index::PipelineExecutor::new()) }; // Create pipeline retriever with config @@ -595,7 +596,7 @@ impl EngineBuilder { } // Build engine - Engine::with_components(config, workspace, retriever, executor) + Engine::with_components(config, workspace, retriever, indexer) .await .map_err(|e| BuildError::Other(e.to_string())) } diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index e075623a..3989481b 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -35,23 +35,27 @@ //! # } //! ``` +use std::collections::HashMap; use std::sync::Arc; +use futures::StreamExt; use tracing::info; use crate::config::Config; use crate::error::Result; -use crate::index::PipelineExecutor; +use crate::index::PipelineOptions; +use crate::index::incremental::{self, IndexAction}; use crate::retrieval::{PipelineRetriever, RetrieveEventReceiver}; -use crate::storage::Workspace; +use crate::storage::{PersistedDocument, Workspace}; +use crate::utils::fingerprint::Fingerprint; use crate::{DocumentTree, Error}; use super::events::EventEmitter; -use super::index_context::IndexContext; +use super::index_context::{IndexContext, IndexSource}; use super::indexer::IndexerClient; -use super::query_context::QueryContext; +use super::query_context::{QueryContext, QueryScope}; use super::retriever::RetrieverClient; -use super::types::{DocumentInfo, IndexItem, IndexResult, QueryResult}; +use super::types::{DocumentInfo, FailedItem, IndexItem, IndexMode, IndexResult, QueryResult, QueryResultItem}; use super::workspace::WorkspaceClient; /// The main Engine client. @@ -94,13 +98,13 @@ impl Engine { config: Config, workspace: Workspace, retriever: PipelineRetriever, - executor: PipelineExecutor, + indexer: IndexerClient, ) -> Result { let config = Arc::new(config); let events = EventEmitter::new(); - // Create indexer client - let indexer = IndexerClient::new(executor).with_events(events.clone()); + // Attach event emitter to indexer + let indexer = indexer.with_events(events.clone()); // Create retriever client let retriever = @@ -149,34 +153,183 @@ impl Engine { /// # } /// ``` pub async fn index(&self, ctx: IndexContext) -> Result { - let doc = self.indexer.index(ctx).await?; + if ctx.is_empty() { + return Err(Error::Config("No document sources provided".to_string())); + } + + // Single source: no need for concurrency overhead + if ctx.sources.len() == 1 { + let source = &ctx.sources[0]; + let (items, failed) = self.process_source(source, &ctx.options, ctx.name.as_deref()).await; + if items.is_empty() && !failed.is_empty() { + return Err(Error::Config(format!( + "All {} source(s) failed to index", + failed.len() + ))); + } + if !items.is_empty() { + if let Err(e) = self.rebuild_graph().await { + tracing::warn!("Graph rebuild failed: {}", e); + } + } + return Ok(IndexResult::with_partial(items, failed)); + } - let item = IndexItem::new(doc.id.clone(), doc.name.clone(), doc.format.clone()); + // Multiple sources: parallel indexing + let concurrency = self.config.concurrency.max_concurrent_requests.min(ctx.sources.len()); + + let results: Vec<(Vec, Vec)> = + futures::stream::iter(&ctx.sources) + .map(|source| { + let options = ctx.options.clone(); + let name = ctx.name.clone(); + let engine = self.clone(); + async move { + engine.process_source(source, &options, name.as_deref()).await + } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + let mut items = Vec::new(); + let mut failed = Vec::new(); + for (ok, err) in results { + items.extend(ok); + failed.extend(err); + } - let persisted = self.indexer.to_persisted(doc); + if items.is_empty() && !failed.is_empty() { + return Err(Error::Config(format!( + "All {} source(s) failed to index", + failed.len() + ))); + } - // Save to workspace if configured - if let Some(ref workspace) = self.workspace { - workspace.save(&persisted).await?; + // Rebuild document graph after successful batch index + if !items.is_empty() { + if let Err(e) = self.rebuild_graph().await { + tracing::warn!("Graph rebuild failed: {}", e); + } } - info!("Indexed document: {}", item.doc_id); - Ok(IndexResult::new(vec![item])) + Ok(IndexResult::with_partial(items, failed)) + } + + /// Process a single source — resolve action and index. + /// + /// Returns `(items, failed)`. + async fn process_source( + &self, + source: &IndexSource, + options: &super::types::IndexOptions, + name: Option<&str>, + ) -> (Vec, Vec) { + let source_label = source.to_string(); + + match self.resolve_index_action(source, options).await { + Ok(IndexAction::Skip(skip_info)) => { + info!("Skipped (unchanged): {}", source_label); + ( + vec![IndexItem::new( + skip_info.doc_id, + skip_info.name, + skip_info.format, + skip_info.description, + skip_info.page_count, + )], + Vec::new(), + ) + } + Ok(IndexAction::FullIndex { existing_id }) => { + match self.indexer.index(source, name, options).await { + Ok(doc) => { + let pipeline_options = self.build_pipeline_options(options, doc.format); + let metrics = doc.metrics.clone(); + let item = IndexItem::new( + doc.id.clone(), + doc.name.clone(), + doc.format.clone(), + doc.description.clone(), + doc.page_count, + ).with_metrics_opt(metrics); + let persisted = self.indexer.to_persisted_with_options(doc, &pipeline_options); + + if let Some(ref workspace) = self.workspace { + if let Err(e) = workspace.save(&persisted).await { + return (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]); + } + // Clean up old document after successful save (atomic: save-first, then remove old) + if let Some(old_id) = &existing_id { + let _ = workspace.remove(old_id).await; + } + } + + info!("Indexed document: {}", item.doc_id); + (vec![item], Vec::new()) + } + Err(e) => { + tracing::warn!("Failed to index {}: {}", source_label, e); + (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + } + } + } + Ok(IndexAction::IncrementalUpdate { old_tree, existing_id }) => { + info!("Incremental update for: {}", source_label); + match self + .indexer + .index_with_existing(source, name, options, Some(&old_tree)) + .await + { + Ok(mut doc) => { + doc.id = existing_id.clone(); + let pipeline_options = self.build_pipeline_options(options, doc.format); + let metrics = doc.metrics.clone(); + let item = IndexItem::new( + doc.id.clone(), + doc.name.clone(), + doc.format.clone(), + doc.description.clone(), + doc.page_count, + ).with_metrics_opt(metrics); + let persisted = self.indexer.to_persisted_with_options(doc, &pipeline_options); + + if let Some(ref workspace) = self.workspace { + // save() is atomic (write-lock + put), no need to remove first + if let Err(e) = workspace.save(&persisted).await { + return (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]); + } + } + + info!("Incrementally updated: {}", item.doc_id); + (vec![item], Vec::new()) + } + Err(e) => { + tracing::warn!("Incremental update failed for {}: {}", source_label, e); + (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + } + } + } + Err(e) => { + tracing::warn!("Failed to resolve action for {}: {}", source_label, e); + (Vec::new(), vec![FailedItem::new(&source_label, e.to_string())]) + } + } } // ============================================================ // Document Querying // ============================================================ - /// Query a document. + /// Query documents. /// - /// Accepts a [`QueryContext`] that specifies the query text, target document, - /// and optional retrieval parameters. + /// Accepts a [`QueryContext`] that specifies the query text and scope + /// (single document, multiple documents, or entire workspace). /// /// # Example /// /// ```rust,no_run - /// use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; + /// use vectorless::client::{EngineBuilder, QueryContext}; /// /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { @@ -185,39 +338,88 @@ impl Engine { /// .build() /// .await?; /// + /// // Single document /// let result = engine.query( /// QueryContext::new("What is the total revenue?") /// .with_doc_id("doc-123") /// ).await?; /// - /// println!("Answer: {}", result.content); + /// if let Some(item) = result.single() { + /// println!("Answer: {}", item.content); + /// } + /// + /// // Entire workspace + /// let result = engine.query( + /// QueryContext::new("Summarize all documents") + /// ).await?; + /// for item in &result.items { + /// println!("{}: score={}", item.doc_id, item.score); + /// } /// # Ok(()) /// # } /// ``` pub async fn query(&self, ctx: QueryContext) -> Result { - let doc_id = ctx.doc_id.as_deref().ok_or_else(|| { - Error::Config("doc_id is required for query".to_string()) - })?; + let doc_ids = self.resolve_scope(&ctx.scope).await?; + let mut options = ctx.to_retrieve_options(&self.config); + + // Load document graph for graph-aware retrieval (if enabled) + if self.config.graph.enabled { + if let Some(ref workspace) = self.workspace { + if let Ok(Some(graph)) = workspace.get_graph().await { + options = options.with_document_graph(Arc::new(graph)); + } + } + } - let tree = self.get_structure(doc_id).await?; - let options = ctx.to_retrieve_options(&self.config); + let mut items = Vec::with_capacity(doc_ids.len()); + let mut failed = Vec::new(); + + for doc_id in doc_ids { + let tree = match self.get_structure(&doc_id).await { + Ok(t) => t, + Err(e) => { + tracing::warn!("Skipping document {}: {}", doc_id, e); + failed.push(FailedItem::new(&doc_id, e.to_string())); + continue; + } + }; + + match self.retriever.query(&tree, &ctx.query, &options).await { + Ok(mut result) => { + result.doc_id = doc_id; + items.push(result); + } + Err(e) => { + tracing::warn!("Query failed for {}: {}", doc_id, e); + failed.push(FailedItem::new(&doc_id, e.to_string())); + } + } + } - let mut result = self.retriever.query(&tree, &ctx.query, &options).await?; - result.doc_id = doc_id.to_string(); + // If everything failed, return error + if items.is_empty() && !failed.is_empty() { + return Err(Error::Config(format!( + "Query failed for all {} document(s)", + failed.len() + ))); + } - Ok(result) + Ok(QueryResult::with_partial(items, failed)) } /// Query a document with streaming results. /// /// Returns a [`RetrieveEventReceiver`] that yields [`RetrieveEvent`](crate::retrieval::RetrieveEvent)s /// as the retrieval pipeline progresses through each stage. + /// + /// Only supports single-document scope (via `with_doc_id`). pub async fn query_stream(&self, ctx: QueryContext) -> Result { - let doc_id = ctx.doc_id.as_deref().ok_or_else(|| { - Error::Config("doc_id is required for query".to_string()) - })?; + let doc_id = match &ctx.scope { + QueryScope::Single(id) => id.clone(), + _ => return Err(Error::Config("query_stream requires a single doc_id".to_string())), + }; - let tree = self.get_structure(doc_id).await?; + let tree = self.get_structure(&doc_id).await?; let options = ctx.to_retrieve_options(&self.config); let rx = self.retriever.query_stream(&tree, &ctx.query, &options).await?; @@ -271,6 +473,19 @@ impl Engine { workspace.clear().await } + /// Get the cross-document relationship graph. + /// + /// The graph is automatically rebuilt after indexing documents. + /// Returns `None` if no graph has been built yet. + pub async fn get_graph(&self) -> Result> { + let workspace = self + .workspace + .as_ref() + .ok_or_else(|| Error::Config("No workspace configured".to_string()))?; + + workspace.get_graph().await + } + // ============================================================ // Internal // ============================================================ @@ -289,6 +504,165 @@ impl Engine { Ok(doc.tree) } + + /// Resolve QueryScope into a list of document IDs. + async fn resolve_scope(&self, scope: &QueryScope) -> Result> { + match scope { + QueryScope::Single(id) => Ok(vec![id.clone()]), + QueryScope::Multiple(ids) => Ok(ids.clone()), + QueryScope::Workspace => { + let docs = self.list().await?; + if docs.is_empty() { + return Err(Error::Config("Workspace is empty".to_string())); + } + Ok(docs.into_iter().map(|d| d.id).collect()) + } + } + } + + /// Build pipeline options from client IndexOptions and detected format. + fn build_pipeline_options( + &self, + options: &super::types::IndexOptions, + format: crate::parser::DocumentFormat, + ) -> PipelineOptions { + use crate::index::SummaryStrategy; + PipelineOptions { + mode: match format { + crate::parser::DocumentFormat::Markdown => crate::index::IndexMode::Markdown, + crate::parser::DocumentFormat::Pdf => crate::index::IndexMode::Pdf, + crate::parser::DocumentFormat::Html => crate::index::IndexMode::Html, + crate::parser::DocumentFormat::Docx => crate::index::IndexMode::Docx, + }, + generate_ids: options.generate_ids, + summary_strategy: if options.generate_summaries { + SummaryStrategy::full() + } else { + SummaryStrategy::none() + }, + generate_description: options.generate_description, + ..Default::default() + } + } + + /// Rebuild the document graph after indexing, if graph is enabled. + async fn rebuild_graph(&self) -> Result<()> { + if !self.config.graph.enabled { + return Ok(()); + } + let workspace = match self.workspace { + Some(ref ws) => ws, + None => return Ok(()), + }; + + // Load all documents and extract keyword profiles + let doc_ids = workspace.inner().list_documents().await; + let mut builder = crate::graph::DocumentGraphBuilder::new(self.config.graph.clone()); + + for doc_id in &doc_ids { + if let Some(doc) = workspace.load(doc_id).await? { + let keywords = Self::extract_keywords_from_doc(&doc); + builder.add_document( + &doc.meta.id, + &doc.meta.name, + &doc.meta.format, + doc.meta.node_count, + keywords, + ); + } + } + + let graph = builder.build(); + workspace.set_graph(&graph).await?; + Ok(()) + } + + /// Extract keyword → weight map from a persisted document's ReasoningIndex. + fn extract_keywords_from_doc(doc: &PersistedDocument) -> HashMap { + let mut keywords = HashMap::new(); + if let Some(ref ri) = doc.reasoning_index { + for (kw, entries) in ri.all_topic_entries() { + let weight: f32 = + entries.iter().map(|e| e.weight).sum::() / entries.len().max(1) as f32; + keywords.insert(kw.clone(), weight); + } + } + keywords + } + + /// Resolve what action to take for a source. + async fn resolve_index_action( + &self, + source: &IndexSource, + options: &super::types::IndexOptions, + ) -> Result { + let workspace = match self.workspace { + Some(ref ws) => ws, + None => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + // Force mode always re-indexes from scratch + if options.mode == IndexMode::Force { + return Ok(IndexAction::FullIndex { existing_id: None }); + } + + // Only path sources support incremental indexing + let path = match source { + IndexSource::Path(p) => p, + _ => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + // Find if this file has already been indexed + let existing_id = match workspace.find_by_source_path(path).await { + Some(id) => id, + None => return Ok(IndexAction::FullIndex { existing_id: None }), // New file + }; + + // Default mode: skip if already indexed (no content check) + if options.mode == IndexMode::Default { + let info = workspace.get_document_info(&existing_id).await?; + let (name, format_str, desc, pages) = match info { + Some(i) => (i.name, i.format, i.description, i.page_count), + None => (String::new(), String::new(), None, None), + }; + return Ok(IndexAction::Skip(incremental::SkipInfo { + doc_id: existing_id, + name, + format: crate::parser::DocumentFormat::from_extension(&format_str) + .unwrap_or(crate::parser::DocumentFormat::Markdown), + description: desc, + page_count: pages, + })); + } + + // Incremental mode: load stored document and delegate to resolver + let current_bytes = match std::fs::read(path) { + Ok(b) => b, + Err(_) => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + let stored_doc = match workspace.load(&existing_id).await? { + Some(d) => d, + None => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + let format = crate::parser::DocumentFormat::from_extension(&stored_doc.meta.format) + .unwrap_or(crate::parser::DocumentFormat::Markdown); + let pipeline_options = self.build_pipeline_options(options, format); + + // If logic fingerprint changed, remove old doc before full reprocess + let action = incremental::resolve_action( + ¤t_bytes, + &stored_doc, + &pipeline_options, + format, + ); + + // Note: if FullIndex, old doc cleanup happens in process_source() + // after successful save (save-first, then remove old). + + Ok(action) + } } impl Clone for Engine { diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 6c038eac..0f551bc8 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -3,19 +3,12 @@ //! Index context for document indexing operations. //! -//! This module provides [`IndexContext`], a unified type for specifying -//! document input sources for the [`Engine::index`](super::Engine::index) method. +//! [`IndexContext`] supports single or multiple document sources: +//! - **File path** — Load and parse a file from disk +//! - **Content string** — Parse content directly (HTML, Markdown, text) +//! - **Byte data** — Parse binary data (PDF, DOCX) //! -//! # Overview -//! -//! `IndexContext` supports three input types: -//! - **File path** - Load and parse a file from disk -//! - **Content string** - Parse content directly (for HTML, Markdown, text) -//! - **Byte data** - Parse binary data (for PDF, DOCX) -//! -//! # Examples -//! -//! ## From file path +//! # Single document //! //! ```rust,no_run //! use vectorless::client::IndexContext; @@ -23,34 +16,20 @@ //! let ctx = IndexContext::from_path("./document.md"); //! ``` //! -//! ## From content string +//! # Multiple documents //! -//! ```rust -//! use vectorless::client::IndexContext; -//! use vectorless::parser::DocumentFormat; -//! -//! let html = "

Title

Content

"; -//! let ctx = IndexContext::from_content(html, DocumentFormat::Html) -//! .with_name("webpage"); -//! ``` -//! -//! ## From bytes -//! -//! ```rust +//! ```rust,no_run //! use vectorless::client::IndexContext; -//! use vectorless::parser::DocumentFormat; //! -//! let pdf_bytes = vec![/* PDF binary data */]; -//! let ctx = IndexContext::from_bytes(pdf_bytes, DocumentFormat::Pdf); +//! let ctx = IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]); //! ``` //! -//! ## With options +//! # From directory //! //! ```rust,no_run -//! use vectorless::client::{IndexContext, IndexMode}; +//! use vectorless::client::IndexContext; //! -//! let ctx = IndexContext::from_path("./document.pdf") -//! .with_mode(IndexMode::Force); +//! let ctx = IndexContext::from_dir("./documents"); //! ``` use std::path::PathBuf; @@ -64,51 +43,25 @@ use super::types::{IndexMode, IndexOptions}; // ============================================================ /// The source of document content for indexing. -/// -/// This enum represents the different ways a document can be provided -/// to the indexing pipeline. #[derive(Debug, Clone)] pub(crate) enum IndexSource { /// Load document from a file path. - /// - /// The format is detected from the file extension. Path(PathBuf), /// Parse document from a string. - /// - /// Used for text-based formats like HTML and Markdown. - /// The format must be explicitly specified. Content { - /// The document content as a UTF-8 string. data: String, - /// The document format. format: DocumentFormat, }, /// Parse document from binary data. - /// - /// Used for binary formats like PDF and DOCX. - /// The format must be explicitly specified. Bytes { - /// The document content as raw bytes. data: Vec, - /// The document format. format: DocumentFormat, }, } impl IndexSource { - /// Get the format of this source, if known. - /// - /// Returns `None` for `Path` sources (format detected from extension). - pub fn format(&self) -> Option { - match self { - IndexSource::Path(_) => None, - IndexSource::Content { format, .. } => Some(*format), - IndexSource::Bytes { format, .. } => Some(*format), - } - } - /// Check if this is a path source. pub fn is_path(&self) -> bool { matches!(self, IndexSource::Path(_)) @@ -131,63 +84,38 @@ impl IndexSource { /// Context for document indexing operations. /// -/// `IndexContext` provides a unified interface for specifying document -/// input sources. It supports files, content strings, and binary data. -/// -/// # Type Parameters -/// -/// The context is constructed using one of: -/// - [`IndexContext::from_path`] - Load from file -/// - [`IndexContext::from_content`] - Parse string content -/// - [`IndexContext::from_bytes`] - Parse binary data -/// -/// Additional configuration can be chained: -/// - [`with_name`](IndexContext::with_name) - Set document name -/// - [`with_options`](IndexContext::with_options) - Set indexing options -/// - [`with_mode`](IndexContext::with_mode) - Set indexing mode +/// Supports single or multiple document sources. When multiple sources +/// are provided, each is indexed independently and the results are +/// collected into [`IndexResult`](super::IndexResult). /// /// # Examples /// /// ```rust,no_run -/// use vectorless::client::{EngineBuilder, IndexContext, IndexMode}; -/// use vectorless::parser::DocumentFormat; +/// use vectorless::client::IndexContext; +/// use vectorless::client::DocumentFormat; /// /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { -/// let engine = EngineBuilder::new() -/// .with_workspace("./data") -/// .build() -/// .await?; +/// # let engine = vectorless::EngineBuilder::new().build().await?; +/// // Single file +/// let result = engine.index(IndexContext::from_path("./doc.md")).await?; /// -/// // Index from file -/// let id1 = engine.index(IndexContext::from_path("./doc.md")).await?; -/// -/// // Index HTML content -/// let html = "

Title

Content

"; -/// let id2 = engine.index( -/// IndexContext::from_content(html, DocumentFormat::Html) -/// .with_name("webpage") -/// ).await?; -/// -/// // Index with force mode -/// let id3 = engine.index( -/// IndexContext::from_path("./doc.pdf") -/// .with_mode(IndexMode::Force) +/// // Multiple files +/// let result = engine.index( +/// IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]) /// ).await?; /// +/// // Entire directory +/// let result = engine.index(IndexContext::from_dir("./docs")).await?; /// # Ok(()) /// # } /// ``` #[derive(Debug, Clone)] pub struct IndexContext { - /// The document source. - pub(crate) source: IndexSource, + /// Document sources (supports multiple). + pub(crate) sources: Vec, - /// Optional document name for metadata. - /// - /// If not set, the name is derived from: - /// - File name (for path sources) - /// - "untitled" (for content/bytes sources) + /// Optional document name for metadata (single-source only). pub(crate) name: Option, /// Indexing options. @@ -195,145 +123,108 @@ pub struct IndexContext { } impl IndexContext { - /// Create an index context from a file path. + /// Create from a single file path. /// /// The document format is automatically detected from the file extension. - /// - /// # Supported Extensions - /// - /// - `.md`, `.markdown` → Markdown - /// - `.pdf` → PDF - /// - `.docx` → DOCX - /// - `.html`, `.htm` → HTML - /// - `.txt` → Plain text - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::IndexContext; - /// - /// let ctx = IndexContext::from_path("./documents/report.pdf"); - /// ``` pub fn from_path(path: impl Into) -> Self { Self { - source: IndexSource::Path(path.into()), + sources: vec![IndexSource::Path(path.into())], name: None, options: IndexOptions::default(), } } - /// Create an index context from a content string. - /// - /// Use this for text-based formats where you have the content - /// as a string. The format must be explicitly specified. - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::IndexContext; - /// use vectorless::parser::DocumentFormat; - /// - /// let markdown = "# Title\n\nContent here."; - /// let ctx = IndexContext::from_content(markdown, DocumentFormat::Markdown); - /// ``` + /// Create from multiple file paths. + pub fn from_paths(paths: impl IntoIterator>) -> Self { + Self { + sources: paths + .into_iter() + .map(|p| IndexSource::Path(p.into())) + .collect(), + name: None, + options: IndexOptions::default(), + } + } + + /// Create from a directory path. + /// + /// Indexes all supported files in the directory (non-recursive). + /// Supported extensions: `.md`, `.pdf`, `.docx`, `.html`, `.txt`. + pub fn from_dir(dir: impl Into) -> Self { + let dir = dir.into(); + let supported_extensions = ["md", "markdown", "pdf", "docx", "html", "htm", "txt"]; + + let mut sources = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&dir) { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + if supported_extensions.contains(&ext.to_lowercase().as_str()) { + sources.push(IndexSource::Path(path)); + } + } + } + } + + Self { + sources, + name: None, + options: IndexOptions::default(), + } + } + + /// Create from a content string. pub fn from_content(content: impl Into, format: DocumentFormat) -> Self { Self { - source: IndexSource::Content { + sources: vec![IndexSource::Content { data: content.into(), format, - }, + }], name: None, options: IndexOptions::default(), } } - /// Create an index context from binary data. - /// - /// Use this for binary formats like PDF and DOCX where you - /// have the raw bytes. The format must be explicitly specified. - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::IndexContext; - /// use vectorless::parser::DocumentFormat; - /// - /// let pdf_bytes: Vec = vec![/* PDF binary data */]; - /// let ctx = IndexContext::from_bytes(pdf_bytes, DocumentFormat::Pdf); - /// ``` + /// Create from binary data. pub fn from_bytes(bytes: Vec, format: DocumentFormat) -> Self { Self { - source: IndexSource::Bytes { + sources: vec![IndexSource::Bytes { data: bytes, format, - }, + }], name: None, options: IndexOptions::default(), } } - /// Set the document name. - /// - /// The name is used in document metadata and listings. - /// If not set, it's derived from the source. - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::IndexContext; - /// use vectorless::parser::DocumentFormat; - /// - /// let ctx = IndexContext::from_content("...", DocumentFormat::Html) - /// .with_name("homepage"); - /// ``` + /// Set the document name (single-source only). pub fn with_name(mut self, name: impl Into) -> Self { self.name = Some(name.into()); self } /// Set the indexing options. - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::{IndexContext, IndexOptions, IndexMode}; - /// - /// let options = IndexOptions { - /// mode: IndexMode::Force, - /// ..Default::default() - /// }; - /// - /// let ctx = IndexContext::from_path("./doc.md") - /// .with_options(options); - /// ``` pub fn with_options(mut self, options: IndexOptions) -> Self { self.options = options; self } /// Set the indexing mode. - /// - /// This is a convenience method for setting just the mode. - /// - /// # Modes - /// - /// - [`IndexMode::Default`] - Skip if already indexed (default) - /// - [`IndexMode::Force`] - Always re-index - /// - [`IndexMode::Incremental`] - Only re-index changed files - /// - /// # Example - /// - /// ```rust - /// use vectorless::client::{IndexContext, IndexMode}; - /// - /// let ctx = IndexContext::from_path("./doc.md") - /// .with_mode(IndexMode::Force); - /// ``` pub fn with_mode(mut self, mode: IndexMode) -> Self { self.options.mode = mode; self } + /// Number of document sources. + pub fn len(&self) -> usize { + self.sources.len() + } + + /// Check if there are no sources. + pub fn is_empty(&self) -> bool { + self.sources.is_empty() + } + /// Get the document name, if set. pub fn name(&self) -> Option<&str> { self.name.as_deref() @@ -386,63 +277,43 @@ mod tests { #[test] fn test_from_path() { let ctx = IndexContext::from_path("./test.md"); - assert!(ctx.source.is_path()); + assert_eq!(ctx.len(), 1); assert!(ctx.name.is_none()); } + #[test] + fn test_from_paths() { + let ctx = IndexContext::from_paths(vec!["./a.md", "./b.pdf"]); + assert_eq!(ctx.len(), 2); + } + #[test] fn test_from_content() { let ctx = IndexContext::from_content("# Title", DocumentFormat::Markdown); - assert!(ctx.source.is_content()); - assert!(ctx.name.is_none()); + assert_eq!(ctx.len(), 1); } #[test] fn test_from_bytes() { let ctx = IndexContext::from_bytes(vec![1, 2, 3], DocumentFormat::Pdf); - assert!(ctx.source.is_bytes()); + assert_eq!(ctx.len(), 1); } #[test] fn test_with_name() { let ctx = IndexContext::from_path("./test.md").with_name("My Document"); - assert_eq!(ctx.name(), Some("My Document")); } #[test] fn test_with_mode() { let ctx = IndexContext::from_path("./test.md").with_mode(IndexMode::Force); - - assert_eq!(ctx.options.mode, IndexMode::Force); - } - - #[test] - fn test_chaining() { - let ctx = IndexContext::from_content("", DocumentFormat::Html) - .with_name("page") - .with_mode(IndexMode::Force); - - assert!(ctx.source.is_content()); - assert_eq!(ctx.name(), Some("page")); assert_eq!(ctx.options.mode, IndexMode::Force); } #[test] fn test_from_path_trait() { let ctx = IndexContext::from(PathBuf::from("./test.md")); - assert!(ctx.source.is_path()); - } - - #[test] - fn test_source_format() { - let content_source = IndexSource::Content { - data: "test".to_string(), - format: DocumentFormat::Html, - }; - assert_eq!(content_source.format(), Some(DocumentFormat::Html)); - - let path_source = IndexSource::Path(PathBuf::from("./test.md")); - assert_eq!(path_source.format(), None); + assert_eq!(ctx.len(), 1); } } diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index cb87a0c1..bc10c210 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -21,13 +21,14 @@ //! ``` use std::path::{Path, PathBuf}; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use tracing::info; use uuid::Uuid; use crate::error::{Error, Result}; use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; +use crate::llm::LlmClient; use crate::parser::DocumentFormat; use crate::storage::{DocumentMeta, PersistedDocument}; @@ -38,9 +39,11 @@ use super::types::{IndexOptions, IndexedDocument}; /// Document indexing client. /// /// Provides operations for parsing and indexing documents. +/// Each index operation creates a fresh pipeline executor, enabling +/// true parallel document indexing without mutex contention. pub(crate) struct IndexerClient { - /// Pipeline executor. - executor: Arc>, + /// Factory for creating pipeline executors (one per index operation). + executor_factory: Arc PipelineExecutor + Send + Sync>, /// Event emitter. events: EventEmitter, @@ -73,10 +76,20 @@ impl Default for IndexerConfig { } impl IndexerClient { - /// Create a new indexer client. - pub fn new(executor: PipelineExecutor) -> Self { + /// Create a new indexer client with a default pipeline executor. + pub fn new(_executor: PipelineExecutor) -> Self { Self { - executor: Arc::new(Mutex::new(executor)), + executor_factory: Arc::new(PipelineExecutor::new), + events: EventEmitter::new(), + config: IndexerConfig::default(), + } + } + + /// Create with an LLM-enabled pipeline. + pub fn with_llm(client: LlmClient) -> Self { + let client = Arc::new(client); + Self { + executor_factory: Arc::new(move || PipelineExecutor::with_llm((*client).clone())), events: EventEmitter::new(), config: IndexerConfig::default(), } @@ -94,59 +107,45 @@ impl IndexerClient { self } - /// Create from an existing executor Arc. - pub(crate) fn from_arc( - executor: Arc>, + /// Create from an executor factory function. + pub(crate) fn from_factory( + factory: Arc PipelineExecutor + Send + Sync>, events: EventEmitter, config: IndexerConfig, ) -> Self { Self { - executor, + executor_factory: factory, events, config, } } /// Index a document from an index context. - /// - /// This is the main entry point for indexing documents. The context - /// specifies the source (path, content, or bytes) and options. - /// - /// # Errors - /// - /// Returns an error if: - /// - The file does not exist (for path sources) - /// - The file format is not supported - /// - The pipeline execution fails - /// - /// # Example - /// - /// ```rust,ignore - /// use vectorless::client::{IndexerClient, IndexContext}; - /// use vectorless::parser::DocumentFormat; - /// - /// // From file path - /// let doc = indexer.index(IndexContext::from_path("./doc.md")).await?; - /// - /// // From HTML content - /// let html = "

Title

"; - /// let doc = indexer.index( - /// IndexContext::from_content(html, DocumentFormat::Html) - /// .with_name("webpage") - /// ).await?; - /// ``` - pub async fn index(&self, ctx: IndexContext) -> Result { - match &ctx.source { - IndexSource::Path(path) => self.index_from_path(path, &ctx).await, + pub async fn index(&self, source: &IndexSource, name: Option<&str>, options: &IndexOptions) -> Result { + self.index_with_existing(source, name, options, None).await + } + + /// Index a document, optionally reusing an existing tree for incremental updates. + pub async fn index_with_existing( + &self, + source: &IndexSource, + name: Option<&str>, + options: &IndexOptions, + existing_tree: Option<&crate::DocumentTree>, + ) -> Result { + match source { + IndexSource::Path(path) => self.index_from_path(path, name, options, existing_tree).await, IndexSource::Content { data, format } => { - self.index_from_content(data, *format, &ctx).await + self.index_from_content(data, *format, name, options, existing_tree).await + } + IndexSource::Bytes { data, format } => { + self.index_from_bytes(data, *format, name, options, existing_tree).await } - IndexSource::Bytes { data, format } => self.index_from_bytes(data, *format, &ctx).await, } } /// Index from a file path. - async fn index_from_path(&self, path: &Path, ctx: &IndexContext) -> Result { + async fn index_from_path(&self, path: &Path, name: Option<&str>, options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>) -> Result { let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); if !path.exists() { @@ -169,19 +168,18 @@ impl IndexerClient { info!("Indexing {:?} document: {}", format, path.display()); // Build pipeline options - let pipeline_options = self.build_pipeline_options(&ctx.options, format); + let pipeline_options = self.build_pipeline_options_with_existing( + options, + format, + existing_tree.cloned(), + ); // Create pipeline input and execute let input = IndexInput::file(&path); - let result = { - let mut executor = self - .executor - .lock() - .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; - executor.execute(input, pipeline_options).await? - }; - - self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), Some(&path)) + let mut executor = (self.executor_factory)(); + let result = executor.execute(input, pipeline_options).await?; + + self.build_indexed_document(doc_id, result, format, name, Some(&path)) } /// Index from content string. @@ -189,11 +187,12 @@ impl IndexerClient { &self, content: &str, format: DocumentFormat, - ctx: &IndexContext, + name: Option<&str>, + options: &IndexOptions, + existing_tree: Option<&crate::DocumentTree>, ) -> Result { - // Emit start event self.events.emit_index(IndexEvent::Started { - path: ctx.name.clone().unwrap_or_else(|| "content".to_string()), + path: name.unwrap_or("content").to_string(), }); let doc_id = Uuid::new_v4().to_string(); @@ -202,18 +201,17 @@ impl IndexerClient { info!("Indexing {:?} document from content", format); - let pipeline_options = self.build_pipeline_options(&ctx.options, format); + let pipeline_options = self.build_pipeline_options_with_existing( + options, + format, + existing_tree.cloned(), + ); let input = IndexInput::content(content); - let result = { - let mut executor = self - .executor - .lock() - .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; - executor.execute(input, pipeline_options).await? - }; - - self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), None) + let mut executor = (self.executor_factory)(); + let result = executor.execute(input, pipeline_options).await?; + + self.build_indexed_document(doc_id, result, format, name, None) } /// Index from binary data. @@ -221,11 +219,12 @@ impl IndexerClient { &self, bytes: &[u8], format: DocumentFormat, - ctx: &IndexContext, + name: Option<&str>, + options: &IndexOptions, + existing_tree: Option<&crate::DocumentTree>, ) -> Result { - // Emit start event self.events.emit_index(IndexEvent::Started { - path: ctx.name.clone().unwrap_or_else(|| "bytes".to_string()), + path: name.unwrap_or("bytes").to_string(), }); let doc_id = Uuid::new_v4().to_string(); @@ -238,18 +237,17 @@ impl IndexerClient { bytes.len() ); - let pipeline_options = self.build_pipeline_options(&ctx.options, format); + let pipeline_options = self.build_pipeline_options_with_existing( + options, + format, + existing_tree.cloned(), + ); let input = IndexInput::bytes(bytes); - let result = { - let mut executor = self - .executor - .lock() - .map_err(|_| Error::Other("Pipeline executor lock poisoned".to_string()))?; - executor.execute(input, pipeline_options).await? - }; - - self.build_indexed_document(doc_id, result, format, ctx.name.as_deref(), None) + let mut executor = (self.executor_factory)(); + let result = executor.execute(input, pipeline_options).await?; + + self.build_indexed_document(doc_id, result, format, name, None) } /// Build pipeline options from client options. @@ -258,8 +256,16 @@ impl IndexerClient { options: &IndexOptions, format: DocumentFormat, ) -> PipelineOptions { - println!("[DEBUG] Building pipeline options for format: {:?} with options: {:?}", format, options); + self.build_pipeline_options_with_existing(options, format, None) + } + /// Build pipeline options with optional existing tree for incremental updates. + fn build_pipeline_options_with_existing( + &self, + options: &IndexOptions, + format: DocumentFormat, + existing_tree: Option, + ) -> PipelineOptions { PipelineOptions { mode: match format { DocumentFormat::Markdown => IndexMode::Markdown, @@ -269,12 +275,12 @@ impl IndexerClient { }, generate_ids: options.generate_ids, summary_strategy: if options.generate_summaries { - // SummaryStrategy::selective(self.config.min_summary_tokens, false) SummaryStrategy::full() } else { SummaryStrategy::none() }, generate_description: options.generate_description, + existing_tree, ..Default::default() } } @@ -283,7 +289,7 @@ impl IndexerClient { fn build_indexed_document( &self, doc_id: String, - result: crate::index::IndexResult, + result: crate::index::PipelineResult, format: DocumentFormat, name: Option<&str>, path: Option<&Path>, @@ -305,7 +311,8 @@ impl IndexerClient { let mut doc = IndexedDocument::new(&doc_id, format) .with_name(&doc_name) - .with_tree(tree); + .with_tree(tree) + .with_metrics(result.metrics); if let Some(p) = path { doc = doc.with_source_path(p); @@ -386,7 +393,12 @@ impl IndexerClient { /// Convert IndexedDocument to PersistedDocument for storage. pub fn to_persisted(&self, doc: IndexedDocument) -> PersistedDocument { - let meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) + self.to_persisted_with_options(doc, &PipelineOptions::default()) + } + + /// Convert IndexedDocument to PersistedDocument, storing fingerprints from pipeline options. + pub fn to_persisted_with_options(&self, doc: IndexedDocument, pipeline_options: &PipelineOptions) -> PersistedDocument { + let mut meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) .with_source_path( doc.source_path .as_ref() @@ -395,6 +407,18 @@ impl IndexerClient { ) .with_description(doc.description.clone().unwrap_or_default()); + // Compute content fingerprint for incremental indexing + if let Some(ref path) = doc.source_path { + if let Ok(bytes) = std::fs::read(path) { + let fp = crate::utils::fingerprint::Fingerprint::from_bytes(&bytes); + meta = meta.with_fingerprint(fp); + } + } + + // Store logic fingerprint (pipeline configuration hash) + let logic_fp = pipeline_options.logic_fingerprint(); + meta = meta.with_logic_fingerprint(logic_fp); + let mut persisted = PersistedDocument::new(meta, doc.tree.expect("IndexedDocument must have a tree")); @@ -404,17 +428,12 @@ impl IndexerClient { persisted } - - /// Get the underlying executor Arc (for advanced use). - pub(crate) fn inner(&self) -> Arc> { - Arc::clone(&self.executor) - } } impl Clone for IndexerClient { fn clone(&self) -> Self { Self { - executor: Arc::clone(&self.executor), + executor_factory: Arc::clone(&self.executor_factory), events: self.events.clone(), config: self.config.clone(), } diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index f3ba5a49..d2852efe 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -30,7 +30,9 @@ //! let result = client.query( //! QueryContext::new("What is this?").with_doc_id(doc_id) //! ).await?; -//! println!("{}", result.content); +//! if let Some(item) = result.single() { +//! println!("{}", item.content); +//! } //! //! // List all documents //! for doc in client.list().await? { @@ -99,11 +101,13 @@ pub use events::EventEmitter; pub use types::{ ClientError, DocumentInfo, + FailedItem, IndexItem, IndexMode, IndexOptions, IndexResult, QueryResult, + QueryResultItem, }; // ============================================================ diff --git a/rust/src/client/query_context.rs b/rust/src/client/query_context.rs index 32b15378..991acf4d 100644 --- a/rust/src/client/query_context.rs +++ b/rust/src/client/query_context.rs @@ -4,33 +4,45 @@ //! Query context for the Engine API. //! //! [`QueryContext`] encapsulates all parameters for a query operation, -//! providing a builder pattern for configuration. +//! supporting single document, multiple documents, or entire workspace queries. //! //! # Example //! //! ```rust //! use vectorless::client::QueryContext; //! -//! // Simple query -//! let ctx = QueryContext::new("What is the total revenue?"); +//! // Query a single document +//! let ctx = QueryContext::new("What is the total revenue?") +//! .with_doc_id("doc-abc123"); //! -//! // With document scope +//! // Query multiple documents //! let ctx = QueryContext::new("What is the architecture?") -//! .with_doc_id("doc-abc123"); +//! .with_doc_ids(vec!["doc-1", "doc-2"]); //! -//! // With options -//! let ctx = QueryContext::new("Explain the algorithm") -//! .with_doc_id("doc-abc123") -//! .with_max_tokens(4000); +//! // Query entire workspace +//! let ctx = QueryContext::new("Explain the algorithm"); //! ``` use crate::config::Config; use crate::retrieval::{RetrieveOptions, StrategyPreference}; +/// Query scope — determines which documents to search. +#[derive(Debug, Clone)] +pub(crate) enum QueryScope { + /// Query a single document. + Single(String), + /// Query multiple specific documents. + Multiple(Vec), + /// Query all documents in the workspace. + Workspace, +} + /// Context for a query operation. /// -/// Encapsulates the query text, target document, and retrieval options. -/// Use builder methods to configure. +/// Supports three scopes: +/// - **Single document** — via `with_doc_id()` +/// - **Multiple documents** — via `with_doc_ids()` +/// - **Entire workspace** — default when no scope is set /// /// # Convenience /// @@ -45,8 +57,8 @@ use crate::retrieval::{RetrieveOptions, StrategyPreference}; pub struct QueryContext { /// The query text. pub(crate) query: String, - /// Target document ID. None means query all (not yet supported). - pub(crate) doc_id: Option, + /// Target scope. + pub(crate) scope: QueryScope, /// Maximum tokens for the result content. pub(crate) max_tokens: Option, /// Retrieval strategy override. @@ -58,11 +70,11 @@ pub struct QueryContext { } impl QueryContext { - /// Create a new query context with the given query text. + /// Create a new query context (defaults to workspace scope). pub fn new(query: impl Into) -> Self { Self { query: query.into(), - doc_id: None, + scope: QueryScope::Workspace, max_tokens: None, strategy: None, include_reasoning: true, @@ -70,9 +82,21 @@ impl QueryContext { } } - /// Set the target document ID. + /// Set scope to a single document. pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { - self.doc_id = Some(doc_id.into()); + self.scope = QueryScope::Single(doc_id.into()); + self + } + + /// Set scope to multiple documents. + pub fn with_doc_ids(mut self, doc_ids: Vec) -> Self { + self.scope = QueryScope::Multiple(doc_ids); + self + } + + /// Set scope to entire workspace. + pub fn with_workspace(mut self) -> Self { + self.scope = QueryScope::Workspace; self } @@ -139,7 +163,6 @@ mod tests { fn test_query_context_new() { let ctx = QueryContext::new("What is this?"); assert_eq!(ctx.query, "What is this?"); - assert!(ctx.doc_id.is_none()); assert!(ctx.include_reasoning); } @@ -156,14 +179,31 @@ mod tests { } #[test] - fn test_query_context_builder() { + fn test_single_doc_scope() { + let ctx = QueryContext::new("test").with_doc_id("doc-1"); + assert!(matches!(ctx.scope, QueryScope::Single(ref id) if id == "doc-1")); + } + + #[test] + fn test_multi_doc_scope() { + let ctx = QueryContext::new("test").with_doc_ids(vec!["a".into(), "b".into()]); + assert!(matches!(ctx.scope, QueryScope::Multiple(ref ids) if ids.len() == 2)); + } + + #[test] + fn test_workspace_scope() { + let ctx = QueryContext::new("test"); + assert!(matches!(ctx.scope, QueryScope::Workspace)); + } + + #[test] + fn test_builder_options() { let ctx = QueryContext::new("test") .with_doc_id("doc-1") .with_max_tokens(4000) .with_include_reasoning(false) .with_depth_limit(5); - assert_eq!(ctx.doc_id, Some("doc-1".to_string())); assert_eq!(ctx.max_tokens, Some(4000)); assert!(!ctx.include_reasoning); assert_eq!(ctx.depth_limit, Some(5)); diff --git a/rust/src/client/retriever.rs b/rust/src/client/retriever.rs index c5bd1aa1..23e9a051 100644 --- a/rust/src/client/retriever.rs +++ b/rust/src/client/retriever.rs @@ -31,7 +31,7 @@ use crate::retrieval::{ SufficiencyLevel, }; use super::events::{EventEmitter, QueryEvent}; -use super::types::QueryResult; +use super::types::QueryResultItem; /// Document retrieval client. /// @@ -127,7 +127,7 @@ impl RetrieverClient { tree: &DocumentTree, question: &str, options: &RetrieveOptions, - ) -> Result { + ) -> Result { self.events.emit_query(QueryEvent::Started { query: question.to_string(), }); @@ -221,8 +221,8 @@ impl RetrieverClient { Ok(rx) } - /// Build QueryResult from RetrieveResponse. - fn build_query_result(&self, response: &RetrieveResponse) -> QueryResult { + /// Build QueryResultItem from RetrieveResponse. + fn build_query_result(&self, response: &RetrieveResponse) -> QueryResultItem { // Extract node IDs let node_ids: Vec = response .results @@ -249,7 +249,7 @@ impl RetrieverClient { content_parts.join("\n\n---\n\n") }; - QueryResult { + QueryResultItem { doc_id: String::new(), // Will be set by caller node_ids, content, diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index 06a8f287..099a9987 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; use crate::document::DocumentTree; +use crate::metrics::IndexMetrics; use crate::parser::DocumentFormat; // ============================================================ @@ -44,6 +45,9 @@ pub struct IndexedDocument { /// Per-page content (for PDFs). pub pages: Vec, + + /// Indexing pipeline metrics. + pub metrics: Option, } impl IndexedDocument { @@ -59,6 +63,7 @@ impl IndexedDocument { line_count: None, tree: None, pages: Vec::new(), + metrics: None, } } @@ -98,6 +103,12 @@ impl IndexedDocument { self } + /// Set the indexing metrics. + pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { + self.metrics = Some(metrics); + self + } + /// Add a page content. pub fn add_page(&mut self, page: usize, content: impl Into) { self.pages.push(PageContent { @@ -122,6 +133,29 @@ pub struct PageContent { pub content: String, } +// ============================================================ +// Partial Success +// ============================================================ + +/// A failed item in a batch operation. +#[derive(Debug, Clone)] +pub struct FailedItem { + /// Source description (file path, content name, or doc ID). + pub source: String, + /// Error message. + pub error: String, +} + +impl FailedItem { + /// Create a new failed item. + pub fn new(source: impl Into, error: impl Into) -> Self { + Self { + source: source.into(), + error: error.into(), + } + } +} + // ============================================================ // Index Types // ============================================================ @@ -220,14 +254,25 @@ impl IndexOptions { /// Result of a document indexing operation. #[derive(Debug, Clone)] pub struct IndexResult { - /// Indexed items. + /// Successfully indexed items. pub items: Vec, + + /// Items that failed to index (partial success). + pub failed: Vec, } impl IndexResult { /// Create a new index result. pub fn new(items: Vec) -> Self { - Self { items } + Self { + items, + failed: Vec::new(), + } + } + + /// Create with both successes and failures. + pub fn with_partial(items: Vec, failed: Vec) -> Self { + Self { items, failed } } /// Get the single document ID (convenience for single-document indexing). @@ -248,6 +293,16 @@ impl IndexResult { pub fn len(&self) -> usize { self.items.len() } + + /// Whether any items failed. + pub fn has_failures(&self) -> bool { + !self.failed.is_empty() + } + + /// Total number of sources (success + failed). + pub fn total(&self) -> usize { + self.items.len() + self.failed.len() + } } /// A single indexed document item. @@ -259,6 +314,12 @@ pub struct IndexItem { pub name: String, /// The document format. pub format: DocumentFormat, + /// Document description (from root summary). + pub description: Option, + /// Page count (for PDFs). + pub page_count: Option, + /// Indexing pipeline metrics (timing, LLM usage, node stats). + pub metrics: Option, } impl IndexItem { @@ -267,22 +328,39 @@ impl IndexItem { doc_id: impl Into, name: impl Into, format: DocumentFormat, + description: Option, + page_count: Option, ) -> Self { Self { doc_id: doc_id.into(), name: name.into(), format, + description, + page_count, + metrics: None, } } + + /// Set the indexing metrics. + pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { + self.metrics = Some(metrics); + self + } + + /// Set the indexing metrics (optional). + pub fn with_metrics_opt(mut self, metrics: Option) -> Self { + self.metrics = metrics; + self + } } // ============================================================ // Query Types // ============================================================ -/// Result of a document query. +/// A single document's query result. #[derive(Debug, Clone)] -pub struct QueryResult { +pub struct QueryResultItem { /// The document ID. pub doc_id: String, @@ -296,25 +374,66 @@ pub struct QueryResult { pub score: f32, } +/// Result of a document query. +/// +/// Contains results from one or more documents. For single-document queries, +/// `items` has one entry. For multi-document or workspace queries, it has +/// one entry per document that matched. +#[derive(Debug, Clone)] +pub struct QueryResult { + /// Query results per document. + pub items: Vec, + + /// Documents that failed during multi-doc query. + pub failed: Vec, +} + impl QueryResult { - /// Create a new query result. - pub fn new(doc_id: impl Into) -> Self { + /// Create a new query result (empty). + pub fn new() -> Self { Self { - doc_id: doc_id.into(), - node_ids: Vec::new(), - content: String::new(), - score: 0.0, + items: Vec::new(), + failed: Vec::new(), } } + /// Create a query result with a single item. + pub fn from_single(item: QueryResultItem) -> Self { + Self { + items: vec![item], + failed: Vec::new(), + } + } + + /// Create with both successes and failures. + pub fn with_partial(items: Vec, failed: Vec) -> Self { + Self { items, failed } + } + /// Check if the result is empty. pub fn is_empty(&self) -> bool { - self.node_ids.is_empty() + self.items.is_empty() } - /// Get the number of results. + /// Get the number of result items. pub fn len(&self) -> usize { - self.node_ids.len() + self.items.len() + } + + /// Get the first (single-doc) result item, if any. + pub fn single(&self) -> Option<&QueryResultItem> { + self.items.first() + } + + /// Whether any documents failed. + pub fn has_failures(&self) -> bool { + !self.failed.is_empty() + } +} + +impl Default for QueryResult { + fn default() -> Self { + Self::new() } } @@ -415,11 +534,26 @@ mod tests { #[test] fn test_query_result() { - let result = QueryResult::new("doc-1"); + let result = QueryResult::new(); assert!(result.is_empty()); assert_eq!(result.len(), 0); } + #[test] + fn test_query_result_single() { + let item = QueryResultItem { + doc_id: "doc-1".into(), + node_ids: vec!["n1".into()], + content: "content".into(), + score: 0.9, + }; + let result = QueryResult::from_single(item); + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert!(result.single().is_some()); + assert_eq!(result.single().unwrap().doc_id, "doc-1"); + } + #[test] fn test_document_info() { let info = DocumentInfo::new("doc-1", "Test").with_format("markdown"); @@ -430,7 +564,7 @@ mod tests { #[test] fn test_index_result() { - let item = IndexItem::new("doc-1", "Test", DocumentFormat::Markdown); + let item = IndexItem::new("doc-1", "Test", DocumentFormat::Markdown, None, None); let result = IndexResult::new(vec![item]); assert_eq!(result.doc_id(), Some("doc-1")); @@ -448,11 +582,23 @@ mod tests { #[test] fn test_index_result_multiple() { let items = vec![ - IndexItem::new("doc-1", "A", DocumentFormat::Markdown), - IndexItem::new("doc-2", "B", DocumentFormat::Pdf), + IndexItem::new("doc-1", "A", DocumentFormat::Markdown, None, None), + IndexItem::new("doc-2", "B", DocumentFormat::Pdf, None, None), ]; let result = IndexResult::new(items); assert_eq!(result.len(), 2); assert_eq!(result.doc_id(), None); } + + #[test] + fn test_partial_success() { + let items = vec![IndexItem::new("doc-1", "A", DocumentFormat::Markdown, None, None)]; + let failed = vec![FailedItem::new("missing.pdf", "File not found")]; + let result = IndexResult::with_partial(items, failed); + + assert_eq!(result.len(), 1); + assert!(result.has_failures()); + assert_eq!(result.total(), 2); + assert_eq!(result.failed[0].source, "missing.pdf"); + } } diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index 7e9cc6a4..a4f7b59e 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -292,6 +292,23 @@ impl WorkspaceClient { pub(crate) fn inner(&self) -> Arc { Arc::clone(&self.workspace) } + + /// Find a document ID by its source file path. + /// + /// Used for incremental indexing to check if a file has already been indexed. + pub async fn find_by_source_path(&self, path: &std::path::Path) -> Option { + self.workspace.find_by_source_path(path).await + } + + /// Get the document graph, loading from backend if not cached. + pub async fn get_graph(&self) -> Result> { + self.workspace.get_graph().await + } + + /// Persist the document graph to the backend. + pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { + self.workspace.set_graph(graph).await + } } /// Workspace statistics. diff --git a/rust/src/config/docs.rs b/rust/src/config/docs.rs deleted file mode 100644 index 0a1447df..00000000 --- a/rust/src/config/docs.rs +++ /dev/null @@ -1,1139 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration documentation generation. -//! -//! This module provides utilities for generating documentation -//! from configuration types, including markdown reference and -//! example TOML files. - -use super::types::Config; - -/// Configuration documentation generator. -#[derive(Debug, Clone)] -pub struct ConfigDocs { - config: Config, -} - -impl ConfigDocs { - /// Create a new documentation generator. - pub fn new(config: Config) -> Self { - Self { config } - } - - /// Create with default configuration. - pub fn with_defaults() -> Self { - Self::new(Config::default()) - } - - /// Generate markdown documentation for the configuration. - pub fn to_markdown(&self) -> String { - let mut md = String::new(); - - md.push_str("# Configuration Reference\n\n"); - md.push_str("This document describes all configuration options for vectorless.\n\n"); - md.push_str("## Configuration File\n\n"); - md.push_str("Configuration is loaded from a TOML file. Default locations:\n"); - md.push_str("- `./vectorless.toml`\n"); - md.push_str("- `./config.toml`\n"); - md.push_str("- `./.vectorless.toml`\n\n"); - - // LLM section (unified) - md.push_str("## `[llm]`\n\n"); - md.push_str("Unified LLM configuration for all LLM operations.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "api_key", - "string?", - "null", - "Default API key (used by all clients unless overridden)", - ); - md.push_str("\n"); - - // LLM.summary section - md.push_str("## `[llm.summary]`\n\n"); - md.push_str("Summary client - generates document summaries during indexing.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "model", - "string", - "gpt-4o-mini", - "Model for summarization (fast, cheap model recommended)", - ); - self.add_row( - &mut md, - "endpoint", - "string", - "https://api.openai.com/v1", - "API endpoint", - ); - self.add_row( - &mut md, - "api_key", - "string?", - "null", - "API key (optional, uses default if not set)", - ); - self.add_row(&mut md, "max_tokens", "usize", "200", "Maximum tokens for summary"); - self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for generation"); - md.push_str("\n"); - - // LLM.retrieval section - md.push_str("## `[llm.retrieval]`\n\n"); - md.push_str("Retrieval client - used for retrieval decisions and content evaluation.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "model", - "string", - "gpt-4o", - "Model for retrieval (more capable model recommended)", - ); - self.add_row( - &mut md, - "endpoint", - "string", - "https://api.openai.com/v1", - "API endpoint", - ); - self.add_row( - &mut md, - "api_key", - "string?", - "null", - "API key (optional, uses default if not set)", - ); - self.add_row(&mut md, "max_tokens", "usize", "100", "Maximum tokens for response"); - self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for generation"); - md.push_str("\n"); - - // LLM.pilot section - md.push_str("## `[llm.pilot]`\n\n"); - md.push_str("Pilot client - used for intelligent navigation guidance.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "model", - "string", - "gpt-4o-mini", - "Model for pilot navigation (fast model recommended)", - ); - self.add_row( - &mut md, - "endpoint", - "string", - "https://api.openai.com/v1", - "API endpoint", - ); - self.add_row( - &mut md, - "api_key", - "string?", - "null", - "API key (optional, uses default if not set)", - ); - self.add_row(&mut md, "max_tokens", "usize", "300", "Maximum tokens for response"); - self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for generation"); - md.push_str("\n"); - - // LLM.retry section - md.push_str("## `[llm.retry]`\n\n"); - md.push_str("Retry configuration for all LLM calls.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "max_attempts", "usize", "3", "Maximum retry attempts"); - self.add_row( - &mut md, - "initial_delay_ms", - "u64", - "500", - "Initial delay before first retry (ms)", - ); - self.add_row( - &mut md, - "max_delay_ms", - "u64", - "30000", - "Maximum delay between retries (ms)", - ); - self.add_row( - &mut md, - "multiplier", - "f64", - "2.0", - "Multiplier for exponential backoff", - ); - self.add_row( - &mut md, - "retry_on_rate_limit", - "bool", - "true", - "Whether to retry on rate limit errors", - ); - md.push_str("\n"); - - // LLM.throttle section - md.push_str("## `[llm.throttle]`\n\n"); - md.push_str("Throttle/rate limiting configuration for all LLM calls.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "max_concurrent_requests", - "usize", - "10", - "Maximum concurrent LLM API calls", - ); - self.add_row( - &mut md, - "requests_per_minute", - "usize", - "500", - "Rate limit: requests per minute", - ); - self.add_row(&mut md, "enabled", "bool", "true", "Enable rate limiting"); - self.add_row( - &mut md, - "semaphore_enabled", - "bool", - "true", - "Enable semaphore-based concurrency", - ); - md.push_str("\n"); - - // LLM.fallback section - md.push_str("## `[llm.fallback]`\n\n"); - md.push_str("Fallback configuration for all LLM calls.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "enabled", - "bool", - "true", - "Enable fallback mechanism", - ); - self.add_row( - &mut md, - "models", - "[string]", - "[\"gpt-4o-mini\", \"glm-4-flash\"]", - "Fallback models in priority order", - ); - self.add_row( - &mut md, - "endpoints", - "[string]", - "[]", - "Fallback endpoints in priority order", - ); - self.add_row( - &mut md, - "on_rate_limit", - "string", - "retry_then_fallback", - "Behavior on rate limit (retry, fallback, retry_then_fallback, fail)", - ); - self.add_row( - &mut md, - "on_timeout", - "string", - "retry_then_fallback", - "Behavior on timeout", - ); - self.add_row( - &mut md, - "on_all_failed", - "string", - "return_error", - "Behavior when all attempts fail (return_error, return_cache)", - ); - md.push_str("\n"); - - // Metrics section - md.push_str("## `[metrics]`\n\n"); - md.push_str("Unified metrics configuration for observability.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "enabled", "bool", "true", "Enable metrics collection"); - self.add_row( - &mut md, - "storage_path", - "string", - "./workspace/metrics", - "Storage path for persisted metrics", - ); - self.add_row( - &mut md, - "retention_days", - "usize", - "30", - "Retention period in days", - ); - md.push_str("\n"); - - // Metrics.llm section - md.push_str("## `[metrics.llm]`\n\n"); - md.push_str("LLM-specific metrics configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "track_tokens", "bool", "true", "Track token usage"); - self.add_row(&mut md, "track_latency", "bool", "true", "Track latency"); - self.add_row(&mut md, "track_cost", "bool", "true", "Track estimated cost"); - self.add_row( - &mut md, - "cost_per_1k_input_tokens", - "f64", - "0.00015", - "Cost per 1K input tokens (gpt-4o-mini)", - ); - self.add_row( - &mut md, - "cost_per_1k_output_tokens", - "f64", - "0.0006", - "Cost per 1K output tokens (gpt-4o-mini)", - ); - md.push_str("\n"); - - // Metrics.pilot section - md.push_str("## `[metrics.pilot]`\n\n"); - md.push_str("Pilot-specific metrics configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "track_decisions", "bool", "true", "Track Pilot decisions"); - self.add_row( - &mut md, - "track_accuracy", - "bool", - "true", - "Track decision accuracy (requires feedback)", - ); - self.add_row(&mut md, "track_feedback", "bool", "true", "Track user feedback"); - md.push_str("\n"); - - // Metrics.retrieval section - md.push_str("## `[metrics.retrieval]`\n\n"); - md.push_str("Retrieval-specific metrics configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "track_paths", "bool", "true", "Track search paths"); - self.add_row(&mut md, "track_scores", "bool", "true", "Track relevance scores"); - self.add_row(&mut md, "track_iterations", "bool", "true", "Track iterations"); - self.add_row(&mut md, "track_cache", "bool", "true", "Track cache hits/misses"); - md.push_str("\n"); - - // Pilot section - md.push_str("## `[pilot]`\n\n"); - md.push_str("Pilot navigation configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "mode", - "string", - "Balanced", - "Operation mode (Aggressive, Balanced, Conservative, AlgorithmOnly)", - ); - self.add_row( - &mut md, - "guide_at_start", - "bool", - "true", - "Whether to provide guidance at search start", - ); - self.add_row( - &mut md, - "guide_at_backtrack", - "bool", - "true", - "Whether to provide guidance during backtracking", - ); - md.push_str("\n"); - - // Pilot.budget section - md.push_str("## `[pilot.budget]`\n\n"); - md.push_str("Token and call budget constraints.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "max_tokens_per_query", - "usize", - "2000", - "Maximum total tokens per query", - ); - self.add_row( - &mut md, - "max_tokens_per_call", - "usize", - "500", - "Maximum tokens per single LLM call", - ); - self.add_row( - &mut md, - "max_calls_per_query", - "usize", - "5", - "Maximum number of LLM calls per query", - ); - self.add_row( - &mut md, - "max_calls_per_level", - "usize", - "2", - "Maximum number of LLM calls per tree level", - ); - self.add_row( - &mut md, - "hard_limit", - "bool", - "true", - "Whether to enforce hard limits (true) or soft limits (false)", - ); - md.push_str("\n"); - - // Pilot.intervention section - md.push_str("## `[pilot.intervention]`\n\n"); - md.push_str("Intervention threshold settings.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "fork_threshold", - "usize", - "3", - "Minimum candidates to trigger fork intervention", - ); - self.add_row( - &mut md, - "score_gap_threshold", - "f32", - "0.15", - "Score gap threshold (intervene when scores are close)", - ); - self.add_row( - &mut md, - "low_score_threshold", - "f32", - "0.3", - "Low score threshold (intervene when best score is below this)", - ); - self.add_row( - &mut md, - "max_interventions_per_level", - "usize", - "2", - "Maximum interventions allowed per tree level", - ); - md.push_str("\n"); - - // Pilot.feedback section - md.push_str("## `[pilot.feedback]`\n\n"); - md.push_str("Feedback and learning configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "enabled", "bool", "true", "Enable feedback collection"); - self.add_row( - &mut md, - "storage_path", - "string", - "./workspace/feedback", - "Storage path for feedback data", - ); - self.add_row( - &mut md, - "learning_rate", - "f32", - "0.1", - "Learning rate for feedback-based improvements", - ); - self.add_row( - &mut md, - "min_samples_for_learning", - "usize", - "10", - "Minimum samples before applying learning", - ); - md.push_str("\n"); - - // Retrieval section - md.push_str("## `[retrieval]`\n\n"); - md.push_str("Retrieval model and behavior configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "model", - "string", - "gpt-4o", - "Model for retrieval navigation", - ); - self.add_row( - &mut md, - "endpoint", - "string", - "https://api.openai.com/v1", - "API endpoint", - ); - self.add_row(&mut md, "top_k", "usize", "3", "Number of top results to return"); - self.add_row( - &mut md, - "max_tokens", - "usize", - "1000", - "Maximum tokens for retrieval context", - ); - self.add_row(&mut md, "temperature", "f32", "0.0", "Temperature for retrieval"); - md.push_str("\n"); - - // Retrieval.search section - md.push_str("## `[retrieval.search]`\n\n"); - md.push_str("Search algorithm configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "top_k", "usize", "5", "Number of top-k results to return"); - self.add_row( - &mut md, - "beam_width", - "usize", - "3", - "Beam width for multi-path search", - ); - self.add_row( - &mut md, - "max_iterations", - "usize", - "10", - "Maximum iterations for search algorithms", - ); - self.add_row( - &mut md, - "min_score", - "f32", - "0.1", - "Minimum score to include a path", - ); - md.push_str("\n"); - - // Retrieval.sufficiency section - md.push_str("## `[retrieval.sufficiency]`\n\n"); - md.push_str("Sufficiency checker configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "min_tokens", - "usize", - "500", - "Minimum tokens for sufficiency", - ); - self.add_row( - &mut md, - "target_tokens", - "usize", - "2000", - "Target tokens for full sufficiency", - ); - self.add_row( - &mut md, - "max_tokens", - "usize", - "4000", - "Maximum tokens before stopping", - ); - self.add_row( - &mut md, - "min_content_length", - "usize", - "200", - "Minimum content length (characters)", - ); - self.add_row( - &mut md, - "confidence_threshold", - "f32", - "0.7", - "Confidence threshold for LLM judge", - ); - md.push_str("\n"); - - // Retrieval.cache section - md.push_str("## `[retrieval.cache]`\n\n"); - md.push_str("Cache configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "max_entries", "usize", "1000", "Maximum cache entries"); - self.add_row(&mut md, "ttl_secs", "u64", "3600", "Time-to-live in seconds"); - md.push_str("\n"); - - // Retrieval.strategy section - md.push_str("## `[retrieval.strategy]`\n\n"); - md.push_str("Strategy-specific configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "exploration_weight", - "f32", - "1.414", - "MCTS exploration weight (√2)", - ); - self.add_row( - &mut md, - "similarity_threshold", - "f32", - "0.5", - "Semantic similarity threshold", - ); - self.add_row( - &mut md, - "high_similarity_threshold", - "f32", - "0.8", - "High similarity for 'answer' decision", - ); - self.add_row( - &mut md, - "low_similarity_threshold", - "f32", - "0.3", - "Low similarity for 'explore' decision", - ); - md.push_str("\n"); - - // Retrieval.content section - md.push_str("## `[retrieval.content]`\n\n"); - md.push_str("Content aggregator configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "enabled", - "bool", - "true", - "Enable content aggregator", - ); - self.add_row( - &mut md, - "token_budget", - "usize", - "4000", - "Maximum tokens for aggregated content", - ); - self.add_row( - &mut md, - "min_relevance_score", - "f32", - "0.2", - "Minimum relevance score threshold (0.0-1.0)", - ); - self.add_row( - &mut md, - "scoring_strategy", - "string", - "hybrid", - "Scoring strategy (keyword, bm25, hybrid)", - ); - self.add_row( - &mut md, - "output_format", - "string", - "markdown", - "Output format (markdown, json, tree, flat)", - ); - self.add_row( - &mut md, - "include_scores", - "bool", - "false", - "Include relevance scores in output", - ); - self.add_row( - &mut md, - "hierarchical_min_per_level", - "f32", - "0.1", - "Minimum budget allocation per depth level", - ); - self.add_row( - &mut md, - "deduplicate", - "bool", - "true", - "Enable content deduplication", - ); - self.add_row( - &mut md, - "dedup_threshold", - "f32", - "0.9", - "Similarity threshold for deduplication", - ); - md.push_str("\n"); - - // Retrieval.multiturn section - md.push_str("## `[retrieval.multiturn]`\n\n"); - md.push_str("Multi-turn retrieval configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "enabled", - "bool", - "true", - "Enable multi-turn retrieval", - ); - self.add_row( - &mut md, - "max_sub_queries", - "usize", - "3", - "Maximum sub-queries per query", - ); - self.add_row( - &mut md, - "decomposition_model", - "string", - "gpt-4o-mini", - "Model for query decomposition", - ); - self.add_row( - &mut md, - "aggregation_strategy", - "string", - "merge", - "Aggregation strategy (merge, rank, synthesize)", - ); - md.push_str("\n"); - - // Retrieval.reference section - md.push_str("## `[retrieval.reference]`\n\n"); - md.push_str("Reference following configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "enabled", - "bool", - "true", - "Enable reference following", - ); - self.add_row(&mut md, "max_depth", "usize", "3", "Maximum reference depth"); - self.add_row( - &mut md, - "max_references", - "usize", - "10", - "Maximum references to follow", - ); - self.add_row( - &mut md, - "follow_pages", - "bool", - "true", - "Follow page references", - ); - self.add_row( - &mut md, - "follow_tables_figures", - "bool", - "true", - "Follow table/figure references", - ); - self.add_row( - &mut md, - "min_confidence", - "f32", - "0.5", - "Minimum confidence to follow reference", - ); - md.push_str("\n"); - - // Storage section - md.push_str("## `[storage]`\n\n"); - md.push_str("Storage configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "workspace_dir", - "string", - "./workspace", - "Workspace directory for persisted documents", - ); - self.add_row(&mut md, "cache_size", "usize", "100", "Cache size"); - self.add_row( - &mut md, - "atomic_writes", - "bool", - "true", - "Enable atomic file writes", - ); - self.add_row(&mut md, "file_lock", "bool", "true", "Enable file locking"); - self.add_row( - &mut md, - "checksum_enabled", - "bool", - "true", - "Enable checksum verification", - ); - md.push_str("\n"); - - // Storage.compression section - md.push_str("## `[storage.compression]`\n\n"); - md.push_str("Compression configuration.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row(&mut md, "enabled", "bool", "false", "Enable compression"); - self.add_row( - &mut md, - "algorithm", - "string", - "gzip", - "Compression algorithm (gzip, zstd, lz4)", - ); - self.add_row(&mut md, "level", "u32", "6", "Compression level"); - md.push_str("\n"); - - // Indexer section - md.push_str("## `[indexer]`\n\n"); - md.push_str("Controls document indexing behavior.\n\n"); - md.push_str("| Option | Type | Default | Description |\n"); - md.push_str("|--------|------|---------|-------------|\n"); - self.add_row( - &mut md, - "subsection_threshold", - "usize", - "300", - "Word count threshold for splitting sections into subsections", - ); - self.add_row( - &mut md, - "max_segment_tokens", - "usize", - "3000", - "Maximum tokens to send in a single segmentation request", - ); - self.add_row( - &mut md, - "max_summary_tokens", - "usize", - "200", - "Maximum tokens for each summary", - ); - self.add_row( - &mut md, - "min_summary_tokens", - "usize", - "20", - "Minimum content tokens required to generate a summary", - ); - md.push_str("\n"); - - md - } - - fn add_row(&self, md: &mut String, name: &str, ty: &str, default: &str, desc: &str) { - md.push_str(&format!( - "| `{}` | {} | {} | {} |\n", - name, ty, default, desc - )); - } - - /// Generate an example TOML file with all options. - pub fn to_example_toml(&self) -> String { - toml::to_string_pretty(&self.config).unwrap_or_else(|e| { - format!( - "# Error generating TOML: {}\n\n# Using default config\n{}", - e, - Self::fallback_toml() - ) - }) - } - - fn fallback_toml() -> String { - r#"# Vectorless Configuration Example -# Copy this file to vectorless.toml and fill in your API keys -# -# All configuration is loaded from this file only. -# No environment variables are used - this ensures explicit, traceable configuration. - -# ============================================================================ -# LLM Configuration (Unified) -# ============================================================================ -# -# The LLM pool allows configuring different models for different purposes: -# - summary: Used for generating document summaries during indexing -# - retrieval: Used for retrieval decisions and content evaluation -# - pilot: Used for intelligent navigation guidance -# -# Each client can have its own model, endpoint, and settings. - -[llm] -# Default API key (used by all clients unless overridden per-client) -api_key = "sk-your-api-key-here" - -# Summary client - generates document summaries during indexing -# Use a fast, cheap model for bulk processing -[llm.summary] -model = "gpt-4o-mini" -endpoint = "https://api.openai.com/v1" -max_tokens = 200 -temperature = 0.0 -# api_key = "sk-specific-key-for-summary" # Optional: override default - -# Retrieval client - used for retrieval decisions and content evaluation -# Can use a more capable model for better decisions -[llm.retrieval] -model = "gpt-4o" -endpoint = "https://api.openai.com/v1" -max_tokens = 100 -temperature = 0.0 -# api_key = "sk-specific-key-for-retrieval" # Optional: override default - -# Pilot client - used for intelligent navigation guidance -# Use a fast model for quick navigation decisions -[llm.pilot] -model = "gpt-4o-mini" -endpoint = "https://api.openai.com/v1" -max_tokens = 300 -temperature = 0.0 -# api_key = "sk-specific-key-for-pilot" # Optional: override default - -# Retry configuration (applies to all LLM calls) -[llm.retry] -max_attempts = 3 -initial_delay_ms = 500 -max_delay_ms = 30000 -multiplier = 2.0 -retry_on_rate_limit = true - -# Throttle/rate limiting configuration (applies to all LLM calls) -[llm.throttle] -max_concurrent_requests = 10 -requests_per_minute = 500 -enabled = true -semaphore_enabled = true - -# Fallback configuration (applies to all LLM calls) -[llm.fallback] -enabled = true -models = ["gpt-4o-mini", "glm-4-flash"] -on_rate_limit = "retry_then_fallback" -on_timeout = "retry_then_fallback" -on_all_failed = "return_error" - -# ============================================================================ -# Metrics Configuration (Unified) -# ============================================================================ - -[metrics] -enabled = true -storage_path = "./workspace/metrics" -retention_days = 30 - -[metrics.llm] -track_tokens = true -track_latency = true -track_cost = true -cost_per_1k_input_tokens = 0.00015 # gpt-4o-mini pricing -cost_per_1k_output_tokens = 0.0006 - -[metrics.pilot] -track_decisions = true -track_accuracy = true -track_feedback = true - -[metrics.retrieval] -track_paths = true -track_scores = true -track_iterations = true -track_cache = true - -# ============================================================================ -# Pilot Configuration -# ============================================================================ - -[pilot] -mode = "Balanced" # Aggressive | Balanced | Conservative | AlgorithmOnly -guide_at_start = true -guide_at_backtrack = true - -[pilot.budget] -max_tokens_per_query = 2000 -max_tokens_per_call = 500 -max_calls_per_query = 5 -max_calls_per_level = 2 -hard_limit = true - -[pilot.intervention] -fork_threshold = 3 -score_gap_threshold = 0.15 -low_score_threshold = 0.3 -max_interventions_per_level = 2 - -[pilot.feedback] -enabled = true -storage_path = "./workspace/feedback" -learning_rate = 0.1 -min_samples_for_learning = 10 - -# ============================================================================ -# Retrieval Configuration -# ============================================================================ - -[retrieval] -model = "gpt-4o" -endpoint = "https://api.openai.com/v1" -top_k = 3 -max_tokens = 1000 -temperature = 0.0 - -[retrieval.search] -top_k = 5 -beam_width = 3 -max_iterations = 10 -min_score = 0.1 - -[retrieval.sufficiency] -min_tokens = 500 -target_tokens = 2000 -max_tokens = 4000 -min_content_length = 200 -confidence_threshold = 0.7 - -[retrieval.cache] -max_entries = 1000 -ttl_secs = 3600 - -[retrieval.strategy] -exploration_weight = 1.414 -similarity_threshold = 0.5 -high_similarity_threshold = 0.8 -low_similarity_threshold = 0.3 - -[retrieval.content] -enabled = true -token_budget = 4000 -min_relevance_score = 0.2 -scoring_strategy = "hybrid" # keyword | bm25 | hybrid -output_format = "markdown" -include_scores = false -hierarchical_min_per_level = 0.1 -deduplicate = true -dedup_threshold = 0.9 - -# ============================================================================ -# Multi-turn Retrieval Configuration -# ============================================================================ - -[retrieval.multiturn] -enabled = true -max_sub_queries = 3 -decomposition_model = "gpt-4o-mini" -aggregation_strategy = "merge" # merge | rank | synthesize - -# ============================================================================ -# Reference Following Configuration -# ============================================================================ - -[retrieval.reference] -enabled = true -max_depth = 3 -max_references = 10 -follow_pages = true -follow_tables_figures = true -min_confidence = 0.5 - -# ============================================================================ -# Storage Configuration -# ============================================================================ - -[storage] -workspace_dir = "./workspace" -cache_size = 100 -atomic_writes = true -file_lock = true -checksum_enabled = true - -[storage.compression] -enabled = false -algorithm = "gzip" -level = 6 - -# ============================================================================ -# Indexer Configuration -# ============================================================================ - -[indexer] -subsection_threshold = 300 -max_segment_tokens = 3000 -max_summary_tokens = 200 -min_summary_tokens = 20 -"# - .to_string() - } - - /// Generate a minimal example TOML file. - pub fn to_minimal_toml(&self) -> String { - r#"# Minimal Vectorless Configuration -# Most options have sensible defaults - -[llm] -api_key = "your-api-key-here" - -[retrieval] -top_k = 5 -"# - .to_string() - } -} - -impl Default for ConfigDocs { - fn default() -> Self { - Self::with_defaults() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_config_docs_markdown() { - let docs = ConfigDocs::with_defaults(); - let md = docs.to_markdown(); - - assert!(md.contains("# Configuration Reference")); - assert!(md.contains("## `[llm]`")); - assert!(md.contains("## `[llm.summary]`")); - assert!(md.contains("## `[metrics]`")); - assert!(md.contains("## `[pilot]`")); - assert!(md.contains("## `[retrieval]`")); - assert!(md.contains("## `[retrieval.content]`")); - } - - #[test] - fn test_config_docs_toml() { - let docs = ConfigDocs::with_defaults(); - let toml = docs.to_example_toml(); - - assert!(toml.contains("[llm]") || toml.contains("[indexer]")); - } - - #[test] - fn test_config_docs_minimal_toml() { - let docs = ConfigDocs::with_defaults(); - let toml = docs.to_minimal_toml(); - - assert!(toml.contains("[llm]")); - assert!(toml.len() < 200); // Should be minimal - } -} diff --git a/rust/src/config/loader.rs b/rust/src/config/loader.rs index e436a315..4fad51eb 100644 --- a/rust/src/config/loader.rs +++ b/rust/src/config/loader.rs @@ -238,40 +238,6 @@ impl ConfigLoader { } } -/// Default configuration file names to search for. -pub const CONFIG_FILE_NAMES: &[&str] = &["vectorless.toml", "config.toml", ".vectorless.toml"]; - -/// Find a configuration file in current or parent directories. -pub fn find_config_file() -> Option { - let current_dir = std::env::current_dir().ok()?; - - // Search in current directory first - for name in CONFIG_FILE_NAMES { - let path = current_dir.join(name); - if path.exists() { - return Some(path); - } - } - - // Search in parent directories (up to 3 levels) - let mut dir = current_dir.as_path(); - for _ in 0..3 { - if let Some(parent) = dir.parent() { - for name in CONFIG_FILE_NAMES { - let path = parent.join(name); - if path.exists() { - return Some(path); - } - } - dir = parent; - } else { - break; - } - } - - None -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/src/config/mod.rs b/rust/src/config/mod.rs index 4fa305a3..b73d21c2 100644 --- a/rust/src/config/mod.rs +++ b/rust/src/config/mod.rs @@ -6,7 +6,6 @@ //! Users configure vectorless via [`EngineBuilder`](crate::client::EngineBuilder) methods, //! not by directly interacting with this module. -mod docs; mod loader; mod merge; mod types; diff --git a/rust/src/config/types/mod.rs b/rust/src/config/types/mod.rs index c60763ef..ea6cedbd 100644 --- a/rust/src/config/types/mod.rs +++ b/rust/src/config/types/mod.rs @@ -68,6 +68,10 @@ pub struct Config { #[serde(default)] pub concurrency: ConcurrencyConfig, + /// Document graph configuration. + #[serde(default)] + pub graph: crate::graph::DocumentGraphConfig, + /// Fallback/error recovery configuration (legacy, prefer llm.fallback). #[serde(default)] pub fallback: FallbackConfig, @@ -83,6 +87,7 @@ impl Default for Config { retrieval: RetrievalConfig::default(), storage: StorageConfig::default(), concurrency: ConcurrencyConfig::default(), + graph: crate::graph::DocumentGraphConfig::default(), fallback: FallbackConfig::default(), } } @@ -136,6 +141,12 @@ impl Config { self } + /// Set the document graph configuration. + pub fn with_graph(mut self, graph: crate::graph::DocumentGraphConfig) -> Self { + self.graph = graph; + self + } + /// Set the fallback configuration. pub fn with_fallback(mut self, fallback: FallbackConfig) -> Self { self.fallback = fallback; @@ -209,6 +220,20 @@ impl Config { )); } + // Validate graph + if self.graph.min_keyword_jaccard < 0.0 || self.graph.min_keyword_jaccard > 1.0 { + errors.push(ValidationError::error( + "graph.min_keyword_jaccard", + "Must be between 0.0 and 1.0", + )); + } + if self.graph.max_edges_per_node == 0 { + errors.push(ValidationError::error( + "graph.max_edges_per_node", + "Must be greater than 0", + )); + } + // Validate fallback if self.fallback.enabled && self.fallback.models.is_empty() { errors.push(ValidationError::warning( diff --git a/rust/src/document/graph.rs b/rust/src/document/graph.rs index 988c5e8f..2a4cade8 100644 --- a/rust/src/document/graph.rs +++ b/rust/src/document/graph.rs @@ -1,358 +1,12 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Document Graph — cross-document relationship graph. +//! Re-export all graph types from the standalone `graph` module. //! -//! A workspace-scoped, weighted graph connecting documents by shared -//! concepts, keywords, and references. Built from each document's -//! [`ReasoningIndex`] data, it enables graph-aware retrieval ranking. +//! This shim preserves backward compatibility for code importing +//! from `crate::document::DocumentGraph`. -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; - -/// A workspace-scoped document relationship graph. -/// -/// Nodes represent documents, edges represent relationships (shared keywords, -/// references). The graph is immutable after construction and can be shared -/// across threads via `Arc`. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraph { - /// All document nodes, indexed by doc_id. - nodes: HashMap, - - /// Adjacency list: doc_id → outgoing edges. - edges: HashMap>, - - /// Inverted index: keyword → documents containing this keyword. - keyword_index: HashMap>, - - /// Graph-level metadata. - metadata: GraphMetadata, -} - -/// Expose edges field for graph builder trimming. -impl DocumentGraph { - /// Take all edges out, leaving an empty map in their place. - pub(crate) fn take_edges(&mut self) -> HashMap> { - std::mem::take(&mut self.edges) - } - - /// Set edges directly (used by builder after trimming). - pub(crate) fn set_edges(&mut self, edges: HashMap>) { - self.metadata.edge_count = edges.values().map(|v| v.len()).sum(); - self.edges = edges; - } - - /// Get a clone of the keyword index (used by builder for edge computation). - pub(crate) fn keyword_index_clone(&self) -> HashMap> { - self.keyword_index.clone() - } -} - -impl DocumentGraph { - /// Create a new empty document graph. - pub fn new() -> Self { - Self { - nodes: HashMap::new(), - edges: HashMap::new(), - keyword_index: HashMap::new(), - metadata: GraphMetadata { - document_count: 0, - edge_count: 0, - }, - } - } - - /// Add a document node to the graph. - pub fn add_node(&mut self, node: DocumentGraphNode) { - // Populate keyword index from the node's top keywords - for kw in &node.top_keywords { - self.keyword_index - .entry(kw.keyword.clone()) - .or_default() - .push(KeywordDocEntry { - doc_id: node.doc_id.clone(), - weight: kw.weight, - }); - } - let doc_id = node.doc_id.clone(); - self.nodes.insert(doc_id, node); - self.metadata.document_count = self.nodes.len(); - } - - /// Add a directed edge from `source` to `target`. - pub fn add_edge(&mut self, source: &str, edge: GraphEdge) { - self.edges - .entry(source.to_string()) - .or_default() - .push(edge); - self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum(); - } - - /// Get a document node by ID. - pub fn get_node(&self, doc_id: &str) -> Option<&DocumentGraphNode> { - self.nodes.get(doc_id) - } - - /// Get all edges outgoing from a document. - pub fn get_neighbors(&self, doc_id: &str) -> &[GraphEdge] { - self.edges.get(doc_id).map_or(&[], Vec::as_slice) - } - - /// Find documents containing a keyword. - pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] { - self.keyword_index - .get(keyword) - .map_or(&[], Vec::as_slice) - } - - /// Get the number of documents in the graph. - pub fn node_count(&self) -> usize { - self.nodes.len() - } - - /// Get the number of edges in the graph. - pub fn edge_count(&self) -> usize { - self.edges.values().map(|v| v.len()).sum() - } - - /// Get all document IDs in the graph. - pub fn doc_ids(&self) -> impl Iterator { - self.nodes.keys().map(|s| s.as_str()) - } - - /// Get graph metadata. - pub fn metadata(&self) -> &GraphMetadata { - &self.metadata - } - - /// Check if the graph is empty. - pub fn is_empty(&self) -> bool { - self.nodes.is_empty() - } -} - -impl Default for DocumentGraph { - fn default() -> Self { - Self::new() - } -} - -/// A document node in the graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphNode { - /// Document ID (matches `PersistedDocument.meta.id`). - pub doc_id: String, - /// Document title/name. - pub title: String, - /// Document format (md, pdf, docx). - pub format: String, - /// Top-N representative keywords extracted from the document's - /// ReasoningIndex topic_paths, sorted by aggregate weight. - pub top_keywords: Vec, - /// Number of nodes in the document tree. - pub node_count: usize, -} - -/// A keyword with its aggregate weight across the document. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WeightedKeyword { - /// The keyword string (lowercased). - pub keyword: String, - /// Aggregate weight across all TopicEntry instances (0.0 - 1.0). - pub weight: f32, -} - -/// An edge connecting two documents. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GraphEdge { - /// Target document ID. - pub target_doc_id: String, - /// Edge weight (0.0 - 1.0). Higher = stronger relationship. - pub weight: f32, - /// Evidence for why these documents are connected. - pub evidence: EdgeEvidence, -} - -/// Evidence for why two documents are connected. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EdgeEvidence { - /// Keywords shared between the two documents. - pub shared_keywords: Vec, - /// Number of shared keywords. - pub shared_keyword_count: usize, - /// Jaccard similarity of keyword sets. - pub keyword_jaccard: f32, -} - -/// A keyword shared between two documents. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SharedKeyword { - /// The shared keyword. - pub keyword: String, - /// Weight in source document. - pub source_weight: f32, - /// Weight in target document. - pub target_weight: f32, -} - -/// Entry in the keyword inverted index. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct KeywordDocEntry { - /// Document ID containing this keyword. - pub doc_id: String, - /// Weight of this keyword in the document. - pub weight: f32, -} - -/// Graph-level metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GraphMetadata { - /// Number of documents in the graph. - pub document_count: usize, - /// Number of edges in the graph. - pub edge_count: usize, -} - -/// Configuration for building the document graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphConfig { - /// Whether graph building is enabled. - pub enabled: bool, - /// Minimum Jaccard similarity for creating an edge. - pub min_keyword_jaccard: f32, - /// Minimum shared keywords to create an edge. - pub min_shared_keywords: usize, - /// Maximum top keywords per document node. - pub max_keywords_per_doc: usize, - /// Maximum edges per document node. - pub max_edges_per_node: usize, - /// Boost factor applied to graph-connected documents during retrieval. - pub retrieval_boost_factor: f32, -} - -impl Default for DocumentGraphConfig { - fn default() -> Self { - Self { - enabled: true, - min_keyword_jaccard: 0.1, - min_shared_keywords: 2, - max_keywords_per_doc: 50, - max_edges_per_node: 20, - retrieval_boost_factor: 0.15, - } - } -} - -impl DocumentGraphConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_empty_graph() { - let graph = DocumentGraph::new(); - assert!(graph.is_empty()); - assert_eq!(graph.node_count(), 0); - assert_eq!(graph.edge_count(), 0); - } - - #[test] - fn test_add_node() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "Test Doc".to_string(), - format: "md".to_string(), - top_keywords: vec![ - WeightedKeyword { keyword: "rust".to_string(), weight: 0.9 }, - WeightedKeyword { keyword: "async".to_string(), weight: 0.7 }, - ], - node_count: 10, - }); - - assert_eq!(graph.node_count(), 1); - assert!(graph.get_node("doc1").is_some()); - assert_eq!(graph.find_by_keyword("rust").len(), 1); - assert_eq!(graph.find_by_keyword("async").len(), 1); - assert_eq!(graph.find_by_keyword("missing").len(), 0); - } - - #[test] - fn test_add_edge() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "A".to_string(), - format: "md".to_string(), - top_keywords: vec![], - node_count: 5, - }); - graph.add_node(DocumentGraphNode { - doc_id: "doc2".to_string(), - title: "B".to_string(), - format: "md".to_string(), - top_keywords: vec![], - node_count: 8, - }); - - graph.add_edge("doc1", GraphEdge { - target_doc_id: "doc2".to_string(), - weight: 0.5, - evidence: EdgeEvidence { - shared_keywords: vec![SharedKeyword { - keyword: "rust".to_string(), - source_weight: 0.9, - target_weight: 0.8, - }], - shared_keyword_count: 1, - keyword_jaccard: 0.3, - }, - }); - - assert_eq!(graph.edge_count(), 1); - assert_eq!(graph.get_neighbors("doc1").len(), 1); - assert_eq!(graph.get_neighbors("doc1")[0].target_doc_id, "doc2"); - assert_eq!(graph.get_neighbors("doc2").len(), 0); - } - - #[test] - fn test_config_default() { - let config = DocumentGraphConfig::default(); - assert!(config.enabled); - assert!((config.min_keyword_jaccard - 0.1).abs() < f32::EPSILON); - assert_eq!(config.min_shared_keywords, 2); - } - - #[test] - fn test_serialization_roundtrip() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "Test".to_string(), - format: "md".to_string(), - top_keywords: vec![WeightedKeyword { keyword: "test".to_string(), weight: 1.0 }], - node_count: 3, - }); - - let json = serde_json::to_string(&graph).unwrap(); - let deserialized: DocumentGraph = serde_json::from_str(&json).unwrap(); - assert_eq!(deserialized.node_count(), 1); - assert_eq!(deserialized.get_node("doc1").unwrap().title, "Test"); - } -} +pub use crate::graph::{ + DocumentGraph, DocumentGraphConfig, DocumentGraphNode, EdgeEvidence, GraphEdge, GraphMetadata, + KeywordDocEntry, SharedKeyword, WeightedKeyword, +}; diff --git a/rust/src/document/reasoning.rs b/rust/src/document/reasoning.rs index 0beeb730..5aaae8bd 100644 --- a/rust/src/document/reasoning.rs +++ b/rust/src/document/reasoning.rs @@ -80,6 +80,11 @@ impl ReasoningIndex { self.section_map.get(&title.to_lowercase()).copied() } + /// Iterate over all keyword → topic entries (for graph building). + pub fn all_topic_entries(&self) -> impl Iterator { + self.topic_paths.iter().map(|(k, v)| (k, v.as_slice())) + } + /// Get the number of topic keywords indexed. pub fn topic_count(&self) -> usize { self.topic_paths.len() diff --git a/rust/src/index/graph_builder.rs b/rust/src/graph/builder.rs similarity index 98% rename from rust/src/index/graph_builder.rs rename to rust/src/graph/builder.rs index b749cc14..62ee87a5 100644 --- a/rust/src/index/graph_builder.rs +++ b/rust/src/graph/builder.rs @@ -10,9 +10,9 @@ use std::collections::HashMap; use tracing::info; -use crate::document::{ - DocumentGraph, DocumentGraphConfig, DocumentGraphNode, EdgeEvidence, GraphEdge, SharedKeyword, - WeightedKeyword, +use super::config::DocumentGraphConfig; +use super::types::{ + DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, SharedKeyword, WeightedKeyword, }; /// Intermediate data collected per document during graph building. diff --git a/rust/src/graph/config.rs b/rust/src/graph/config.rs new file mode 100644 index 00000000..40b1d888 --- /dev/null +++ b/rust/src/graph/config.rs @@ -0,0 +1,51 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration for document graph building and retrieval. + +use serde::{Deserialize, Serialize}; + +/// Configuration for building the document graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphConfig { + /// Whether graph building is enabled. + pub enabled: bool, + /// Minimum Jaccard similarity for creating an edge. + pub min_keyword_jaccard: f32, + /// Minimum shared keywords to create an edge. + pub min_shared_keywords: usize, + /// Maximum top keywords per document node. + pub max_keywords_per_doc: usize, + /// Maximum edges per document node. + pub max_edges_per_node: usize, + /// Boost factor applied to graph-connected documents during retrieval. + pub retrieval_boost_factor: f32, +} + +impl Default for DocumentGraphConfig { + fn default() -> Self { + Self { + enabled: true, + min_keyword_jaccard: 0.1, + min_shared_keywords: 2, + max_keywords_per_doc: 50, + max_edges_per_node: 20, + retrieval_boost_factor: 0.15, + } + } +} + +impl DocumentGraphConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} diff --git a/rust/src/graph/mod.rs b/rust/src/graph/mod.rs new file mode 100644 index 00000000..6c084e22 --- /dev/null +++ b/rust/src/graph/mod.rs @@ -0,0 +1,41 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document graph module — workspace-level cross-document relationship graph. +//! +//! This module provides: +//! - [`DocumentGraph`] — the graph data structure connecting documents by shared concepts +//! - [`DocumentGraphBuilder`] — constructs the graph from document keyword profiles +//! - [`DocumentGraphConfig`] — configuration for graph building and retrieval boosting +//! +//! The document graph is a workspace-scoped, weighted graph built from each document's +//! [`ReasoningIndex`](crate::document::ReasoningIndex) keyword data. It enables +//! graph-aware retrieval ranking where connected documents receive a relevance boost. +//! +//! # Data Flow +//! +//! ```text +//! Document Indexing → ReasoningIndex (topic_paths) +//! ↓ +//! DocumentGraphBuilder::add_document() +//! ↓ +//! DocumentGraph +//! ↓ +//! Workspace::set_graph() +//! ↓ +//! Engine::query() loads graph +//! ↓ +//! CrossDocumentStrategy (graph boosting) +//! ``` + +mod builder; +mod config; +mod types; + +// Re-export public API +pub use builder::DocumentGraphBuilder; +pub use config::DocumentGraphConfig; +pub use types::{ + DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, GraphMetadata, KeywordDocEntry, + SharedKeyword, WeightedKeyword, +}; diff --git a/rust/src/graph/types.rs b/rust/src/graph/types.rs new file mode 100644 index 00000000..3a8577a6 --- /dev/null +++ b/rust/src/graph/types.rs @@ -0,0 +1,303 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document Graph data types. +//! +//! Core data structures for the workspace-scoped, weighted document relationship graph. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// A workspace-scoped document relationship graph. +/// +/// Nodes represent documents, edges represent relationships (shared keywords, +/// references). The graph is immutable after construction and can be shared +/// across threads via `Arc`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraph { + /// All document nodes, indexed by doc_id. + nodes: HashMap, + + /// Adjacency list: doc_id → outgoing edges. + edges: HashMap>, + + /// Inverted index: keyword → documents containing this keyword. + keyword_index: HashMap>, + + /// Graph-level metadata. + metadata: GraphMetadata, +} + +/// Expose fields for graph builder (same module). +impl DocumentGraph { + /// Take all edges out, leaving an empty map in their place. + pub(crate) fn take_edges(&mut self) -> HashMap> { + std::mem::take(&mut self.edges) + } + + /// Set edges directly (used by builder after trimming). + pub(crate) fn set_edges(&mut self, edges: HashMap>) { + self.metadata.edge_count = edges.values().map(|v| v.len()).sum(); + self.edges = edges; + } + + /// Get a clone of the keyword index (used by builder for edge computation). + pub(crate) fn keyword_index_clone(&self) -> HashMap> { + self.keyword_index.clone() + } +} + +impl DocumentGraph { + /// Create a new empty document graph. + pub fn new() -> Self { + Self { + nodes: HashMap::new(), + edges: HashMap::new(), + keyword_index: HashMap::new(), + metadata: GraphMetadata { + document_count: 0, + edge_count: 0, + }, + } + } + + /// Add a document node to the graph. + pub fn add_node(&mut self, node: DocumentGraphNode) { + // Populate keyword index from the node's top keywords + for kw in &node.top_keywords { + self.keyword_index + .entry(kw.keyword.clone()) + .or_default() + .push(KeywordDocEntry { + doc_id: node.doc_id.clone(), + weight: kw.weight, + }); + } + let doc_id = node.doc_id.clone(); + self.nodes.insert(doc_id, node); + self.metadata.document_count = self.nodes.len(); + } + + /// Add a directed edge from `source` to `target`. + pub fn add_edge(&mut self, source: &str, edge: GraphEdge) { + self.edges + .entry(source.to_string()) + .or_default() + .push(edge); + self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum(); + } + + /// Get a document node by ID. + pub fn get_node(&self, doc_id: &str) -> Option<&DocumentGraphNode> { + self.nodes.get(doc_id) + } + + /// Get all edges outgoing from a document. + pub fn get_neighbors(&self, doc_id: &str) -> &[GraphEdge] { + self.edges.get(doc_id).map_or(&[], Vec::as_slice) + } + + /// Find documents containing a keyword. + pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] { + self.keyword_index + .get(keyword) + .map_or(&[], Vec::as_slice) + } + + /// Get the number of documents in the graph. + pub fn node_count(&self) -> usize { + self.nodes.len() + } + + /// Get the number of edges in the graph. + pub fn edge_count(&self) -> usize { + self.edges.values().map(|v| v.len()).sum() + } + + /// Get all document IDs in the graph. + pub fn doc_ids(&self) -> impl Iterator { + self.nodes.keys().map(|s| s.as_str()) + } + + /// Get graph metadata. + pub fn metadata(&self) -> &GraphMetadata { + &self.metadata + } + + /// Check if the graph is empty. + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } +} + +impl Default for DocumentGraph { + fn default() -> Self { + Self::new() + } +} + +/// A document node in the graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphNode { + /// Document ID (matches `PersistedDocument.meta.id`). + pub doc_id: String, + /// Document title/name. + pub title: String, + /// Document format (md, pdf, docx). + pub format: String, + /// Top-N representative keywords extracted from the document's + /// ReasoningIndex topic_paths, sorted by aggregate weight. + pub top_keywords: Vec, + /// Number of nodes in the document tree. + pub node_count: usize, +} + +/// A keyword with its aggregate weight across the document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WeightedKeyword { + /// The keyword string (lowercased). + pub keyword: String, + /// Aggregate weight across all TopicEntry instances (0.0 - 1.0). + pub weight: f32, +} + +/// An edge connecting two documents. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GraphEdge { + /// Target document ID. + pub target_doc_id: String, + /// Edge weight (0.0 - 1.0). Higher = stronger relationship. + pub weight: f32, + /// Evidence for why these documents are connected. + pub evidence: EdgeEvidence, +} + +/// Evidence for why two documents are connected. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EdgeEvidence { + /// Keywords shared between the two documents. + pub shared_keywords: Vec, + /// Number of shared keywords. + pub shared_keyword_count: usize, + /// Jaccard similarity of keyword sets. + pub keyword_jaccard: f32, +} + +/// A keyword shared between two documents. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SharedKeyword { + /// The shared keyword. + pub keyword: String, + /// Weight in source document. + pub source_weight: f32, + /// Weight in target document. + pub target_weight: f32, +} + +/// Entry in the keyword inverted index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct KeywordDocEntry { + /// Document ID containing this keyword. + pub doc_id: String, + /// Weight of this keyword in the document. + pub weight: f32, +} + +/// Graph-level metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GraphMetadata { + /// Number of documents in the graph. + pub document_count: usize, + /// Number of edges in the graph. + pub edge_count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_graph() { + let graph = DocumentGraph::new(); + assert!(graph.is_empty()); + assert_eq!(graph.node_count(), 0); + assert_eq!(graph.edge_count(), 0); + } + + #[test] + fn test_add_node() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "Test Doc".to_string(), + format: "md".to_string(), + top_keywords: vec![ + WeightedKeyword { keyword: "rust".to_string(), weight: 0.9 }, + WeightedKeyword { keyword: "async".to_string(), weight: 0.7 }, + ], + node_count: 10, + }); + + assert_eq!(graph.node_count(), 1); + assert!(graph.get_node("doc1").is_some()); + assert_eq!(graph.find_by_keyword("rust").len(), 1); + assert_eq!(graph.find_by_keyword("async").len(), 1); + assert_eq!(graph.find_by_keyword("missing").len(), 0); + } + + #[test] + fn test_add_edge() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "A".to_string(), + format: "md".to_string(), + top_keywords: vec![], + node_count: 5, + }); + graph.add_node(DocumentGraphNode { + doc_id: "doc2".to_string(), + title: "B".to_string(), + format: "md".to_string(), + top_keywords: vec![], + node_count: 8, + }); + + graph.add_edge("doc1", GraphEdge { + target_doc_id: "doc2".to_string(), + weight: 0.5, + evidence: EdgeEvidence { + shared_keywords: vec![SharedKeyword { + keyword: "rust".to_string(), + source_weight: 0.9, + target_weight: 0.8, + }], + shared_keyword_count: 1, + keyword_jaccard: 0.3, + }, + }); + + assert_eq!(graph.edge_count(), 1); + assert_eq!(graph.get_neighbors("doc1").len(), 1); + assert_eq!(graph.get_neighbors("doc1")[0].target_doc_id, "doc2"); + assert_eq!(graph.get_neighbors("doc2").len(), 0); + } + + #[test] + fn test_serialization_roundtrip() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "Test".to_string(), + format: "md".to_string(), + top_keywords: vec![WeightedKeyword { keyword: "test".to_string(), weight: 1.0 }], + node_count: 3, + }); + + let json = serde_json::to_string(&graph).unwrap(); + let deserialized: DocumentGraph = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.node_count(), 1); + assert_eq!(deserialized.get_node("doc1").unwrap().title, "Test"); + } +} diff --git a/rust/src/index/config.rs b/rust/src/index/config.rs index 5d982183..c6fa74ea 100644 --- a/rust/src/index/config.rs +++ b/rust/src/index/config.rs @@ -11,7 +11,8 @@ use super::summary::SummaryStrategy; use crate::config::{ConcurrencyConfig, IndexerConfig}; -use crate::document::ReasoningIndexConfig; +use crate::document::{DocumentTree, ReasoningIndexConfig}; +use crate::utils::fingerprint::{Fingerprint, Fingerprinter}; /// Index mode for document processing. #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -157,6 +158,14 @@ pub struct PipelineOptions { /// Reasoning index configuration. pub reasoning_index: ReasoningIndexConfig, + + /// Existing tree from a previous index (for incremental updates). + /// Stages (enhance, reasoning) can reuse data from unchanged nodes. + pub existing_tree: Option, + + /// Current processing version. Bumped when indexing algorithm changes + /// to force reprocessing of existing documents. + pub processing_version: u32, } impl Default for PipelineOptions { @@ -171,6 +180,8 @@ impl Default for PipelineOptions { concurrency: ConcurrencyConfig::default(), indexer: IndexerConfig::default(), reasoning_index: ReasoningIndexConfig::default(), + existing_tree: None, + processing_version: 1, } } } @@ -234,6 +245,21 @@ impl PipelineOptions { self.reasoning_index = config; self } + + /// Compute a fingerprint of the pipeline configuration. + /// + /// If this fingerprint changes between runs, all documents need full reprocessing + /// even if their content hasn't changed (because the processing logic is different). + pub fn logic_fingerprint(&self) -> Fingerprint { + Fingerprinter::new() + .with_str(&format!("{:?}", self.mode)) + .with_bool(self.generate_ids) + .with_str(&format!("{:?}", self.summary_strategy)) + .with_bool(self.generate_description) + .with_bool(self.optimization.enabled) + .with_str(&format!("{:?}", self.reasoning_index)) + .into_fingerprint() + } } #[cfg(test)] diff --git a/rust/src/index/incremental/mod.rs b/rust/src/index/incremental/mod.rs index 741f4ae1..2933d198 100644 --- a/rust/src/index/incremental/mod.rs +++ b/rust/src/index/incremental/mod.rs @@ -15,10 +15,71 @@ //! - **Partial updates**: Only reprocess changed nodes mod detector; +mod resolver; mod updater; pub use detector::{ ChangeDetector, ChangeDetectorState, ChangeSet, ChangeType, DocumentChangeInfo, NodeChange, compute_all_node_fingerprints, compute_tree_fingerprint, }; +pub use resolver::{IndexAction, SkipInfo, resolve_action}; pub use updater::PartialUpdater; + +use std::collections::HashMap; +use crate::document::DocumentTree; + +/// Reuse summaries from old tree for unchanged nodes in the new tree. +/// +/// Uses `ChangeDetector` to find which nodes changed, then copies +/// summaries from old tree nodes with matching titles that are unchanged. +/// +/// Returns a map of `title -> summary` for reusable summaries. +pub fn compute_reusable_summaries( + old_tree: &DocumentTree, + new_tree: &DocumentTree, +) -> HashMap { + let detector = ChangeDetector::new(); + let changes = detector.detect_changes(old_tree, new_tree); + + let changed_titles: std::collections::HashSet = changes.modified + .iter() + .chain(changes.restructured.iter()) + .chain(changes.added.iter()) + .chain(changes.removed.iter()) + .map(|c| c.title.clone()) + .collect(); + + let mut reusable = HashMap::new(); + for node_id in old_tree.traverse() { + if let Some(node) = old_tree.get(node_id) { + if !changed_titles.contains(&node.title) && !node.summary.is_empty() { + reusable.insert(node.title.clone(), node.summary.clone()); + } + } + } + reusable +} + +/// Apply reusable summaries to a new tree. +/// +/// For each node in `new_tree` whose title matches a key in `summaries`, +/// sets the node's summary from the map. +/// +/// Returns the number of summaries applied. +pub fn apply_reusable_summaries( + new_tree: &mut DocumentTree, + summaries: &HashMap, +) -> usize { + let mut applied = 0; + for node_id in new_tree.traverse() { + if let Some(node) = new_tree.get(node_id) { + if node.summary.is_empty() { + if let Some(summary) = summaries.get(&node.title) { + new_tree.set_summary(node_id, summary); + applied += 1; + } + } + } + } + applied +} diff --git a/rust/src/index/incremental/resolver.rs b/rust/src/index/incremental/resolver.rs new file mode 100644 index 00000000..4d3d7031 --- /dev/null +++ b/rust/src/index/incremental/resolver.rs @@ -0,0 +1,102 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Incremental indexing resolver — decides what action to take for a source. +//! +//! Three-layer change detection: +//! 1. **File-level**: content fingerprint → skip if unchanged +//! 2. **Logic-level**: pipeline config fingerprint → full reprocess if changed +//! 3. **Node-level**: Merkle subtree diff → incremental update + +use tracing::info; + +use crate::document::DocumentTree; +use crate::storage::PersistedDocument; +use crate::utils::fingerprint::Fingerprint; +use crate::index::config::PipelineOptions; +use crate::parser::DocumentFormat; + +/// Action to take for a source during indexing. +pub enum IndexAction { + /// Skip entirely — content unchanged. + Skip(SkipInfo), + /// Full index from scratch — new file, logic changed, or force mode. + /// If replacing an existing document, `existing_id` contains the old doc ID + /// to clean up after the new document is successfully saved. + FullIndex { + /// Old document ID to remove after successful re-index (if replacing). + existing_id: Option, + }, + /// Incremental update — content changed, pipeline unchanged. + IncrementalUpdate { + /// The old tree to reuse data from. + old_tree: DocumentTree, + /// The existing document ID (preserved across updates). + existing_id: String, + }, +} + +/// Info returned when a source is skipped. +pub struct SkipInfo { + /// Existing document ID. + pub doc_id: String, + /// Document name. + pub name: String, + /// Document format. + pub format: DocumentFormat, + /// Document description. + pub description: Option, + /// Page count. + pub page_count: Option, +} + +/// Resolve what action to take for a source file. +/// +/// This is the core three-layer incremental decision: +/// +/// 1. **File fingerprint**: Compare file bytes hash with stored `content_fingerprint`. +/// If equal → `Skip` (nothing changed). +/// +/// 2. **Logic fingerprint**: Compare pipeline config hash with stored `logic_fingerprint`. +/// If different → `FullIndex` (processing logic changed, must reprocess everything). +/// +/// 3. **Incremental**: Content changed but pipeline unchanged → `IncrementalUpdate` +/// with the old tree for partial reprocessing. +pub fn resolve_action( + file_bytes: &[u8], + stored_doc: &PersistedDocument, + pipeline_options: &PipelineOptions, + format: DocumentFormat, +) -> IndexAction { + let current_fp = Fingerprint::from_bytes(file_bytes); + + // Layer 1: File-level content fingerprint + if !stored_doc.meta.needs_reprocessing(¤t_fp, pipeline_options.processing_version) { + info!("File fingerprint unchanged, skipping"); + return IndexAction::Skip(SkipInfo { + doc_id: stored_doc.meta.id.clone(), + name: stored_doc.meta.name.clone(), + format, + description: stored_doc.meta.description.clone(), + page_count: stored_doc.meta.page_count, + }); + } + + // Layer 2: Logic fingerprint (pipeline config changed?) + let current_logic_fp = pipeline_options.logic_fingerprint(); + if stored_doc.meta.logic_fingerprint != current_logic_fp + && !stored_doc.meta.logic_fingerprint.is_zero() + { + info!("Logic fingerprint changed, full reprocess required"); + return IndexAction::FullIndex { + existing_id: Some(stored_doc.meta.id.clone()), + }; + } + + // Layer 3: Content changed, pipeline unchanged → incremental update + info!("Content changed, pipeline unchanged → incremental update"); + IndexAction::IncrementalUpdate { + old_tree: stored_doc.tree.clone(), + existing_id: stored_doc.meta.id.clone(), + } +} diff --git a/rust/src/index/mod.rs b/rust/src/index/mod.rs index 395325fe..6a4c5dd1 100644 --- a/rust/src/index/mod.rs +++ b/rust/src/index/mod.rs @@ -36,7 +36,6 @@ //! ``` pub mod config; -pub mod graph_builder; pub mod incremental; pub mod pipeline; pub mod stages; @@ -44,7 +43,7 @@ pub mod summary; // Re-export main types from pipeline pub use pipeline::{ - ExecutionGroup, FailurePolicy, IndexContext, IndexInput, IndexMetrics, IndexResult, + ExecutionGroup, FailurePolicy, IndexContext, IndexInput, IndexMetrics, PipelineResult, PipelineExecutor, PipelineOrchestrator, StageResult, StageRetryConfig, }; diff --git a/rust/src/index/pipeline/context.rs b/rust/src/index/pipeline/context.rs index 9fffdcf0..264b966e 100644 --- a/rust/src/index/pipeline/context.rs +++ b/rust/src/index/pipeline/context.rs @@ -245,6 +245,10 @@ pub struct IndexContext { /// Pre-computed reasoning index (built by ReasoningIndexStage). pub reasoning_index: Option, + /// Existing tree from previous indexing (for incremental updates). + /// When set, the enhance and reasoning stages can reuse data from unchanged nodes. + pub existing_tree: Option, + /// Stage execution results. pub stage_results: HashMap, @@ -276,6 +280,7 @@ impl IndexContext { llm_client: None, summary_cache: SummaryCache::default(), reasoning_index: None, + existing_tree: None, stage_results: HashMap::new(), metrics: IndexMetrics::default(), description: None, @@ -314,6 +319,12 @@ impl IndexContext { self } + /// Set the existing tree for incremental updates. + pub fn with_existing_tree(mut self, tree: DocumentTree) -> Self { + self.existing_tree = Some(tree); + self + } + /// Initialize summary cache based on strategy. pub fn init_summary_cache(&mut self) { if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy { @@ -337,8 +348,8 @@ impl IndexContext { } /// Finalize and build the result. - pub fn finalize(self) -> IndexResult { - IndexResult { + pub fn finalize(self) -> PipelineResult { + PipelineResult { doc_id: self.doc_id, name: self.name, format: self.format, @@ -356,7 +367,7 @@ impl IndexContext { /// Final result from the index pipeline. #[derive(Debug)] -pub struct IndexResult { +pub struct PipelineResult { /// Document ID. pub doc_id: String, @@ -391,7 +402,7 @@ pub struct IndexResult { pub reasoning_index: Option, } -impl IndexResult { +impl PipelineResult { /// Check if the result has a tree. pub fn has_tree(&self) -> bool { self.tree.is_some() @@ -410,6 +421,5 @@ impl IndexResult { + self.metrics.enrich_time_ms + self.metrics.reasoning_index_time_ms + self.metrics.optimize_time_ms - + self.metrics.persist_time_ms } } diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index 09f548e1..91ddcd99 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -13,10 +13,10 @@ use crate::llm::LlmClient; use super::super::PipelineOptions; use super::super::stages::{ - BuildStage, EnhanceStage, EnrichStage, IndexStage, OptimizeStage, ParseStage, PersistStage, + BuildStage, EnhanceStage, EnrichStage, IndexStage, OptimizeStage, ParseStage, ReasoningIndexStage, }; -use super::context::{IndexInput, IndexResult}; +use super::context::{IndexInput, PipelineResult}; use super::orchestrator::PipelineOrchestrator; /// Pipeline executor for document indexing. @@ -140,14 +140,6 @@ impl PipelineExecutor { self } - /// Add persistence stage with async workspace. - pub fn with_persistence(mut self, workspace: crate::storage::Workspace) -> Self { - self.orchestrator = self - .orchestrator - .stage_with_priority(PersistStage::with_workspace(workspace), 80); - self - } - /// Get the list of stage names in execution order. pub fn stage_names(&self) -> Result> { self.orchestrator.stage_names() @@ -165,7 +157,7 @@ impl PipelineExecutor { &mut self, input: IndexInput, options: PipelineOptions, - ) -> Result { + ) -> Result { info!( "Starting index pipeline with {} stages", self.orchestrator.stage_count() diff --git a/rust/src/index/pipeline/metrics.rs b/rust/src/index/pipeline/metrics.rs index e731e7a7..f25fe29f 100644 --- a/rust/src/index/pipeline/metrics.rs +++ b/rust/src/index/pipeline/metrics.rs @@ -1,155 +1,6 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Performance metrics for indexing. +//! Re-export IndexMetrics from the metrics module. -use serde::{Deserialize, Serialize}; - -/// Performance metrics for the indexing pipeline. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct IndexMetrics { - /// Parse stage duration (ms). - #[serde(default)] - pub parse_time_ms: u64, - - /// Build stage duration (ms). - #[serde(default)] - pub build_time_ms: u64, - - /// Enhance stage duration (ms). - #[serde(default)] - pub enhance_time_ms: u64, - - /// Enrich stage duration (ms). - #[serde(default)] - pub enrich_time_ms: u64, - - /// Optimize stage duration (ms). - #[serde(default)] - pub optimize_time_ms: u64, - - /// Persist stage duration (ms). - #[serde(default)] - pub persist_time_ms: u64, - - /// Reasoning index build duration (ms). - #[serde(default)] - pub reasoning_index_time_ms: u64, - - /// Number of topics indexed in reasoning index. - #[serde(default)] - pub topics_indexed: usize, - - /// Number of keywords indexed in reasoning index. - #[serde(default)] - pub keywords_indexed: usize, - - /// Total tokens generated (summaries). - #[serde(default)] - pub total_tokens_generated: usize, - - /// Number of LLM calls. - #[serde(default)] - pub llm_calls: usize, - - /// Number of nodes processed. - #[serde(default)] - pub nodes_processed: usize, - - /// Number of summaries generated. - #[serde(default)] - pub summaries_generated: usize, - - /// Number of nodes skipped (thinning). - #[serde(default)] - pub nodes_skipped: usize, - - /// Number of nodes merged. - #[serde(default)] - pub nodes_merged: usize, -} - -impl IndexMetrics { - /// Create new metrics with start time. - pub fn new() -> Self { - Self::default() - } - - /// Record parse stage time. - pub fn record_parse(&mut self, duration_ms: u64) { - self.parse_time_ms = duration_ms; - } - - /// Record build stage time. - pub fn record_build(&mut self, duration_ms: u64) { - self.build_time_ms = duration_ms; - } - - /// Record enhance stage time. - pub fn record_enhance(&mut self, duration_ms: u64) { - self.enhance_time_ms = duration_ms; - } - - /// Record enrich stage time. - pub fn record_enrich(&mut self, duration_ms: u64) { - self.enhance_time_ms = duration_ms; - } - - /// Record optimize stage time. - pub fn record_optimize(&mut self, duration_ms: u64) { - self.optimize_time_ms = duration_ms; - } - - /// Record persist stage time. - pub fn record_persist(&mut self, duration_ms: u64) { - self.persist_time_ms = duration_ms; - } - - /// Record reasoning index build time. - pub fn record_reasoning_index(&mut self, duration_ms: u64, topics: usize, keywords: usize) { - self.reasoning_index_time_ms = duration_ms; - self.topics_indexed = topics; - self.keywords_indexed = keywords; - } - - /// Increment LLM calls. - pub fn increment_llm_calls(&mut self) { - self.llm_calls += 1; - } - - /// Add to tokens generated. - pub fn add_tokens_generated(&mut self, tokens: usize) { - self.total_tokens_generated += tokens; - } - - /// Set nodes processed. - pub fn set_nodes_processed(&mut self, count: usize) { - self.nodes_processed = count; - } - - /// Increment summaries generated. - pub fn increment_summaries(&mut self) { - self.summaries_generated += 1; - } - - /// Increment nodes skipped. - pub fn increment_nodes_skipped(&mut self) { - self.nodes_skipped += 1; - } - - /// Increment nodes merged. - pub fn increment_nodes_merged(&mut self) { - self.nodes_merged += 1; - } - - /// Get total time. - pub fn total_time_ms(&self) -> u64 { - self.parse_time_ms - + self.build_time_ms - + self.enhance_time_ms - + self.enrich_time_ms - + self.reasoning_index_time_ms - + self.optimize_time_ms - + self.persist_time_ms - } -} +pub use crate::metrics::IndexMetrics; diff --git a/rust/src/index/pipeline/mod.rs b/rust/src/index/pipeline/mod.rs index fdf22827..2d221a8a 100644 --- a/rust/src/index/pipeline/mod.rs +++ b/rust/src/index/pipeline/mod.rs @@ -17,7 +17,7 @@ mod metrics; mod orchestrator; mod policy; -pub use context::{IndexContext, IndexInput, IndexResult, StageResult}; +pub use context::{IndexContext, IndexInput, PipelineResult, StageResult}; pub use executor::PipelineExecutor; pub use metrics::IndexMetrics; pub use orchestrator::{CustomStageBuilder, ExecutionGroup, PipelineOrchestrator}; diff --git a/rust/src/index/pipeline/orchestrator.rs b/rust/src/index/pipeline/orchestrator.rs index 299317d8..2f0b1def 100644 --- a/rust/src/index/pipeline/orchestrator.rs +++ b/rust/src/index/pipeline/orchestrator.rs @@ -30,8 +30,8 @@ use tracing::{error, info, warn}; use crate::error::Result; use super::super::PipelineOptions; -use super::super::stages::IndexStage; -use super::context::{IndexContext, IndexInput, IndexResult, StageResult}; +use super::super::stages::{AccessPattern, IndexStage}; +use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; use super::policy::FailurePolicy; /// Stage entry with metadata for orchestration. @@ -260,10 +260,9 @@ impl PipelineOrchestrator { // Check for cycles if result.len() != n { - let remaining: Vec<&str> = result - .iter() - .filter(|&&i| !result.contains(&i)) - .map(|&i| self.stages[i].stage.name()) + let remaining: Vec<&str> = (0..n) + .filter(|i| !result.contains(i)) + .map(|i| self.stages[i].stage.name()) .collect(); return Err(crate::error::Error::Config(format!( "Circular dependency detected involving stages: {:?}", @@ -390,6 +389,37 @@ impl PipelineOrchestrator { } } + /// Handle the result of a stage execution (shared between sequential and parallel paths). + fn handle_stage_result( + result: Result, + stage_name: &str, + policy: &FailurePolicy, + ctx: &mut IndexContext, + ) -> Result<()> { + match result { + Ok(result) => { + ctx.stage_results.insert(stage_name.to_string(), result); + Ok(()) + } + Err(e) => { + if policy.allows_continuation() { + warn!( + "Stage {} failed but policy allows continuation: {}", + stage_name, e + ); + ctx.stage_results.insert( + stage_name.to_string(), + StageResult::failure(stage_name, &e.to_string()), + ); + Ok(()) + } else { + error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + Err(e) + } + } + } + } + /// Execute the pipeline. /// /// Stages are executed in dependency-resolved order. @@ -398,7 +428,7 @@ impl PipelineOrchestrator { &mut self, input: IndexInput, options: PipelineOptions, - ) -> Result { + ) -> Result { let total_start = Instant::now(); info!( "Starting orchestrated pipeline with {} stages", @@ -419,7 +449,12 @@ impl PipelineOrchestrator { ); // Create context - let mut ctx = IndexContext::new(input, options); + let mut opts = options; + let existing_tree = opts.existing_tree.take(); + let mut ctx = IndexContext::new(input, opts); + if let Some(tree) = existing_tree { + ctx = ctx.with_existing_tree(tree); + } // Execute each group for (group_idx, group) in groups.iter().enumerate() { @@ -436,37 +471,129 @@ impl PipelineOrchestrator { ); } - // Execute stages in this group - // Note: For true parallel execution, stages would need to declare - // that they don't modify shared context. Currently executed sequentially - // for safety, but grouped for future optimization. - for &idx in &group.stage_indices { - let entry = &mut self.stages[idx]; - let stage_name = entry.stage.name().to_string(); - let policy = entry.stage.failure_policy(); + if group.parallel && group.stage_indices.len() == 2 { + // === Parallel execution for 2-stage groups === + // One stage gets the main ctx (mutates tree), the other + // gets a cloned snapshot (read-only). Results are merged back. + let idx_a = group.stage_indices[0]; + let idx_b = group.stage_indices[1]; + + // Determine which stage reads tree (gets snapshot) vs writes tree (gets ctx) + // using AccessPattern instead of hardcoded name checks. + let (writer_idx, reader_idx) = { + let ap_a = self.stages[idx_a].stage.access_pattern(); + let ap_b = self.stages[idx_b].stage.access_pattern(); + // The stage that writes tree gets the main ctx; + // the other (read-only on tree) gets a clone. + if ap_b.writes_tree && !ap_a.writes_tree { + (idx_b, idx_a) // b writes tree, a is reader + } else { + (idx_a, idx_b) // a writes tree (or both/neither write), b is reader + } + }; - info!( - "Executing stage: {} (priority {})", - stage_name, entry.priority + // Clone tree snapshot for the reader stage + let tree_snapshot = ctx.tree.clone(); + let options_snapshot = ctx.options.clone(); + let existing_tree_snapshot = ctx.existing_tree.clone(); + + // Take both stages out to avoid double &mut self + let mut stage_writer = std::mem::replace( + &mut self.stages[writer_idx].stage, + Box::new(NopStage), + ); + let mut stage_reader = std::mem::replace( + &mut self.stages[reader_idx].stage, + Box::new(NopStage), ); - match Self::execute_stage_with_policy(&mut entry.stage, &mut ctx).await { - Ok(result) => { - ctx.stage_results.insert(stage_name.clone(), result); - } - Err(e) => { - if policy.allows_continuation() { - warn!( - "Stage {} failed but policy allows continuation: {}", - stage_name, e - ); - ctx.stage_results.insert( - stage_name.clone(), - StageResult::failure(&stage_name, &e.to_string()), - ); - } else { - error!("Stage {} failed, stopping pipeline: {}", stage_name, e); - return Err(e); + let writer_name = stage_writer.name().to_string(); + let reader_name = stage_reader.name().to_string(); + let writer_policy = stage_writer.failure_policy(); + let reader_policy = stage_reader.failure_policy(); + + info!("Parallel: executing {} ∥ {}", writer_name, reader_name); + + // Build a minimal context clone for the reader stage + let mut reader_ctx = IndexContext::new(IndexInput::content(""), options_snapshot); + reader_ctx.tree = tree_snapshot; + reader_ctx.existing_tree = existing_tree_snapshot; + reader_ctx.doc_id = ctx.doc_id.clone(); + reader_ctx.name = ctx.name.clone(); + reader_ctx.format = ctx.format; + reader_ctx.source_path = ctx.source_path.clone(); + + // Execute both stages concurrently + let (writer_result, reader_result) = tokio::join!( + Self::execute_stage_with_policy(&mut stage_writer, &mut ctx), + Self::execute_stage_with_policy(&mut stage_reader, &mut reader_ctx), + ); + + // Put stages back + self.stages[writer_idx].stage = stage_writer; + self.stages[reader_idx].stage = stage_reader; + + // Handle writer result + Self::handle_stage_result(writer_result, &writer_name, &writer_policy, &mut ctx)?; + + // Handle reader result + Self::handle_stage_result(reader_result, &reader_name, &reader_policy, &mut ctx)?; + + // Merge reader's outputs back based on its AccessPattern + let reader_ap = self.stages[reader_idx].stage.access_pattern(); + if reader_ap.writes_reasoning_index { + ctx.reasoning_index = reader_ctx.reasoning_index; + } + if reader_ap.writes_description { + ctx.description = reader_ctx.description; + } + // Merge additive metrics + ctx.metrics.llm_calls += reader_ctx.metrics.llm_calls; + ctx.metrics.summaries_generated += reader_ctx.metrics.summaries_generated; + ctx.metrics.total_tokens_generated += reader_ctx.metrics.total_tokens_generated; + ctx.metrics.nodes_processed += reader_ctx.metrics.nodes_processed; + if reader_ctx.metrics.reasoning_index_time_ms > 0 { + ctx.metrics.record_reasoning_index( + reader_ctx.metrics.reasoning_index_time_ms, + reader_ctx.metrics.topics_indexed, + reader_ctx.metrics.keywords_indexed, + ); + } + if reader_ctx.metrics.optimize_time_ms > 0 { + ctx.metrics.record_optimize(reader_ctx.metrics.optimize_time_ms); + } + ctx.metrics.nodes_merged += reader_ctx.metrics.nodes_merged; + ctx.metrics.nodes_skipped += reader_ctx.metrics.nodes_skipped; + } else { + // === Sequential execution (single stage or non-parallel group) === + for &idx in &group.stage_indices { + let entry = &mut self.stages[idx]; + let stage_name = entry.stage.name().to_string(); + let policy = entry.stage.failure_policy(); + + info!( + "Executing stage: {} (priority {})", + stage_name, entry.priority + ); + + match Self::execute_stage_with_policy(&mut entry.stage, &mut ctx).await { + Ok(result) => { + ctx.stage_results.insert(stage_name.clone(), result); + } + Err(e) => { + if policy.allows_continuation() { + warn!( + "Stage {} failed but policy allows continuation: {}", + stage_name, e + ); + ctx.stage_results.insert( + stage_name.clone(), + StageResult::failure(&stage_name, &e.to_string()), + ); + } else { + error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + return Err(e); + } } } } @@ -498,6 +625,21 @@ impl PipelineOrchestrator { } } +/// Placeholder stage used during parallel execution when the real stage +/// is temporarily swapped out via `std::mem::replace`. +struct NopStage; + +#[async_trait::async_trait] +impl IndexStage for NopStage { + fn name(&self) -> &'static str { + "_nop" + } + + async fn execute(&mut self, _ctx: &mut IndexContext) -> Result { + Ok(StageResult::success("_nop")) + } +} + /// Builder for creating custom stage configurations. /// /// This is a convenience type for configuring custom stages diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs index e848e832..92e5136d 100644 --- a/rust/src/index/stages/enhance.rs +++ b/rust/src/index/stages/enhance.rs @@ -4,12 +4,14 @@ //! Enhance stage - Generate summaries using LLM. use super::async_trait; +use futures::StreamExt; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; use crate::document::{DocumentTree, NodeId, TreeNode}; use crate::error::Result; +use crate::index::incremental; use crate::utils::fingerprint::Fingerprint; use crate::llm::LlmClient; use crate::memo::{MemoKey, MemoStore, MemoValue}; @@ -18,6 +20,13 @@ use super::{IndexStage, StageResult}; use crate::index::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; use crate::index::summary::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy}; +/// A node that needs LLM summary generation. +struct PendingNode { + node_id: NodeId, + title: String, + content: String, +} + /// Enhance stage - generates summaries using LLM. pub struct EnhanceStage { /// LLM client for summary generation. @@ -101,7 +110,6 @@ impl IndexStage for EnhanceStage { // Check if we need summaries if !self.needs_summaries(ctx) { - println!("[DEBUG] Summary generation skipped (strategy: {:?})", ctx.options.summary_strategy); info!( "Summary generation skipped (strategy: {:?})", ctx.options.summary_strategy @@ -113,7 +121,6 @@ impl IndexStage for EnhanceStage { let llm_client = match &self.llm_client { Some(client) => client, None => { - println!("[DEBUG] No LLM client configured, skipping summary generation"); warn!("No LLM client configured, skipping summary generation"); return Ok(StageResult::success("enhance")); } @@ -123,45 +130,56 @@ impl IndexStage for EnhanceStage { let tree = match ctx.tree.as_mut() { Some(t) => t, None => { - println!("[DEBUG] No tree built, skipping enhance stage"); warn!("No tree built, skipping enhance stage"); return Ok(StageResult::success("enhance")); } }; - println!("[DEBUG] Using summary strategy: {:?}", ctx.options.summary_strategy); info!("Using summary strategy: {:?}", ctx.options.summary_strategy); - // Create summary generator with optional memo store - let mut generator = LlmSummaryGenerator::new((*llm_client).as_ref().clone()) - .with_max_tokens(ctx.options.indexer.max_summary_tokens); - - // Attach memo store to generator if available - if let Some(store) = &self.memo_store { - generator = generator.with_memo_store((**store).clone()); - } + // Create summary generator (shared via Arc for concurrent use) + let generator = Arc::new( + LlmSummaryGenerator::new((*llm_client).as_ref().clone()) + .with_max_tokens(ctx.options.indexer.max_summary_tokens) + .with_memo_store( + self.memo_store + .as_ref() + .map(|s| (**s).clone()) + .unwrap_or_default(), + ), + ); // Get all nodes to process let node_ids: Vec = tree.traverse(); let total_nodes = node_ids.len(); - println!("[DEBUG] Processing {} nodes for summary generation", total_nodes); + // === Incremental: reuse summaries from existing tree for unchanged nodes === + if let Some(ref old_tree) = ctx.existing_tree { + let reusable = incremental::compute_reusable_summaries(old_tree, tree); + let applied = incremental::apply_reusable_summaries(tree, &reusable); + for _ in 0..applied { + ctx.metrics.increment_summaries(); + } + info!( + "Incremental: {} of {} nodes unchanged, reusing summaries", + applied, total_nodes, + ); + } + info!("Processing {} nodes for summary generation", total_nodes); - // Process nodes + // === Phase 1: Collect pending nodes (cache hits applied immediately) === + let strategy = ctx.options.summary_strategy.clone(); + let mut pending_llm: Vec = Vec::new(); let mut generated = 0; - let mut failed = 0; let mut skipped_no_content = 0; let mut skipped_tokens = 0; - let strategy = ctx.options.summary_strategy.clone(); for node_id in node_ids { - // Get node data (need to clone to avoid borrow issues) let node = match tree.get(node_id) { Some(n) => n.clone(), None => continue, }; - println!("[DEBUG] Evaluating node for summary: {} {}", node.title, node.content); // Skip if no content if node.content.is_empty() { @@ -169,78 +187,93 @@ impl IndexStage for EnhanceStage { continue; } - // Get token count and check if we should generate + // Skip if summary already set (incremental: reused from old tree) + if !node.summary.is_empty() { + continue; + } + + // Check if strategy says we should generate let token_count = node.token_count.unwrap_or(0); if !strategy.should_generate(tree, node_id, token_count) { skipped_tokens += 1; continue; } - // Check memo store first (additional check beyond generator) - let cached_summary = if let Some(store) = self.memo_store.as_deref() { + // Check memo store (fast path — apply immediately) + if let Some(store) = self.memo_store.as_deref() { let content_fp = Fingerprint::from_str(&format!("{}|{}", node.title, node.content)); let memo_key = MemoKey::summary(&content_fp); - - store - .get(&memo_key) - .and_then(|cached| cached.as_summary().map(|s| s.to_string())) - } else { - None - }; - - if let Some(summary) = cached_summary { - if !summary.is_empty() { - tree.set_summary(node_id, &summary); - debug!( - "Using cached summary for node: {} ({} chars)", - node.title, - summary.len() - ); - ctx.metrics.increment_summaries(); - generated += 1; - continue; + if let Some(cached) = store.get(&memo_key).and_then(|c| c.as_summary().map(|s| s.to_string())) { + if !cached.is_empty() { + tree.set_summary(node_id, &cached); + debug!("Using cached summary for node: {} ({} chars)", node.title, cached.len()); + ctx.metrics.increment_summaries(); + generated += 1; + continue; + } } } - // Generate summary (generator also has memoization built-in) - println!("[DEBUG] Calling LLM to generate summary for node: {} ({} tokens)", node.title, token_count); - println!("[DEBUG] Node content: {}", node.content); + // Needs LLM call + pending_llm.push(PendingNode { + node_id, + title: node.title, + content: node.content, + }); + } + + // === Phase 2: Concurrent LLM calls with buffer_unordered === + let mut failed = 0; + let concurrency = ctx.options.concurrency.max_concurrent_requests; + + if !pending_llm.is_empty() { + info!( + "Generating summaries for {} nodes (concurrency: {})", + pending_llm.len(), concurrency + ); - match generator.generate(&node.title, &node.content).await { - Ok(summary) => { - if summary.is_empty() { - warn!("Empty summary returned for node '{}'", node.title); + // Collect results: (NodeId, Result) + let results: Vec<(NodeId, std::result::Result)> = + futures::stream::iter(pending_llm) + .map(|pending| { + let generator = Arc::clone(&generator); + async move { + let result = generator.generate(&pending.title, &pending.content).await; + (pending.node_id, result.map_err(|e| e.to_string())) + } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + // Write results back to tree + for (node_id, result) in results { + ctx.metrics.increment_llm_calls(); + match result { + Ok(summary) => { + if summary.is_empty() { + failed += 1; + } else { + tree.set_summary(node_id, &summary); + generated += 1; + ctx.metrics.increment_summaries(); + } + } + Err(e) => { + warn!("Failed to generate summary: {}", e); failed += 1; - } else { - tree.set_summary(node_id, &summary); - debug!( - "Generated summary for node: {} ({} chars)", - node.title, - summary.len() - ); - ctx.metrics.increment_summaries(); - generated += 1; } } - Err(e) => { - warn!("Failed to generate summary for {}: {}", node.title, e); - failed += 1; - } } - - // Increment LLM calls metric - ctx.metrics.increment_llm_calls(); } let duration = start.elapsed().as_millis() as u64; ctx.metrics.record_enhance(duration); - println!("[DEBUG] Generated {} summaries ({} failed, {} skipped no content, {} skipped tokens) in {}ms", - generated, failed, skipped_no_content, skipped_tokens, duration); info!( - "Generated {} summaries ({} failed) in {}ms", - generated, failed, duration + "Generated {} summaries ({} failed, {} skipped no content, {} skipped tokens) in {}ms", + generated, failed, skipped_no_content, skipped_tokens, duration ); let mut stage_result = StageResult::success("enhance"); diff --git a/rust/src/index/stages/enrich.rs b/rust/src/index/stages/enrich.rs index 7b0c670d..0ff3cb55 100644 --- a/rust/src/index/stages/enrich.rs +++ b/rust/src/index/stages/enrich.rs @@ -10,7 +10,7 @@ use tracing::info; use crate::document::{DocumentTree, NodeId, TocView}; use crate::error::Result; -use super::{IndexStage, StageResult}; +use super::{AccessPattern, IndexStage, StageResult}; use crate::index::pipeline::IndexContext; /// Enrich stage - adds metadata to the tree. @@ -111,6 +111,15 @@ impl IndexStage for EnrichStage { vec!["build"] } + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, // sets page_boundaries + writes_description: true, + ..Default::default() + } + } + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { let start = Instant::now(); diff --git a/rust/src/index/stages/mod.rs b/rust/src/index/stages/mod.rs index 2022ffae..5b1847c4 100644 --- a/rust/src/index/stages/mod.rs +++ b/rust/src/index/stages/mod.rs @@ -8,7 +8,6 @@ mod enhance; mod enrich; mod optimize; mod parse; -mod persist; mod reasoning; pub use build::BuildStage; @@ -16,13 +15,26 @@ pub use enhance::EnhanceStage; pub use enrich::EnrichStage; pub use optimize::OptimizeStage; pub use parse::ParseStage; -pub use persist::PersistStage; pub use reasoning::ReasoningIndexStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; use crate::error::Result; pub use async_trait::async_trait; +/// Declares which context fields a stage reads/writes. +/// Used by the orchestrator to determine safe parallel execution. +#[derive(Debug, Clone, Default)] +pub struct AccessPattern { + /// Whether this stage reads the tree. + pub reads_tree: bool, + /// Whether this stage mutates the tree (summaries, structure, etc.). + pub writes_tree: bool, + /// Whether this stage writes to `reasoning_index`. + pub writes_reasoning_index: bool, + /// Whether this stage writes to `description`. + pub writes_description: bool, +} + /// Index pipeline stage. /// /// Each stage represents a discrete step in the document indexing process. @@ -106,4 +118,10 @@ pub trait IndexStage: Send + Sync { FailurePolicy::fail() } } + + /// Declare which context fields this stage accesses. + /// Used by the orchestrator for safe parallel execution. + fn access_pattern(&self) -> AccessPattern { + AccessPattern::default() + } } diff --git a/rust/src/index/stages/optimize.rs b/rust/src/index/stages/optimize.rs index 8ae8b44f..b9c90948 100644 --- a/rust/src/index/stages/optimize.rs +++ b/rust/src/index/stages/optimize.rs @@ -3,7 +3,7 @@ //! Optimize stage - Optimize tree structure. -use super::async_trait; +use super::{AccessPattern, async_trait}; use std::time::Instant; use tracing::info; @@ -143,6 +143,14 @@ impl IndexStage for OptimizeStage { vec!["enrich"] } + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, // merges small leaf nodes + ..Default::default() + } + } + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { let start = Instant::now(); diff --git a/rust/src/index/stages/persist.rs b/rust/src/index/stages/persist.rs deleted file mode 100644 index 509bc874..00000000 --- a/rust/src/index/stages/persist.rs +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Persist stage - Save indexed document to storage. - -use super::async_trait; -use std::time::Instant; -use tracing::info; - -use crate::error::Result; -use crate::storage::{DocumentMeta as StorageMeta, PersistedDocument, Workspace}; - -use super::{IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; - -/// Persist stage - saves indexed document to storage. -pub struct PersistStage { - /// Optional workspace for persistence. - workspace: Option, -} - -impl PersistStage { - /// Create a new persist stage without workspace (in-memory only). - pub fn new() -> Self { - Self { workspace: None } - } - - /// Create with workspace. - pub fn with_workspace(workspace: Workspace) -> Self { - Self { - workspace: Some(workspace), - } - } - - /// Save document to workspace. - async fn save_to_workspace(&self, ctx: &IndexContext) -> Result<()> { - let workspace = self - .workspace - .as_ref() - .ok_or_else(|| crate::Error::Config("No workspace configured".to_string()))?; - - let tree = ctx - .tree - .as_ref() - .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; - - // Create metadata - let meta = StorageMeta::new(&ctx.doc_id, &ctx.name, ctx.format.extension()) - .with_source_path(ctx.source_path.clone().unwrap_or_default()) - .with_description(ctx.description.clone().unwrap_or_default()); - - let doc = PersistedDocument::new(meta, tree.clone()); - - // Note: pages would need to be stored in context during parse stage - - // Attach reasoning index if available - let mut doc = doc; - if let Some(ref reasoning_index) = ctx.reasoning_index { - doc.reasoning_index = Some(reasoning_index.clone()); - } - - workspace.add(&doc).await?; - info!("Saved document {} to workspace", ctx.doc_id); - - Ok(()) - } -} - -impl Default for PersistStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for PersistStage { - fn name(&self) -> &'static str { - "persist" - } - - fn is_optional(&self) -> bool { - true - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - // Only persist if workspace is configured - if self.workspace.is_some() { - self.save_to_workspace(ctx).await?; - } else { - info!("No workspace configured, skipping persistence"); - } - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_persist(duration); - - info!("Persist stage completed in {}ms", duration); - - let mut stage_result = StageResult::success("persist"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "persisted".to_string(), - serde_json::json!(self.workspace.is_some()), - ); - - Ok(stage_result) - } -} diff --git a/rust/src/index/stages/reasoning.rs b/rust/src/index/stages/reasoning.rs index 804dcb19..9fd98b42 100644 --- a/rust/src/index/stages/reasoning.rs +++ b/rust/src/index/stages/reasoning.rs @@ -18,7 +18,7 @@ use crate::error::Result; use crate::retrieval::search::extract_keywords; use super::async_trait; -use super::{IndexStage, StageResult}; +use super::{AccessPattern, IndexStage, StageResult}; use crate::index::pipeline::IndexContext; /// Reasoning Index Stage - builds a pre-computed reasoning index from the document tree. @@ -221,6 +221,14 @@ impl IndexStage for ReasoningIndexStage { true } + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_reasoning_index: true, + ..Default::default() + } + } + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { let start = Instant::now(); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 34579e3e..950d64b5 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -56,6 +56,7 @@ pub mod client; mod config; pub mod document; pub mod error; +mod graph; mod index; mod llm; mod memo; @@ -81,3 +82,9 @@ pub use document::{ TreeNode, }; +// Graph types +pub use graph::DocumentGraph; + +// Index metrics +pub use metrics::IndexMetrics; + diff --git a/rust/src/metrics/index.rs b/rust/src/metrics/index.rs new file mode 100644 index 00000000..e23769d2 --- /dev/null +++ b/rust/src/metrics/index.rs @@ -0,0 +1,145 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Indexing pipeline metrics. + +use serde::{Deserialize, Serialize}; + +/// Performance metrics for the indexing pipeline. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct IndexMetrics { + /// Parse stage duration (ms). + #[serde(default)] + pub parse_time_ms: u64, + + /// Build stage duration (ms). + #[serde(default)] + pub build_time_ms: u64, + + /// Enhance stage duration (ms). + #[serde(default)] + pub enhance_time_ms: u64, + + /// Enrich stage duration (ms). + #[serde(default)] + pub enrich_time_ms: u64, + + /// Optimize stage duration (ms). + #[serde(default)] + pub optimize_time_ms: u64, + + /// Reasoning index build duration (ms). + #[serde(default)] + pub reasoning_index_time_ms: u64, + + /// Number of topics indexed in reasoning index. + #[serde(default)] + pub topics_indexed: usize, + + /// Number of keywords indexed in reasoning index. + #[serde(default)] + pub keywords_indexed: usize, + + /// Total tokens generated (summaries). + #[serde(default)] + pub total_tokens_generated: usize, + + /// Number of LLM calls. + #[serde(default)] + pub llm_calls: usize, + + /// Number of nodes processed. + #[serde(default)] + pub nodes_processed: usize, + + /// Number of summaries generated. + #[serde(default)] + pub summaries_generated: usize, + + /// Number of nodes skipped (thinning). + #[serde(default)] + pub nodes_skipped: usize, + + /// Number of nodes merged. + #[serde(default)] + pub nodes_merged: usize, +} + +impl IndexMetrics { + /// Create new metrics with start time. + pub fn new() -> Self { + Self::default() + } + + /// Record parse stage time. + pub fn record_parse(&mut self, duration_ms: u64) { + self.parse_time_ms = duration_ms; + } + + /// Record build stage time. + pub fn record_build(&mut self, duration_ms: u64) { + self.build_time_ms = duration_ms; + } + + /// Record enhance stage time. + pub fn record_enhance(&mut self, duration_ms: u64) { + self.enhance_time_ms = duration_ms; + } + + /// Record enrich stage time. + pub fn record_enrich(&mut self, duration_ms: u64) { + self.enrich_time_ms = duration_ms; + } + + /// Record optimize stage time. + pub fn record_optimize(&mut self, duration_ms: u64) { + self.optimize_time_ms = duration_ms; + } + + /// Record reasoning index build time. + pub fn record_reasoning_index(&mut self, duration_ms: u64, topics: usize, keywords: usize) { + self.reasoning_index_time_ms = duration_ms; + self.topics_indexed = topics; + self.keywords_indexed = keywords; + } + + /// Increment LLM calls. + pub fn increment_llm_calls(&mut self) { + self.llm_calls += 1; + } + + /// Add to tokens generated. + pub fn add_tokens_generated(&mut self, tokens: usize) { + self.total_tokens_generated += tokens; + } + + /// Set nodes processed. + pub fn set_nodes_processed(&mut self, count: usize) { + self.nodes_processed = count; + } + + /// Increment summaries generated. + pub fn increment_summaries(&mut self) { + self.summaries_generated += 1; + } + + /// Increment nodes skipped. + pub fn increment_nodes_skipped(&mut self) { + self.nodes_skipped += 1; + } + + /// Increment nodes merged. + pub fn increment_nodes_merged(&mut self) { + self.nodes_merged += 1; + } + + /// Get total time. + pub fn total_time_ms(&self) -> u64 { + self.parse_time_ms + + self.build_time_ms + + self.enhance_time_ms + + self.enrich_time_ms + + self.reasoning_index_time_ms + + self.optimize_time_ms + } +} diff --git a/rust/src/metrics/mod.rs b/rust/src/metrics/mod.rs index e60a1103..b311efea 100644 --- a/rust/src/metrics/mod.rs +++ b/rust/src/metrics/mod.rs @@ -50,11 +50,13 @@ //! ``` mod hub; +mod index; mod llm; mod pilot; mod retrieval; pub use hub::MetricsHub; +pub use index::IndexMetrics; pub use llm::{LlmMetrics, LlmMetricsReport}; pub use pilot::{InterventionPoint, PilotMetrics, PilotMetricsReport}; pub use retrieval::{RetrievalMetrics, RetrievalMetricsReport}; diff --git a/rust/src/retrieval/pipeline/context.rs b/rust/src/retrieval/pipeline/context.rs index d5158ecb..45530ac2 100644 --- a/rust/src/retrieval/pipeline/context.rs +++ b/rust/src/retrieval/pipeline/context.rs @@ -10,7 +10,8 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use crate::document::{DocumentGraph, DocumentTree, NodeId, ReasoningIndex, RetrievalIndex}; +use crate::document::{DocumentTree, NodeId, ReasoningIndex, RetrievalIndex}; +use crate::graph::DocumentGraph; use crate::retrieval::cache::{HotNodeTracker, ReasoningCache}; use crate::retrieval::pipeline::budget::RetrievalBudgetController; use crate::retrieval::pilot::Pilot; @@ -343,8 +344,8 @@ impl PipelineContext { } /// Set the document graph for graph-aware retrieval. - pub fn with_document_graph(mut self, graph: DocumentGraph) -> Self { - self.document_graph = Some(Arc::new(graph)); + pub fn with_document_graph(mut self, graph: Arc) -> Self { + self.document_graph = Some(graph); self } diff --git a/rust/src/retrieval/pipeline/orchestrator.rs b/rust/src/retrieval/pipeline/orchestrator.rs index 6e53fbc3..50976988 100644 --- a/rust/src/retrieval/pipeline/orchestrator.rs +++ b/rust/src/retrieval/pipeline/orchestrator.rs @@ -322,7 +322,11 @@ impl RetrievalOrchestrator { ); // Create context with Pilot + let document_graph = options.document_graph.clone(); let mut ctx = PipelineContext::with_pilot(tree, query, options, self.pilot.clone()); + if let Some(graph) = document_graph { + ctx = ctx.with_document_graph(graph); + } // Track execution state let mut backtrack_count = 0; @@ -600,10 +604,14 @@ impl RetrievalOrchestrator { let groups = self.compute_execution_groups(&order); // Create context with Pilot and reasoning index + let document_graph = options.document_graph.clone(); let mut ctx = PipelineContext::with_pilot(tree, query, options, self.pilot.clone()); if let Some(ri) = reasoning_index { ctx = ctx.with_reasoning_index(ri); } + if let Some(graph) = document_graph { + ctx = ctx.with_document_graph(graph); + } let mut backtrack_count = 0; let mut total_iterations = 0; @@ -897,7 +905,11 @@ impl RetrievalOrchestrator { let order = self.resolve_order()?; let groups = self.compute_execution_groups(&order); + let document_graph = options.document_graph.clone(); let mut ctx = PipelineContext::with_pilot(tree, query, options, self.pilot.clone()); + if let Some(graph) = document_graph { + ctx = ctx.with_document_graph(graph); + } let mut backtrack_count = 0; let mut total_iterations = 0; diff --git a/rust/src/retrieval/strategy/cross_document.rs b/rust/src/retrieval/strategy/cross_document.rs index fe43f775..4dfa1f4d 100644 --- a/rust/src/retrieval/strategy/cross_document.rs +++ b/rust/src/retrieval/strategy/cross_document.rs @@ -11,7 +11,8 @@ use std::collections::HashMap; use std::sync::Arc; use super::r#trait::{NodeEvaluation, RetrievalStrategy, StrategyCapabilities}; -use crate::document::{DocumentGraph, DocumentTree, NodeId}; +use crate::document::{DocumentTree, NodeId}; +use crate::graph::DocumentGraph; use crate::retrieval::types::{NavigationDecision, QueryComplexity}; use crate::retrieval::RetrievalContext; diff --git a/rust/src/retrieval/types.rs b/rust/src/retrieval/types.rs index fa1b7e1c..649163d7 100644 --- a/rust/src/retrieval/types.rs +++ b/rust/src/retrieval/types.rs @@ -125,6 +125,9 @@ pub struct RetrieveOptions { /// `RetrieveEvent`s as each pipeline stage completes. When disabled /// (default), the standard `query()` returns a single final result. pub streaming: bool, + + /// Cross-document graph for graph-aware retrieval boosting. + pub document_graph: Option>, } impl Default for RetrieveOptions { @@ -144,6 +147,7 @@ impl Default for RetrieveOptions { token_estimation: super::TokenEstimation::default(), use_async_context: false, streaming: false, + document_graph: None, } } } @@ -252,6 +256,13 @@ impl RetrieveOptions { self.streaming = enable; self } + + /// Set the cross-document graph for graph-aware retrieval boosting. + #[must_use] + pub fn with_document_graph(mut self, graph: std::sync::Arc) -> Self { + self.document_graph = Some(graph); + self + } } /// A single retrieval result. diff --git a/rust/src/storage/persistence.rs b/rust/src/storage/persistence.rs index 2e6c1f91..fece82a8 100644 --- a/rust/src/storage/persistence.rs +++ b/rust/src/storage/persistence.rs @@ -58,6 +58,11 @@ pub struct DocumentMeta { #[serde(default, skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero")] pub content_fingerprint: crate::utils::fingerprint::Fingerprint, + /// Logic fingerprint (hash of pipeline configuration used to produce this document). + /// If the pipeline config changes, a full reprocess is needed even if content didn't change. + #[serde(default, skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero")] + pub logic_fingerprint: crate::utils::fingerprint::Fingerprint, + /// Processing version (incremented when algorithm changes). #[serde(default)] pub processing_version: u32, @@ -94,6 +99,7 @@ impl DocumentMeta { created_at: now, modified_at: now, content_fingerprint: crate::utils::fingerprint::Fingerprint::zero(), + logic_fingerprint: crate::utils::fingerprint::Fingerprint::zero(), processing_version: 0, node_count: 0, total_summary_tokens: 0, @@ -120,6 +126,12 @@ impl DocumentMeta { self } + /// Set the logic fingerprint. + pub fn with_logic_fingerprint(mut self, fp: crate::utils::fingerprint::Fingerprint) -> Self { + self.logic_fingerprint = fp; + self + } + /// Set the processing version. pub fn with_processing_version(mut self, version: u32) -> Self { self.processing_version = version; diff --git a/rust/src/storage/workspace.rs b/rust/src/storage/workspace.rs index c2192cfa..974f0e68 100644 --- a/rust/src/storage/workspace.rs +++ b/rust/src/storage/workspace.rs @@ -111,7 +111,7 @@ struct WorkspaceInner { /// LRU cache for loaded documents. cache: DocumentCache, /// Cross-document relationship graph (cached). - document_graph: Option, + document_graph: Option, } /// An async workspace for managing indexed documents. @@ -381,6 +381,21 @@ impl Workspace { inner.meta_index.is_empty() } + /// Find a document ID by its source path. + /// + /// Returns the first document whose `source_path` matches. + /// Used for incremental indexing to check if a file has already been indexed. + pub async fn find_by_source_path(&self, path: &std::path::Path) -> Option { + let target = path.to_string_lossy().to_string(); + let inner = self.inner.read().await; + for (_, entry) in &inner.meta_index { + if entry.path.as_deref() == Some(target.as_str()) { + return Some(entry.id.clone()); + } + } + None + } + /// Get the number of items currently in the LRU cache. pub async fn cache_len(&self) -> usize { let inner = self.inner.read().await; @@ -415,7 +430,7 @@ impl Workspace { const GRAPH_KEY: &'static str = "_graph"; /// Get the document graph, loading from backend if not cached. - pub async fn get_graph(&self) -> Result> { + pub async fn get_graph(&self) -> Result> { // Check cache first { let inner = self.inner.read().await; @@ -428,7 +443,7 @@ impl Workspace { let inner = self.inner.read().await; match inner.backend.get(Self::GRAPH_KEY)? { Some(bytes) => { - let graph: crate::document::DocumentGraph = + let graph: crate::graph::DocumentGraph = serde_json::from_slice(&bytes).map_err(|e| { crate::Error::Serialization(format!("Failed to deserialize graph: {}", e)) })?; @@ -440,7 +455,7 @@ impl Workspace { } /// Persist the document graph to the backend. - pub async fn set_graph(&self, graph: &crate::document::DocumentGraph) -> Result<()> { + pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { let mut inner = self.inner.write().await; let bytes = serde_json::to_vec(graph).map_err(|e| { crate::Error::Serialization(format!("Failed to serialize graph: {}", e))