diff --git a/.gitignore b/.gitignore index 553e4b7c..a05dac13 100644 --- a/.gitignore +++ b/.gitignore @@ -83,4 +83,7 @@ wheels/ .ruff_cache/ .venv/ venv/ -ENV/ \ No newline at end of file +ENV/ + +# Test workspace +workspace* \ No newline at end of file diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index 141b66df..cdefb451 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -30,7 +30,7 @@ async fn main() -> vectorless::Result<()> { // 2. Index documents — graph is rebuilt automatically let result = engine - .index(IndexContext::from_paths(&["./README.md", "./CLAUDE.md"])) + .index(IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) .await?; println!("Indexed {} document(s)", result.items.len()); diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs new file mode 100644 index 00000000..6b710a93 --- /dev/null +++ b/rust/examples/index_incremental.rs @@ -0,0 +1,96 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Incremental indexing example — re-index with change detection. +//! +//! ```bash +//! cargo run --example index_incremental +//! ``` + +use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./workspace_incremental_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let content_v1 = r#"# API Reference + +## GET /users + +Returns a list of all users in the system. + +## POST /users + +Creates a new user account. +"#; + + let content_v2 = r#"# API Reference + +## GET /users + +Returns a paginated list of users. Supports `?page=` and `?limit=` parameters. + +## POST /users + +Creates a new user account. Requires email and password fields. + +## DELETE /users/:id + +Deletes a user by their unique identifier. +"#; + + // 1. Initial full index + println!("--- Initial index ---"); + let result = engine + .index(IndexContext::from_content(content_v1, DocumentFormat::Markdown)) + .await?; + + let doc_id = result.items[0].doc_id.clone(); + if let Some(m) = &result.items[0].metrics { + println!("indexed in {}ms, {} nodes", m.total_time_ms(), m.nodes_processed); + } + + // 2. Re-index unchanged content (incremental) — skips processing + println!("\n--- Re-index unchanged (incremental) ---"); + let result = engine + .index( + IndexContext::from_content(content_v1, DocumentFormat::Markdown) + .with_mode(IndexMode::Incremental), + ) + .await?; + + for item in &result.items { + println!("doc_id: {} (unchanged, skipped)", item.doc_id); + } + + // 3. Re-index with changes (incremental) — detects diff and updates + println!("\n--- Re-index with changes (incremental) ---"); + let result = engine + .index( + IndexContext::from_content(content_v2, DocumentFormat::Markdown) + .with_mode(IndexMode::Incremental), + ) + .await?; + + for item in &result.items { + if let Some(m) = &item.metrics { + println!("updated in {}ms, {} nodes", m.total_time_ms(), m.nodes_processed); + } + } + + println!("\ndoc_id: {doc_id}"); + + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs new file mode 100644 index 00000000..8218cc00 --- /dev/null +++ b/rust/examples/index_single.rs @@ -0,0 +1,86 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Single document indexing example — index one document from content. +//! +//! ```bash +//! cargo run --example index_single +//! ``` + +use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./workspace_single_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let content = r#"# Project Overview + +## Introduction + +This document describes the architecture of a distributed system +designed for high-throughput data processing. + +## Components + +### API Gateway + +Handles authentication, rate limiting, and request routing. +Supports both REST and gRPC protocols. + +### Worker Pool + +Processes tasks from the message queue. Each worker handles +one task at a time with configurable timeout. + +## Performance + +Under load testing, the system achieves 50k requests/second +with p99 latency under 200ms. + +## Conclusion + +The modular design allows independent scaling of each component. +"#; + + // Index from content string + let result = engine + .index(IndexContext::from_content(content, DocumentFormat::Markdown)) + .await?; + + for item in &result.items { + println!("doc_id: {}", item.doc_id); + println!("name: {}", item.name); + println!("format: {:?}", item.format); + + if let Some(metrics) = &item.metrics { + println!(" metrics:"); + println!(" total time: {}ms", metrics.total_time_ms()); + println!(" parse: {}ms", metrics.parse_time_ms); + println!(" build: {}ms", metrics.build_time_ms); + println!(" enhance: {}ms", metrics.enhance_time_ms); + println!(" enrich: {}ms", metrics.enrich_time_ms); + println!(" optimize: {}ms", metrics.optimize_time_ms); + println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); + println!(" nodes: {}", metrics.nodes_processed); + println!(" summaries: {}", metrics.summaries_generated); + println!(" llm calls: {}", metrics.llm_calls); + println!(" tokens: {}", metrics.total_tokens_generated); + println!(" topics: {}", metrics.topics_indexed); + println!(" keywords: {}", metrics.keywords_indexed); + } + } + + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index fc764835..53d8fe92 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -1,110 +1,46 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Index pipeline example for Vectorless. -//! -//! Demonstrates the full indexing flow: create engine → index document → inspect metrics. -//! -//! # Usage +//! Batch indexing example — index multiple documents at once. //! //! ```bash //! cargo run --example indexing //! ``` -use vectorless::{EngineBuilder, IndexContext, IndexMode}; +use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { - println!("=== Index Pipeline Example ===\n"); - - // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./workspace_index_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_workspace("./workspace_batch_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") .build() .await - .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; - - println!("Engine created\n"); - - // 2. Index a single document with default options - println!("--- Single document (default mode) ---"); - let result = engine.index(IndexContext::from_path("./README.md")).await?; - + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + // Index multiple files from different paths + let result = engine + .index(IndexContext::from_paths(&[ + "../README.md", + "../CLAUDE.md", + "../LICENSE", + ])) + .await?; + + println!("indexed: {}, failed: {}", result.items.len(), result.failed.len()); for item in &result.items { - println!(" doc_id: {}", item.doc_id); - println!(" name: {}", item.name); - println!(" format: {:?}", item.format); - - if let Some(ref metrics) = item.metrics { - println!(" metrics:"); - println!(" total time: {}ms", metrics.total_time_ms()); - println!(" parse: {}ms", metrics.parse_time_ms); - println!(" build: {}ms", metrics.build_time_ms); - println!(" enhance: {}ms", metrics.enhance_time_ms); - println!(" enrich: {}ms", metrics.enrich_time_ms); - println!(" optimize: {}ms", metrics.optimize_time_ms); - println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); - println!(" nodes: {}", metrics.nodes_processed); - println!(" summaries: {}", metrics.summaries_generated); - println!(" llm calls: {}", metrics.llm_calls); - println!(" tokens: {}", metrics.total_tokens_generated); - println!(" topics: {}", metrics.topics_indexed); - println!(" keywords: {}", metrics.keywords_indexed); - } - - // doc_id preserved across the loop for readability - let _doc_id = item.doc_id.clone(); - - // 3. Re-index with incremental mode — should detect no change - println!("\n--- Re-index (incremental, unchanged) ---"); - let result2 = engine - .index(IndexContext::from_path("./README.md").with_mode(IndexMode::Incremental)) - .await?; - - for item in &result2.items { - println!( - " {} (metrics present: {})", - item.doc_id, - item.metrics.is_some() - ); - } - - // 4. Index multiple documents at once - println!("\n--- Batch indexing ---"); - let batch = engine - .index(IndexContext::from_paths(&["./README.md", "./CLAUDE.md"])) - .await?; - - println!( - " indexed: {}, failed: {}", - batch.items.len(), - batch.failed.len() - ); - for item in &batch.items { - let time = item - .metrics - .as_ref() - .map(|m| m.total_time_ms()) - .unwrap_or(0); - let nodes = item - .metrics - .as_ref() - .map(|m| m.nodes_processed) - .unwrap_or(0); - println!(" {} — {}ms, {} nodes", item.name, time, nodes); - } + println!(" {} — doc_id: {}", item.name, item.doc_id); + } + for fail in &result.failed { + println!(" FAILED: {} — {}", fail.source, fail.error); + } - // 5. Cleanup - println!("\n--- Cleanup ---"); - let docs = engine.list().await?; - for doc in &docs { - engine.remove(&doc.id).await?; - } - println!(" removed {} document(s)", docs.len()); + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; } - println!("\n=== Done ==="); Ok(()) } diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 683ac782..21f09c08 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -164,8 +164,9 @@ impl Engine { .await; if items.is_empty() && !failed.is_empty() { return Err(Error::Config(format!( - "All {} source(s) failed to index", - failed.len() + "All {} source(s) failed to index: {}", + failed.len(), + failed.iter().map(|f| format!("{} ({})", f.source, f.error)).collect::>().join("; ") ))); } if !items.is_empty() { @@ -207,8 +208,9 @@ impl Engine { if items.is_empty() && !failed.is_empty() { return Err(Error::Config(format!( - "All {} source(s) failed to index", - failed.len() + "All {} source(s) failed to index: {}", + failed.len(), + failed.iter().map(|f| format!("{} ({})", f.source, f.error)).collect::>().join("; ") ))); } diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index f0e43890..373be62c 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -428,13 +428,24 @@ impl IndexerClient { let logic_fp = pipeline_options.logic_fingerprint(); meta = meta.with_logic_fingerprint(logic_fp); - let mut persisted = - PersistedDocument::new(meta, doc.tree.expect("IndexedDocument must have a tree")); + let tree = doc.tree.expect("IndexedDocument must have a tree"); + + // Extract stats from metrics + let node_count = tree.node_count(); + let (summary_tokens, duration_ms) = if let Some(ref m) = doc.metrics { + (m.total_tokens_generated, m.total_time_ms()) + } else { + (0, 0) + }; + + let mut persisted = PersistedDocument::new(meta, tree); for page in doc.pages { persisted.add_page(page.page, &page.content); } + persisted.meta.update_processing_stats(node_count, summary_tokens, duration_ms); + persisted } } diff --git a/rust/src/index/config.rs b/rust/src/index/config.rs index f06fa22c..d43d7900 100644 --- a/rust/src/index/config.rs +++ b/rust/src/index/config.rs @@ -55,7 +55,7 @@ impl Default for OptimizationConfig { enabled: true, max_depth: None, max_children: None, - merge_leaf_threshold: 50, + merge_leaf_threshold: 0, } } } diff --git a/rust/src/index/stages/optimize.rs b/rust/src/index/stages/optimize.rs index 9eca0b8f..6b21688f 100644 --- a/rust/src/index/stages/optimize.rs +++ b/rust/src/index/stages/optimize.rs @@ -22,7 +22,11 @@ impl OptimizeStage { Self } - /// Merge adjacent small leaf nodes. + /// Merge adjacent small leaf nodes that are siblings under the same parent. + /// + /// Only merges nodes that are both **leaves** (no children of their own). + /// Non-leaf nodes (section headings with subsections) are never merged, + /// even if their own content is empty. fn merge_small_leaves( tree: &mut crate::document::DocumentTree, min_tokens: usize, @@ -30,7 +34,7 @@ impl OptimizeStage { ) -> usize { let mut merged_count = 0; - // Get all non-leaf nodes + // Get all non-leaf nodes (parents whose children may be candidates) let non_leaves: Vec = tree .traverse() .into_iter() @@ -43,27 +47,43 @@ impl OptimizeStage { continue; } - // Find pairs of adjacent small nodes + // Collect children info: only leaf nodes are merge candidates + let candidates: Vec<(NodeId, usize, bool)> = children + .iter() + .map(|&id| { + let tokens = tree.get(id).and_then(|n| n.token_count).unwrap_or(0); + let is_leaf = tree.is_leaf(id); + (id, tokens, is_leaf) + }) + .collect(); + + // Find pairs of adjacent small leaf siblings let mut i = 0; - while i < children.len() - 1 { - let curr_id = children[i]; - let next_id = children[i + 1]; - - let curr_tokens = tree.get(curr_id).and_then(|n| n.token_count).unwrap_or(0); - let next_tokens = tree.get(next_id).and_then(|n| n.token_count).unwrap_or(0); - - // If both are small, merge next into current - if curr_tokens < min_tokens && next_tokens < min_tokens { - // Merge content + while i < candidates.len() - 1 { + let (curr_id, curr_tokens, curr_is_leaf) = candidates[i]; + let (next_id, next_tokens, next_is_leaf) = candidates[i + 1]; + + // Both must be leaves with actual content, and both must be small + if curr_is_leaf + && next_is_leaf + && curr_tokens > 0 + && curr_tokens < min_tokens + && next_tokens > 0 + && next_tokens < min_tokens + { + // Merge next into current if let Some(next_node) = tree.get(next_id).cloned() { if let Some(curr) = tree.get_mut(curr_id) { if !next_node.content.is_empty() { if !curr.content.is_empty() { - curr.content.push('\n'); + curr.content.push_str("\n\n"); } - curr.content.push_str(&next_node.content); + // Prefix with heading to preserve boundary + curr.content + .push_str(&format!("## {}\n{}", next_node.title, next_node.content)); } - curr.token_count = Some(curr.token_count.unwrap_or(0) + next_tokens); + curr.token_count = + Some(curr.token_count.unwrap_or(0) + next_tokens); } } @@ -86,15 +106,20 @@ impl OptimizeStage { merged_count } - /// Remove empty intermediate nodes. + /// Remove empty intermediate nodes (skip root). fn remove_empty_nodes(tree: &mut crate::document::DocumentTree) -> usize { let mut removed_count = 0; + let root = tree.root(); - // Find nodes with no content and only one child + // Find non-root nodes with no content and only one child let candidates: Vec = tree .traverse() .into_iter() .filter(|id| { + // Skip root node + if *id == root { + return false; + } if tree.is_leaf(*id) { return false; }