From 9ceacbf3141f65cc03170aa8e3f63691c5824c4c Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 12 Apr 2026 09:10:24 +0800 Subject: [PATCH 1/5] feat(client): enhance error messages with detailed failure information - Add specific error details showing which sources failed and their respective error messages - Replace generic "All X source(s) failed to index" message with detailed breakdown - Include source name and error description in the error output for better debugging --- rust/src/client/engine.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 683ac782..21f09c08 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -164,8 +164,9 @@ impl Engine { .await; if items.is_empty() && !failed.is_empty() { return Err(Error::Config(format!( - "All {} source(s) failed to index", - failed.len() + "All {} source(s) failed to index: {}", + failed.len(), + failed.iter().map(|f| format!("{} ({})", f.source, f.error)).collect::>().join("; ") ))); } if !items.is_empty() { @@ -207,8 +208,9 @@ impl Engine { if items.is_empty() && !failed.is_empty() { return Err(Error::Config(format!( - "All {} source(s) failed to index", - failed.len() + "All {} source(s) failed to index: {}", + failed.len(), + failed.iter().map(|f| format!("{} ({})", f.source, f.error)).collect::>().join("; ") ))); } From d7e016c070f4c38fcc32ff0c95dce63286ea653d Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 12 Apr 2026 09:21:32 +0800 Subject: [PATCH 2/5] feat(rust): add incremental and single document indexing examples - Add index_incremental.rs example demonstrating incremental indexing with change detection capabilities - Add index_single.rs example showing single document indexing workflow - Update existing indexing.rs example to focus on batch indexing with multiple document paths - Update example configurations to use google/gemini-3-flash-preview model and proper endpoint - Include comprehensive cleanup logic in all examples chore: update .gitignore with workspace patterns - Add workspace* pattern to ignore test workspace directories - Keep ENV/ directory in gitignore as was previously intended --- .gitignore | 5 +- rust/examples/index_incremental.rs | 96 ++++++++++++++++++++++++ rust/examples/index_single.rs | 72 ++++++++++++++++++ rust/examples/indexing.rs | 116 +++++++---------------------- 4 files changed, 198 insertions(+), 91 deletions(-) create mode 100644 rust/examples/index_incremental.rs create mode 100644 rust/examples/index_single.rs diff --git a/.gitignore b/.gitignore index 553e4b7c..a05dac13 100644 --- a/.gitignore +++ b/.gitignore @@ -83,4 +83,7 @@ wheels/ .ruff_cache/ .venv/ venv/ -ENV/ \ No newline at end of file +ENV/ + +# Test workspace +workspace* \ No newline at end of file diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs new file mode 100644 index 00000000..6b710a93 --- /dev/null +++ b/rust/examples/index_incremental.rs @@ -0,0 +1,96 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Incremental indexing example — re-index with change detection. +//! +//! ```bash +//! cargo run --example index_incremental +//! ``` + +use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./workspace_incremental_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let content_v1 = r#"# API Reference + +## GET /users + +Returns a list of all users in the system. + +## POST /users + +Creates a new user account. +"#; + + let content_v2 = r#"# API Reference + +## GET /users + +Returns a paginated list of users. Supports `?page=` and `?limit=` parameters. + +## POST /users + +Creates a new user account. Requires email and password fields. + +## DELETE /users/:id + +Deletes a user by their unique identifier. +"#; + + // 1. Initial full index + println!("--- Initial index ---"); + let result = engine + .index(IndexContext::from_content(content_v1, DocumentFormat::Markdown)) + .await?; + + let doc_id = result.items[0].doc_id.clone(); + if let Some(m) = &result.items[0].metrics { + println!("indexed in {}ms, {} nodes", m.total_time_ms(), m.nodes_processed); + } + + // 2. Re-index unchanged content (incremental) — skips processing + println!("\n--- Re-index unchanged (incremental) ---"); + let result = engine + .index( + IndexContext::from_content(content_v1, DocumentFormat::Markdown) + .with_mode(IndexMode::Incremental), + ) + .await?; + + for item in &result.items { + println!("doc_id: {} (unchanged, skipped)", item.doc_id); + } + + // 3. Re-index with changes (incremental) — detects diff and updates + println!("\n--- Re-index with changes (incremental) ---"); + let result = engine + .index( + IndexContext::from_content(content_v2, DocumentFormat::Markdown) + .with_mode(IndexMode::Incremental), + ) + .await?; + + for item in &result.items { + if let Some(m) = &item.metrics { + println!("updated in {}ms, {} nodes", m.total_time_ms(), m.nodes_processed); + } + } + + println!("\ndoc_id: {doc_id}"); + + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs new file mode 100644 index 00000000..aa396395 --- /dev/null +++ b/rust/examples/index_single.rs @@ -0,0 +1,72 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Single document indexing example — index one document from content. +//! +//! ```bash +//! cargo run --example index_single +//! ``` + +use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./workspace_single_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") + .build() + .await + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + let content = r#"# Project Overview + +## Introduction + +This document describes the architecture of a distributed system +designed for high-throughput data processing. + +## Components + +### API Gateway + +Handles authentication, rate limiting, and request routing. +Supports both REST and gRPC protocols. + +### Worker Pool + +Processes tasks from the message queue. Each worker handles +one task at a time with configurable timeout. + +## Performance + +Under load testing, the system achieves 50k requests/second +with p99 latency under 200ms. + +## Conclusion + +The modular design allows independent scaling of each component. +"#; + + // Index from content string + let result = engine + .index(IndexContext::from_content(content, DocumentFormat::Markdown)) + .await?; + + for item in &result.items { + println!("doc_id: {}", item.doc_id); + println!("name: {}", item.name); + println!("format: {:?}", item.format); + if let Some(m) = &item.metrics { + println!("time: {}ms, nodes: {}", m.total_time_ms(), m.nodes_processed); + } + } + + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; + } + + Ok(()) +} diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index fc764835..53d8fe92 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -1,110 +1,46 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Index pipeline example for Vectorless. -//! -//! Demonstrates the full indexing flow: create engine → index document → inspect metrics. -//! -//! # Usage +//! Batch indexing example — index multiple documents at once. //! //! ```bash //! cargo run --example indexing //! ``` -use vectorless::{EngineBuilder, IndexContext, IndexMode}; +use vectorless::{EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { - println!("=== Index Pipeline Example ===\n"); - - // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./workspace_index_example") - .with_key("sk-...") - .with_model("gpt-4o") + .with_workspace("./workspace_batch_example") + .with_key("sk-or-v1-...") + .with_model("google/gemini-3-flash-preview") + .with_endpoint("http://localhost:4000/api/v1") .build() .await - .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; - - println!("Engine created\n"); - - // 2. Index a single document with default options - println!("--- Single document (default mode) ---"); - let result = engine.index(IndexContext::from_path("./README.md")).await?; - + .map_err(|e| vectorless::Error::Config(e.to_string()))?; + + // Index multiple files from different paths + let result = engine + .index(IndexContext::from_paths(&[ + "../README.md", + "../CLAUDE.md", + "../LICENSE", + ])) + .await?; + + println!("indexed: {}, failed: {}", result.items.len(), result.failed.len()); for item in &result.items { - println!(" doc_id: {}", item.doc_id); - println!(" name: {}", item.name); - println!(" format: {:?}", item.format); - - if let Some(ref metrics) = item.metrics { - println!(" metrics:"); - println!(" total time: {}ms", metrics.total_time_ms()); - println!(" parse: {}ms", metrics.parse_time_ms); - println!(" build: {}ms", metrics.build_time_ms); - println!(" enhance: {}ms", metrics.enhance_time_ms); - println!(" enrich: {}ms", metrics.enrich_time_ms); - println!(" optimize: {}ms", metrics.optimize_time_ms); - println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); - println!(" nodes: {}", metrics.nodes_processed); - println!(" summaries: {}", metrics.summaries_generated); - println!(" llm calls: {}", metrics.llm_calls); - println!(" tokens: {}", metrics.total_tokens_generated); - println!(" topics: {}", metrics.topics_indexed); - println!(" keywords: {}", metrics.keywords_indexed); - } - - // doc_id preserved across the loop for readability - let _doc_id = item.doc_id.clone(); - - // 3. Re-index with incremental mode — should detect no change - println!("\n--- Re-index (incremental, unchanged) ---"); - let result2 = engine - .index(IndexContext::from_path("./README.md").with_mode(IndexMode::Incremental)) - .await?; - - for item in &result2.items { - println!( - " {} (metrics present: {})", - item.doc_id, - item.metrics.is_some() - ); - } - - // 4. Index multiple documents at once - println!("\n--- Batch indexing ---"); - let batch = engine - .index(IndexContext::from_paths(&["./README.md", "./CLAUDE.md"])) - .await?; - - println!( - " indexed: {}, failed: {}", - batch.items.len(), - batch.failed.len() - ); - for item in &batch.items { - let time = item - .metrics - .as_ref() - .map(|m| m.total_time_ms()) - .unwrap_or(0); - let nodes = item - .metrics - .as_ref() - .map(|m| m.nodes_processed) - .unwrap_or(0); - println!(" {} — {}ms, {} nodes", item.name, time, nodes); - } + println!(" {} — doc_id: {}", item.name, item.doc_id); + } + for fail in &result.failed { + println!(" FAILED: {} — {}", fail.source, fail.error); + } - // 5. Cleanup - println!("\n--- Cleanup ---"); - let docs = engine.list().await?; - for doc in &docs { - engine.remove(&doc.id).await?; - } - println!(" removed {} document(s)", docs.len()); + // Cleanup + for doc in engine.list().await? { + engine.remove(&doc.id).await?; } - println!("\n=== Done ==="); Ok(()) } From dc6cdd6ee1d5de752bc3dc7f401c8764b32457d4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 12 Apr 2026 10:06:54 +0800 Subject: [PATCH 3/5] feat(indexer): add processing statistics to persisted documents Extract node count, token count, and processing duration from document metrics and update processing stats in the persisted document metadata. fix(optimize): prevent merging non-leaf nodes and improve merge logic Change merge_leaf_threshold default to 0 to disable unwanted merging. Improve merge_small_leaves function to only merge adjacent leaf nodes with actual content, preserving section boundaries by adding proper headings when merging content. docs(optimize): clarify merge behavior for leaf nodes only --- rust/src/client/indexer.rs | 15 +++++++- rust/src/index/config.rs | 2 +- rust/src/index/stages/optimize.rs | 61 ++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index f0e43890..373be62c 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -428,13 +428,24 @@ impl IndexerClient { let logic_fp = pipeline_options.logic_fingerprint(); meta = meta.with_logic_fingerprint(logic_fp); - let mut persisted = - PersistedDocument::new(meta, doc.tree.expect("IndexedDocument must have a tree")); + let tree = doc.tree.expect("IndexedDocument must have a tree"); + + // Extract stats from metrics + let node_count = tree.node_count(); + let (summary_tokens, duration_ms) = if let Some(ref m) = doc.metrics { + (m.total_tokens_generated, m.total_time_ms()) + } else { + (0, 0) + }; + + let mut persisted = PersistedDocument::new(meta, tree); for page in doc.pages { persisted.add_page(page.page, &page.content); } + persisted.meta.update_processing_stats(node_count, summary_tokens, duration_ms); + persisted } } diff --git a/rust/src/index/config.rs b/rust/src/index/config.rs index f06fa22c..d43d7900 100644 --- a/rust/src/index/config.rs +++ b/rust/src/index/config.rs @@ -55,7 +55,7 @@ impl Default for OptimizationConfig { enabled: true, max_depth: None, max_children: None, - merge_leaf_threshold: 50, + merge_leaf_threshold: 0, } } } diff --git a/rust/src/index/stages/optimize.rs b/rust/src/index/stages/optimize.rs index 9eca0b8f..6b21688f 100644 --- a/rust/src/index/stages/optimize.rs +++ b/rust/src/index/stages/optimize.rs @@ -22,7 +22,11 @@ impl OptimizeStage { Self } - /// Merge adjacent small leaf nodes. + /// Merge adjacent small leaf nodes that are siblings under the same parent. + /// + /// Only merges nodes that are both **leaves** (no children of their own). + /// Non-leaf nodes (section headings with subsections) are never merged, + /// even if their own content is empty. fn merge_small_leaves( tree: &mut crate::document::DocumentTree, min_tokens: usize, @@ -30,7 +34,7 @@ impl OptimizeStage { ) -> usize { let mut merged_count = 0; - // Get all non-leaf nodes + // Get all non-leaf nodes (parents whose children may be candidates) let non_leaves: Vec = tree .traverse() .into_iter() @@ -43,27 +47,43 @@ impl OptimizeStage { continue; } - // Find pairs of adjacent small nodes + // Collect children info: only leaf nodes are merge candidates + let candidates: Vec<(NodeId, usize, bool)> = children + .iter() + .map(|&id| { + let tokens = tree.get(id).and_then(|n| n.token_count).unwrap_or(0); + let is_leaf = tree.is_leaf(id); + (id, tokens, is_leaf) + }) + .collect(); + + // Find pairs of adjacent small leaf siblings let mut i = 0; - while i < children.len() - 1 { - let curr_id = children[i]; - let next_id = children[i + 1]; - - let curr_tokens = tree.get(curr_id).and_then(|n| n.token_count).unwrap_or(0); - let next_tokens = tree.get(next_id).and_then(|n| n.token_count).unwrap_or(0); - - // If both are small, merge next into current - if curr_tokens < min_tokens && next_tokens < min_tokens { - // Merge content + while i < candidates.len() - 1 { + let (curr_id, curr_tokens, curr_is_leaf) = candidates[i]; + let (next_id, next_tokens, next_is_leaf) = candidates[i + 1]; + + // Both must be leaves with actual content, and both must be small + if curr_is_leaf + && next_is_leaf + && curr_tokens > 0 + && curr_tokens < min_tokens + && next_tokens > 0 + && next_tokens < min_tokens + { + // Merge next into current if let Some(next_node) = tree.get(next_id).cloned() { if let Some(curr) = tree.get_mut(curr_id) { if !next_node.content.is_empty() { if !curr.content.is_empty() { - curr.content.push('\n'); + curr.content.push_str("\n\n"); } - curr.content.push_str(&next_node.content); + // Prefix with heading to preserve boundary + curr.content + .push_str(&format!("## {}\n{}", next_node.title, next_node.content)); } - curr.token_count = Some(curr.token_count.unwrap_or(0) + next_tokens); + curr.token_count = + Some(curr.token_count.unwrap_or(0) + next_tokens); } } @@ -86,15 +106,20 @@ impl OptimizeStage { merged_count } - /// Remove empty intermediate nodes. + /// Remove empty intermediate nodes (skip root). fn remove_empty_nodes(tree: &mut crate::document::DocumentTree) -> usize { let mut removed_count = 0; + let root = tree.root(); - // Find nodes with no content and only one child + // Find non-root nodes with no content and only one child let candidates: Vec = tree .traverse() .into_iter() .filter(|id| { + // Skip root node + if *id == root { + return false; + } if tree.is_leaf(*id) { return false; } From 5b30f8b52eb24255ed78db121d989919996bc290 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 12 Apr 2026 11:16:06 +0800 Subject: [PATCH 4/5] feat(rust/examples): enhance metrics display in index_single example - Replace generic 'm' variable with descriptive 'metrics' variable - Add detailed metrics output including parse, build, enhance, enrich, optimize, and reasoning times - Include additional metrics for nodes processed, summaries generated, LLM calls, tokens generated, topics indexed, and keywords indexed --- rust/examples/index_single.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index aa396395..8218cc00 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -58,8 +58,22 @@ The modular design allows independent scaling of each component. println!("doc_id: {}", item.doc_id); println!("name: {}", item.name); println!("format: {:?}", item.format); - if let Some(m) = &item.metrics { - println!("time: {}ms, nodes: {}", m.total_time_ms(), m.nodes_processed); + + if let Some(metrics) = &item.metrics { + println!(" metrics:"); + println!(" total time: {}ms", metrics.total_time_ms()); + println!(" parse: {}ms", metrics.parse_time_ms); + println!(" build: {}ms", metrics.build_time_ms); + println!(" enhance: {}ms", metrics.enhance_time_ms); + println!(" enrich: {}ms", metrics.enrich_time_ms); + println!(" optimize: {}ms", metrics.optimize_time_ms); + println!(" reasoning: {}ms", metrics.reasoning_index_time_ms); + println!(" nodes: {}", metrics.nodes_processed); + println!(" summaries: {}", metrics.summaries_generated); + println!(" llm calls: {}", metrics.llm_calls); + println!(" tokens: {}", metrics.total_tokens_generated); + println!(" topics: {}", metrics.topics_indexed); + println!(" keywords: {}", metrics.keywords_indexed); } } From d5263283c7b85ded5422071207ece900f84af8b9 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 12 Apr 2026 11:26:43 +0800 Subject: [PATCH 5/5] fix(rust/examples): update file paths in graph example Change file paths from relative to parent directory to ensure correct file location resolution. --- rust/examples/graph.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index 141b66df..cdefb451 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -30,7 +30,7 @@ async fn main() -> vectorless::Result<()> { // 2. Index documents — graph is rebuilt automatically let result = engine - .index(IndexContext::from_paths(&["./README.md", "./CLAUDE.md"])) + .index(IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) .await?; println!("Indexed {} document(s)", result.items.len());