From c56afafff7f35e98eba2d3aa412a2e6f06407655 Mon Sep 17 00:00:00 2001 From: Wesley Finck Date: Sat, 18 Oct 2025 20:38:49 -0700 Subject: [PATCH 1/2] feat: Add comprehensive SemanticSearch.md design document for Logseq semantic search infrastructure Co-authored-by: aider (anthropic/claude-sonnet-4-20250514) --- notes/features/SemanticSearch.md | 309 +++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 notes/features/SemanticSearch.md diff --git a/notes/features/SemanticSearch.md b/notes/features/SemanticSearch.md new file mode 100644 index 0000000..7515117 --- /dev/null +++ b/notes/features/SemanticSearch.md @@ -0,0 +1,309 @@ +# Semantic Search Feature + +## Overview + +Semantic search system that converts Logseq blocks into vector embeddings for similarity-based retrieval. Uses fastembed-rs for local embedding generation and Qdrant for vector storage and search. Designed for hybrid search compatibility with future tantivy integration. + +## Core Components + +### Domain Layer + +#### Value Objects +- `EmbeddingVector`: Wrapper around Vec with validation +- `ChunkId`: Unique identifier for text chunks (may be 1:1 or 1:many with BlockId) +- `SimilarityScore`: Normalized similarity score (0.0-1.0) +- `EmbeddingModel`: Enum of supported models (AllMiniLML6V2 as default) + +#### Entities +- `TextChunk`: Represents a piece of text ready for embedding + - Contains preprocessed text content + - Maintains reference to source Block and Page + - Handles text chunking logic for long blocks + - Stores metadata for context reconstruction + +#### Aggregates +- `EmbeddedBlock`: Aggregate containing Block + its embeddings + - Manages relationship between Block and its TextChunks + - Handles embedding lifecycle (create, update, delete) + - Ensures consistency between block content and embeddings + +### Application Layer + +#### Use Cases +- `EmbedBlocks`: Convert blocks to embeddings and store in vector DB +- `SemanticSearch`: Query vector DB and return ranked results +- `UpdateEmbeddings`: Re-embed modified blocks +- `DeleteEmbeddings`: Remove embeddings for deleted blocks + +#### DTOs +- `EmbeddingRequest`: Block content + metadata for embedding +- `SemanticSearchRequest`: Query text + search parameters +- `SemanticSearchResult`: Ranked results with similarity scores and context + +#### Repositories +- `EmbeddingRepository`: Interface for vector storage operations +- `EmbeddingModelRepository`: Interface for embedding model management + +### Infrastructure Layer + +#### Embedding Service +- `FastEmbedService`: Wraps fastembed-rs for local embedding generation +- `EmbeddingModelManager`: Handles model loading and caching +- `TextPreprocessor`: Cleans and prepares text for embedding + +#### Vector Database +- `QdrantVectorStore`: Qdrant client wrapper for vector operations +- `VectorCollectionManager`: Manages Qdrant collections and schemas + +## Implementation Approach + +### Text Preprocessing Pipeline +```rust +impl TextPreprocessor { + pub fn preprocess_block(&self, block: &Block) -> Vec { + let content = block.content().as_str(); + + // 1. Remove Logseq-specific syntax + let cleaned = self.remove_logseq_syntax(content); + + // 2. Extract and preserve important context + let with_context = self.add_context_markers(&cleaned, block); + + // 3. Handle chunking for long blocks + self.chunk_if_needed(with_context) + } + + fn remove_logseq_syntax(&self, text: &str) -> String { + // Remove [[page references]] but keep the text + // Remove #tags but keep the text + // Remove TODO/DONE markers + // Clean up markdown formatting for better embedding + } + + fn add_context_markers(&self, text: &str, block: &Block) -> String { + // Add page title as context + // Add parent block context for nested blocks + // Preserve important structural information + } +} +``` + +### Chunking Strategy +```rust +impl TextChunk { + const MAX_CHUNK_SIZE: usize = 512; // tokens, roughly 400 words + const OVERLAP_SIZE: usize = 50; // token overlap between chunks + + pub fn from_block(block: &Block, page_title: &str) -> Vec { + let preprocessed = TextPreprocessor::new().preprocess_block(block); + + if preprocessed.len() <= Self::MAX_CHUNK_SIZE { + // Single chunk + vec![Self::new_single(block, page_title, preprocessed)] + } else { + // Multiple chunks with overlap + Self::create_overlapping_chunks(block, page_title, preprocessed) + } + } +} +``` + +### Embedding Generation +```rust +impl EmbedBlocks { + pub async fn execute(&self, blocks: Vec) -> DomainResult<()> { + // 1. Preprocess blocks into chunks + let chunks = self.create_chunks_from_blocks(blocks); + + // 2. Generate embeddings in batches + let batch_size = 32; // Optimize for fastembed performance + for chunk_batch in chunks.chunks(batch_size) { + let texts: Vec = chunk_batch.iter() + .map(|c| c.preprocessed_text().to_string()) + .collect(); + + let embeddings = self.embedding_service + .generate_embeddings(texts) + .await?; + + // 3. Store in vector database + self.store_embeddings(chunk_batch, embeddings).await?; + } + + Ok(()) + } +} +``` + +### Semantic Search +```rust +impl SemanticSearch { + pub async fn execute(&self, request: SemanticSearchRequest) -> DomainResult> { + // 1. Generate query embedding + let query_embedding = self.embedding_service + .generate_embeddings(vec![request.query]) + .await? + .into_iter() + .next() + .unwrap(); + + // 2. Search vector database + let vector_results = self.vector_store + .similarity_search(query_embedding, request.limit) + .await?; + + // 3. Reconstruct context and rank results + let results = self.build_search_results(vector_results).await?; + + Ok(results) + } + + async fn build_search_results(&self, vector_results: Vec) -> DomainResult> { + let mut results = Vec::new(); + + for vector_result in vector_results { + // Get original block and page context + let chunk = self.embedding_repository + .get_chunk_by_id(&vector_result.chunk_id) + .await?; + + let block = self.page_repository + .find_block_by_id(&chunk.block_id) + .await?; + + let page = self.page_repository + .find_by_id(&chunk.page_id) + .await?; + + results.push(SemanticSearchResult { + block_id: chunk.block_id.clone(), + page_id: chunk.page_id.clone(), + page_title: page.title().to_string(), + block_content: block.content().as_str().to_string(), + chunk_text: chunk.preprocessed_text().to_string(), + similarity_score: SimilarityScore::new(vector_result.score), + hierarchy_path: page.get_hierarchy_path(&chunk.block_id) + .iter() + .map(|b| b.content().as_str().to_string()) + .collect(), + }); + } + + Ok(results) + } +} +``` + +## Data Storage Strategy + +### Qdrant Collection Schema +```rust +// Collection: "logseq_blocks" +// Vector dimension: 384 (for all-MiniLM-L6-v2) +// Distance metric: Cosine similarity + +// Payload structure: +{ + "chunk_id": "block-123-chunk-0", + "block_id": "block-123", + "page_id": "page-456", + "page_title": "Programming Notes", + "chunk_index": 0, // For multi-chunk blocks + "total_chunks": 1, + "original_content": "Original block text...", + "preprocessed_content": "Cleaned text for embedding...", + "hierarchy_path": ["Parent block", "Current block"], + "created_at": "2025-10-18T10:00:00Z", + "updated_at": "2025-10-18T10:00:00Z" +} +``` + +### Embedding Model Configuration +- **Default Model**: `all-MiniLM-L6-v2` (384 dimensions) + - Good balance of quality and speed + - Suitable for personal knowledge bases + - ~25MB model size +- **Alternative Models**: Support for BGE-small-en-v1.5, nomic-embed-text-v1 +- **Model Selection**: Configurable via application settings + +## Integration Points + +### With Existing Domain +- Extends existing `Block` and `Page` aggregates +- Uses existing `PageRepository` for context reconstruction +- Integrates with file sync system for incremental updates + +### With Future Hybrid Search +- Semantic results can be combined with tantivy keyword results +- Shared result ranking and fusion logic +- Common search result DTOs + +### With Tauri Frontend +- Async search commands with progress reporting +- Embedding status and model management +- Search result streaming for large result sets + +## Performance Considerations + +### Embedding Generation +- Batch processing for efficiency (32 blocks per batch) +- Async processing to avoid blocking UI +- Model caching to avoid repeated loading + +### Vector Search +- Qdrant's HNSW index for fast similarity search +- Configurable search parameters (ef, m values) +- Result caching for repeated queries + +### Storage Optimization +- Quantization options for reduced storage +- Periodic index optimization +- Cleanup of orphaned embeddings + +## Error Handling + +### Embedding Failures +- Continue processing other blocks if one fails +- Retry logic for transient failures +- Fallback to keyword search if embedding unavailable + +### Vector Database Issues +- Graceful degradation to traditional search +- Connection retry with exponential backoff +- Data consistency checks and repair + +## Testing Strategy + +### Unit Tests +- Text preprocessing logic +- Chunking algorithms +- Embedding generation (with mock models) +- Search result reconstruction + +### Integration Tests +- End-to-end embedding and search flow +- Performance benchmarks with realistic data +- Error scenario handling + +### Test Data +- Sample Logseq blocks with various content types +- Long blocks requiring chunking +- Blocks with complex Logseq syntax + +## Key Simplifications + +**Removed:** +- Complex embedding model management +- Sophisticated chunking strategies +- Advanced vector database optimizations +- Detailed analytics and monitoring +- Multi-language support + +**Kept:** +- Clean text preprocessing +- Efficient batching and async processing +- Context preservation for search results +- Integration with existing domain model +- Hybrid search compatibility + +This approach provides high-quality semantic search with reasonable complexity for a personal knowledge management tool. From 7bda83c2e479c361950abd6a4d2f81d309b71a5c Mon Sep 17 00:00:00 2001 From: Wesley Finck Date: Sat, 18 Oct 2025 20:40:51 -0700 Subject: [PATCH 2/2] feat: Add comprehensive semantic search implementation work plan Co-authored-by: aider (anthropic/claude-sonnet-4-20250514) --- .../issues/semantic-search-implementation.md | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 notes/issues/semantic-search-implementation.md diff --git a/notes/issues/semantic-search-implementation.md b/notes/issues/semantic-search-implementation.md new file mode 100644 index 0000000..d538dbf --- /dev/null +++ b/notes/issues/semantic-search-implementation.md @@ -0,0 +1,217 @@ +# Title: Implement Semantic Search with fastembed-rs + Qdrant using Simplified DDD Architecture + +## Description + +Build the semantic search system for Logseq notes using local vector embeddings and similarity search. This extends our existing domain model with embedding capabilities while maintaining the pragmatic DDD approach established in the import/sync system. + +## Core Requirements + +### 1. Semantic Search Domain Extensions + +**New Value Objects:** +- `EmbeddingVector`: Wrapper around Vec with validation (384 dimensions for all-MiniLM-L6-v2) +- `ChunkId`: Unique identifier for text chunks (may be 1:1 or 1:many with BlockId) +- `SimilarityScore`: Normalized similarity score (0.0-1.0) +- `EmbeddingModel`: Enum of supported models (AllMiniLML6V2 as default) + +**New Entities:** +- `TextChunk`: Represents preprocessed text ready for embedding + - Contains cleaned text content + - Maintains reference to source Block and Page + - Handles chunking logic for long blocks (>512 tokens) + - Stores metadata for context reconstruction + +**New Aggregates:** +- `EmbeddedBlock`: Aggregate containing Block + its embeddings + - Manages relationship between Block and its TextChunks + - Handles embedding lifecycle (create, update, delete) + - Ensures consistency between block content and embeddings + +### 2. Application Layer Use Cases + +**EmbedBlocks UseCase:** +- Convert blocks to embeddings and store in vector DB +- Batch processing (32 blocks per batch for efficiency) +- Text preprocessing pipeline (remove Logseq syntax, add context) +- Handle chunking for long blocks with overlap strategy +- Async processing to avoid blocking UI + +**SemanticSearch UseCase:** +- Query vector DB and return ranked results with context +- Generate query embedding using same model +- Reconstruct full context (page title, hierarchy path, related refs) +- Combine with existing search DTOs for unified results + +**UpdateEmbeddings UseCase:** +- Re-embed modified blocks (triggered by file sync events) +- Incremental updates to avoid full re-indexing +- Cleanup orphaned embeddings for deleted blocks + +**DeleteEmbeddings UseCase:** +- Remove embeddings for deleted blocks/pages +- Maintain vector DB consistency with domain model + +### 3. Infrastructure Layer + +**Embedding Service (fastembed-rs):** +- `FastEmbedService`: Local embedding generation wrapper +- `EmbeddingModelManager`: Model loading and caching (~25MB download on first run) +- `TextPreprocessor`: Clean Logseq syntax while preserving context + - Remove [[page references]] brackets but keep text + - Remove #tags formatting but keep text + - Remove TODO/DONE markers + - Add page title and parent block context + - Handle chunking with 50-token overlap + +**Vector Database (Qdrant embedded):** +- `QdrantVectorStore`: Local file-based vector storage +- `VectorCollectionManager`: Collection schema and lifecycle management +- Store in app data directory (no external database needed) +- Cosine similarity search with configurable parameters + +### 4. Integration Points + +**With Existing Domain:** +- Extend existing `Block` and `Page` aggregates with embedding methods +- Use existing `PageRepository` for context reconstruction +- Integrate with file sync system for incremental updates + +**With Future Hybrid Search:** +- Semantic results combine with tantivy keyword results +- Shared result ranking and fusion logic (Reciprocal Rank Fusion) +- Common search result DTOs and interfaces + +**With Tauri Frontend:** +- Async search commands with progress reporting +- Embedding status and model management UI +- Search result streaming for large result sets + +## Technical Stack + +- **Embedding Generation**: fastembed-rs (all-MiniLM-L6-v2 model, 384 dimensions) +- **Vector Storage**: Qdrant embedded mode (file-based, no server needed) +- **Text Processing**: Custom preprocessing pipeline for Logseq syntax +- **Persistence**: Qdrant's native file storage + existing SQLite for metadata +- **Integration**: Direct integration with existing PageRepository and sync system + +## Architecture Notes + +**Pragmatic DDD Approach:** +- Extend existing domain objects rather than creating parallel structures +- Use existing repositories and services where possible +- Keep embedding logic separate but integrated with core domain +- Focus on testability with mockable embedding and vector services + +**Performance Considerations:** +- Batch embedding generation (32 blocks per batch) +- Async processing with progress reporting +- Model caching to avoid repeated loading +- Incremental updates for file changes +- Local storage optimization (quantization options) + +**Error Handling Strategy:** +- Continue processing other blocks if one embedding fails +- Graceful degradation to keyword search if embeddings unavailable +- Retry logic for transient failures +- Connection retry with exponential backoff for vector DB + +## Data Storage Strategy + +**Qdrant Collection Schema:** +```json +{ + "collection": "logseq_blocks", + "vector_size": 384, + "distance": "Cosine", + "payload": { + "chunk_id": "block-123-chunk-0", + "block_id": "block-123", + "page_id": "page-456", + "page_title": "Programming Notes", + "chunk_index": 0, + "total_chunks": 1, + "original_content": "Original block text...", + "preprocessed_content": "Cleaned text for embedding...", + "hierarchy_path": ["Parent block", "Current block"], + "created_at": "2025-10-18T10:00:00Z", + "updated_at": "2025-10-18T10:00:00Z" + } +} +``` + +**Storage Locations:** +- **macOS**: `~/Library/Application Support/com.logseq-search/qdrant_storage/` +- **Windows**: `%APPDATA%\com.logseq-search\qdrant_storage\` +- **Linux**: `~/.local/share/com.logseq-search/qdrant_storage/` + +## Testing Requirements + +**Unit Tests:** +- Text preprocessing logic (Logseq syntax removal, context addition) +- Chunking algorithms (long blocks, overlap strategy) +- Embedding generation (with mock models) +- Search result reconstruction and ranking +- Vector storage operations (with mock Qdrant) + +**Integration Tests:** +- End-to-end embedding and search flow +- Performance benchmarks with realistic data (1000+ blocks) +- Error scenario handling (model loading failures, vector DB issues) +- Incremental update workflows +- Memory usage and storage optimization + +**Test Data:** +- Sample Logseq blocks with various content types +- Long blocks requiring chunking (>512 tokens) +- Blocks with complex Logseq syntax ([[refs]], #tags, URLs) +- Hierarchical block structures with context + +## Implementation Phases + +**Phase 1: Core Infrastructure** +- Set up fastembed-rs integration and model management +- Implement Qdrant embedded storage +- Create basic text preprocessing pipeline +- Build embedding generation service + +**Phase 2: Domain Integration** +- Extend existing domain objects with embedding capabilities +- Implement EmbedBlocks and SemanticSearch use cases +- Create vector repository interfaces +- Add embedding lifecycle management + +**Phase 3: Search Integration** +- Integrate semantic search with existing search system +- Implement result fusion and ranking +- Add search result DTOs and context reconstruction +- Build Tauri command interfaces + +**Phase 4: Optimization & Polish** +- Performance optimization and benchmarking +- Error handling and recovery mechanisms +- Progress reporting and status management +- Documentation and testing completion + +## Success Criteria + +- **Functionality**: Users can perform semantic search across their Logseq notes +- **Performance**: Search results return in <200ms for typical personal knowledge bases +- **Accuracy**: Semantic search finds conceptually related content beyond keyword matching +- **Integration**: Seamless integration with existing import/sync system +- **Reliability**: Robust error handling and graceful degradation +- **Maintainability**: Clean, testable code following established DDD patterns + +## Dependencies + +- `fastembed = "3.0"` - Local embedding generation +- `qdrant-client = "1.11"` - Vector database client +- `tokio` - Async runtime (already in project) +- `serde` - Serialization (already in project) + +## Notes for Implementation + +This builds directly on the existing domain model and repository patterns established in the import/sync system. The semantic search capabilities are designed as extensions rather than replacements, allowing for future hybrid search implementations. + +The focus remains on pragmatic DDD - we want the benefits of clean architecture without over-engineering for a personal project. All components should be testable, maintainable, and performant for typical personal knowledge base sizes (1K-10K notes). + +See `./notes/features/SemanticSearch.md` for detailed architectural guidance and `./notes/dependencies/fastembed-ts.md` and `./notes/dependencies/qdrant.md` for technical implementation details.