From b217f01b4d56381b393eb197648f88ef290398f0 Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 26 Nov 2025 15:24:29 +0000 Subject: [PATCH 01/25] Fix collection configuration mismatch between memory server and indexer for ReFRAG mode Problem: - Memory server created collections with only [dense, lex] vectors, ignoring REFRAG_MODE - Indexer expected [dense, lex, mini] vectors, causing "Not existing vector name: mini" errors - Qdrant doesn't support adding new vector names to existing collections via update_collection() - update_collection() failed silently, leading to indexing failures and loops - Collection recreation attempts were failing, causing indexing to get stuck Solution: - Add REFRAG_MODE support to mcp_memory_server.py _ensure_collection() function - Implement memory backup/restore system in ingest_code.py ensure_collection() for future recreation needs - Export memories (points without file_path) before any recreation attempt - Restore memories with partial vector support for new configurations - Add proper error handling and logging for collection recreation scenarios Impact: - Memory server now creates collections with correct [dense, lex, mini] vectors from start - Eliminates indexing failures and loops caused by missing mini vector - Fixes HTTP 400 errors about missing "mini" vector - Enables proper ReFRAG mode functionality without requiring recreation - Preserves user memories during any future collection configuration changes - Backward compatible and future-proof for additional vector changes --- scripts/ingest_code.py | 153 ++++++++++++++++++++++++++++++++++- scripts/mcp_memory_server.py | 17 ++++ 2 files changed, 167 insertions(+), 3 deletions(-) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 3bdcdcb9..204b7abe 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -697,6 +697,11 @@ def generate_pseudo_tags(text: str) -> tuple[str, list[str]]: return pseudo, tags +class CollectionNeedsRecreateError(Exception): + """Raised when a collection needs to be recreated to add new vector types.""" + pass + + def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: str): """Ensure collection exists with named vectors. Always includes dense (vector_name) and lexical (LEX_VECTOR_NAME). @@ -744,9 +749,86 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st client.update_collection( collection_name=name, vectors_config=missing ) - except Exception: - # Best-effort; if server doesn't support adding vectors, leave to recreate path - pass + print(f"[COLLECTION_SUCCESS] Successfully updated collection {name} with missing vectors") + except Exception as update_e: + # Qdrant doesn't support adding new vector names to existing collections + # Fall back to recreating the collection with the correct vector configuration + print(f"[COLLECTION_WARNING] Cannot add missing vectors to {name} ({update_e}). Recreating collection...") + + # Backup memories before recreating collection + backup_file = None + try: + import tempfile + import json + from qdrant_client.models import Filter, FieldCondition, MatchValue, IsNull + + # Create temporary backup file + with tempfile.NamedTemporaryFile(mode='w', suffix='_memories_backup.json', delete=False) as f: + backup_file = f.name + + print(f"[MEMORY_BACKUP] Backing up memories from {name} to {backup_file}") + + # Export memories (points without file_path metadata) + points = client.scroll( + collection_name=name, + scroll_filter=Filter( + must=[ + IsNull(key="metadata.path") + ] + ), + limit=10000, # Adjust based on expected memory count + with_payload=True, + with_vectors=True + )[0] + + memories = [] + for point in points: + memory_data = { + "id": str(point.id), + "payload": point.payload, + "vectors": {} + } + + # Extract vectors if they exist + if hasattr(point, 'vector') and point.vector: + if isinstance(point.vector, dict): + memory_data["vectors"] = {k: v.tolist() if hasattr(v, 'tolist') else v + for k, v in point.vector.items()} + else: + # Single unnamed vector + memory_data["vectors"]["default"] = point.vector.tolist() if hasattr(point.vector, 'tolist') else point.vector + + memories.append(memory_data) + + with open(backup_file, 'w') as f: + json.dump({ + "collection": name, + "backup_time": datetime.now().isoformat(), + "memories": memories + }, f, indent=2) + + print(f"[MEMORY_BACKUP] Successfully backed up {len(memories)} memories") + + except Exception as backup_e: + print(f"[MEMORY_BACKUP_WARNING] Failed to backup memories: {backup_e}") + # Continue with recreation even if backup fails + + try: + client.delete_collection(name) + print(f"[COLLECTION_INFO] Deleted existing collection {name}") + except Exception: + pass + + # Store backup info for restoration + if backup_file: + setattr(CollectionNeedsRecreateError, 'backup_file', backup_file) + + # Proceed to recreate with full vector configuration + raise CollectionNeedsRecreateError(f"Collection {name} needs recreation for new vectors") + except CollectionNeedsRecreateError: + # Let this fall through to collection creation logic + print(f"[COLLECTION_INFO] Collection {name} needs recreation - proceeding...") + raise except Exception as e: print(f"[COLLECTION_ERROR] Failed to update collection {name}: {e}") pass @@ -780,6 +862,71 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st vectors_config=vectors_cfg, hnsw_config=models.HnswConfigDiff(m=16, ef_construct=256), ) + print(f"[COLLECTION_INFO] Successfully created new collection {name} with vectors: {list(vectors_cfg.keys())}") + + # Restore memories if we have a backup from recreation + try: + backup_file = getattr(CollectionNeedsRecreateError, 'backup_file', None) + if backup_file and os.path.exists(backup_file): + print(f"[MEMORY_RESTORE] Restoring memories from {backup_file}") + import json + + with open(backup_file, 'r') as f: + backup_data = json.load(f) + + memories = backup_data.get('memories', []) + if memories: + from qdrant_client.models import PointStruct + points_to_upsert = [] + + for memory in memories: + # Convert vectors back to proper format + vectors = {} + for vector_name, vector_data in memory.get('vectors', {}).items(): + if isinstance(vector_data, list): + vectors[vector_name] = vector_data + else: + vectors[vector_name] = vector_data + + # Important: Only include vectors that existed in backup + # If new collection has additional vectors (e.g., 'mini'), + # Qdrant will handle them gracefully as missing vectors + point = PointStruct( + id=memory['id'], + payload=memory['payload'], + vector=vectors if vectors else {} # Ensure vector is never None + ) + points_to_upsert.append(point) + + # Batch restore memories + batch_size = 100 + for i in range(0, len(points_to_upsert), batch_size): + batch = points_to_upsert[i:i + batch_size] + client.upsert( + collection_name=name, + points=batch + ) + + print(f"[MEMORY_RESTORE] Successfully restored {len(points_to_upsert)} memories") + + # Clean up backup file and reset class attribute + try: + os.unlink(backup_file) + print(f"[MEMORY_RESTORE] Cleaned up backup file {backup_file}") + # Reset the backup file attribute to prevent accidental reuse + setattr(CollectionNeedsRecreateError, 'backup_file', None) + except Exception: + setattr(CollectionNeedsRecreateError, 'backup_file', None) + pass + + elif backup_file: + print(f"[MEMORY_RESTORE_WARNING] Backup file {backup_file} not found") + # Reset the backup file attribute even if file not found + setattr(CollectionNeedsRecreateError, 'backup_file', None) + + except Exception as restore_e: + print(f"[MEMORY_RESTORE_ERROR] Failed to restore memories: {restore_e}") + # Continue even if restore fails - indexing is more important def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: str): diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py index 6777f16a..6644fe07 100644 --- a/scripts/mcp_memory_server.py +++ b/scripts/mcp_memory_server.py @@ -241,7 +241,24 @@ def _ensure_collection(name: str): VECTOR_NAME: models.VectorParams(size=int(dense_dim or 768), distance=models.Distance.COSINE), LEX_VECTOR_NAME: models.VectorParams(size=LEX_VECTOR_DIM, distance=models.Distance.COSINE), } + + # Add mini vector for ReFRAG mode (same logic as ingest_code.py) + try: + if os.environ.get("REFRAG_MODE", "").strip().lower() in { + "1", "true", "yes", "on" + }: + mini_vector_name = os.environ.get("MINI_VECTOR_NAME", "mini") + mini_vec_dim = int(os.environ.get("MINI_VEC_DIM", "64")) + vectors_cfg[mini_vector_name] = models.VectorParams( + size=mini_vec_dim, + distance=models.Distance.COSINE, + ) + except Exception: + pass + client.create_collection(collection_name=name, vectors_config=vectors_cfg) + vector_names = list(vectors_cfg.keys()) + print(f"[MEMORY_SERVER] Created collection '{name}' with vectors: {vector_names}") return True From 679532a680dac7e8e66287619f20eabfb47a81be Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 26 Nov 2025 16:53:17 +0000 Subject: [PATCH 02/25] Refactors memory backup and restore process Uses dedicated backup and restore scripts for handling memory persistence during collection recreation. This change replaces the in-line memory backup and restore logic with calls to separate, more robust and testable scripts (`memory_backup.py` and `memory_restore.py`). These scripts provide better error handling, logging, and are designed to be more resilient to changes in the Qdrant client. The scripts are now invoked as subprocesses, ensuring better isolation and management of the backup/restore operations. The ingest code now only handles the overall orchestration and error reporting. Adds `--skip-collection-creation` option to memory restore script to allow restoration of memories into a collection that's already initialized. This is specifically useful when `ingest_code.py` handles collection creation. This change improves maintainability and reduces the complexity of the `ingest_code.py` script. --- scripts/ingest_code.py | 123 +++++++++++--------------------------- scripts/memory_restore.py | 27 +++++++-- 2 files changed, 59 insertions(+), 91 deletions(-) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 204b7abe..c2b146ab 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -755,12 +755,12 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st # Fall back to recreating the collection with the correct vector configuration print(f"[COLLECTION_WARNING] Cannot add missing vectors to {name} ({update_e}). Recreating collection...") - # Backup memories before recreating collection + # Backup memories before recreating collection using dedicated backup script backup_file = None try: import tempfile - import json - from qdrant_client.models import Filter, FieldCondition, MatchValue, IsNull + import subprocess + import sys # Create temporary backup file with tempfile.NamedTemporaryFile(mode='w', suffix='_memories_backup.json', delete=False) as f: @@ -768,50 +768,23 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st print(f"[MEMORY_BACKUP] Backing up memories from {name} to {backup_file}") - # Export memories (points without file_path metadata) - points = client.scroll( - collection_name=name, - scroll_filter=Filter( - must=[ - IsNull(key="metadata.path") - ] - ), - limit=10000, # Adjust based on expected memory count - with_payload=True, - with_vectors=True - )[0] - - memories = [] - for point in points: - memory_data = { - "id": str(point.id), - "payload": point.payload, - "vectors": {} - } - - # Extract vectors if they exist - if hasattr(point, 'vector') and point.vector: - if isinstance(point.vector, dict): - memory_data["vectors"] = {k: v.tolist() if hasattr(v, 'tolist') else v - for k, v in point.vector.items()} - else: - # Single unnamed vector - memory_data["vectors"]["default"] = point.vector.tolist() if hasattr(point.vector, 'tolist') else point.vector - - memories.append(memory_data) - - with open(backup_file, 'w') as f: - json.dump({ - "collection": name, - "backup_time": datetime.now().isoformat(), - "memories": memories - }, f, indent=2) - - print(f"[MEMORY_BACKUP] Successfully backed up {len(memories)} memories") + # Use battle-tested backup script + backup_script = Path(__file__).parent / "memory_backup.py" + result = subprocess.run([ + sys.executable, str(backup_script), + "--collection", name, + "--output", backup_file + ], capture_output=True, text=True, cwd=Path(__file__).parent.parent) + + if result.returncode == 0: + print(f"[MEMORY_BACKUP] Successfully backed up memories using {backup_script.name}") + else: + print(f"[MEMORY_BACKUP_WARNING] Backup script failed: {result.stderr}") + backup_file = None except Exception as backup_e: print(f"[MEMORY_BACKUP_WARNING] Failed to backup memories: {backup_e}") - # Continue with recreation even if backup fails + backup_file = None try: client.delete_collection(name) @@ -864,60 +837,36 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st ) print(f"[COLLECTION_INFO] Successfully created new collection {name} with vectors: {list(vectors_cfg.keys())}") - # Restore memories if we have a backup from recreation + # Restore memories if we have a backup from recreation using dedicated restore script try: backup_file = getattr(CollectionNeedsRecreateError, 'backup_file', None) if backup_file and os.path.exists(backup_file): print(f"[MEMORY_RESTORE] Restoring memories from {backup_file}") - import json - - with open(backup_file, 'r') as f: - backup_data = json.load(f) - - memories = backup_data.get('memories', []) - if memories: - from qdrant_client.models import PointStruct - points_to_upsert = [] - - for memory in memories: - # Convert vectors back to proper format - vectors = {} - for vector_name, vector_data in memory.get('vectors', {}).items(): - if isinstance(vector_data, list): - vectors[vector_name] = vector_data - else: - vectors[vector_name] = vector_data - - # Important: Only include vectors that existed in backup - # If new collection has additional vectors (e.g., 'mini'), - # Qdrant will handle them gracefully as missing vectors - point = PointStruct( - id=memory['id'], - payload=memory['payload'], - vector=vectors if vectors else {} # Ensure vector is never None - ) - points_to_upsert.append(point) - - # Batch restore memories - batch_size = 100 - for i in range(0, len(points_to_upsert), batch_size): - batch = points_to_upsert[i:i + batch_size] - client.upsert( - collection_name=name, - points=batch - ) - - print(f"[MEMORY_RESTORE] Successfully restored {len(points_to_upsert)} memories") + import subprocess + import sys + + # Use battle-tested restore script (skip collection creation since ingest_code.py already handles it) + restore_script = Path(__file__).parent / "memory_restore.py" + result = subprocess.run([ + sys.executable, str(restore_script), + "--backup", backup_file, + "--collection", name, + "--skip-collection-creation" + ], capture_output=True, text=True, cwd=Path(__file__).parent.parent) + + if result.returncode == 0: + print(f"[MEMORY_RESTORE] Successfully restored memories using {restore_script.name}") + else: + print(f"[MEMORY_RESTORE_WARNING] Restore script failed: {result.stderr}") # Clean up backup file and reset class attribute try: os.unlink(backup_file) print(f"[MEMORY_RESTORE] Cleaned up backup file {backup_file}") - # Reset the backup file attribute to prevent accidental reuse - setattr(CollectionNeedsRecreateError, 'backup_file', None) except Exception: - setattr(CollectionNeedsRecreateError, 'backup_file', None) pass + # Reset the backup file attribute to prevent accidental reuse + setattr(CollectionNeedsRecreateError, 'backup_file', None) elif backup_file: print(f"[MEMORY_RESTORE_WARNING] Backup file {backup_file} not found") diff --git a/scripts/memory_restore.py b/scripts/memory_restore.py index cacddeda..27fb34d5 100644 --- a/scripts/memory_restore.py +++ b/scripts/memory_restore.py @@ -98,7 +98,8 @@ def restore_memories( embedding_model_name: Optional[str] = None, vector_name: str = "memory", batch_size: int = 100, - skip_existing: bool = True + skip_existing: bool = True, + skip_collection_creation: bool = False ) -> Dict[str, Any]: """ Restore memories from backup file to Qdrant collection. @@ -111,6 +112,7 @@ def restore_memories( vector_name: Name for the memory vector in collection batch_size: Number of memories to upload per batch skip_existing: Skip memories that already exist in collection + skip_collection_creation: Skip collection creation (useful when collection is already configured) Returns: Dict with restore statistics @@ -166,8 +168,18 @@ def restore_memories( embedding_model = None print(f"Using vectors from backup, dimension: {vector_dimension}") - # Ensure collection exists - ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + # Ensure collection exists (unless skipped) + if not skip_collection_creation: + ensure_collection_exists(client, collection_name, vector_dimension, vector_name) + else: + print(f"Skipping collection creation for '{collection_name}' (as requested)") + + # Verify collection actually exists when skipping creation + try: + client.get_collection(collection_name) + print(f"Confirmed collection '{collection_name}' exists") + except Exception: + raise RuntimeError(f"Collection '{collection_name}' does not exist but creation was skipped") # Check for existing memories if skip_existing is True existing_ids = set() @@ -327,6 +339,12 @@ def main(): help="Show backup file information without restoring" ) + parser.add_argument( + "--skip-collection-creation", + action="store_true", + help="Skip collection creation (useful when collection is already configured by other processes)" + ) + args = parser.parse_args() try: @@ -361,7 +379,8 @@ def main(): embedding_model_name=args.embedding_model, vector_name=args.vector_name, batch_size=args.batch_size, - skip_existing=not args.no_skip_existing + skip_existing=not args.no_skip_existing, + skip_collection_creation=args.skip_collection_creation ) if result["success"]: From ddf5ff12bbc74a7c6bc96a1af9155387081c2abc Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 26 Nov 2025 19:44:14 +0000 Subject: [PATCH 03/25] Handles corrupted workspace cache files Adds a try-except block when loading the workspace cache to handle cases where the cache file is corrupt or empty. If an exception occurs during loading, recreates the cache --- scripts/workspace_state.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 99209c50..fab94167 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -651,8 +651,12 @@ def set_cached_file_hash(file_path: str, file_hash: str, repo_name: Optional[str try: if cache_path.exists(): - with open(cache_path, "r", encoding="utf-8") as f: - cache = json.load(f) + try: + with open(cache_path, "r", encoding="utf-8") as f: + cache = json.load(f) + except Exception: + # If the existing cache is corrupt/empty, recreate it + cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} else: cache = {"file_hashes": {}, "created_at": datetime.now().isoformat()} From 7dc72ddb80dcf1c8bf4e4c633e876f5b06859efa Mon Sep 17 00:00:00 2001 From: Reese Date: Wed, 26 Nov 2025 21:36:14 +0000 Subject: [PATCH 04/25] Implements smart code re-indexing Implements smarter re-indexing strategy that reuses embeddings and reduces unnecessary re-indexing by leveraging a symbol cache. This change introduces symbol extraction using tree-sitter to identify functions, classes, and methods in code files. It compares the symbols against a cache to determine which parts of the code have changed, allowing for targeted re-indexing of only the modified sections. This significantly reduces the processing time and resource consumption associated with indexing large codebases. Adds the ability to reuse existing embeddings/lexical vectors for unchanged code chunks (identified by code content), and re-embed only changed chunks improving efficiency and overall performance. Also, includes logic for improved pseudo-tag generation. --- .env.example | 2 + docker-compose.dev-remote.yml | 1 + scripts/ingest_code.py | 693 +++++++++++++++++++++++++++++++++- scripts/watch_index.py | 96 ++++- scripts/workspace_state.py | 181 +++++++++ 5 files changed, 946 insertions(+), 27 deletions(-) diff --git a/.env.example b/.env.example index 52f6b595..cd60947e 100644 --- a/.env.example +++ b/.env.example @@ -153,6 +153,8 @@ QDRANT_TIMEOUT=20 MEMORY_AUTODETECT=1 MEMORY_COLLECTION_TTL_SECS=300 +# Smarter re-indexing for symbol cache, reuse embeddings and reduce decoder/pseudo tags to re-index +SMART_SYMBOL_REINDEXING=0 # Watcher-safe defaults (recommended) # Applied to watcher via compose; uncomment to apply globally. diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml index 7b1630a4..8edd5034 100644 --- a/docker-compose.dev-remote.yml +++ b/docker-compose.dev-remote.yml @@ -215,6 +215,7 @@ services: context: . dockerfile: Dockerfile.indexer container_name: indexer-dev-remote + user: "1000:1000" depends_on: - qdrant env_file: diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index c2b146ab..9f34d987 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -61,6 +61,13 @@ def _detect_repo_name_from_path(path: Path) -> str: remove_cached_file, update_indexing_status, update_workspace_state, + get_cached_symbols, + set_cached_symbols, + remove_cached_symbols, + compare_symbol_changes, + get_cached_pseudo, + set_cached_pseudo, + update_symbols_with_pseudo, ) except ImportError: # State integration is optional; continue if not available @@ -70,6 +77,14 @@ def _detect_repo_name_from_path(path: Path) -> str: remove_cached_file = None # type: ignore update_indexing_status = None # type: ignore update_workspace_state = None # type: ignore + get_cached_symbols = None # type: ignore + set_cached_symbols = None # type: ignore + remove_cached_symbols = None # type: ignore + get_cached_pseudo = None # type: ignore + set_cached_pseudo = None # type: ignore + update_symbols_with_pseudo = None # type: ignore + compare_symbol_changes = None # type: ignore + compare_symbol_changes = None # type: ignore # Optional Tree-sitter import (graceful fallback) try: @@ -629,6 +644,97 @@ def _pseudo_describe_enabled() -> bool: return False +# ===== Symbol Extraction for Smart Reindexing ===== + +def _smart_symbol_reindexing_enabled() -> bool: + """Check if symbol-aware reindexing is enabled.""" + try: + return str(os.environ.get("SMART_SYMBOL_REINDEXING", "0")).strip().lower() in {"1","true","yes","on"} + except Exception: + return False + + +def extract_symbols_with_tree_sitter(file_path: str) -> dict: + """Extract functions, classes, methods from file using tree-sitter or fallback. + + Returns: + dict: {symbol_id: {name, type, start_line, end_line, content_hash, pseudo, tags}} + """ + try: + # Read file content + text = Path(file_path).read_text(encoding="utf-8", errors="ignore") + language = detect_language(Path(file_path)) + + # Use existing symbol extraction infrastructure + symbols_list = _extract_symbols(language, text) + + # Convert to our expected dict format + symbols = {} + for sym in symbols_list: + symbol_id = f"{sym['kind']}_{sym['name']}_{sym['start']}" + + # Extract actual content for hashing + content_lines = text.split('\n')[sym['start']-1:sym['end']] + content = '\n'.join(content_lines) + content_hash = hashlib.sha1(content.encode('utf-8', errors='ignore')).hexdigest() + + symbols[symbol_id] = { + 'name': sym['name'], + 'type': sym['kind'], + 'start_line': sym['start'], + 'end_line': sym['end'], + 'content_hash': content_hash, + 'content': content, + # These will be populated during processing + 'pseudo': '', + 'tags': [], + 'qdrant_ids': [] # Will store Qdrant point IDs for this symbol + } + + return symbols + + except Exception as e: + print(f"[SYMBOL_EXTRACTION] Failed to extract symbols from {file_path}: {e}") + return {} + + +def should_use_smart_reindexing(file_path: str, file_hash: str) -> tuple[bool, str]: + """Determine if smart reindexing should be used for a file. + + Returns: + (use_smart, reason) + """ + if not _smart_symbol_reindexing_enabled(): + return False, "smart_reindexing_disabled" + + if not get_cached_symbols or not set_cached_symbols: + return False, "symbol_cache_unavailable" + + # Load cached symbols + cached_symbols = get_cached_symbols(file_path) + if not cached_symbols: + return False, "no_cached_symbols" + + # Extract current symbols + current_symbols = extract_symbols_with_tree_sitter(file_path) + if not current_symbols: + return False, "no_current_symbols" + + # Compare symbols + unchanged_symbols, changed_symbols = compare_symbol_changes(cached_symbols, current_symbols) + + total_symbols = len(current_symbols) + changed_ratio = len(changed_symbols) / max(total_symbols, 1) + + # Use thresholds to decide strategy + max_changed_ratio = float(os.environ.get("MAX_CHANGED_SYMBOLS_RATIO", "0.3")) + if changed_ratio > max_changed_ratio: + return False, f"too_many_changes_{changed_ratio:.2f}" + + print(f"[SMART_REINDEX] {file_path}: {len(unchanged_symbols)} unchanged, {len(changed_symbols)} changed") + return True, f"smart_reindex_{len(changed_symbols)}/{total_symbols}" + + def generate_pseudo_tags(text: str) -> tuple[str, list[str]]: """Best-effort: ask local decoder to produce a short label and 3-6 tags. Returns (pseudo, tags). On failure returns ("", []).""" @@ -697,6 +803,57 @@ def generate_pseudo_tags(text: str) -> tuple[str, list[str]]: return pseudo, tags +def should_process_pseudo_for_chunk( + file_path: str, chunk: dict, changed_symbols: set +) -> tuple[bool, str, list[str]]: + """Determine if a chunk needs pseudo processing based on symbol changes AND pseudo cache. + + Uses existing symbol change detection and pseudo cache lookup for optimal performance. + + Args: + file_path: Path to the file containing this chunk + chunk: Chunk dict with symbol information + changed_symbols: Set of symbol IDs that changed (from compare_symbol_changes) + + Returns: + (needs_processing, cached_pseudo, cached_tags) + """ + # For chunks without symbol info, process them (fallback - no symbol to reuse from) + symbol_name = chunk.get("symbol", "") + if not symbol_name: + return True, "", [] + + # Create symbol ID matching the format used in symbol cache + kind = chunk.get("kind", "unknown") + start_line = chunk.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + + # If we don't have any change information, best effort: try reusing cached pseudo when present + if not changed_symbols and get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags + except Exception: + pass + return True, "", [] + + # Unchanged symbol: prefer reuse when cached pseudo/tags exist + if symbol_id not in changed_symbols: + if get_cached_pseudo: + try: + cached_pseudo, cached_tags = get_cached_pseudo(file_path, symbol_id) + if cached_pseudo or cached_tags: + return False, cached_pseudo, cached_tags + except Exception: + pass + # Unchanged but no cached data yet – process once + return True, "", [] + + # Symbol content changed: always re-run pseudo; do not reuse stale cached values + return True, "", [] + + class CollectionNeedsRecreateError(Exception): """Raised when a collection needs to be recreated to add new vector types.""" pass @@ -1789,6 +1946,19 @@ def index_single_file( repo_tag = _detect_repo_name_from_path(file_path) + # Get changed symbols for pseudo processing optimization + changed_symbols = set() + if get_cached_symbols and set_cached_symbols: + cached_symbols = get_cached_symbols(str(file_path)) + if cached_symbols: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + _, changed = compare_symbol_changes(cached_symbols, current_symbols) + # Convert symbol names to IDs for lookup + for symbol_data in current_symbols.values(): + symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" + if symbol_id in changed: + changed_symbols.add(symbol_id) + if skip_unchanged: # Prefer local workspace cache to avoid Qdrant lookups ws_path = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" @@ -1900,6 +2070,13 @@ def make_point(pid, dense_vec, lex_vec, payload): sym = ch.get("symbol") or sym if "symbol_path" in ch and ch.get("symbol_path"): sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks always carry symbol metadata so pseudo gating can work for all chunking modes + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path # Track both container path (/work mirror) and original host path for clarity across environments _cur_path = str(file_path) _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") @@ -1976,15 +2153,32 @@ def make_point(pid, dense_vec, lex_vec, payload): }, } # Optional LLM enrichment for lexical retrieval: pseudo + tags per micro-chunk - pseudo, tags = ("", []) - try: - pseudo, tags = generate_pseudo_tags(ch.get("text") or "") - if pseudo: - payload["pseudo"] = pseudo - if tags: - payload["tags"] = tags - except Exception: - pass + # Use symbol-aware gating and cached pseudo/tags where possible + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + str(file_path), ch, changed_symbols + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + # Cache the pseudo data for this symbol + symbol_name = ch.get("symbol", "") + if symbol_name: + kind = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + + if set_cached_pseudo: + set_cached_pseudo(str(file_path), symbol_id, pseudo, tags, file_hash) + except Exception: + # Fall back to cached values (if any) or empty pseudo/tags + pass + # Attach whichever pseudo/tags we ended up with (cached or freshly generated) + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags batch_texts.append(info) batch_meta.append(payload) batch_ids.append(hash_id(ch["text"], str(file_path), ch["start"], ch["end"])) @@ -2264,6 +2458,8 @@ def make_point(pid, dense_vec, lex_vec, payload): continue except Exception: pass + + # Check existing indexed hash in Qdrant prev = get_indexed_file_hash(client, current_collection, str(file_path)) if prev and file_hash and prev == file_hash: # File exists in Qdrant with same hash - cache it locally for next time @@ -2301,6 +2497,35 @@ def make_point(pid, dense_vec, lex_vec, payload): print(f"Skipping unchanged file: {file_path}") continue + # At this point, file content has changed vs previous index; attempt smart reindex when enabled + if _smart_symbol_reindexing_enabled(): + try: + use_smart, smart_reason = should_use_smart_reindexing(str(file_path), file_hash) + if use_smart: + print(f"[SMART_REINDEX] Using smart reindexing for {file_path} ({smart_reason})") + status = process_file_with_smart_reindexing( + file_path, + text, + language, + client, + current_collection, + per_file_repo, + model, + vector_name, + ) + if status == "success": + files_indexed += 1 + # Smart path handles point counts internally; skip full reindex for this file + continue + else: + print( + f"[SMART_REINDEX] Smart reindex failed for {file_path} (status={status}), falling back to full reindex" + ) + else: + print(f"[SMART_REINDEX] Using full reindexing for {file_path} ({smart_reason})") + except Exception as e: + print(f"[SMART_REINDEX] Smart reindexing failed, falling back to full reindex: {e}") + # Dedupe per-file by deleting previous points for this path (default) if dedupe: delete_points_by_path(client, current_collection, str(file_path)) @@ -2310,6 +2535,19 @@ def make_point(pid, dense_vec, lex_vec, payload): imports, calls = _get_imports_calls(language, text) last_mod, churn_count, author_count = _git_metadata(file_path) + # Get changed symbols for pseudo processing optimization (reuse existing pattern) + changed_symbols = set() + if get_cached_symbols and set_cached_symbols: + cached_symbols = get_cached_symbols(str(file_path)) + if cached_symbols: + current_symbols = extract_symbols_with_tree_sitter(str(file_path)) + _, changed = compare_symbol_changes(cached_symbols, current_symbols) + # Convert symbol names to IDs for lookup + for symbol_data in current_symbols.values(): + symbol_id = f"{symbol_data['type']}_{symbol_data['name']}_{symbol_data['start_line']}" + if symbol_id in changed: + changed_symbols.add(symbol_id) + # Micro-chunking (token-based) takes precedence; else semantic; else line-based use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in { "1", @@ -2368,6 +2606,13 @@ def make_point(pid, dense_vec, lex_vec, payload): sym = ch.get("symbol") or sym if "symbol_path" in ch and ch.get("symbol_path"): sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks carry symbol metadata so pseudo gating works across all chunking modes + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path # Track both container path (/work mirror) and original host path _cur_path = str(file_path) _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") @@ -2435,15 +2680,28 @@ def make_point(pid, dense_vec, lex_vec, payload): }, } # Optional LLM enrichment for lexical retrieval: pseudo + tags per micro-chunk - pseudo, tags = ("", []) - try: - pseudo, tags = generate_pseudo_tags(ch.get("text") or "") - if pseudo: - payload["pseudo"] = pseudo - if tags: - payload["tags"] = tags - except Exception: - pass + # Use symbol-aware gating and cached pseudo/tags where possible + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + str(file_path), ch, changed_symbols + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + symbol_name = ch.get("symbol", "") + if symbol_name: + kind = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + symbol_id = f"{kind}_{symbol_name}_{start_line}" + if set_cached_pseudo: + set_cached_pseudo(str(file_path), symbol_id, pseudo, tags, file_hash) + except Exception: + pass + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags batch_texts.append(info) batch_meta.append(payload) # Track per-file latest hash once we add the first chunk to any batch @@ -2543,6 +2801,30 @@ def make_point(pid, dense_vec, lex_vec, payload): set_cached_file_hash(_p, _h, per_file_repo) except Exception: continue + + # NEW: Update symbol cache for files that were processed + if set_cached_symbols and _smart_symbol_reindexing_enabled(): + try: + # Process files that had chunks and extract/update their symbol cache + processed_files = set(str(Path(_p).resolve()) for _p in batch_file_hashes.keys()) + + for file_path_str in processed_files: + try: + # Extract current symbols for this file + current_symbols = extract_symbols_with_tree_sitter(file_path_str) + if current_symbols: + # Generate file hash for this file + with open(file_path_str, 'r', encoding='utf-8') as f: + content = f.read() + file_hash = hashlib.sha1(content.encode('utf-8', errors='ignore')).hexdigest() + + # Save symbol cache + set_cached_symbols(file_path_str, current_symbols, file_hash) + print(f"[SYMBOL_CACHE] Updated symbols for {Path(file_path_str).name}: {len(current_symbols)} symbols") + except Exception as e: + print(f"[SYMBOL_CACHE] Failed to update symbols for {Path(_p).name}: {e}") + except Exception as e: + print(f"[SYMBOL_CACHE] Symbol cache update failed: {e}") except Exception: pass @@ -2591,6 +2873,381 @@ def make_point(pid, dense_vec, lex_vec, payload): print(f"[ERROR] Traceback: {traceback.format_exc()}") +def process_file_with_smart_reindexing( + file_path, + text: str, + language: str, + client: QdrantClient, + current_collection: str, + per_file_repo, + model: TextEmbedding, + vector_name: str | None, +) -> str: + """Smart, chunk-level reindexing for a single file. + + Rebuilds all points for the file with *accurate* line numbers while: + - Reusing existing embeddings/lexical vectors for unchanged chunks (by code content), and + - Re-embedding only for changed chunks. + + Symbol cache is used to gate pseudo/tag generation, but embedding reuse is decided + at the chunk level by matching previous chunk code. + """ + try: + print(f"[SMART_REINDEX] Processing {file_path} with chunk-level reindexing") + + # Normalize path / types + try: + fp = str(file_path) + except Exception: + fp = str(file_path) + try: + if not isinstance(file_path, Path): + file_path = Path(fp) + except Exception: + file_path = Path(fp) + + # Compute current file hash + file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + + # Extract current symbols for diffing (dict) and for chunk mapping (List[_Sym]) + symbol_meta = extract_symbols_with_tree_sitter(fp) + if not symbol_meta: + print(f"[SMART_REINDEX] No symbols found in {file_path}, falling back to full reindex") + return "failed" + + # Use the dict-style symbol_meta for cache diffing + cached_symbols = get_cached_symbols(fp) if get_cached_symbols else {} + unchanged_symbols: list[str] = [] + changed_symbols: list[str] = [] + if cached_symbols and compare_symbol_changes: + try: + unchanged_symbols, changed_symbols = compare_symbol_changes( + cached_symbols, symbol_meta + ) + except Exception: + # On failure, treat everything as changed + unchanged_symbols = [] + changed_symbols = list(symbol_meta.keys()) + else: + changed_symbols = list(symbol_meta.keys()) + changed_set = set(changed_symbols) + + # Load existing points for this file (for embedding reuse) + existing_points = [] + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.path", match=models.MatchValue(value=fp) + ) + ] + ) + next_offset = None + while True: + pts, next_offset = client.scroll( + collection_name=current_collection, + scroll_filter=filt, + with_payload=True, + with_vectors=True, + limit=256, + offset=next_offset, + ) + if not pts: + break + existing_points.extend(pts) + if next_offset is None: + break + except Exception as e: + print(f"[SMART_REINDEX] Failed to load existing points for {file_path}: {e}") + existing_points = [] + + # Index existing points by (symbol_id, code) for reuse + points_by_code: dict[tuple[str, str], list[models.Record]] = {} + try: + for rec in existing_points: + payload = rec.payload or {} + md = payload.get("metadata") or {} + code_text = md.get("code") or "" + kind = md.get("kind") or "" + sym_name = md.get("symbol") or "" + start_line = md.get("start_line") or 0 + symbol_id = ( + f"{kind}_{sym_name}_{start_line}" + if kind and sym_name and start_line + else "" + ) + key = (symbol_id, code_text) if symbol_id else ("", code_text) + points_by_code.setdefault(key, []).append(rec) + except Exception: + points_by_code = {} + + # Chunk current file using the same strategy as normal indexing + CHUNK_LINES = int(os.environ.get("INDEX_CHUNK_LINES", "120") or 120) + CHUNK_OVERLAP = int(os.environ.get("INDEX_CHUNK_OVERLAP", "20") or 20) + use_micro = os.environ.get("INDEX_MICRO_CHUNKS", "0").lower() in { + "1", + "true", + "yes", + "on", + } + use_semantic = os.environ.get("INDEX_SEMANTIC_CHUNKS", "1").lower() in { + "1", + "true", + "yes", + "on", + } + + if use_micro: + chunks = chunk_by_tokens(text) + symbol_spans: list[_Sym] = _extract_symbols(language, text) + elif use_semantic: + chunks = chunk_semantic(text, language, CHUNK_LINES, CHUNK_OVERLAP) + symbol_spans = _extract_symbols(language, text) + else: + chunks = chunk_lines(text, CHUNK_LINES, CHUNK_OVERLAP) + symbol_spans = _extract_symbols(language, text) + + # Prepare collections for reused vs newly embedded points + reused_points: list[models.PointStruct] = [] + embed_texts: list[str] = [] + embed_payloads: list[dict] = [] + embed_ids: list[int] = [] + embed_lex: list[list[float]] = [] + + imports, calls = _get_imports_calls(language, text) + last_mod, churn_count, author_count = _git_metadata(file_path) + + for ch in chunks: + info = build_information( + language, + file_path, + ch["start"], + ch["end"], + ch["text"].splitlines()[0] if ch["text"] else "", + ) + # Use span-style symbols for mapping chunks to symbols + kind, sym, sym_path = _choose_symbol_for_chunk( + ch["start"], ch["end"], symbol_spans + ) + # Prefer embedded symbol metadata from semantic chunker when present + if "kind" in ch and ch.get("kind"): + kind = ch.get("kind") or kind + if "symbol" in ch and ch.get("symbol"): + sym = ch.get("symbol") or sym + if "symbol_path" in ch and ch.get("symbol_path"): + sym_path = ch.get("symbol_path") or sym_path + # Ensure chunks carry symbol metadata so pseudo gating works + if not ch.get("kind") and kind: + ch["kind"] = kind + if not ch.get("symbol") and sym: + ch["symbol"] = sym + if not ch.get("symbol_path") and sym_path: + ch["symbol_path"] = sym_path + + # Basic metadata payload + _cur_path = str(file_path) + _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") + _host_path = None + _container_path = None + _origin_client_path = None + try: + if _cur_path.startswith("/work/"): + _parts = _cur_path[6:].split("/") + if len(_parts) >= 2: + _repo_name = _parts[0] + _workspace_path = f"/work/{_repo_name}" + _origin_client_path = _get_host_path_from_origin( + _workspace_path, _repo_name + ) + except Exception: + pass + try: + if _cur_path.startswith("/work/") and (_host_root or _origin_client_path): + _rel = _cur_path[len("/work/") :] + if _origin_client_path: + _host_path = os.path.realpath( + os.path.join(_origin_client_path, _rel) + ) + else: + _host_path = os.path.realpath(os.path.join(_host_root, _rel)) + _container_path = _cur_path + else: + _host_path = _cur_path + if ( + (_host_root or _origin_client_path) + and _cur_path.startswith( + ((_origin_client_path or _host_root) + "/") + ) + ): + _rel = _cur_path[ + len((_origin_client_path or _host_root)) + 1 : + ] + _container_path = "/work/" + _rel + except Exception: + _host_path = _cur_path + _container_path = ( + _cur_path if _cur_path.startswith("/work/") else None + ) + + payload = { + "document": info, + "information": info, + "metadata": { + "path": str(file_path), + "path_prefix": str(file_path.parent), + "ext": str(file_path.suffix).lstrip(".").lower(), + "language": language, + "kind": kind, + "symbol": sym, + "symbol_path": sym_path or "", + "repo": per_file_repo, + "start_line": ch["start"], + "end_line": ch["end"], + "code": ch["text"], + "file_hash": file_hash, + "imports": imports, + "calls": calls, + "ingested_at": int(time.time()), + "last_modified_at": int(last_mod), + "churn_count": int(churn_count), + "author_count": int(author_count), + "host_path": _host_path, + "container_path": _container_path, + }, + } + + # Pseudo / tags with symbol-aware gating + needs_pseudo, cached_pseudo, cached_tags = should_process_pseudo_for_chunk( + fp, ch, changed_set + ) + pseudo, tags = cached_pseudo, cached_tags + if needs_pseudo: + try: + pseudo, tags = generate_pseudo_tags(ch.get("text") or "") + if pseudo or tags: + symbol_name = ch.get("symbol", "") + if symbol_name: + k = ch.get("kind", "unknown") + start_line = ch.get("start", 0) + sid = f"{k}_{symbol_name}_{start_line}" + if set_cached_pseudo: + set_cached_pseudo(fp, sid, pseudo, tags, file_hash) + except Exception: + pass + if pseudo: + payload["pseudo"] = pseudo + if tags: + payload["tags"] = tags + + # Decide whether we can reuse an existing embedding for this chunk + code_text = ch.get("text") or "" + chunk_symbol_id = "" + if sym and kind: + chunk_symbol_id = f"{kind}_{sym}_{ch['start']}" + + reuse_key = (chunk_symbol_id, code_text) + fallback_key = ("", code_text) + reused_rec = None + bucket = points_by_code.get(reuse_key) or points_by_code.get(fallback_key) + if bucket: + try: + reused_rec = bucket.pop() + if not bucket: + # Clean up empty bucket + points_by_code.pop(reuse_key, None) + points_by_code.pop(fallback_key, None) + except Exception: + reused_rec = None + + if reused_rec is not None: + try: + vec = reused_rec.vector + pid = hash_id(code_text, fp, ch["start"], ch["end"]) + reused_points.append( + models.PointStruct(id=pid, vector=vec, payload=payload) + ) + continue + except Exception: + # Fall through to re-embedding path + pass + + # Need to embed this chunk + embed_texts.append(info) + embed_payloads.append(payload) + embed_ids.append( + hash_id(code_text, fp, ch["start"], ch["end"]) + ) + aug_lex_text = (code_text or "") + ( + " " + pseudo if pseudo else "" + ) + (" " + " ".join(tags) if tags else "") + embed_lex.append(_lex_hash_vector_text(aug_lex_text)) + + # Embed changed/new chunks and build final point set + new_points: list[models.PointStruct] = [] + if embed_texts: + vectors = embed_batch(model, embed_texts) + for pid, v, lx, pl in zip( + embed_ids, + vectors, + embed_lex, + embed_payloads, + ): + if vector_name: + vecs = {vector_name: v, LEX_VECTOR_NAME: lx} + try: + if os.environ.get("REFRAG_MODE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + }: + vecs[MINI_VECTOR_NAME] = project_mini( + list(v), MINI_VEC_DIM + ) + except Exception: + pass + new_points.append( + models.PointStruct(id=pid, vector=vecs, payload=pl) + ) + else: + new_points.append( + models.PointStruct(id=pid, vector=v, payload=pl) + ) + + all_points = reused_points + new_points + + # Replace existing points for this file with the new set + try: + delete_points_by_path(client, current_collection, fp) + except Exception as e: + print(f"[SMART_REINDEX] Failed to delete old points for {file_path}: {e}") + + if all_points: + upsert_points(client, current_collection, all_points) + + # Update caches with the new state + try: + if set_cached_symbols: + set_cached_symbols(fp, symbol_meta, file_hash) + except Exception as e: + print(f"[SMART_REINDEX] Failed to update symbol cache for {file_path}: {e}") + try: + if set_cached_file_hash: + set_cached_file_hash(fp, file_hash, per_file_repo) + except Exception: + pass + + print( + f"[SMART_REINDEX] Completed {file_path}: chunks={len(chunks)}, reused_points={len(reused_points)}, embedded_points={len(new_points)}" + ) + return "success" + + except Exception as e: + print(f"[SMART_REINDEX] Failed to process {file_path}: {e}") + import traceback + print(f"[SMART_REINDEX] Traceback: {traceback.format_exc()}") + return "failed" + def main(): parser = argparse.ArgumentParser( description="Index code into Qdrant with metadata for MCP code search." diff --git a/scripts/watch_index.py b/scripts/watch_index.py index c9e94c57..41c33d80 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -279,9 +279,25 @@ def on_deleted(self, event): if repo_path: repo_name = _extract_repo_name_from_path(str(repo_path)) remove_cached_file(str(p), repo_name) + + # Remove symbol cache entry + try: + from scripts.workspace_state import remove_cached_symbols + remove_cached_symbols(str(p)) + print(f"[deleted_symbol_cache] {p}") + except Exception as e: + print(f"[symbol_cache_delete_error] {p}: {e}") else: root_repo_name = _extract_repo_name_from_path(str(self.root)) remove_cached_file(str(p), root_repo_name) + + # Remove symbol cache entry (single repo mode) + try: + from scripts.workspace_state import remove_cached_symbols + remove_cached_symbols(str(p)) + print(f"[deleted_symbol_cache] {p}") + except Exception as e: + print(f"[symbol_cache_delete_error] {p}: {e}") except Exception: pass @@ -816,15 +832,77 @@ def _process_paths(paths, client, model, vector_name: str, model_dim: int, works ok = False try: - ok = idx.index_single_file( - client, - model, - collection, - vector_name, - p, - dedupe=True, - skip_unchanged=False, - ) + # Prefer smart symbol-aware reindexing when enabled and cache is available + try: + if getattr(idx, "_smart_symbol_reindexing_enabled", None) and idx._smart_symbol_reindexing_enabled(): + text: str | None = None + try: + text = p.read_text(encoding="utf-8", errors="ignore") + except Exception: + text = None + if text is not None: + try: + language = idx.detect_language(p) + except Exception: + language = "" + try: + file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() + except Exception: + file_hash = "" + if file_hash: + try: + use_smart, smart_reason = idx.should_use_smart_reindexing(str(p), file_hash) + except Exception: + use_smart, smart_reason = False, "smart_check_failed" + + # Bootstrap: if we have no symbol cache yet, still run smart path once + bootstrap = smart_reason == "no_cached_symbols" + if use_smart or bootstrap: + msg_kind = "smart reindexing" if use_smart else "bootstrap (no_cached_symbols) for smart reindex" + try: + print(f"[SMART_REINDEX][watcher] Using {msg_kind} for {p} ({smart_reason})") + except Exception: + pass + try: + status = idx.process_file_with_smart_reindexing( + p, + text, + language, + client, + collection, + repo_name, + model, + vector_name, + ) + ok = status == "success" + except Exception as se: + try: + print(f"[SMART_REINDEX][watcher] Smart reindexing failed for {p}: {se}") + except Exception: + pass + ok = False + else: + try: + print(f"[SMART_REINDEX][watcher] Using full reindexing for {p} ({smart_reason})") + except Exception: + pass + except Exception as e_smart: + try: + print(f"[SMART_REINDEX][watcher] Smart reindexing disabled or preview failed for {p}: {e_smart}") + except Exception: + pass + + # Fallback: full single-file reindex + if not ok: + ok = idx.index_single_file( + client, + model, + collection, + vector_name, + p, + dedupe=True, + skip_unchanged=False, + ) except Exception as e: try: print(f"[index_error] {p}: {e}") diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index fab94167..70f0a15b 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -252,6 +252,12 @@ def _atomic_write_state(state_path: Path, state: WorkspaceState) -> None: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2, ensure_ascii=False) temp_path.replace(state_path) + # Ensure state/cache files are group-writable so multiple processes + # (upload service, watcher, indexer) can update them. + try: + os.chmod(state_path, 0o664) + except PermissionError: + pass except Exception: # Clean up temp file if something went wrong try: @@ -806,4 +812,179 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, return mappings + +# ===== Symbol-Level Cache for Smart Reindexing ===== + +def _get_symbol_cache_path(file_path: str) -> Path: + """Get symbol cache file path for a given file.""" + try: + fp = str(Path(file_path).resolve()) + # Create symbol cache using file hash to handle renames + file_hash = hashlib.md5(fp.encode('utf-8')).hexdigest()[:8] + if is_multi_repo_mode(): + # Use the same repo-name detection as other state helpers so that + # symbol caches live under the correct per-repo .codebase directory + repo_name = _detect_repo_name_from_path(Path(file_path)) + state_dir = _get_repo_state_dir(repo_name) + return state_dir / f"symbols_{file_hash}.json" + else: + cache_dir = _get_cache_path(_resolve_workspace_root()).parent + return cache_dir / f"symbols_{file_hash}.json" + except Exception: + # Fallback to simple file-based path + cache_dir = _get_cache_path(_resolve_workspace_root()).parent + filename = Path(file_path).name.replace('.', '_').replace('/', '_') + return cache_dir / f"symbols_{filename}.json" + + +def get_cached_symbols(file_path: str) -> dict: + """Load cached symbol metadata for a file.""" + cache_path = _get_symbol_cache_path(file_path) + + if not cache_path.exists(): + return {} + + try: + with open(cache_path, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + return cache_data.get("symbols", {}) + except Exception: + return {} + + +def set_cached_symbols(file_path: str, symbols: dict, file_hash: str) -> None: + """Save symbol metadata for a file. Extends existing to include pseudo data.""" + cache_path = _get_symbol_cache_path(file_path) + cache_path.parent.mkdir(parents=True, exist_ok=True) + + try: + cache_data = { + "file_path": str(file_path), + "file_hash": file_hash, + "updated_at": datetime.now().isoformat(), + "symbols": symbols + } + + with open(cache_path, 'w', encoding='utf-8') as f: + json.dump(cache_data, f, indent=2) + + # Ensure symbol cache files are group-writable so both indexer and + # watcher processes (potentially different users sharing a group) + # can update them on shared volumes. + try: + os.chmod(cache_path, 0o664) + except PermissionError: + pass + except Exception as e: + print(f"[SYMBOL_CACHE_WARNING] Failed to save symbol cache for {file_path}: {e}") + + +def get_cached_pseudo(file_path: str, symbol_id: str) -> tuple[str, list[str]]: + """Load cached pseudo description and tags for a specific symbol. + + Returns: + (pseudo, tags) tuple, or ("", []) if not found + """ + cached_symbols = get_cached_symbols(file_path) + + if symbol_id in cached_symbols: + symbol_info = cached_symbols[symbol_id] + pseudo = symbol_info.get("pseudo", "") + tags = symbol_info.get("tags", []) + + # Ensure correct types + if isinstance(pseudo, str): + pseudo = pseudo + else: + pseudo = "" + + if isinstance(tags, list): + tags = [str(tag) for tag in tags] + else: + tags = [] + + return pseudo, tags + + return "", [] + + +def set_cached_pseudo(file_path: str, symbol_id: str, pseudo: str, tags: list[str], file_hash: str) -> None: + """Update pseudo data for a specific symbol in the cache. + + This function updates only the pseudo data without recreating the entire symbol cache, + making it efficient for incremental updates during indexing. + """ + cached_symbols = get_cached_symbols(file_path) + + # Update the symbol with pseudo data + if symbol_id in cached_symbols: + cached_symbols[symbol_id]["pseudo"] = pseudo + cached_symbols[symbol_id]["tags"] = tags + + # Save the updated cache only when we actually have symbol entries, to + # avoid creating empty symbol cache files before the base symbol set + # has been seeded by the indexer/smart reindex path. + set_cached_symbols(file_path, cached_symbols, file_hash) + + +def update_symbols_with_pseudo(file_path: str, symbols_with_pseudo: dict, file_hash: str) -> None: + """Update symbols cache with pseudo data for multiple symbols at once. + + Args: + file_path: Path to the file + symbols_with_pseudo: Dict mapping symbol_id to (symbol_info, pseudo, tags) tuples + file_hash: Current file hash + """ + cached_symbols = get_cached_symbols(file_path) + + # Update symbols with their new pseudo data + for symbol_id, (symbol_info, pseudo, tags) in symbols_with_pseudo.items(): + if symbol_id in cached_symbols: + # Update existing symbol with pseudo data + cached_symbols[symbol_id]["pseudo"] = pseudo + cached_symbols[symbol_id]["tags"] = tags + + # Update content hash from symbol_info if available + if isinstance(symbol_info, dict): + cached_symbols[symbol_id].update(symbol_info) + + # Save the updated cache + set_cached_symbols(file_path, cached_symbols, file_hash) + + +def remove_cached_symbols(file_path: str) -> None: + """Remove symbol cache for a file (when file is deleted).""" + cache_path = _get_symbol_cache_path(file_path) + try: + if cache_path.exists(): + cache_path.unlink() + except Exception: + pass + + +def compare_symbol_changes(old_symbols: dict, new_symbols: dict) -> tuple[list, list]: + """ + Compare old and new symbols to identify changes. + + Returns: + (unchanged_symbols, changed_symbols) + """ + unchanged = [] + changed = [] + + for symbol_id, symbol_info in new_symbols.items(): + if symbol_id in old_symbols: + old_info = old_symbols[symbol_id] + # Compare content hash + if old_info.get("content_hash") == symbol_info.get("content_hash"): + unchanged.append(symbol_id) + else: + changed.append(symbol_id) + else: + # New symbol + changed.append(symbol_id) + + return unchanged, changed + + # Add missing functions that callers expect (already defined above) \ No newline at end of file From 1b9004f0ee75a81316251a92f27df0fd8a7ef7bf Mon Sep 17 00:00:00 2001 From: Reese Date: Thu, 27 Nov 2025 02:46:13 +0000 Subject: [PATCH 05/25] Adds git history ingestion support to upload client Enables the collection and indexing of git commit history for enhanced context lineage capabilities. - Introduces configuration options to control the depth and scope of git history ingestion. - Implements mechanisms to extract commit metadata, diffs, and lineage information. - Integrates git history into the existing indexing pipeline, allowing agents to reason about code evolution. - Provides a new command to trigger a force sync to upload git history. --- .env.example | 8 + ctx-hook-simple.sh | 40 ++- docs/CLAUDE.example.md | 14 + docs/commit-indexing/cmds.md | 29 ++ docs/commit-indexing/experiments.md | 281 +++++++++++++++++ docs/commit-indexing/overview.md | 144 +++++++++ scripts/ingest_history.py | 282 +++++++++++++++++- scripts/mcp_indexer_server.py | 211 ++++++++++++- scripts/remote_upload_client.py | 227 ++++++++++++++ scripts/standalone_upload_client.py | 224 ++++++++++++++ scripts/upload_service.py | 22 ++ scripts/watch_index.py | 61 +++- .../context-engine-uploader/README.md | 4 + .../context-engine-uploader/extension.js | 36 ++- .../context-engine-uploader/package.json | 15 + 15 files changed, 1570 insertions(+), 28 deletions(-) create mode 100644 docs/commit-indexing/cmds.md create mode 100644 docs/commit-indexing/experiments.md create mode 100644 docs/commit-indexing/overview.md diff --git a/.env.example b/.env.example index cd60947e..0b91bb6d 100644 --- a/.env.example +++ b/.env.example @@ -168,3 +168,11 @@ SMART_SYMBOL_REINDEXING=0 # INDEX_UPSERT_BACKOFF=0.5 # Debounce file events to coalesce bursts # WATCH_DEBOUNCE_SECS=1.5 + +# Remote upload git history (used by upload clients) +# Max number of commits to include per bundle (0 disables git history) +# REMOTE_UPLOAD_GIT_MAX_COMMITS=500 +# Optional git log since filter, e.g. '6 months ago' or '2024-01-01' +# REMOTE_UPLOAD_GIT_SINCE= +# Enable commit lineage goals for indexing +REFRAG_COMMIT_DESCRIBE=1 diff --git a/ctx-hook-simple.sh b/ctx-hook-simple.sh index 1d4cdb0e..e6c07bd4 100755 --- a/ctx-hook-simple.sh +++ b/ctx-hook-simple.sh @@ -105,16 +105,17 @@ fi # Read all settings from ctx_config.json if [ -n "$CONFIG_FILE" ] && [ -f "$CONFIG_FILE" ]; then - CTX_COLLECTION=$(grep -o '"default_collection"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"default_collection"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) - REFRAG_RUNTIME=$(grep -o '"refrag_runtime"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"refrag_runtime"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' || echo "glm") - GLM_API_KEY=$(grep -o '"glm_api_key"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) - GLM_API_BASE=$(grep -o '"glm_api_base"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_base"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') - GLM_MODEL=$(grep -o '"glm_model"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"glm_model"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/' || echo "glm-4.6") - CTX_DEFAULT_MODE=$(grep -o '"default_mode"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"default_mode"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/') - CTX_REQUIRE_CONTEXT=$(grep -o '"require_context"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"require_context"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') - CTX_RELEVANCE_GATE=$(grep -o '"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') - CTX_MIN_RELEVANCE=$(grep -o '"min_relevance"[[:space:]]*:[[:space:]]*[0-9.][0-9.]*' "$CONFIG_FILE" | sed 's/.*"min_relevance"[[:space:]]*:[[:space:]]*\([0-9.][0-9.]*\).*/\1/') - CTX_REWRITE_MAX_TOKENS=$(grep -o '"rewrite_max_tokens"[[:space:]]*:[[:space:]]*[0-9][0-9]*' "$CONFIG_FILE" | sed 's/.*"rewrite_max_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/') + CTX_COLLECTION=$(grep -o '"default_collection"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"default_collection"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) + REFRAG_RUNTIME=$(grep -o '"refrag_runtime"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"refrag_runtime"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' || echo "glm") + GLM_API_KEY=$(grep -o '"glm_api_key"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/' ) + GLM_API_BASE=$(grep -o '"glm_api_base"[[:space:]]*:[[:space:]]*"[^"]*"' "$CONFIG_FILE" | sed 's/.*"glm_api_base"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/') + GLM_MODEL=$(grep -o '"glm_model"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"glm_model"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/' || echo "glm-4.6") + CTX_DEFAULT_MODE=$(grep -o '"default_mode"[[:space:]]*:[[:space:]]*"[^\"]*"' "$CONFIG_FILE" | sed 's/.*"default_mode"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/') + CTX_REQUIRE_CONTEXT=$(grep -o '"require_context"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"require_context"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') + CTX_RELEVANCE_GATE=$(grep -o '"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"relevance_gate_enabled"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') + CTX_MIN_RELEVANCE=$(grep -o '"min_relevance"[[:space:]]*:[[:space:]]*[0-9.][0-9.]*' "$CONFIG_FILE" | sed 's/.*"min_relevance"[[:space:]]*:[[:space:]]*\([0-9.][0-9.]*\).*/\1/') + CTX_REWRITE_MAX_TOKENS=$(grep -o '"rewrite_max_tokens"[[:space:]]*:[[:space:]]*[0-9][0-9]*' "$CONFIG_FILE" | sed 's/.*"rewrite_max_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/') + CTX_SURFACE_COLLECTION_CFG=$(grep -o '"surface_qdrant_collection_hint"[[:space:]]*:[[:space:]]*\(true\|false\)' "$CONFIG_FILE" | sed 's/.*"surface_qdrant_collection_hint"[[:space:]]*:[[:space:]]*\(true\|false\).*/\1/') fi # Set defaults if not found in config @@ -129,6 +130,25 @@ CTX_RELEVANCE_GATE=${CTX_RELEVANCE_GATE:-false} CTX_MIN_RELEVANCE=${CTX_MIN_RELEVANCE:-0.1} CTX_REWRITE_MAX_TOKENS=${CTX_REWRITE_MAX_TOKENS:-320} +# Normalize surface_qdrant_collection_hint from config (true/false) into 1/0 +CFG_HINT="" +if [ -n "$CTX_SURFACE_COLLECTION_CFG" ]; then + if [ "$CTX_SURFACE_COLLECTION_CFG" = "true" ]; then + CFG_HINT="1" + elif [ "$CTX_SURFACE_COLLECTION_CFG" = "false" ]; then + CFG_HINT="0" + fi +fi + +# Precedence: explicit env override > ctx_config flag > auto-on when collection known +if [ -n "${CTX_SURFACE_COLLECTION_HINT:-}" ]; then + : +elif [ -n "$CFG_HINT" ]; then + CTX_SURFACE_COLLECTION_HINT="$CFG_HINT" +elif [ -n "$CTX_COLLECTION" ]; then + CTX_SURFACE_COLLECTION_HINT="1" +fi + # Export GLM/context environment variables from config export REFRAG_RUNTIME GLM_API_KEY GLM_API_BASE GLM_MODEL CTX_REQUIRE_CONTEXT CTX_RELEVANCE_GATE CTX_MIN_RELEVANCE CTX_REWRITE_MAX_TOKENS diff --git a/docs/CLAUDE.example.md b/docs/CLAUDE.example.md index ce132d4e..92b3755b 100644 --- a/docs/CLAUDE.example.md +++ b/docs/CLAUDE.example.md @@ -86,6 +86,20 @@ Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep - Use for: short natural-language summaries/explanations of specific modules or tools, grounded in code/docs with citations. - Good for: "What does scripts/standalone_upload_client.py do at a high level?", "Summarize the remote upload client pipeline.". + Advanced lineage workflow (code + history): + + - Goal: answer "when/why did behavior X change?" without flooding context. + - Step 1 – Find current implementation (code): + - Use repo_search to locate the relevant file/symbol, e.g. `repo_search(query: "upload client timeout", language: "python", under: "scripts")`. + - Step 2 – Summarize recent change activity for a file: + - Call change_history_for_path with `include_commits=true` to get churn stats and a small list of recent commits, e.g. `change_history_for_path(path: "scripts/remote_upload_client.py", include_commits: true)`. + - Step 3 – Pull commit lineage for a specific behavior: + - Use search_commits_for with short behavior phrases plus an optional path filter, e.g. `search_commits_for(query: "remote upload timeout retry", path: "scripts/remote_upload_client.py")`. + - Read lineage_goal / lineage_symbols / lineage_tags to understand intent and related concepts. + - Step 4 – Optionally summarize current behavior: + - After you have the right file/symbol from repo_search, use context_answer to explain what the module does now; treat commit lineage as background, not as primary code context. + - For exact line-level changes (e.g. "when did this literal constant change?"), use lineage tools to narrow candidate commits, then inspect diffs with git tooling; do not guess purely from summaries. + Query Phrasing Tips for context_answer: - Prefer behavior/architecture questions about a single module or tool: diff --git a/docs/commit-indexing/cmds.md b/docs/commit-indexing/cmds.md new file mode 100644 index 00000000..fa819886 --- /dev/null +++ b/docs/commit-indexing/cmds.md @@ -0,0 +1,29 @@ +curl -s "http://localhost:6333/collections/Context-Engine-41e67959/points/scroll" -H "Content-Type: application/json" -d '{"filter":{"must":[{"key":"metadata.language","match":{"value":"git"}},{"key":"metadata.kind","match":{"value":"git_message"}}]},"limit":5,"with_payload":true,"with_vector":false}' + + + + +set -a; . .env; set +a; REFRAG_DECODER=1 REFRAG_RUNTIME=glm REFRAG_COMMIT_DESCRIBE=1 python3 - << 'PY' +from scripts.refrag_llamacpp import is_decoder_enabled, get_runtime_kind +from scripts.ingest_history import commit_metadata, generate_commit_summary, run + +print('is_decoder_enabled:', is_decoder_enabled()) +print('runtime:', get_runtime_kind()) + +sha = run('git rev-list --max-count=1 HEAD').strip() +md = commit_metadata(sha) + +diff = run(f'git show --stat --patch --unified=3 {sha}') +print('Testing commit:', sha) +print('Files:', md.get('files')) + +goal, symbols, tags = generate_commit_summary(md, diff) +print('goal:', repr(goal)) +print('symbols:', symbols) +print('tags:', tags) +PY + + + +Index commits: +set -a; . .env; set +a; COLLECTION_NAME=Context-Engine-41e67959 QDRANT_URL=http://localhost:6333 REFRAG_DECODER=1 REFRAG_RUNTIME=glm REFRAG_COMMIT_DESCRIBE=1 python3 -m scripts.ingest_history --since '6 months ago' --max-commits 10 --per-batch 10 \ No newline at end of file diff --git a/docs/commit-indexing/experiments.md b/docs/commit-indexing/experiments.md new file mode 100644 index 00000000..64d77e2b --- /dev/null +++ b/docs/commit-indexing/experiments.md @@ -0,0 +1,281 @@ +--- + +## 1. High-level agent recipe + +Target question: +**“When and why did behavior X change in file Y?”** + +Recommended steps: + +1. **Localize the behavior now (code search).** + Use [repo_search](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:1663:0-2468:5) to find the current implementation of X. + +2. **Shortlist relevant commits (lineage search).** + Use [search_commits_for(path=, query=)](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) to get a tiny set of candidate commits with `lineage_goal` / tags. + +3. **Decide if lineage summary is enough.** + Sometimes `lineage_goal` already answers the “why” without diffs. + +4. **If needed, pull small diffs and reason.** + For 1–2 chosen SHAs, fetch compact diffs for that file and let the LLM explain _how_ the behavior changed. + +Everything else is just detail and guardrails around these four steps. + +--- + +## 2. Step-by-step, with knobs + +### Step 1: Localize behavior (repo_search) + +- **Inputs:** + - Behavior symbol: [ensureIndexedWatcher](cci:1://file:///home/coder/project/Context-Engine/vscode-extension/context-engine-uploader/extension.js:751:0-781:1), `index_repo`, etc. + - Optional context: `"status bar"`, `"upload delta"`, etc. + +- **Call pattern:** + + ```jsonc + repo_search( + query: "ensureIndexedWatcher status bar", + under: "vscode-extension/context-engine-uploader", + limit: 8, + per_path: 2 + ) + ``` + +- **Goal:** + - Identify: + - Canonical file path, e.g. `"vscode-extension/context-engine-uploader/extension.js"`. + - Specific function or symbol (span) you care about. + +Agents should **store**: + +- `target_file`: relative path under repo. +- `target_symbol` or a short description of the behavior X. + +### Step 2: Shortlist commits (search_commits_for) + +- **Normalize path:** + - [search_commits_for](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) uses commit metadata `files` like: + - `"scripts/ingest_code.py"` + - `"vscode-extension/context-engine-uploader/scripts/ctx.py"` + - So pass exactly that style: **no `/work/...` prefix**. + +- **Call pattern:** + + ```jsonc + search_commits_for( + query: "ensureIndexedWatcher status bar", // or a simpler keyword + path: "vscode-extension/context-engine-uploader/extension.js", + collection: "Context-Engine-41e67959", + limit: 5, + max_points: 1000 + ) + ``` + +- **What you get back:** + + ```json + { + "commit_id": "...", + "message": "...", + "files": ["..."], + "lineage_goal": "short intent summary", + "lineage_symbols": [...], + "lineage_tags": [...] + } + ``` + +Agents should: + +- Prefer commits where: + - `files` includes the target file, and + - `lineage_goal` / `lineage_symbols` mention relevant concepts. + +This step is the **semantic pre-filter**: instead of scrolling `git log -- path`, you pick 1–3 promising SHAs. + +### Step 3: Answer “why” from lineage, if possible + +For many questions, you don’t need diffs at all: + +- Example from your run: + + ```json + { + "commit_id": "e9d5...", + "message": "Remove duplicate ctx script in extension ...", + "lineage_goal": "Remove duplicate ctx script from extension as it's bundled at build time", + "lineage_tags": ["cleanup","duplicate","build","extension","script"] + } + ``` + +For a question like: + +> “Why did ctx.py disappear from the extension folder?” + +An agent can answer almost entirely from: + +- `lineage_goal` + `message` + filename, maybe with a tiny code/context snippet. + +**Rule of thumb:** +If the question is “why was this introduced/removed/renamed?”, try to answer from `lineage_goal` before reaching for diffs. + +### Step 4: When you actually need diffs + +Only when: + +- The question is about **behavior changes** (“when did it start returning null?”, “when did it stop calling X?”), or +- `lineage_goal` is too high-level, + +should you pull real diffs. + +In this repo, a local agent can: + +```bash +git show -p -- +# or with smaller context: +git show --unified=3 -- +``` + +Then: + +- Extract only the hunks around the target symbol or lines (to save tokens). +- Ask the model: + + > “Given this diff for `` and the current code for ``, explain how the behavior of X changed.” + +For a future remote/MCP world, this would be a natural small MCP tool: + +- `get_commit_diff(commit_id, path, context_lines=3)` → returns only the relevant diff hunks as text. + +But you don’t need that implemented yet to exercise the pattern locally. + +--- + +## 3. Example in this repo (ctx.py cleanup) + +Concrete run we just saw: + +1. **Behavior:** “What happened to `ctx.py` in the VS Code extension?” + +2. **repo_search (hypothetical):** + + - Find `vscode-extension/context-engine-uploader/scripts/ctx.py`. + +3. **search_commits_for:** + + ```json + search_commits_for( + query: "ctx script build time", + path: "vscode-extension/context-engine-uploader/scripts/ctx.py", + limit: 3 + ) + ``` + + One of the results: + + ```json + { + "commit_id": "e9d5...", + "message": "Remove duplicate ctx script in extension - bundled at build time - ctx is available in-repo (scripts/ctx.py)", + "lineage_goal": "Remove duplicate ctx script from extension as it's bundled at build time", + "lineage_symbols": ["ctx.py","vscode-extension","build-time","bundled"], + "lineage_tags": ["cleanup","duplicate","build","extension","script"] + } + ``` + +4. **Answer “why”**: + + - No diff needed: you can say: + - It was removed as a duplicate because the script is bundled at build time and already available in-repo. + - If you *also* want “how did the code change?”: + - Pull `git show -p e9d5... -- vscode-extension/context-engine-uploader/scripts/ctx.py`. + - Let the LLM confirm that the extension now uses the in-repo `scripts/ctx.py` and no longer ships a copy. + +--- + +## 4. Where this goes next + +We don’t need more MCP tools immediately; we have: + +- [repo_search](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:1663:0-2468:5) → code now. +- [search_commits_for](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2760:0-2898:65) → commit shortlist with lineage summaries. +- [change_history_for_path(include_commits=true)](cci:1://file:///home/coder/project/Context-Engine/scripts/mcp_indexer_server.py:2901:0-3015:56) → file-level view with recent commits. + +Polish / next actions (conceptual, not coding yet): + +- Encode this **4-step playbook** into an “advanced: context lineage” section in [CLAUDE.md](cci:7://file:///home/coder/project/CLAUDE.md:0:0-0:0) for agents. +- Later, if needed, introduce a tiny `get_commit_diff` MCP tool for remote setups; locally, continue to use `git show` directly. + +If you want, next step we can actually draft that “advanced lineage workflow” section text for [CLAUDE.md](cci:7://file:///home/coder/project/CLAUDE.md:0:0-0:0), using the above structure but even more compressed for agents. + +--- + +## 5. "Bad message" / good summary sanity check + +Question: *Is GLM just parroting commit messages, or is it actually reading diffs and/or detailed bodies?* + +### Commit under test + +- **SHA:** `6adced4ed83adf75ad8f8c2649b4599a68fb53ae` +- **Subject:** `fix` +- **Body (excerpt):** + - `What this fixes:` + - `stopProcesses will not resolve prematurely` + - `runSequence cannot start a new watcher while the previous one is still alive` + - `Resilient to processes that ignore SIGTERM` +- **Files touched (relevant):** + - `vscode-extension/context-engine-uploader/extension.js` + +Diff (abridged) shows changes to `terminateProcess(proc, label)`: + +- Introduces `termTimer` / `killTimer` and a `cleanup()` helper. +- Makes `finalize(reason)` idempotent and ensures timers are cleared. +- Hooks `exit` / `close` handlers into a shared `onExit` that calls `finalize` with exit status / signal. +- Keeps an initial `proc.kill()` (SIGTERM), then: + - Waits `waitSigtermMs` (4s), then tries `proc.kill('SIGKILL')` and logs a message. + - After an additional `waitSigkillMs` (2s), forces `finalize` with a “forced after X ms” reason. + +### Lineage summary produced by GLM + +From `search_commits_for(query="fix", path="", limit=5)` we see, for this SHA: + +```json +{ + "commit_id": "6adced4ed83adf75ad8f8c2649b4599a68fb53ae", + "message": "fix", + "files": [".env.example", "vscode-extension/context-engine-uploader/extension.js"], + "lineage_goal": "Fix process termination and watcher lifecycle issues", + "lineage_symbols": [ + "SIGTERM", + "SIGKILL", + "watchProcess", + "forceProcess" + ], + "lineage_tags": [ + "process-management", + "termination", + "watcher", + "lifecycle", + "signal-handling" + ] +} +``` + +### Interpretation + +- The **subject** alone (`fix`) is non-informative. +- The **body** gives some English hints about watcher behavior and SIGTERM resilience. +- The **diff** clearly shows: + - A more robust termination sequence (SIGTERM → SIGKILL → forced finalize). + - Explicit references to `watchProcess`, `forceProcess`, and signal names. + +The GLM summary: + +- Captures the high-level intent (`process termination and watcher lifecycle issues`). +- Names concrete symbols seen in the diff (`SIGTERM`, `SIGKILL`, `watchProcess`, `forceProcess`). +- Adds tags (`process-management`, `signal-handling`, etc.) that do not appear verbatim in the subject. + +Conclusion for this case: + +- `lineage_goal` is *not* just a restatement of the one-word subject; it reflects both the commit body and the structure of the diff. +- `lineage_symbols` / `lineage_tags` show that GLM is paying attention to changed identifiers and behavior, making this commit discoverable via queries like `"watcher lifecycle"`, `"SIGTERM"`, or `"process termination"` even though the subject is just `fix`. \ No newline at end of file diff --git a/docs/commit-indexing/overview.md b/docs/commit-indexing/overview.md new file mode 100644 index 00000000..3d4ea088 --- /dev/null +++ b/docs/commit-indexing/overview.md @@ -0,0 +1,144 @@ +# Commit Indexing & Context Lineage: Goals and Status + +## 1. Motivation + +- **Historical context for agents** + - Modern agents are good at reading current files but struggle when the answer is buried in months of commit history. + - Goal: expose a compact, queryable view of *how and why* the code evolved, not just what it looks like now. +- **Complement, not replace, git log** + - Humans and local tools can always use `git log` / `git show` / `git diff` directly. + - Commit indexing and lineage should add value by: + - Making history available to remote/agent clients that cannot run git. + - Providing structured summaries and tags so agents can quickly find and explain relevant changes. + +## 2. Current architecture (v1) + +- **Commit harvesting (`scripts/ingest_history.py`)** + - Walks git history with configurable filters (`--since`, `--until`, `--author`, `--path`, `--max-commits`). + - For each commit: + - Captures `commit_id`, `author_name`, `authored_date`, `message` (subject + body, redacted), and `files` touched. + - Builds a short `document` and `information` string. + - Embeds into Qdrant in the same collection as code, with metadata: + - `language="git"`, `kind="git_message"`. + - `symbol` / `symbol_path` = `commit_id`. + - `files`, `repo`, `path=.git`, `ingested_at`, etc. + +- **GLM-backed diff summarization (`generate_commit_summary`)** + - Opt-in via `REFRAG_COMMIT_DESCRIBE=1` and decoder flags (`REFRAG_DECODER=1`, `REFRAG_RUNTIME=glm`). + - For each commit, fetches a truncated `git show --stat --patch --unified=3 ` and sends it to the decoder (GLM or llama.cpp). + - Asks for compact JSON: + - `goal`: short explanation of the commit’s intent / behavior change. + - `symbols`: 1–6 key functions/flags/terms. + - `tags`: 3–6 short keywords to aid retrieval. + - On success, stores these as metadata on the commit point: + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - On failure or when disabled, falls back gracefully and leaves these fields empty. + +- **Indexer-facing Qdrant schema** + - Commit points live in the same Qdrant collection as code spans (e.g. `Context-Engine-41e67959`). + - This allows hybrid flows that combine code search and commit search within one collection. + +## 3. MCP tools and usage + +- **`search_commits_for` (indexer MCP)** + - Purpose: search git commit history stored in Qdrant. + - Filters: + - Always restricts to `language="git"`, `kind="git_message"`. + - Optional `path` filter: only keep commits whose `files` list contains the path substring. + - Optional `query`: lexical match against a composite blob containing: + - `message` + `information`. + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - Output (per commit): + - `commit_id`, `author_name`, `authored_date`, `message`, `files`. + - `lineage_goal`, `lineage_symbols`, `lineage_tags`. + - Dedupes by `commit_id` so each commit appears at most once per response. + +- **`change_history_for_path(include_commits=true)` (indexer MCP)** + - Base behavior: + - Scans Qdrant points whose `metadata.path == ` (code index), summarizing: + - `points_scanned`, `distinct_hashes`, `last_modified_min/max`, `ingested_min/max`, `churn_count_max`. + - With `include_commits=true`: + - Calls `search_commits_for(path=)` and attaches a small list of recent commits: + - Each entry includes commit metadata plus any `lineage_*` fields. + - Dedupes by `commit_id` before attaching. + - Intended usage: + - Fast “what changed and how hot is this file?” view for agents. + - Entry point for deeper lineage questions when combined with `repo_search` and git diffs. + +## 4. Current experiments and evaluation + +See: +- `cmds.md` for handy one-liner commands (curl, ingest, local GLM tests). +- `experiments.md` for a detailed “when/why did behavior X change?” recipe and worked examples. + +Key experiments so far: + +- **GLM summarization sanity-check** + - Local script that: + - Picks `HEAD` via `git rev-list --max-count=1 HEAD`. + - Calls `commit_metadata` + `generate_commit_summary`. + - Observed: with valid GLM API keys and flags, we get reasonable `goal/symbols/tags` for real commits. + +- **Qdrant payload inspection** + - Direct `curl` scroll over `Context-Engine-41e67959` for `language="git"`, `kind="git_message"`. + - Verified commit points include: + - Baseline metadata (message, files, etc.). + - Newly-added `lineage_goal`, `lineage_symbols`, `lineage_tags` after reindexing. + +- **MCP round-trip tests** + - `search_commits_for(query="pseudo tag boost")` → surfaces the hybrid_search commit with clear lineage fields. + - `search_commits_for(query="ctx script", path="vscode-extension/context-engine-uploader/scripts/ctx.py")` → surfaces the ctx cleanup commit and explains its intent. + - `change_history_for_path(path="vscode-extension/context-engine-uploader/scripts/ctx.py", include_commits=true)` → returns a deduped list of relevant commits with lineage summaries. + +These confirm the end-to-end path: +- Git → ingest_history → GLM → Qdrant → MCP → agent. + +## 5. Target workflows (what we are aiming for) + +Our north star is the "Context Lineage" behavior from the Augment blog: + +- **Hero question:** + - “When and why did behavior X change in file Y?” + +- **Recommended agent flow:** + 1. **Localize X in current code** + - Use `repo_search` to find the symbol / behavior in the current tree. + 2. **Shortlist commits about X** + - Use `search_commits_for(path=, query=)` to get a compact list of relevant commits with `lineage_goal`/tags. + 3. **Try to answer "why" from summaries** + - Many “why was this introduced/removed/renamed?” questions can be answered from `lineage_goal` plus minimal code context. + 4. **If necessary, pull diffs to answer "how"** + - Use `git show --unified=3 -- ` (or a future MCP diff tool) and let the LLM explain the behavior change in detail. + +This should: +- Reduce reliance on raw `git log` grepping in larger repos. +- Give agents a semantic, compact view of history they can reason over. + +## 6. Open questions and future improvements + +- **Prompt quality and consistency** + - Are `lineage_goal` strings consistently helpful across many commits, or do they drift toward restating the subject line? + - Do `lineage_symbols` and `lineage_tags` give agents enough hooks to connect history with current code (e.g., flags, functions, config keys)? + +- **Search behavior and ranking** + - How often does `search_commits_for` surface the right commit(s) in the top N for real questions? + - Do we need semantic reranking or additional filters (date ranges, authors, etc.) in practice? + +- **Higher-level `lineage_answer` helper** + - Today: agents compose `repo_search` + `change_history_for_path(include_commits=true)` + `search_commits_for` + optional `context_answer` themselves. + - Future: a thin MCP wrapper (e.g., `lineage_answer(query, path=...)`) could orchestrate those calls and ask the decoder to produce a short "when/why did this change" answer, returning both the text and the underlying commit/code citations. + +- **Diff access for remote agents** + - Today: local workflows can rely on `git show` from the shell. + - Future: a small, token-conscious MCP tool like `get_commit_diff(commit_id, path, context_lines)` could make lineage usable from fully remote contexts when precise line-level inspection is required. + +- **Remote git metadata via upload pipeline** + - Current commit ingest assumes direct access to a local `.git` repo (`ingest_history.py` running alongside the indexer). + - Future: the standalone upload client could optionally parse a compact git log view (e.g., JSON-ified commit metadata + diffs) and bundle it with delta uploads, with the upload_service and watcher feeding that into commit indexing for remote/non-git workspaces. + +- **Docs and agent guidance** + - CLAUDE.md (and related examples) should clearly document when to: + - Prefer lineage summaries over raw diffs for "why" questions. + - Fallback to `repo_search + git show` (or a future diff MCP tool) for detailed "how" questions. + +This document is meant as the high-level tracker for commit indexing and context-lineage work. Use `cmds.md` for concrete commands and `experiments.md` for detailed workflows and notes on specific runs. diff --git a/scripts/ingest_history.py b/scripts/ingest_history.py index 99645386..10079253 100644 --- a/scripts/ingest_history.py +++ b/scripts/ingest_history.py @@ -7,6 +7,9 @@ from typing import List, Dict, Any import re import time +import json +import sys +from pathlib import Path from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -17,6 +20,9 @@ API_KEY = os.environ.get("QDRANT_API_KEY") REPO_NAME = os.environ.get("REPO_NAME", "workspace") +ROOT_DIR = Path(__file__).resolve().parent.parent +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) from scripts.utils import sanitize_vector_name as _sanitize_vector_name @@ -108,6 +114,119 @@ def stable_id(commit_id: str) -> int: return int(h[:16], 16) +def _commit_summary_enabled() -> bool: + """Check REFRAG_COMMIT_DESCRIBE to decide if commit summarization is enabled. + + This is an opt-in feature: set REFRAG_COMMIT_DESCRIBE=1 (and enable the decoder) + to generate per-commit lineage summaries at ingest time. + """ + try: + return str(os.environ.get("REFRAG_COMMIT_DESCRIBE", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } + except Exception: + return False + + +def generate_commit_summary(md: Dict[str, Any], diff_text: str) -> tuple[str, list[str], list[str]]: + """Best-effort: ask local decoder to summarize a git commit. + + Returns (goal, symbols, tags). On failure returns ("", [], []). + + The summary is designed to be compact and search-friendly, mirroring the + Context Lineage goals: high-level intent, key symbols, and short tags. + """ + goal: str = "" + symbols: list[str] = [] + tags: list[str] = [] + if not _commit_summary_enabled() or not diff_text.strip(): + return goal, symbols, tags + try: + from scripts.refrag_llamacpp import ( # type: ignore + LlamaCppRefragClient, + is_decoder_enabled, + get_runtime_kind, + ) + + if not is_decoder_enabled(): + return "", [], [] + runtime = get_runtime_kind() + commit_id = str(md.get("commit_id") or "") + message = str(md.get("message") or "") + files = md.get("files") or [] + try: + files_str = "\n".join(str(f) for f in files[:50]) + except Exception: + files_str = "" + # Truncate diff text to keep summarization fast/token-efficient + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + body = diff_text[:max_chars] + + if runtime == "glm": + from scripts.refrag_glm import GLMRefragClient # type: ignore + + client = GLMRefragClient() + prompt = ( + "You are a JSON-only function that summarizes git commits for search enrichment.\n" + "Respond with a single JSON object and nothing else (no prose, no markdown).\n" + "Exact format: {\"goal\": string (<=200 chars), \"symbols\": [1-6 short strings], \"tags\": [3-6 short strings]}.\n" + f"Commit id: {commit_id}\n" + f"Message:\n{message}\n" + f"Files:\n{files_str}\n" + "Diff:\n" + body + ) + out = client.generate_with_soft_embeddings( + prompt=prompt, + max_tokens=int(os.environ.get("COMMIT_SUMMARY_MAX_TOKENS", "128") or 128), + temperature=float(os.environ.get("COMMIT_SUMMARY_TEMPERATURE", "0.10") or 0.10), + top_p=float(os.environ.get("COMMIT_SUMMARY_TOP_P", "0.9") or 0.9), + stop=["\n\n"], + force_json=True, + ) + else: + client = LlamaCppRefragClient() + prompt = ( + "You summarize git commits for search enrichment.\n" + "Return strictly JSON: {\"goal\": string (<=200 chars), \"symbols\": [1-6 short strings], \"tags\": [3-6 short strings]}.\n" + f"Commit id: {commit_id}\n" + f"Message:\n{message}\n" + f"Files:\n{files_str}\n" + "Diff:\n" + body + ) + out = client.generate_with_soft_embeddings( + prompt=prompt, + max_tokens=int(os.environ.get("COMMIT_SUMMARY_MAX_TOKENS", "128") or 128), + temperature=float(os.environ.get("COMMIT_SUMMARY_TEMPERATURE", "0.10") or 0.10), + top_k=int(os.environ.get("COMMIT_SUMMARY_TOP_K", "30") or 30), + top_p=float(os.environ.get("COMMIT_SUMMARY_TOP_P", "0.9") or 0.9), + stop=["\n\n"], + ) + import json as _json + try: + obj = _json.loads(out) + if isinstance(obj, dict): + g = obj.get("goal") + s = obj.get("symbols") + t = obj.get("tags") + if isinstance(g, str): + goal = g.strip()[:200] + if isinstance(s, list): + symbols = [str(x).strip() for x in s if str(x).strip()][:6] + if isinstance(t, list): + tags = [str(x).strip() for x in t if str(x).strip()][:6] + except Exception: + pass + except Exception: + return "", [], [] + return goal, symbols, tags + + def build_text( md: Dict[str, Any], max_files: int = 200, include_body: bool = True ) -> str: @@ -120,6 +239,107 @@ def build_text( return (head + "\n\nFiles:\n" + files_part).strip() +def _ingest_from_manifest( + manifest_path: str, + model: TextEmbedding, + client: QdrantClient, + vec_name: str, + include_body: bool, + per_batch: int, +) -> int: + try: + with open(manifest_path, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as e: + print(f"Failed to read manifest {manifest_path}: {e}") + return 0 + + commits = data.get("commits") or [] + if not commits: + print("No commits in manifest.") + return 0 + + points: List[models.PointStruct] = [] + count = 0 + for c in commits: + try: + if not isinstance(c, dict): + continue + commit_id = str(c.get("commit_id") or "").strip() + if not commit_id: + continue + author_name = str(c.get("author_name") or "") + authored_date = str(c.get("authored_date") or "") + message = str(c.get("message") or "") + files = c.get("files") or [] + if not isinstance(files, list): + files = [] + md: Dict[str, Any] = { + "commit_id": commit_id, + "author_name": author_name, + "authored_date": authored_date, + "message": message, + "files": files, + } + text = build_text(md, include_body=include_body) + try: + vec = next(model.embed([text])).tolist() + except Exception: + continue + + goal: str = "" + sym: List[str] = [] + tgs: List[str] = [] + diff_text = str(c.get("diff") or "") + if diff_text.strip(): + try: + goal, sym, tgs = generate_commit_summary(md, diff_text) + except Exception: + goal, sym, tgs = "", [], [] + + md_payload: Dict[str, Any] = { + "language": "git", + "kind": "git_message", + "symbol": commit_id, + "symbol_path": commit_id, + "repo": REPO_NAME, + "commit_id": commit_id, + "author_name": author_name, + "authored_date": authored_date, + "message": message, + "files": files, + "path": ".git", + "path_prefix": ".git", + "ingested_at": int(time.time()), + } + if goal: + md_payload["lineage_goal"] = goal + if sym: + md_payload["lineage_symbols"] = sym + if tgs: + md_payload["lineage_tags"] = tgs + + payload = { + "document": (message.splitlines()[0] if message else commit_id), + "information": text[:512], + "metadata": md_payload, + } + pid = stable_id(commit_id) + pt = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) + points.append(pt) + count += 1 + if len(points) >= per_batch: + client.upsert(collection_name=COLLECTION, points=points) + points.clear() + except Exception: + continue + + if points: + client.upsert(collection_name=COLLECTION, points=points) + print(f"Ingested {count} commits into {COLLECTION} from manifest {manifest_path}.") + return count + + def main(): ap = argparse.ArgumentParser( description="Ingest Git history into Qdrant deterministically" @@ -146,6 +366,12 @@ def main(): default="origin", help="Remote to fetch from if no local HEAD is present", ) + ap.add_argument( + "--manifest-json", + type=str, + default=None, + help="Path to git history manifest JSON produced by upload client", + ) ap.add_argument( "--fetch-depth", type=int, @@ -158,6 +384,17 @@ def main(): vec_name = _sanitize_vector_name(MODEL_NAME) client = QdrantClient(url=QDRANT_URL, api_key=API_KEY or None) + if args.manifest_json: + _ingest_from_manifest( + args.manifest_json, + model, + client, + vec_name, + args.include_body, + args.per_batch, + ) + return + commits = list_commits(args) if not commits: print("No commits matched filters.") @@ -168,6 +405,35 @@ def main(): md = commit_metadata(sha) text = build_text(md, include_body=args.include_body) vec = next(model.embed([text])).tolist() + goal, sym, tgs = "", [], [] + try: + diff = run(f"git show --stat --patch --unified=3 {sha}") + goal, sym, tgs = generate_commit_summary(md, diff) + except Exception: + pass + + md_payload: Dict[str, Any] = { + "language": "git", + "kind": "git_message", + "symbol": md["commit_id"], + "symbol_path": md["commit_id"], + "repo": REPO_NAME, + "commit_id": md["commit_id"], + "author_name": md["author_name"], + "authored_date": md["authored_date"], + "message": md["message"], + "files": md["files"], + "path": ".git", + "path_prefix": ".git", + "ingested_at": int(time.time()), + } + if goal: + md_payload["lineage_goal"] = goal + if sym: + md_payload["lineage_symbols"] = sym + if tgs: + md_payload["lineage_tags"] = tgs + payload = { "document": ( md.get("message", "").splitlines()[0] @@ -175,21 +441,7 @@ def main(): else md["commit_id"] ), "information": text[:512], - "metadata": { - "language": "git", - "kind": "git_message", - "symbol": md["commit_id"], - "symbol_path": md["commit_id"], - "repo": REPO_NAME, - "commit_id": md["commit_id"], - "author_name": md["author_name"], - "authored_date": md["authored_date"], - "message": md["message"], - "files": md["files"], - "path": ".git", - "path_prefix": ".git", - "ingested_at": int(time.time()), - }, + "metadata": md_payload, } pid = stable_id(md["commit_id"]) # deterministic per-commit point = models.PointStruct(id=pid, vector={vec_name: vec}, payload=payload) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 0acc9ec9..9754eaac 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -1962,6 +1962,16 @@ def _to_str_list(x): if include_snippet: compact = False + # Default behavior: exclude commit-history docs (which use path=".git") from + # generic repo_search calls, unless the caller explicitly asks for git + # content. This prevents normal code queries from surfacing commit-index + # points as if they were source files. + if (not language or language.lower() != "git") and ( + not kind or kind.lower() != "git_message" + ): + if ".git" not in not_globs: + not_globs.append(".git") + # Accept top-level alias `queries` as a drop-in for `query` # Many clients send queries=[...] instead of query=[...] if kwargs and "queries" in kwargs and kwargs.get("queries") is not None: @@ -2758,11 +2768,175 @@ async def search_importers_for( ) +@mcp.tool() +async def search_commits_for( + query: Any = None, + path: Any = None, + collection: Any = None, + limit: Any = None, + max_points: Any = None, +) -> Dict[str, Any]: + """Search git commit history indexed in Qdrant. + + What it does: + - Queries commit documents ingested by scripts/ingest_history.py + - Filters by optional file path (metadata.files contains path) + + Parameters: + - query: str or list[str]; matched lexically against commit message/text + - path: str (optional). Relative path under /work; filters commits that touched this file + - collection: str (optional). Defaults to env/WS collection + - limit: int (optional, default 10). Max commits to return + - max_points: int (optional). Safety cap on scanned points (default 1000) + + Returns: + - {"ok": true, "results": [{"commit_id", "author_name", "authored_date", "message", "files"}, ...], "scanned": int} + - On error: {"ok": false, "error": "..."} + """ + # Normalize inputs + # query may be a string ("ctx script build") or a list of terms; + # in both cases we normalize to lowercase tokens and require all of + # them to appear somewhere in the composite text. + q_terms: list[str] = [] + if isinstance(query, (list, tuple)): + for x in query: + for tok in str(x).strip().split(): + if tok.strip(): + q_terms.append(tok.strip().lower()) + elif query is not None: + qs = str(query).strip() + if qs: + for tok in qs.split(): + if tok.strip(): + q_terms.append(tok.strip().lower()) + p = str(path or "").strip() + coll = str(collection or "").strip() or _default_collection() + try: + lim = int(limit) if limit not in (None, "") else 10 + except (ValueError, TypeError): + lim = 10 + try: + mcap = int(max_points) if max_points not in (None, "") else 1000 + except (ValueError, TypeError): + mcap = 1000 + + try: + from qdrant_client import QdrantClient # type: ignore + from qdrant_client import models as qmodels # type: ignore + + client = QdrantClient( + url=QDRANT_URL, + api_key=os.environ.get("QDRANT_API_KEY"), + timeout=float(os.environ.get("QDRANT_TIMEOUT", "20") or 20), + ) + + # Restrict to commit documents ingested by ingest_history.py + filt = qmodels.Filter( + must=[ + qmodels.FieldCondition( + key="metadata.language", match=qmodels.MatchValue(value="git") + ), + qmodels.FieldCondition( + key="metadata.kind", match=qmodels.MatchValue(value="git_message") + ), + ] + ) + + page = None + scanned = 0 + out: list[dict[str, Any]] = [] + seen_ids: set[str] = set() + while scanned < mcap and len(seen_ids) < lim: + sc, page = await asyncio.to_thread( + lambda: client.scroll( + collection_name=coll, + with_payload=True, + with_vectors=False, + limit=200, + offset=page, + scroll_filter=filt, + ) + ) + if not sc: + break + for pt in sc: + scanned += 1 + if scanned > mcap: + break + payload = getattr(pt, "payload", {}) or {} + md = payload.get("metadata") or {} + msg = str(md.get("message") or "") + info = str(payload.get("information") or "") + files = md.get("files") or [] + try: + files_list = [str(f) for f in files] + except Exception: + files_list = [] + # Optional lineage-style metadata from ingest_history (GLM/decoder-backed) + lg = md.get("lineage_goal") + if isinstance(lg, str): + lineage_goal = lg.strip() + else: + lineage_goal = "" + ls_raw = md.get("lineage_symbols") or [] + if isinstance(ls_raw, list): + lineage_symbols = [ + str(x).strip() for x in ls_raw if str(x).strip() + ][:6] + else: + lineage_symbols = [] + lt_raw = md.get("lineage_tags") or [] + if isinstance(lt_raw, list): + lineage_tags = [ + str(x).strip() for x in lt_raw if str(x).strip() + ][:6] + else: + lineage_tags = [] + # Build a composite lowercase text blob for simple lexical matching + lineage_text_parts = [] + if lineage_goal: + lineage_text_parts.append(lineage_goal) + if lineage_symbols: + lineage_text_parts.extend(lineage_symbols) + if lineage_tags: + lineage_text_parts.extend(lineage_tags) + text_l = (msg + "\n" + info + "\n" + " ".join(lineage_text_parts)).lower() + if q_terms and not all(t in text_l for t in q_terms): + continue + if p: + # Require the path substring to appear in at least one touched file + if not any(p in f for f in files_list): + continue + cid = md.get("commit_id") or md.get("symbol") + scid = str(cid) if cid is not None else "" + if not scid or scid in seen_ids: + continue + seen_ids.add(scid) + out.append( + { + "commit_id": cid, + "author_name": md.get("author_name"), + "authored_date": md.get("authored_date"), + "message": msg.splitlines()[0] if msg else "", + "files": files_list, + "lineage_goal": lineage_goal, + "lineage_symbols": lineage_symbols, + "lineage_tags": lineage_tags, + } + ) + if len(seen_ids) >= lim: + break + return {"ok": True, "results": out, "scanned": scanned, "collection": coll} + except Exception as e: + return {"ok": False, "error": str(e), "collection": coll} + + @mcp.tool() async def change_history_for_path( path: Any, collection: Any = None, max_points: Any = None, + include_commits: Any = None, ) -> Dict[str, Any]: """Summarize recent change metadata for a file path from the index. @@ -2770,6 +2944,8 @@ async def change_history_for_path( - path: str. Relative path under /work. - collection: str (optional). Defaults to env/WS default. - max_points: int (optional). Safety cap on scanned points. + - include_commits: bool (optional). If true, attach a small list of recent commits + touching this path based on the commit index. Returns: - {"ok": true, "summary": {...}} or {"ok": false, "error": "..."}. @@ -2782,6 +2958,14 @@ async def change_history_for_path( mcap = int(max_points) if max_points not in (None, "") else 200 except (ValueError, TypeError): mcap = 200 + # Treat include_commits as a loose boolean flag + inc_commits = False + if include_commits not in (None, ""): + try: + inc_commits = str(include_commits).strip().lower() in {"1", "true", "yes", "on"} + except Exception: + inc_commits = False + try: from qdrant_client import QdrantClient # type: ignore from qdrant_client import models as qmodels # type: ignore @@ -2835,7 +3019,7 @@ async def change_history_for_path( total += 1 if total >= mcap: break - summary = { + summary: Dict[str, Any] = { "path": p, "points_scanned": total, "distinct_hashes": len(hashes), @@ -2845,6 +3029,30 @@ async def change_history_for_path( "ingested_max": max(ingested) if ingested else None, "churn_count_max": max(churns) if churns else None, } + if inc_commits: + try: + commits = await search_commits_for( + query=None, + path=p, + collection=coll, + limit=10, + max_points=1000, + ) + if isinstance(commits, dict) and commits.get("ok"): + raw = commits.get("results") or [] + seen: set[str] = set() + uniq: list[dict[str, Any]] = [] + for c in raw: + cid = c.get("commit_id") if isinstance(c, dict) else None + scid = str(cid) if cid is not None else "" + if not scid or scid in seen: + continue + seen.add(scid) + uniq.append(c) + summary["commits"] = uniq + except Exception: + # Best-effort: change-history summary is still useful without commit details + pass return {"ok": True, "summary": summary} except Exception as e: return {"ok": False, "error": str(e), "path": p} @@ -4432,6 +4640,7 @@ def _ca_prepare_filters_and_retrieve( ".kiro/", "node_modules/", ".git/", + ".git", ] def _variants(p: str) -> list[str]: diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 449aa0bc..fe805e2d 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -20,6 +20,9 @@ import tempfile import logging import argparse +import subprocess +import shlex +import re from pathlib import Path, PurePosixPath from typing import Dict, List, Any, Optional, Tuple from datetime import datetime @@ -44,6 +47,219 @@ import scripts.ingest_code as idx +def _find_git_root(start: Path) -> Optional[Path]: + """Best-effort detection of the git repository root for a workspace. + + Walks up from the given path looking for a .git directory. Returns None if + no repo is found or git metadata is unavailable. + """ + try: + cur = start.resolve() + except Exception: + cur = start + try: + for p in [cur] + list(cur.parents): + try: + if (p / ".git").exists(): + return p + except Exception: + continue + except Exception: + return None + return None + + +def _redact_emails(text: str) -> str: + """Redact email addresses from commit messages for privacy.""" + try: + return re.sub( + r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "", text or "", + ) + except Exception: + return text + + +def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str, Any]]: + """Best-effort collection of recent git history for a workspace. + + Uses REMOTE_UPLOAD_GIT_MAX_COMMITS (0/empty disables) and + REMOTE_UPLOAD_GIT_SINCE (optional) to bound history. Returns a + serializable dict suitable for writing as metadata/git_history.json, or + None when git metadata is unavailable. + """ + # Read configuration from environment + try: + raw_max = (os.environ.get("REMOTE_UPLOAD_GIT_MAX_COMMITS", "") or "").strip() + max_commits = int(raw_max) if raw_max else 0 + except Exception: + max_commits = 0 + since = (os.environ.get("REMOTE_UPLOAD_GIT_SINCE", "") or "").strip() + force_full = str(os.environ.get("REMOTE_UPLOAD_GIT_FORCE", "") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + + if max_commits <= 0: + return None + + root = _find_git_root(Path(workspace_path)) + if not root: + return None + + # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged + base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + git_cache_path = base / ".context-engine" / "git_history_cache.json" + current_head = "" + try: + head_proc = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if head_proc.returncode == 0 and head_proc.stdout.strip(): + current_head = head_proc.stdout.strip() + except Exception: + current_head = "" + + cache: Dict[str, Any] = {} + if not force_full: + try: + if git_cache_path.exists(): + with git_cache_path.open("r", encoding="utf-8") as f: + obj = json.load(f) + if isinstance(obj, dict): + cache = obj + except Exception: + cache = {} + + if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + return None + + # Build git rev-list command (simple HEAD-based history) + cmd: List[str] = ["git", "rev-list", "--no-merges"] + if since: + cmd.append(f"--since={since}") + cmd.append("HEAD") + + try: + proc = subprocess.run( + cmd, + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if proc.returncode != 0 or not proc.stdout.strip(): + return None + commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] + except Exception: + return None + + if not commits: + return None + if len(commits) > max_commits: + commits = commits[:max_commits] + + records: List[Dict[str, Any]] = [] + for sha in commits: + try: + fmt = "%H%x1f%an%x1f%ae%x1f%ad%x1f%s%x1f%b" + show_proc = subprocess.run( + ["git", "show", "-s", f"--format={fmt}", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if show_proc.returncode != 0 or not show_proc.stdout.strip(): + continue + parts = show_proc.stdout.strip().split("\x1f") + c_sha, an, _ae, ad, subj, body = (parts + [""] * 6)[:6] + + files_proc = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + files: List[str] = [] + if files_proc.returncode == 0 and files_proc.stdout: + files = [f for f in files_proc.stdout.splitlines() if f] + + diff_text = "" + try: + diff_proc = subprocess.run( + ["git", "show", "--stat", "--patch", "--unified=3", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if diff_proc.returncode == 0 and diff_proc.stdout: + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + diff_text = diff_proc.stdout[:max_chars] + except Exception: + diff_text = "" + + msg = _redact_emails((subj + ("\n" + body if body else "")).strip()) + if len(msg) > 2000: + msg = msg[:2000] + "\u2026" + + records.append( + { + "commit_id": c_sha or sha, + "author_name": an, + "authored_date": ad, + "message": msg, + "files": files, + "diff": diff_text, + } + ) + except Exception: + continue + + if not records: + return None + + try: + repo_name = root.name + except Exception: + repo_name = "workspace" + + manifest = { + "version": 1, + "repo_name": repo_name, + "generated_at": datetime.now().isoformat(), + "max_commits": max_commits, + "since": since, + "commits": records, + } + + # Update git history cache with the HEAD and settings used for this manifest + try: + git_cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_out = { + "last_head": current_head or (commits[0] if commits else ""), + "max_commits": max_commits, + "since": since, + "updated_at": datetime.now().isoformat(), + } + with git_cache_path.open("w", encoding="utf-8") as f: + json.dump(cache_out, f, indent=2) + except Exception: + pass + + return manifest + + def _load_local_cache_file_hashes(workspace_path: str, repo_name: Optional[str]) -> Dict[str, str]: """Best-effort read of the local cache.json file_hashes map. @@ -506,6 +722,17 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, } (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + # Optional: attach recent git history for this workspace + try: + git_history = _collect_git_history_for_workspace(self.workspace_path) + if git_history: + (metadata_dir / "git_history.json").write_text( + json.dumps(git_history, indent=2) + ) + except Exception: + # Best-effort only; never fail bundle creation on git history issues + pass + # Create tarball in temporary directory temp_bundle_dir = self._get_temp_bundle_dir() bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index d3e7a1ba..e6bb9a55 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -19,6 +19,8 @@ import tempfile import logging import argparse +import subprocess +import re from pathlib import Path, PurePosixPath from typing import Dict, List, Any, Optional, Tuple from datetime import datetime @@ -231,6 +233,219 @@ def remove_cached_file(file_path: str, repo_name: Optional[str] = None) -> None: _hash_cache.remove_hash(file_path) +def _find_git_root(start: Path) -> Optional[Path]: + """Best-effort detection of the git repository root for a workspace. + + Walks up from the given path looking for a .git directory. Returns None if + no repo is found or git metadata is unavailable. + """ + try: + cur = start.resolve() + except Exception: + cur = start + try: + for p in [cur] + list(cur.parents): + try: + if (p / ".git").exists(): + return p + except Exception: + continue + except Exception: + return None + return None + + +def _redact_emails(text: str) -> str: + """Redact email addresses from commit messages for privacy.""" + try: + return re.sub( + r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "", text or "", + ) + except Exception: + return text + + +def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str, Any]]: + """Best-effort collection of recent git history for a workspace. + + Uses REMOTE_UPLOAD_GIT_MAX_COMMITS (0/empty disables) and + REMOTE_UPLOAD_GIT_SINCE (optional) to bound history. Returns a + serializable dict suitable for writing as metadata/git_history.json, or + None when git metadata is unavailable. + """ + # Read configuration from environment + try: + raw_max = (os.environ.get("REMOTE_UPLOAD_GIT_MAX_COMMITS", "") or "").strip() + max_commits = int(raw_max) if raw_max else 0 + except Exception: + max_commits = 0 + since = (os.environ.get("REMOTE_UPLOAD_GIT_SINCE", "") or "").strip() + force_full = str(os.environ.get("REMOTE_UPLOAD_GIT_FORCE", "") or "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + + if max_commits <= 0: + return None + + root = _find_git_root(Path(workspace_path)) + if not root: + return None + + # Git history cache: avoid emitting identical manifests when HEAD/settings are unchanged + base = Path(os.environ.get("WORKSPACE_PATH") or workspace_path).resolve() + git_cache_path = base / ".context-engine" / "git_history_cache.json" + current_head = "" + try: + head_proc = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if head_proc.returncode == 0 and head_proc.stdout.strip(): + current_head = head_proc.stdout.strip() + except Exception: + current_head = "" + + cache: Dict[str, Any] = {} + if not force_full: + try: + if git_cache_path.exists(): + with git_cache_path.open("r", encoding="utf-8") as f: + obj = json.load(f) + if isinstance(obj, dict): + cache = obj + except Exception: + cache = {} + + if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: + return None + + # Build git rev-list command (simple HEAD-based history) + cmd: List[str] = ["git", "rev-list", "--no-merges"] + if since: + cmd.append(f"--since={since}") + cmd.append("HEAD") + + try: + proc = subprocess.run( + cmd, + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if proc.returncode != 0 or not proc.stdout.strip(): + return None + commits = [l.strip() for l in proc.stdout.splitlines() if l.strip()] + except Exception: + return None + + if not commits: + return None + if len(commits) > max_commits: + commits = commits[:max_commits] + + records: List[Dict[str, Any]] = [] + for sha in commits: + try: + fmt = "%H%x1f%an%x1f%ae%x1f%ad%x1f%s%x1f%b" + show_proc = subprocess.run( + ["git", "show", "-s", f"--format={fmt}", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if show_proc.returncode != 0 or not show_proc.stdout.strip(): + continue + parts = show_proc.stdout.strip().split("\x1f") + c_sha, an, _ae, ad, subj, body = (parts + [""] * 6)[:6] + + files_proc = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + files: List[str] = [] + if files_proc.returncode == 0 and files_proc.stdout: + files = [f for f in files_proc.stdout.splitlines() if f] + + diff_text = "" + try: + diff_proc = subprocess.run( + ["git", "show", "--stat", "--patch", "--unified=3", sha], + cwd=str(root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if diff_proc.returncode == 0 and diff_proc.stdout: + try: + max_chars = int(os.environ.get("COMMIT_SUMMARY_DIFF_CHARS", "6000") or 6000) + except Exception: + max_chars = 6000 + diff_text = diff_proc.stdout[:max_chars] + except Exception: + diff_text = "" + + msg = _redact_emails((subj + ("\n" + body if body else "")).strip()) + if len(msg) > 2000: + msg = msg[:2000] + "\u2026" + + records.append( + { + "commit_id": c_sha or sha, + "author_name": an, + "authored_date": ad, + "message": msg, + "files": files, + "diff": diff_text, + } + ) + except Exception: + continue + + if not records: + return None + + try: + repo_name = root.name + except Exception: + repo_name = "workspace" + + manifest = { + "version": 1, + "repo_name": repo_name, + "generated_at": datetime.now().isoformat(), + "max_commits": max_commits, + "since": since, + "commits": records, + } + + # Update git history cache with the HEAD and settings used for this manifest + try: + git_cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_out = { + "last_head": current_head or (commits[0] if commits else ""), + "max_commits": max_commits, + "since": since, + "updated_at": datetime.now().isoformat(), + } + with git_cache_path.open("w", encoding="utf-8") as f: + json.dump(cache_out, f, indent=2) + except Exception: + pass + + return manifest + + class RemoteUploadClient: """Client for uploading delta bundles to remote server.""" @@ -661,6 +876,15 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, } (metadata_dir / "hashes.json").write_text(json.dumps(hashes_metadata, indent=2)) + try: + git_history = _collect_git_history_for_workspace(self.workspace_path) + if git_history: + (metadata_dir / "git_history.json").write_text( + json.dumps(git_history, indent=2) + ) + except Exception: + pass + # Create tarball in temporary directory temp_bundle_dir = self._get_temp_bundle_dir() bundle_path = temp_bundle_dir / f"{bundle_id}.tar.gz" diff --git a/scripts/upload_service.py b/scripts/upload_service.py index f9eb416b..b5095034 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -214,6 +214,28 @@ def process_delta_bundle(workspace_path: str, bundle_path: Path, manifest: Dict[ operations_data = json.loads(ops_file.read().decode('utf-8')) operations = operations_data.get("operations", []) + # Best-effort: extract git history metadata for watcher to ingest + try: + git_member = None + for member in tar.getnames(): + if member.endswith("metadata/git_history.json"): + git_member = member + break + if git_member: + git_file = tar.extractfile(git_member) + if git_file: + history_bytes = git_file.read() + history_dir = workspace / ".remote-git" + history_dir.mkdir(parents=True, exist_ok=True) + bundle_id = manifest.get("bundle_id") or "unknown" + history_path = history_dir / f"git_history_{bundle_id}.json" + try: + history_path.write_bytes(history_bytes) + except Exception as write_err: + logger.debug(f"[upload_service] Failed to write git history manifest: {write_err}") + except Exception as git_err: + logger.debug(f"[upload_service] Error extracting git history metadata: {git_err}") + # Process each operation for operation in operations: op_type = operation.get("operation") diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 41c33d80..845a022c 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -2,8 +2,10 @@ import os import time import threading +import json +import subprocess from pathlib import Path -from typing import Optional, Set +from typing import Optional, Set, Dict, List, Any from qdrant_client import QdrantClient, models from fastembed import TextEmbedding @@ -237,6 +239,9 @@ def _maybe_enqueue(self, src_path: str): rel_dir = "/" if self.excl.exclude_dir(rel_dir): return + if any(part == ".remote-git" for part in p.parts) and p.suffix.lower() == ".json": + self.queue.add(p) + return # only code files if p.suffix.lower() not in idx.CODE_EXTS: return @@ -760,6 +765,39 @@ def main(): obs.join() +def _process_git_history_manifest( + p: Path, + client, + model, + collection: str, + vector_name: str, + repo_name: Optional[str], +): + try: + import sys + + script = ROOT_DIR / "scripts" / "ingest_history.py" + if not script.exists(): + return + cmd = [sys.executable or "python3", str(script), "--manifest-json", str(p)] + env = os.environ.copy() + if collection: + env["COLLECTION_NAME"] = collection + if QDRANT_URL: + env["QDRANT_URL"] = QDRANT_URL + if repo_name: + env["REPO_NAME"] = repo_name + try: + print( + f"[git_history_manifest] launching ingest_history.py for {p} collection={collection} repo={repo_name}" + ) + except Exception: + pass + subprocess.Popen(cmd, env=env) + except Exception: + return + + def _process_paths(paths, client, model, vector_name: str, model_dim: int, workspace_path: str): unique_paths = sorted(set(Path(x) for x in paths)) if not unique_paths: @@ -798,6 +836,27 @@ def _process_paths(paths, client, model, vector_name: str, model_dim: int, works repo_name = _extract_repo_name_from_path(repo_key) collection = _get_collection_for_file(p) + if ".remote-git" in p.parts and p.suffix.lower() == ".json": + try: + _process_git_history_manifest(p, client, model, collection, vector_name, repo_name) + except Exception as e: + try: + print(f"[commit_ingest_error] {p}: {e}") + except Exception: + pass + repo_progress[repo_key] = repo_progress.get(repo_key, 0) + 1 + try: + _update_progress( + repo_key, + started_at, + repo_progress[repo_key], + len(repo_files), + p, + ) + except Exception: + pass + continue + if not p.exists(): if client is not None: try: diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md index 356a3d84..159672e8 100644 --- a/vscode-extension/context-engine-uploader/README.md +++ b/vscode-extension/context-engine-uploader/README.md @@ -23,6 +23,9 @@ Configuration - **CTX + GLM settings:** - `contextEngineUploader.ctxIndexerUrl` is copied into `.env` (as `MCP_INDEXER_URL`) so the embedded `ctx.py` knows which MCP indexer to call when enhancing prompts. - `contextEngineUploader.glmApiKey`, `glmApiBase`, and `glmModel` are used when scaffolding `ctx_config.json`/`.env` to pre-fill GLM decoder options. Existing non-placeholder values are preserved, so you can override them in the files at any time. +- **Git history upload settings:** + - `contextEngineUploader.gitMaxCommits` controls `REMOTE_UPLOAD_GIT_MAX_COMMITS`, bounding how many commits the upload client includes per bundle (set to 0 to disable git history). + - `contextEngineUploader.gitSince` controls `REMOTE_UPLOAD_GIT_SINCE`, letting you constrain the git log window (e.g. `2 years ago` or `2023-01-01`). - **Context scaffolding:** - `contextEngineUploader.scaffoldCtxConfig` (default `true`) controls whether the extension keeps a minimal `ctx_config.json` + `.env` in sync with your workspace. When enabled, running `Write MCP Config` or `Write CTX Config` will reuse the workspace’s existing files (if present) and only backfill placeholder or missing values from the bundled `env.example` plus the inferred collection name. Existing custom values are preserved. - The scaffolder also enforces CTX defaults (e.g., `MULTI_REPO_MODE=1`, `REFRAG_RUNTIME=glm`, `REFRAG_DECODER=1`) so the embedded `ctx.py` is ready for remote uploads, regardless of the “Use GLM Decoder” toggle. @@ -44,6 +47,7 @@ Commands - Status-bar button (`Index Codebase`) mirrors Start/Stop/Restart/Index status, while the `Prompt+` status button runs the ctx rewrite command on the current selection. - `Context Engine Uploader: Write MCP Config (.mcp.json)` writes or updates a project-local `.mcp.json` with MCP server entries for the Qdrant indexer and memory/search endpoints, using the configured MCP URLs. - `Context Engine Uploader: Write CTX Config (ctx_config.json/.env)` scaffolds the ctx config + env files as described above. This command runs automatically after `Write MCP Config` if scaffolding is enabled, but it is also exposed in the Command Palette for manual use. +- `Context Engine Uploader: Upload Git History (force sync bundle)` triggers a one-off force sync using the configured git history settings, producing a bundle that includes a `metadata/git_history.json` manifest for remote lineage ingestion. Logs ---- diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index deb9d355..372f1359 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -69,6 +69,11 @@ function activate(context) { } runSequence('force').catch(error => log(`Index failed: ${error instanceof Error ? error.message : String(error)}`)); }); + const uploadGitHistoryDisposable = vscode.commands.registerCommand('contextEngineUploader.uploadGitHistory', () => { + vscode.window.showInformationMessage('Context Engine git history upload (force sync) started.'); + if (outputChannel) { outputChannel.show(true); } + runSequence('force').catch(error => log(`Git history upload failed: ${error instanceof Error ? error.message : String(error)}`)); + }); const ctxConfigDisposable = vscode.commands.registerCommand('contextEngineUploader.writeCtxConfig', () => { writeCtxConfig().catch(error => log(`CTX config write failed: ${error instanceof Error ? error.message : String(error)}`)); }); @@ -118,6 +123,7 @@ function activate(context) { stopDisposable, restartDisposable, indexDisposable, + uploadGitHistoryDisposable, showLogsDisposable, promptEnhanceDisposable, mcpConfigDisposable, @@ -509,7 +515,9 @@ function setStatusBarState(mode) { function runOnce(options) { return new Promise(resolve => { const args = buildArgs(options, 'force'); - const child = spawn(options.pythonPath, args, { cwd: options.workingDirectory, env: buildChildEnv(options) }); + const baseEnv = buildChildEnv(options); + const childEnv = { ...baseEnv, REMOTE_UPLOAD_GIT_FORCE: '1' }; + const child = spawn(options.pythonPath, args, { cwd: options.workingDirectory, env: childEnv }); forceProcess = child; attachOutput(child, 'force'); let finished = false; @@ -886,6 +894,15 @@ function buildChildEnv(options) { env.DEV_REMOTE_MODE = '1'; log('Context Engine Uploader: devRemoteMode enabled (REMOTE_UPLOAD_MODE=development, DEV_REMOTE_MODE=1).'); } + const gitMaxCommits = settings.get('gitMaxCommits'); + if (typeof gitMaxCommits === 'number' && !Number.isNaN(gitMaxCommits)) { + env.REMOTE_UPLOAD_GIT_MAX_COMMITS = String(gitMaxCommits); + } + const gitSinceRaw = settings.get('gitSince'); + const gitSince = typeof gitSinceRaw === 'string' ? gitSinceRaw.trim() : ''; + if (gitSince) { + env.REMOTE_UPLOAD_GIT_SINCE = gitSince; + } } catch (error) { log(`Failed to read devRemoteMode setting: ${error instanceof Error ? error.message : String(error)}`); } @@ -1046,6 +1063,8 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { let glmApiKey = ''; let glmApiBase = 'https://api.z.ai/api/coding/paas/v4/'; let glmModel = 'glm-4.6'; + let gitMaxCommits = 500; + let gitSince = ''; if (uploaderSettings) { try { const runtimeSetting = String(uploaderSettings.get('decoderRuntime') ?? 'glm').trim().toLowerCase(); @@ -1065,6 +1084,14 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { if (cfgModel) { glmModel = cfgModel; } + const maxCommitsSetting = uploaderSettings.get('gitMaxCommits'); + if (typeof maxCommitsSetting === 'number' && !Number.isNaN(maxCommitsSetting)) { + gitMaxCommits = maxCommitsSetting; + } + const sinceSetting = uploaderSettings.get('gitSince'); + if (typeof sinceSetting === 'string') { + gitSince = sinceSetting.trim(); + } } catch (error) { log(`Failed to read decoder/GLM settings from configuration: ${error instanceof Error ? error.message : String(error)}`); } @@ -1311,6 +1338,13 @@ async function scaffoldCtxConfigFiles(workspaceDir, collectionName) { } } + if (typeof gitMaxCommits === 'number' && !Number.isNaN(gitMaxCommits)) { + upsertEnv('REMOTE_UPLOAD_GIT_MAX_COMMITS', String(gitMaxCommits), { overwrite: true }); + } + if (gitSince) { + upsertEnv('REMOTE_UPLOAD_GIT_SINCE', gitSince, { overwrite: true, skipIfDesiredEmpty: true }); + } + if (envChanged) { fs.writeFileSync(envPath, envLines.join('\n') + '\n', 'utf8'); log(`Ensured decoder/GLM/MCP settings in .env at ${envPath}`); diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index 47e29a64..6133c63d 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -37,6 +37,10 @@ "command": "contextEngineUploader.indexCodebase", "title": "Context Engine Uploader: Index Codebase" }, + { + "command": "contextEngineUploader.uploadGitHistory", + "title": "Context Engine Uploader: Upload Git History (force sync bundle)" + }, { "command": "contextEngineUploader.writeMcpConfig", "title": "Context Engine Uploader: Write MCP Config (.mcp.json)" @@ -205,6 +209,17 @@ "type": "string", "default": "glm-4.6", "description": "GLM model name (GLM_MODEL) used by refrag_glm/ctx.py when REFRAG_RUNTIME=glm." + }, + "contextEngineUploader.gitMaxCommits": { + "type": "number", + "default": 500, + "minimum": 0, + "description": "Upper bound for REMOTE_UPLOAD_GIT_MAX_COMMITS passed to the upload clients via .env/child process environment. Set to 0 or a negative value to disable git history collection." + }, + "contextEngineUploader.gitSince": { + "type": "string", + "default": "", + "description": "Optional REMOTE_UPLOAD_GIT_SINCE constraint (e.g. '2 years ago' or '2023-01-01') passed to the upload clients via .env/child process environment. Leave empty to use the clients' default behavior." } } } From 2253cb5b39514c161f58011a7a6448e35e3397bf Mon Sep 17 00:00:00 2001 From: Reese Date: Thu, 27 Nov 2025 09:17:22 +0000 Subject: [PATCH 06/25] Optimizes git history collection Improves efficiency of git history collection by fetching only the commits since the last successful upload, instead of always starting from the beginning. It constructs a `git rev-list` command that fetches only the commits between the previous HEAD and the current HEAD, if both are available and different. This reduces the amount of data that needs to be processed, improving performance. If either HEAD is missing or they are the same, the command defaults to fetching the entire history from HEAD. --- scripts/remote_upload_client.py | 14 +++++++++++++- scripts/standalone_upload_client.py | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index fe805e2d..1166c666 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -139,11 +139,23 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: return None + base_head = "" + if not force_full: + try: + prev_head = str(cache.get("last_head") or "").strip() + if current_head and prev_head and prev_head != current_head: + base_head = prev_head + except Exception: + base_head = "" + # Build git rev-list command (simple HEAD-based history) cmd: List[str] = ["git", "rev-list", "--no-merges"] if since: cmd.append(f"--since={since}") - cmd.append("HEAD") + if base_head and current_head: + cmd.append(f"{base_head}..{current_head}") + else: + cmd.append("HEAD") try: proc = subprocess.run( diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index e6bb9a55..7707e065 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -325,11 +325,23 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str if current_head and cache.get("last_head") == current_head and cache.get("max_commits") == max_commits and str(cache.get("since") or "") == since: return None + base_head = "" + if not force_full: + try: + prev_head = str(cache.get("last_head") or "").strip() + if current_head and prev_head and prev_head != current_head: + base_head = prev_head + except Exception: + base_head = "" + # Build git rev-list command (simple HEAD-based history) cmd: List[str] = ["git", "rev-list", "--no-merges"] if since: cmd.append(f"--since={since}") - cmd.append("HEAD") + if base_head and current_head: + cmd.append(f"{base_head}..{current_head}") + else: + cmd.append("HEAD") try: proc = subprocess.run( From 3dd79b130070a4c6de11eb63eac3236428386076 Mon Sep 17 00:00:00 2001 From: Reese Date: Thu, 27 Nov 2025 10:39:49 +0000 Subject: [PATCH 07/25] Improves memory restoration during reindexing Adds a `STRICT_MEMORY_RESTORE` option to control whether memory restoration failures should halt the reindexing process. Introduces a comprehensive test suite for memory backup and restore operations and fixes an issue where point IDs were not correctly converted to integers during restoration. --- .env.example | 2 + scripts/ingest_code.py | 51 ++-- scripts/memory_restore.py | 12 +- .../test_collection_memory_backup_restore.py | 233 ++++++++++++++++++ 4 files changed, 281 insertions(+), 17 deletions(-) create mode 100644 tests/test_collection_memory_backup_restore.py diff --git a/.env.example b/.env.example index 0b91bb6d..8d7f8f2f 100644 --- a/.env.example +++ b/.env.example @@ -176,3 +176,5 @@ SMART_SYMBOL_REINDEXING=0 # REMOTE_UPLOAD_GIT_SINCE= # Enable commit lineage goals for indexing REFRAG_COMMIT_DESCRIBE=1 + +STRICT_MEMORY_RESTORE=0 \ No newline at end of file diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 9f34d987..9bc79139 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -864,6 +864,8 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st Always includes dense (vector_name) and lexical (LEX_VECTOR_NAME). When REFRAG_MODE=1, also includes a compact mini vector (MINI_VECTOR_NAME). """ + # Track backup file path for this ensure_collection call (per-collection, per-process) + backup_file = None try: info = client.get_collection(name) # Prevent I/O storm - only update vectors if they actually don't exist @@ -950,8 +952,7 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st pass # Store backup info for restoration - if backup_file: - setattr(CollectionNeedsRecreateError, 'backup_file', backup_file) + # backup_file remains bound for this function call; used after collection creation # Proceed to recreate with full vector configuration raise CollectionNeedsRecreateError(f"Collection {name} needs recreation for new vectors") @@ -995,8 +996,14 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st print(f"[COLLECTION_INFO] Successfully created new collection {name} with vectors: {list(vectors_cfg.keys())}") # Restore memories if we have a backup from recreation using dedicated restore script + strict_restore = False + try: + val = os.environ.get("STRICT_MEMORY_RESTORE", "") + strict_restore = str(val or "").strip().lower() in {"1", "true", "yes", "on"} + except Exception: + strict_restore = False + try: - backup_file = getattr(CollectionNeedsRecreateError, 'backup_file', None) if backup_file and os.path.exists(backup_file): print(f"[MEMORY_RESTORE] Restoring memories from {backup_file}") import subprocess @@ -1004,35 +1011,47 @@ def ensure_collection(client: QdrantClient, name: str, dim: int, vector_name: st # Use battle-tested restore script (skip collection creation since ingest_code.py already handles it) restore_script = Path(__file__).parent / "memory_restore.py" - result = subprocess.run([ - sys.executable, str(restore_script), - "--backup", backup_file, - "--collection", name, - "--skip-collection-creation" - ], capture_output=True, text=True, cwd=Path(__file__).parent.parent) + result = subprocess.run( + [ + sys.executable, + str(restore_script), + "--backup", + backup_file, + "--collection", + name, + "--skip-collection-creation", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent, + ) if result.returncode == 0: print(f"[MEMORY_RESTORE] Successfully restored memories using {restore_script.name}") else: - print(f"[MEMORY_RESTORE_WARNING] Restore script failed: {result.stderr}") + msg = result.stderr or result.stdout or "unknown error" + print(f"[MEMORY_RESTORE_WARNING] Restore script failed: {msg}") + if strict_restore: + raise RuntimeError(f"Memory restore failed for collection {name}: {msg}") - # Clean up backup file and reset class attribute + # Clean up backup file once we've attempted restore try: os.unlink(backup_file) print(f"[MEMORY_RESTORE] Cleaned up backup file {backup_file}") except Exception: pass - # Reset the backup file attribute to prevent accidental reuse - setattr(CollectionNeedsRecreateError, 'backup_file', None) + finally: + backup_file = None elif backup_file: print(f"[MEMORY_RESTORE_WARNING] Backup file {backup_file} not found") - # Reset the backup file attribute even if file not found - setattr(CollectionNeedsRecreateError, 'backup_file', None) + backup_file = None except Exception as restore_e: print(f"[MEMORY_RESTORE_ERROR] Failed to restore memories: {restore_e}") - # Continue even if restore fails - indexing is more important + # Optionally fail hard when STRICT_MEMORY_RESTORE is enabled + if strict_restore: + raise def recreate_collection(client: QdrantClient, name: str, dim: int, vector_name: str): diff --git a/scripts/memory_restore.py b/scripts/memory_restore.py index 27fb34d5..c2f4c01e 100644 --- a/scripts/memory_restore.py +++ b/scripts/memory_restore.py @@ -208,7 +208,17 @@ def restore_memories( batch_points = [] for memory in batch: - memory_id = memory.get("id", "") + raw_id = memory.get("id", "") + + # Qdrant HTTP API expects point IDs to be either an unsigned integer + # or a UUID string. Backups store IDs as strings, so we convert + # purely numeric IDs back to integers to match the original type. + memory_id = raw_id + try: + if isinstance(raw_id, str) and raw_id.isdigit(): + memory_id = int(raw_id) + except Exception: + memory_id = raw_id # Skip if already exists if skip_existing and memory_id in existing_ids: diff --git a/tests/test_collection_memory_backup_restore.py b/tests/test_collection_memory_backup_restore.py new file mode 100644 index 00000000..6808a222 --- /dev/null +++ b/tests/test_collection_memory_backup_restore.py @@ -0,0 +1,233 @@ +import os +import uuid +import importlib +import subprocess +from types import SimpleNamespace + +import pytest +from qdrant_client import QdrantClient, models + +# Reuse the existing Qdrant testcontainer fixture +from tests.test_integration_qdrant import qdrant_container # noqa: F401 + + +ing = importlib.import_module("scripts.ingest_code") +mem_backup = importlib.import_module("scripts.memory_backup") +mem_restore = importlib.import_module("scripts.memory_restore") + +pytestmark = pytest.mark.integration + + +def _create_collection_with_memory(qdrant_url: str, name: str, dim: int = 8) -> QdrantClient: + """Create a collection with dense+lex vectors and a single memory point. + + The collection is intentionally created without the ReFRAG mini vector so that + ensure_collection(..., REFRAG_MODE=1) must add it, exercising the + backup/recreate/restore path. + """ + client = QdrantClient(url=qdrant_url) + + vectors_cfg = { + "code": models.VectorParams(size=dim, distance=models.Distance.COSINE), + ing.LEX_VECTOR_NAME: models.VectorParams( + size=ing.LEX_VECTOR_DIM, distance=models.Distance.COSINE + ), + } + client.create_collection(collection_name=name, vectors_config=vectors_cfg) + + # One "memory" point (no metadata.path) and one code point (with path). + # Use integer point IDs to match Qdrant's accepted ID types. + points = [ + models.PointStruct( + id=1, + vector={"code": [0.1] * dim}, + payload={"information": "test memory", "metadata": {}}, + ), + models.PointStruct( + id=2, + vector={"code": [0.2] * dim}, + payload={ + "information": "code chunk", + # Mark as real code: has a path and language/kind so is_memory_point() returns False + "metadata": {"path": "/tmp/example.py", "language": "python", "kind": "code"}, + }, + ), + ] + client.upsert(collection_name=name, points=points) + return client + + +def _get_point_ids(client: QdrantClient, collection_name: str) -> set[str]: + pts, _ = client.scroll( + collection_name=collection_name, + limit=None, + with_payload=False, + with_vectors=False, + ) + return {str(p.id) for p in pts} + + +def test_memory_backup_restore_happy_path(qdrant_container, monkeypatch): + """ensure_collection should backup, recreate, and restore memories. + + Scenario: + - Start with a collection that has dense+lex vectors and at least one + "memory" point. + - Enable REFRAG_MODE so ensure_collection wants to add the mini vector. + - Qdrant will reject adding a new vector name via update_collection, so we + exercise the backup -> delete -> recreate -> restore path. + - In tolerant mode (STRICT_MEMORY_RESTORE not set / 0) indexing should + succeed and the memory should still be present. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + # Force ReFRAG on so ensure_collection tries to add MINI_VECTOR_NAME + os.environ["REFRAG_MODE"] = "1" + os.environ.pop("STRICT_MEMORY_RESTORE", None) + + # Run ensure_collection: this should trigger backup + recreate + restore + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + info = client.get_collection(collection) + cfg = info.config.params.vectors + + # Dense + lex must be present + assert "code" in cfg + assert ing.LEX_VECTOR_NAME in cfg + + # When REFRAG_MODE is on, mini vector should be present too + mini_name = os.environ.get("MINI_VECTOR_NAME", getattr(ing, "MINI_VECTOR_NAME", "mini")) + assert mini_name in cfg + + # Memory id should still exist after restore, but code points are not restored + ids = _get_point_ids(client, collection) + assert "1" in ids + assert "2" not in ids + + +def test_memory_restore_strict_mode_raises_on_failure(qdrant_container, monkeypatch): + """STRICT_MEMORY_RESTORE=1 should turn restore failures into hard errors. + + We let the real backup script run against Qdrant, but we force the restore + subprocess to fail and assert that ensure_collection raises. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-strict-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + os.environ["REFRAG_MODE"] = "1" + os.environ["STRICT_MEMORY_RESTORE"] = "1" + + # Patch subprocess.run to: + # - allow the real memory_backup.py to run + # - force memory_restore.py to fail with non-zero exit + orig_run = subprocess.run + + def fake_run(args, **kwargs): # type: ignore[override] + cmd_str = " ".join(map(str, args)) + if "memory_backup.py" in cmd_str: + return orig_run(args, **kwargs) + if "memory_restore.py" in cmd_str: + return SimpleNamespace(returncode=1, stdout="", stderr="simulated restore failure") + return orig_run(args, **kwargs) + + monkeypatch.setattr(subprocess, "run", fake_run) + + with pytest.raises(RuntimeError): + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + +def test_memory_backup_failure_tolerant_mode_still_recreates_collection(qdrant_container, monkeypatch): + """If backup fails but STRICT_MEMORY_RESTORE is not set, ensure_collection + should still recreate the collection with the correct vectors, even though + memories may be dropped. + + This makes the behavior explicit: backup failure is best-effort by default. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-backup-fail-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + os.environ["REFRAG_MODE"] = "1" + os.environ.pop("STRICT_MEMORY_RESTORE", None) + + # Patch subprocess.run so memory_backup.py fails, but everything else runs normally + orig_run = subprocess.run + + def fake_run(args, **kwargs): # type: ignore[override] + cmd_str = " ".join(map(str, args)) + if "memory_backup.py" in cmd_str: + return SimpleNamespace(returncode=1, stdout="", stderr="simulated backup failure") + return orig_run(args, **kwargs) + + monkeypatch.setattr(subprocess, "run", fake_run) + + # Should not raise even though backup fails + ing.ensure_collection(client, collection, dim=8, vector_name="code") + + info = client.get_collection(collection) + cfg = info.config.params.vectors + + # Collection should still have the expected vectors (including mini) + assert "code" in cfg + assert ing.LEX_VECTOR_NAME in cfg + mini_name = os.environ.get("MINI_VECTOR_NAME", getattr(ing, "MINI_VECTOR_NAME", "mini")) + assert mini_name in cfg + + # Because backup failed and no restore occurred, the original memory is gone + ids = _get_point_ids(client, collection) + assert "1" not in ids + + +def test_memory_backup_and_restore_scripts_roundtrip(qdrant_container, tmp_path): + """Directly exercise memory_backup.export_memories and + memory_restore.restore_memories without going through ensure_collection. + + This confirms that the backup file contains the expected memory and that + restore_memories can recreate it in a fresh collection. + """ + os.environ["QDRANT_URL"] = qdrant_container + collection = f"test-mem-scripts-{uuid.uuid4().hex[:8]}" + + client = _create_collection_with_memory(qdrant_container, collection, dim=8) + + # Backup memories from the collection + backup_file = tmp_path / "memories_backup.json" + result = mem_backup.export_memories( + collection_name=collection, + output_file=str(backup_file), + client=client, + include_vectors=True, + batch_size=100, + ) + + assert result["success"] is True + assert result["memory_count"] == 1 + assert backup_file.exists() + + # Drop the original collection entirely + client.delete_collection(collection) + + # Restore into a fresh collection; let restore_memories create it + restore_result = mem_restore.restore_memories( + backup_file=str(backup_file), + collection_name=collection, + client=client, + embedding_model_name=None, + vector_name="code", + batch_size=50, + skip_existing=True, + skip_collection_creation=False, + ) + + assert restore_result["success"] is True + + # After restore, there should be exactly one memory point (id 1) and no code point (id 2) + ids = _get_point_ids(client, collection) + assert "1" in ids + assert "2" not in ids From 434e7da8b75a7e6a30e217350e2cd78a9b34dcef Mon Sep 17 00:00:00 2001 From: Reese Date: Thu, 27 Nov 2025 12:19:07 +0000 Subject: [PATCH 08/25] vscode-ext: auto-detect CTX workspace and wire Prompt+ - Bump extension version - When contextEngineUploader.targetPath is unset, derive the CTX workspace from the VS Code folder: - If the workspace folder has .context-engine, ctx_config.json, .codebase/state.json, or .git, treat it as the root. - Otherwise, scan one level of child directories; if exactly one child matches those markers, use it. If zero or many, fall back to the workspace folder. - Keep explicit contextEngineUploader.targetPath as the single source of truth when configured. - When running Prompt+ (ctx.py), set CTX_WORKSPACE_DIR using the same target-path/auto-detection logic so ctx.py reads .env/ctx_config.json from the same CTX workspace as the uploader/indexer. - No server-side behavior changes; only VS Code extension workspace detection and Prompt+ wiring are updated. --- .../context-engine-uploader/extension.js | 85 ++++++++++++++++++- .../context-engine-uploader/package.json | 2 +- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 372f1359..07a7977c 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -285,8 +285,9 @@ function getTargetPath(config) { updateStatusBarTooltip(); return undefined; } - updateStatusBarTooltip(folderPath); - return folderPath; + const autoTarget = detectDefaultTargetPath(folderPath); + updateStatusBarTooltip(autoTarget); + return autoTarget; } function saveTargetPath(config, targetPath) { const hasWorkspace = vscode.workspace.workspaceFolders && vscode.workspace.workspaceFolders.length; @@ -302,6 +303,66 @@ function getWorkspaceFolderPath() { } return folders[0].uri.fsPath; } +function looksLikeRepoRoot(dirPath) { + try { + const contextEngineDir = path.join(dirPath, '.context-engine'); + const ctxConfigPath = path.join(dirPath, 'ctx_config.json'); + const codebaseStatePath = path.join(dirPath, '.codebase', 'state.json'); + const gitDir = path.join(dirPath, '.git'); + if (fs.existsSync(contextEngineDir) || fs.existsSync(ctxConfigPath) || fs.existsSync(codebaseStatePath) || fs.existsSync(gitDir)) { + return true; + } + } catch (error) { + log(`Repo root detection failed for ${dirPath}: ${error instanceof Error ? error.message : String(error)}`); + } + return false; +} +function detectDefaultTargetPath(workspaceFolderPath) { + try { + const resolved = path.resolve(workspaceFolderPath); + if (!fs.existsSync(resolved)) { + return workspaceFolderPath; + } + if (looksLikeRepoRoot(resolved)) { + return resolved; + } + let entries; + try { + entries = fs.readdirSync(resolved); + } catch (error) { + log(`Auto targetPath discovery failed to read workspace folder: ${error instanceof Error ? error.message : String(error)}`); + return resolved; + } + const candidates = []; + for (const name of entries) { + const fullPath = path.join(resolved, name); + let stats; + try { + stats = fs.statSync(fullPath); + } catch (_) { + continue; + } + if (!stats.isDirectory()) { + continue; + } + if (looksLikeRepoRoot(fullPath)) { + candidates.push(path.resolve(fullPath)); + } + } + if (candidates.length === 1) { + const detected = candidates[0]; + log(`Target path auto-detected as ${detected} (under workspace folder).`); + return detected; + } + if (candidates.length > 1) { + log('Auto targetPath discovery found multiple candidate repos under workspace; using workspace folder instead.'); + } + return resolved; + } catch (error) { + log(`Auto targetPath discovery failed: ${error instanceof Error ? error.message : String(error)}`); + return workspaceFolderPath; + } +} function ensureTargetPathConfigured() { const config = vscode.workspace.getConfiguration('contextEngineUploader'); const current = (config.get('targetPath') || '').trim(); @@ -314,9 +375,10 @@ function ensureTargetPathConfigured() { updateStatusBarTooltip(); return; } - updateStatusBarTooltip(folderPath); + const autoTarget = detectDefaultTargetPath(folderPath); + updateStatusBarTooltip(autoTarget); } -function updateStatusBarTooltip(targetPath) { + function updateStatusBarTooltip(targetPath) { if (!statusBarItem) { return; } @@ -699,6 +761,21 @@ async function enhanceSelectionWithUnicorn() { if (useGpuDecoder) { env.USE_GPU_DECODER = '1'; } + let ctxWorkspaceDir; + try { + ctxWorkspaceDir = getTargetPath(cfg); + } catch (error) { + ctxWorkspaceDir = undefined; + } + if (!ctxWorkspaceDir) { + const wsFolder = getWorkspaceFolderPath(); + if (wsFolder) { + ctxWorkspaceDir = detectDefaultTargetPath(wsFolder); + } + } + if (ctxWorkspaceDir && typeof ctxWorkspaceDir === 'string' && fs.existsSync(ctxWorkspaceDir)) { + env.CTX_WORKSPACE_DIR = ctxWorkspaceDir; + } } catch (_) { // ignore config read failures; fall back to defaults } diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index 6133c63d..749f99eb 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -2,7 +2,7 @@ "name": "context-engine-uploader", "displayName": "Context Engine Uploader", "description": "Runs the Context-Engine remote upload client with a force sync on startup followed by watch mode. Requires Python with pip install requests urllib3 charset_normalizer.", - "version": "0.1.26", + "version": "0.1.27", "publisher": "context-engine", "engines": { "vscode": "^1.85.0" From be890f03dbd272dce7c445aa80c915a18e5aea24 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 13:26:56 +0000 Subject: [PATCH 09/25] vscode-ext: Update MCP docs and VS Code uploader for HTTP transport - docs: document SSE vs HTTP MCP transports and recommend HTTP /mcp endpoints for IDEs (Claude, Windsurf, etc.), including notes on the FastMCP SSE init race - vscode-extension: add mcpTransportMode setting to choose between mcp-remote SSE and direct HTTP MCP for Claude/Windsurf configs - vscode-extension: add autoWriteMcpConfigOnStartup to refresh .mcp.json, Windsurf mcp_config.json, and the Claude hook on extension activation - vscode-extension: update extension README to describe MCP transport modes and startup MCP config behavior - vscode-extension: improve targetPath auto-detection to prefer a single git/ .codebase child repo under the workspace root --- docs/DEVELOPMENT.md | 24 +++ docs/IDE_CLIENTS.md | 36 ++++- docs/MCP_API.md | 40 +++-- .../context-engine-uploader/README.md | 8 +- .../context-engine-uploader/extension.js | 146 +++++++++++++----- .../context-engine-uploader/package.json | 11 ++ 6 files changed, 212 insertions(+), 53 deletions(-) diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 51e8041a..1bf6d3bf 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -66,6 +66,30 @@ curl http://localhost:8000/sse # Memory server SSE curl http://localhost:8001/sse # Indexer server SSE ``` +### 4. IDE MCP Configuration + +For MCP-aware IDEs (Claude Desktop, Windsurf, etc.), prefer the HTTP MCP endpoints: + +```bash +# Memory MCP (HTTP) +http://localhost:8002/mcp + +# Indexer MCP (HTTP) +http://localhost:8003/mcp + +# Health checks +curl http://localhost:18002/readyz # Memory health +curl http://localhost:18003/readyz # Indexer health +``` + +SSE endpoints (`/sse`) remain available and are typically used via `mcp-remote`, but some clients that send MCP messages in parallel on a fresh session can hit a FastMCP initialization guard and intermittently log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +If you see tools/resources only appearing after a second reconnect, switch your IDE configuration to use the HTTP `/mcp` endpoints instead of SSE. + ## Project Structure ``` diff --git a/docs/IDE_CLIENTS.md b/docs/IDE_CLIENTS.md index 97f14c56..fe196c9e 100644 --- a/docs/IDE_CLIENTS.md +++ b/docs/IDE_CLIENTS.md @@ -21,7 +21,7 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo i **Prerequisites:** Context-Engine running somewhere (localhost, remote server, or Kubernetes). -**Minimal config** — add to your IDE's MCP settings: +**Minimal config (SSE)** — for clients that only understand SSE or use `mcp-remote`: ```json { "mcpServers": { @@ -30,6 +30,25 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo i } ``` +**HTTP (recommended for RMCP-capable IDEs)** — prefer this when your IDE supports HTTP MCP / RMCP (Claude Desktop, Windsurf, Qodo, etc.): + +```json +{ + "mcpServers": { + "memory": { "url": "http://localhost:8002/mcp" }, + "qdrant-indexer": { "url": "http://localhost:8003/mcp" } + } +} +``` + +Using HTTP `/mcp` avoids a FastMCP initialization race that some SSE clients hit when they send `listTools` in parallel with `initialize`, which can log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +If you see tools/resources only appearing after a second reconnect when using SSE, switch your IDE configuration to these HTTP endpoints instead. + Replace `localhost` with your server IP/hostname for remote setups. --- @@ -120,7 +139,7 @@ Add to your Zed `settings.json` (Command Palette → "Settings: Open Settings (J { "qdrant-indexer": { "type": "http", - "url": "http://localhost:8001/sse" + "url": "http://localhost:8003/mcp" } } ``` @@ -200,6 +219,19 @@ When Context-Engine runs on a remote server (e.g., `context.yourcompany.com`): } ``` +If your IDE supports HTTP MCP / RMCP, prefer the HTTP endpoints instead: + +```json +{ + "mcpServers": { + "memory": { "url": "http://context.yourcompany.com:8002/mcp" }, + "qdrant-indexer": { "url": "http://context.yourcompany.com:8003/mcp" } + } +} +``` + +This uses the HTTP `/mcp` transport and avoids the initialization race described above. + **Indexing your local project to the remote server:** ```bash # Using VS Code extension (recommended) diff --git a/docs/MCP_API.md b/docs/MCP_API.md index 73b1b8ef..38ca1145 100644 --- a/docs/MCP_API.md +++ b/docs/MCP_API.md @@ -24,6 +24,33 @@ Context Engine exposes two MCP servers: Both servers support SSE and HTTP RMCP transports simultaneously. +### Transports & IDE Integration + +For each server, two transports are available: + +- **SSE (Server-Sent Events)** + - Memory: `http://localhost:8000/sse` + - Indexer: `http://localhost:8001/sse` + - Typically used via `mcp-remote` or legacy MCP clients. + +- **HTTP (streamable MCP over HTTP)** + - Memory: `http://localhost:8002/mcp` + - Indexer: `http://localhost:8003/mcp` + - Health: + - Memory: `http://localhost:18002/readyz` + - Indexer: `http://localhost:18003/readyz` + - Tools (for debugging): `GET /tools` on the health ports. + +**Recommendation for IDEs:** Prefer the HTTP `/mcp` endpoints when integrating with IDE clients (Claude Desktop, Windsurf, etc.). HTTP uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding initialization races. + +When using SSE via `mcp-remote`, some clients may send MCP messages (for example `listTools`) in parallel on a fresh session before `initialize` has fully completed. FastMCP enforces that only `initialize` may be processed during initialization; if a non-initialize request arrives too early, the server can log: + +```text +Failed to validate request: Received request before initialization was complete +``` + +This manifests as tools/resources only appearing after a second reconnect. Switching the IDE to talk directly to the HTTP `/mcp` endpoints avoids this class of issue. + ## Memory Server API ### store() @@ -657,16 +684,11 @@ All API methods follow consistent error handling patterns: ## Transport-Specific Behavior -### SSE (Server-Sent Events) -- Real-time bidirectional communication -- Automatic reconnection on disconnect -- Streaming responses for long operations +Both SSE and HTTP RMCP transports expose the **same tools, arguments, and response shapes**. The choice of transport affects only how MCP messages are carried, not what the tools do. -### HTTP RMCP -- JSON-RPC over HTTP -- Request/response pattern -- Better for batch operations and integrations +- **SSE (`/sse`)** is primarily intended for use behind `mcp-remote` or legacy clients. +- **HTTP (`/mcp`)** is recommended for IDE integrations and direct tooling because it uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding known initialization races in some SSE clients. -Both transports provide identical API semantics and response formats. +When in doubt, prefer the HTTP `/mcp` endpoints described in the Overview. This API reference should enable developers to effectively integrate Context Engine's MCP tools into their applications and workflows. \ No newline at end of file diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md index 159672e8..50df124a 100644 --- a/vscode-extension/context-engine-uploader/README.md +++ b/vscode-extension/context-engine-uploader/README.md @@ -19,7 +19,13 @@ Configuration - **Python dependencies:** the extension runs the standalone upload client via your configured `pythonPath`. Ensure the interpreter has `requests`, `urllib3`, and `charset_normalizer` installed. Run `python3 -m pip install requests urllib3 charset_normalizer` (or replace `python3` with your configured path) before starting the uploader. - **Path mapping:** `Host Root` + `Container Root` control how local paths are rewritten before reaching the remote service. By default the host root mirrors your `Target Path` and the container root is `/work`, which keeps Windows paths working without extra config. - **Prompt+ decoder:** set `Context Engine Uploader: Decoder Url` (default `http://localhost:8081`, auto-appends `/completion`) to point at your local llama.cpp decoder. For Ollama, set it to `http://localhost:11434/api/chat`. Turn on `Use Gpu Decoder` to set `USE_GPU_DECODER=1` so ctx.py prefers the GPU llama.cpp sidecar. Prompt+ automatically runs the bundled `scripts/ctx.py` when an embedded copy is available, falling back to the workspace version if not. -- **Claude Code MCP config:** `MCP Indexer Url` and `MCP Memory Url` control the URLs written into the project-local `.mcp.json` when you run the `Write MCP Config` command. This is only for configuring Claude Code MCP clients; other MCP integrations can be added separately later. +- **Claude/Windsurf MCP config:** + - `MCP Indexer Url` and `MCP Memory Url` control the URLs written into the project-local `.mcp.json` (Claude) and Windsurf `mcp_config.json` when you run the `Write MCP Config` command. These URLs are used **literally** (e.g. `http://localhost:8001/sse` or `http://localhost:8003/mcp`). + - `MCP Transport Mode` (`contextEngineUploader.mcpTransportMode`) chooses how those URLs are wrapped: + - `sse-remote` (default): emit stdio configs that call `npx mcp-remote --transport sse-only`. + - `http`: emit direct HTTP MCP entries of the form `{ "type": "http", "url": "" }` for Claude/Windsurf. Use this when pointing at HTTP `/mcp` endpoints exposed by the Context-Engine MCP services. +- **MCP config on startup:** + - `contextEngineUploader.autoWriteMcpConfigOnStartup` (default `false`) controls whether the extension automatically runs the same logic as `Write MCP Config` on activation. When enabled, it refreshes `.mcp.json`, Windsurf `mcp_config.json`, and the Claude hook (`.claude/settings.local.json`) to match your current settings and the installed extension version. If `scaffoldCtxConfig` is also `true`, this startup path will additionally scaffold/update `ctx_config.json` and `.env` as described below. - **CTX + GLM settings:** - `contextEngineUploader.ctxIndexerUrl` is copied into `.env` (as `MCP_INDEXER_URL`) so the embedded `ctx.py` knows which MCP indexer to call when enhancing prompts. - `contextEngineUploader.glmApiKey`, `glmApiBase`, and `glmModel` are used when scaffolding `ctx_config.json`/`.env` to pre-fill GLM decoder options. Existing non-placeholder values are preserved, so you can override them in the files at any time. diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 07a7977c..fedc00c7 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -101,6 +101,7 @@ function activate(context) { event.affectsConfiguration('contextEngineUploader.mcpMemoryUrl') || event.affectsConfiguration('contextEngineUploader.mcpClaudeEnabled') || event.affectsConfiguration('contextEngineUploader.mcpWindsurfEnabled') || + event.affectsConfiguration('contextEngineUploader.mcpTransportMode') || event.affectsConfiguration('contextEngineUploader.windsurfMcpPath') || event.affectsConfiguration('contextEngineUploader.claudeHookEnabled') || event.affectsConfiguration('contextEngineUploader.surfaceQdrantCollectionHint') @@ -138,8 +139,11 @@ function activate(context) { runSequence('auto').catch(error => log(`Startup run failed: ${error instanceof Error ? error.message : String(error)}`)); } - // When enabled, best-effort auto-scaffold ctx_config.json/.env for the current targetPath on activation - if (config.get('scaffoldCtxConfig', true)) { + // Optionally keep MCP + hook + ctx config in sync on activation + if (config.get('autoWriteMcpConfigOnStartup')) { + writeMcpConfig().catch(error => log(`MCP config auto-write on activation failed: ${error instanceof Error ? error.message : String(error)}`)); + } else if (config.get('scaffoldCtxConfig', true)) { + // Legacy behavior: scaffold ctx_config.json/.env directly when MCP auto-write is disabled writeCtxConfig().catch(error => log(`CTX config auto-scaffold on activation failed: ${error instanceof Error ? error.message : String(error)}`)); } } @@ -305,11 +309,9 @@ function getWorkspaceFolderPath() { } function looksLikeRepoRoot(dirPath) { try { - const contextEngineDir = path.join(dirPath, '.context-engine'); - const ctxConfigPath = path.join(dirPath, 'ctx_config.json'); const codebaseStatePath = path.join(dirPath, '.codebase', 'state.json'); const gitDir = path.join(dirPath, '.git'); - if (fs.existsSync(contextEngineDir) || fs.existsSync(ctxConfigPath) || fs.existsSync(codebaseStatePath) || fs.existsSync(gitDir)) { + if (fs.existsSync(codebaseStatePath) || fs.existsSync(gitDir)) { return true; } } catch (error) { @@ -323,9 +325,7 @@ function detectDefaultTargetPath(workspaceFolderPath) { if (!fs.existsSync(resolved)) { return workspaceFolderPath; } - if (looksLikeRepoRoot(resolved)) { - return resolved; - } + const rootLooksLikeRepo = looksLikeRepoRoot(resolved); let entries; try { entries = fs.readdirSync(resolved); @@ -354,6 +354,12 @@ function detectDefaultTargetPath(workspaceFolderPath) { log(`Target path auto-detected as ${detected} (under workspace folder).`); return detected; } + if (rootLooksLikeRepo) { + if (candidates.length > 1) { + log('Auto targetPath discovery found multiple candidate repos under workspace; using workspace folder instead.'); + } + return resolved; + } if (candidates.length > 1) { log('Auto targetPath discovery found multiple candidate repos under workspace; using workspace folder instead.'); } @@ -1011,8 +1017,11 @@ async function writeMcpConfig() { vscode.window.showInformationMessage('Context Engine Uploader: MCP config writing is disabled in settings.'); return; } - const indexerUrl = (settings.get('mcpIndexerUrl') || 'http://localhost:8001/sse').trim(); - const memoryUrl = (settings.get('mcpMemoryUrl') || 'http://localhost:8000/sse').trim(); + const transportModeRaw = (settings.get('mcpTransportMode') || 'sse-remote'); + const transportMode = (typeof transportModeRaw === 'string' ? transportModeRaw.trim() : 'sse-remote') || 'sse-remote'; + + let indexerUrl = (settings.get('mcpIndexerUrl') || 'http://localhost:8001/sse').trim(); + let memoryUrl = (settings.get('mcpMemoryUrl') || 'http://localhost:8000/sse').trim(); let wroteAny = false; let hookWrote = false; if (claudeEnabled) { @@ -1020,14 +1029,14 @@ async function writeMcpConfig() { if (!root) { vscode.window.showErrorMessage('Context Engine Uploader: open a folder before writing .mcp.json.'); } else { - const result = await writeClaudeMcpServers(root, indexerUrl, memoryUrl); + const result = await writeClaudeMcpServers(root, indexerUrl, memoryUrl, transportMode); wroteAny = wroteAny || result; } } if (windsurfEnabled) { const customPath = (settings.get('windsurfMcpPath') || '').trim(); const windsPath = customPath || getDefaultWindsurfMcpPath(); - const result = await writeWindsurfMcpServers(windsPath, indexerUrl, memoryUrl); + const result = await writeWindsurfMcpServers(windsPath, indexerUrl, memoryUrl, transportMode); wroteAny = wroteAny || result; } if (claudeHookEnabled) { @@ -1467,7 +1476,7 @@ function getDefaultWindsurfMcpPath() { return path.join(os.homedir(), '.codeium', 'windsurf', 'mcp_config.json'); } -async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { +async function writeClaudeMcpServers(root, indexerUrl, memoryUrl, transportMode) { const configPath = path.join(root, '.mcp.json'); let config = { mcpServers: {} }; if (fs.existsSync(configPath)) { @@ -1487,27 +1496,46 @@ async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { config.mcpServers = {}; } log(`Preparing to write .mcp.json at ${configPath} with indexerUrl=${indexerUrl || '""'} memoryUrl=${memoryUrl || '""'}`); - const isWindows = process.platform === 'win32'; - const makeServer = url => { - if (isWindows) { + const servers = config.mcpServers; + const mode = (typeof transportMode === 'string' ? transportMode.trim() : 'sse-remote') || 'sse-remote'; + + if (mode === 'http') { + // Direct HTTP MCP endpoints for Claude (.mcp.json) + if (indexerUrl) { + servers['qdrant-indexer'] = { + type: 'http', + url: indexerUrl + }; + } + if (memoryUrl) { + servers.memory = { + type: 'http', + url: memoryUrl + }; + } + } else { + // Legacy/default: stdio via mcp-remote SSE bridge + const isWindows = process.platform === 'win32'; + const makeServer = url => { + if (isWindows) { + return { + command: 'cmd', + args: ['/c', 'npx', 'mcp-remote', url, '--transport', 'sse-only'], + env: {} + }; + } return { - command: 'cmd', - args: ['/c', 'npx', 'mcp-remote', url, '--transport', 'sse-only'], + command: 'npx', + args: ['mcp-remote', url, '--transport', 'sse-only'], env: {} }; - } - return { - command: 'npx', - args: ['mcp-remote', url, '--transport', 'sse-only'], - env: {} }; - }; - const servers = config.mcpServers; - if (indexerUrl) { - servers['qdrant-indexer'] = makeServer(indexerUrl); - } - if (memoryUrl) { - servers.memory = makeServer(memoryUrl); + if (indexerUrl) { + servers['qdrant-indexer'] = makeServer(indexerUrl); + } + if (memoryUrl) { + servers.memory = makeServer(memoryUrl); + } } try { const json = JSON.stringify(config, null, 2) + '\n'; @@ -1522,7 +1550,7 @@ async function writeClaudeMcpServers(root, indexerUrl, memoryUrl) { } } -async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl) { +async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl, transportMode) { try { fs.mkdirSync(path.dirname(configPath), { recursive: true }); } catch (error) { @@ -1548,17 +1576,53 @@ async function writeWindsurfMcpServers(configPath, indexerUrl, memoryUrl) { config.mcpServers = {}; } log(`Preparing to write Windsurf mcp_config.json at ${configPath} with indexerUrl=${indexerUrl || '""'} memoryUrl=${memoryUrl || '""'}`); - const makeServer = url => ({ - command: 'npx', - args: ['mcp-remote', url, '--transport', 'sse-only'], - env: {} - }); const servers = config.mcpServers; - if (indexerUrl) { - servers['qdrant-indexer'] = makeServer(indexerUrl); - } - if (memoryUrl) { - servers.memory = makeServer(memoryUrl); + const mode = (typeof transportMode === 'string' ? transportMode.trim() : 'sse-remote') || 'sse-remote'; + + if (mode === 'http') { + // Direct HTTP MCP endpoints for Windsurf mcp_config.json + if (indexerUrl) { + servers['qdrant-indexer'] = { + type: 'http', + url: indexerUrl + }; + } + if (memoryUrl) { + servers.memory = { + type: 'http', + url: memoryUrl + }; + } + } else { + // Legacy/default: use mcp-remote SSE bridge + const makeServer = url => { + // Default args for local/HTTPS endpoints + const args = ['mcp-remote', url, '--transport', 'sse-only']; + try { + const u = new URL(url); + const isLocalHost = + u.hostname === 'localhost' || + u.hostname === '127.0.0.1' || + u.hostname === '::1'; + // For non-local HTTP URLs, mcp-remote requires --allow-http + if (u.protocol === 'http:' && !isLocalHost) { + args.push('--allow-http'); + } + } catch (e) { + // If URL parsing fails, fall back to default args without additional flags + } + return { + command: 'npx', + args, + env: {} + }; + }; + if (indexerUrl) { + servers['qdrant-indexer'] = makeServer(indexerUrl); + } + if (memoryUrl) { + servers.memory = makeServer(memoryUrl); + } } try { const json = JSON.stringify(config, null, 2) + '\n'; diff --git a/vscode-extension/context-engine-uploader/package.json b/vscode-extension/context-engine-uploader/package.json index 749f99eb..07617c87 100644 --- a/vscode-extension/context-engine-uploader/package.json +++ b/vscode-extension/context-engine-uploader/package.json @@ -155,6 +155,17 @@ "default": false, "description": "Enable writing Windsurf's global MCP config (requires Windsurf or compatible clients)." }, + "contextEngineUploader.autoWriteMcpConfigOnStartup": { + "type": "boolean", + "default": false, + "description": "When enabled, automatically run 'Write MCP Config' on extension activation to keep .mcp.json, Windsurf mcp_config.json, and .claude/settings.local.json in sync with settings and the current extension version." + }, + "contextEngineUploader.mcpTransportMode": { + "type": "string", + "enum": ["sse-remote", "http"], + "default": "http", + "description": "Transport mode for Claude/Windsurf MCP configs: SSE via mcp-remote (sse-remote) or direct HTTP /mcp endpoints (http)." + }, "contextEngineUploader.mcpIndexerUrl": { "type": "string", "default": "http://localhost:8001/sse", From cdf83d85a5efc33443acceb01ff93c181e63bd69 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 13:55:51 +0000 Subject: [PATCH 10/25] vscode-ext: Improve Python interpreter detection and venv fallback - Always probe the configured pythonPath with bundled python_libs before doing anything else - If that fails, auto-detect a working system Python (python3/python/py/Homebrew) via detectSystemPython and reuse the bundled libs - Only as a last resort, prompt to create a private venv and pip-install deps, then switch to that interpreter - Reduces spurious venv prompts when switching between systems where only `python` or `python3` is available --- .../context-engine-uploader/extension.js | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index fedc00c7..608ceea9 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -408,9 +408,24 @@ function needsForceSync(targetPath) { } } async function ensurePythonDependencies(pythonPath) { - // Probe current interpreter; if modules missing, offer to create a private venv and install deps - const ok = await checkPythonDeps(pythonPath); - if (ok) return true; + // Probe current interpreter with bundled python_libs first + let ok = await checkPythonDeps(pythonPath); + if (ok) { + return true; + } + + // If that fails, try to auto-detect a better system Python before falling back to a venv + const autoPython = await detectSystemPython(); + if (autoPython && autoPython !== pythonPath) { + log(`Falling back to auto-detected Python interpreter: ${autoPython}`); + ok = await checkPythonDeps(autoPython); + if (ok) { + pythonOverridePath = autoPython; + return true; + } + } + + // As a last resort, offer to create a private venv and install deps via pip const choice = await vscode.window.showErrorMessage( 'Context Engine Uploader: missing Python modules. Create isolated environment and auto-install?', 'Auto-install to private venv', @@ -430,7 +445,7 @@ async function ensurePythonDependencies(pythonPath) { if (!installed) return false; pythonOverridePath = venvPython; log(`Using private venv interpreter: ${pythonOverridePath}`); - return await checkPythonDeps(pythonOverridePath); + return await checkPythonDeps(venvPython); } async function checkPythonDeps(pythonPath) { From 3429eab06a4b2d3f4803f55066595eab83ed5bb1 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 13:58:13 +0000 Subject: [PATCH 11/25] Upload client: Fix Windows UnicodeDecodeError in git history collection - remote_upload_client/standalone_upload_client: decode git subprocess output as UTF-8 with errors='replace' instead of relying on Windows cp1252 locale - Prevents noisy UnicodeDecodeError stack traces during uploads on Windows while keeping git_history manifests usable --- scripts/remote_upload_client.py | 10 ++++++++++ scripts/standalone_upload_client.py | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 1166c666..f98cb136 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -119,6 +119,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if head_proc.returncode == 0 and head_proc.stdout.strip(): current_head = head_proc.stdout.strip() @@ -164,6 +166,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): return None @@ -186,6 +190,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if show_proc.returncode != 0 or not show_proc.stdout.strip(): continue @@ -198,6 +204,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) files: List[str] = [] if files_proc.returncode == 0 and files_proc.stdout: @@ -211,6 +219,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if diff_proc.returncode == 0 and diff_proc.stdout: try: diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 7707e065..564e78d2 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -305,6 +305,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if head_proc.returncode == 0 and head_proc.stdout.strip(): current_head = head_proc.stdout.strip() @@ -350,6 +352,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if proc.returncode != 0 or not proc.stdout.strip(): return None @@ -372,6 +376,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if show_proc.returncode != 0 or not show_proc.stdout.strip(): continue @@ -384,6 +390,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) files: List[str] = [] if files_proc.returncode == 0 and files_proc.stdout: @@ -397,6 +405,8 @@ def _collect_git_history_for_workspace(workspace_path: str) -> Optional[Dict[str stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", ) if diff_proc.returncode == 0 and diff_proc.stdout: try: From e9d917f29225585f77ca75f7ba5f33c46ea88d5f Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 15:20:15 +0000 Subject: [PATCH 12/25] k8: Adds volume and env vars for model caching Configures shared volume and environment variables to enable model caching for context-engine components. This reduces redundant downloads and speeds up processing. --- deploy/kubernetes/indexer-services.yaml | 22 ++++++++++++++++++++++ deploy/kubernetes/mcp-http.yaml | 9 +++++++++ deploy/kubernetes/mcp-indexer.yaml | 9 +++++++++ 3 files changed, 40 insertions(+) diff --git a/deploy/kubernetes/indexer-services.yaml b/deploy/kubernetes/indexer-services.yaml index c0a73e00..271e3578 100644 --- a/deploy/kubernetes/indexer-services.yaml +++ b/deploy/kubernetes/indexer-services.yaml @@ -47,6 +47,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache - name: WATCH_ROOT value: /work - name: QDRANT_TIMEOUT @@ -74,6 +78,10 @@ spec: configMapKeyRef: name: context-engine-config key: WATCH_DEBOUNCE_SECS + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache resources: requests: memory: 512Mi @@ -86,6 +94,8 @@ spec: mountPath: /work - name: metadata-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models envFrom: - configMapRef: name: context-engine-config @@ -96,6 +106,9 @@ spec: - name: metadata-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: batch/v1 kind: Job @@ -142,6 +155,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache resources: requests: memory: 1Gi @@ -154,6 +171,8 @@ spec: mountPath: /work - name: metadata-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models envFrom: - configMapRef: name: context-engine-config @@ -164,6 +183,9 @@ spec: - name: metadata-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: batch/v1 kind: Job diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index 5d60bf4b..5dbff5fe 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -53,6 +53,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache - name: EMBEDDING_PROVIDER valueFrom: configMapKeyRef: @@ -277,6 +281,8 @@ spec: mountPath: /work - name: codebase-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models livenessProbe: httpGet: path: /readyz @@ -303,6 +309,9 @@ spec: - name: codebase-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: v1 kind: Service diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index 505eaed5..7545158d 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -67,6 +67,10 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache resources: requests: memory: 512Mi @@ -79,6 +83,8 @@ spec: mountPath: /work - name: codebase-volume mountPath: /work/.codebase + - name: models-volume + mountPath: /work/models livenessProbe: httpGet: path: /readyz @@ -105,6 +111,9 @@ spec: - name: codebase-volume persistentVolumeClaim: claimName: code-metadata-pvc + - name: models-volume + persistentVolumeClaim: + claimName: code-models-pvc --- apiVersion: v1 kind: Service From 22bb63e92f854a796452a26b1efcd8385dcba9f8 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 15:38:55 +0000 Subject: [PATCH 13/25] vscode-ext: fix: align ctx_config collection inference with python detection flow - Update writeCtxConfig() to call ensurePythonDependencies before probing --show-mapping - Re-resolve upload client options after Python detection so pythonOverridePath is honored - Prevent Windows Store python/python3 aliases from breaking collection inference - Keep inferCollectionFromUpload() behavior the same once a valid interpreter is selected --- vscode-extension/context-engine-uploader/extension.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vscode-extension/context-engine-uploader/extension.js b/vscode-extension/context-engine-uploader/extension.js index 608ceea9..a72b7cdc 100644 --- a/vscode-extension/context-engine-uploader/extension.js +++ b/vscode-extension/context-engine-uploader/extension.js @@ -1093,10 +1093,17 @@ async function writeCtxConfig() { log('CTX config scaffolding skipped because scaffoldCtxConfig is false.'); return; } - const options = resolveOptions(); + let options = resolveOptions(); if (!options) { return; } + const depsOk = await ensurePythonDependencies(options.pythonPath); + if (!depsOk) { + return; + } + // ensurePythonDependencies may switch to a better interpreter (pythonOverridePath), + // so re-resolve options to pick up the updated pythonPath and script/working directory. + options = resolveOptions() || options; const collectionName = inferCollectionFromUpload(options); if (!collectionName) { vscode.window.showErrorMessage('Context Engine Uploader: failed to infer collection name from upload client. Check the Output panel for details.'); From d0bcd87206e481a403758eef2aa2e952cd686ef1 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 16:25:57 +0000 Subject: [PATCH 14/25] upload client: fix: normalize upload client paths for Windows - Use as_posix() for all relative paths in standalone and remote upload clients - Ensure operations.json uses forward-slash paths (e.g. scripts/foo.py) - Match tar layout so upload_service can extract files from Windows bundles - Fixes issue where only top-level files appeared under /work/- on the cluster --- scripts/remote_upload_client.py | 10 +++++----- scripts/standalone_upload_client.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index f98cb136..127bbbc0 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -561,7 +561,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process created files for path in changes["created"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -598,7 +598,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process updated files for path in changes["updated"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -637,8 +637,8 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process moved files for source_path, dest_path in changes["moved"]: - dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) - source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() try: with open(dest_path, 'rb') as f: content = f.read() @@ -678,7 +678,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process deleted files for path in changes["deleted"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 564e78d2..063db07e 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -716,7 +716,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process created files for path in changes["created"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -754,7 +754,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process updated files for path in changes["updated"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: with open(path, 'rb') as f: content = f.read() @@ -794,8 +794,8 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process moved files for source_path, dest_path in changes["moved"]: - dest_rel_path = str(dest_path.relative_to(Path(self.workspace_path))) - source_rel_path = str(source_path.relative_to(Path(self.workspace_path))) + dest_rel_path = dest_path.relative_to(Path(self.workspace_path)).as_posix() + source_rel_path = source_path.relative_to(Path(self.workspace_path)).as_posix() try: with open(dest_path, 'rb') as f: content = f.read() @@ -836,7 +836,7 @@ def create_delta_bundle(self, changes: Dict[str, List]) -> Tuple[str, Dict[str, # Process deleted files for path in changes["deleted"]: - rel_path = str(path.relative_to(Path(self.workspace_path))) + rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) From 8eb74135d5d5a88fad0eb9b42332461919db4314 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 16:35:28 +0000 Subject: [PATCH 15/25] K8: Configures HF cache environment variables Configures the `HF_HOME`, `XDG_CACHE_HOME`, and `HF_HUB_CACHE` environment variables for both the HTTP server and the indexer. This change ensures that both components use a consistent Hugging Face cache directory, preventing redundant downloads and improving efficiency. --- deploy/kubernetes/mcp-http.yaml | 6 ++++++ deploy/kubernetes/mcp-indexer.yaml | 2 ++ 2 files changed, 8 insertions(+) diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index 5dbff5fe..ae6a707c 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -220,6 +220,12 @@ spec: configMapKeyRef: name: context-engine-config key: EMBEDDING_MODEL + - name: HF_HOME + value: /work/models/hf-cache + - name: XDG_CACHE_HOME + value: /work/models/hf-cache + - name: HF_HUB_CACHE + value: /work/models/hf-cache/huggingface - name: INDEX_MICRO_CHUNKS valueFrom: configMapKeyRef: diff --git a/deploy/kubernetes/mcp-indexer.yaml b/deploy/kubernetes/mcp-indexer.yaml index 7545158d..c8ade75a 100644 --- a/deploy/kubernetes/mcp-indexer.yaml +++ b/deploy/kubernetes/mcp-indexer.yaml @@ -71,6 +71,8 @@ spec: value: /work/models/hf-cache - name: XDG_CACHE_HOME value: /work/models/hf-cache + - name: HF_HUB_CACHE + value: /work/models/hf-cache/huggingface resources: requests: memory: 512Mi From 2bc6631f8573f17b53c8e191497510a041912abe Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 16:46:08 +0000 Subject: [PATCH 16/25] k8: watcher: add optional polling mode for shared PVCs Use watchdog's PollingObserver when WATCH_USE_POLLING is set so the watcher can see file changes made by other pods on the shared /work PVC (NFS/CephFS), where inotify events are not reliably propagated across nodes. Default behavior remains unchanged when the env var is unset. --- scripts/watch_index.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 845a022c..30b7ea08 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -751,7 +751,31 @@ def main(): ) handler = IndexHandler(ROOT, q, client, default_collection) - obs = Observer() + use_polling = (os.environ.get("WATCH_USE_POLLING") or "").strip().lower() in ( + "1", + "true", + "yes", + "on", + ) + if use_polling: + try: + from watchdog.observers.polling import PollingObserver # type: ignore + + obs = PollingObserver() + try: + print("[watch_mode] Using polling observer for filesystem events") + except Exception: + pass + except Exception: + obs = Observer() + try: + print( + "[watch_mode] Polling observer unavailable, falling back to default Observer" + ) + except Exception: + pass + else: + obs = Observer() obs.schedule(handler, str(ROOT), recursive=True) obs.start() From 318a0b6b892b9d02f089c01f3d82beecb794e01a Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 19:26:45 +0000 Subject: [PATCH 17/25] Normalize CTX paths to user workspace via origin source_path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make ctx.py trust the server-chosen `path` field when formatting search results, falling back to host_path`/`container_path` only when `path` is missing. This centralizes display-path policy in the indexer (`hybrid_search` + PATH_EMIT_MODE) instead of duplicating it in the CTX CLI / hook. - Fix ingest_code.py host_path mapping for remote upload workspaces. When indexing under /work/ and origin.source_path is set, drop the leading slug segment and map: /work//… → /… so metadata.host_path is a clean, user-facing path rooted at the original client workspace, without embedding the slug directory. - Update docker-compose.dev-remote PATH_EMIT_MODE from `container` to `auto` for indexer / MCP services so hybrid_search prefers host_path when available and only falls back to container_path. This lets CTX and the VS Code extension show real host/workspace paths derived from origin.source_path, while still allowing deployments to force pure container paths by setting PATH_EMIT_MODE=container if desired. Overall, CTX hook output now surfaces consistent, user-facing paths (e.g. /home/.../Context-Engine/…), while container-style /work paths remain available as an explicit server-side configuration choice. --- docker-compose.dev-remote.yml | 8 ++++---- scripts/ctx.py | 36 +++++++++++++++++------------------ scripts/ingest_code.py | 5 ++++- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/docker-compose.dev-remote.yml b/docker-compose.dev-remote.yml index 8edd5034..437ec4a0 100644 --- a/docker-compose.dev-remote.yml +++ b/docker-compose.dev-remote.yml @@ -34,7 +34,7 @@ services: - FASTMCP_PORT=${FASTMCP_PORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/work/.cache/huggingface - TRANSFORMERS_CACHE=/work/.cache/huggingface - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface @@ -82,7 +82,7 @@ services: - GLM_MODEL=${GLM_MODEL:-glm-4.6} - LLAMACPP_URL=${LLAMACPP_URL:-http://llamacpp:8080} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/tmp/huggingface - HF_HUB_CACHE=/tmp/huggingface/hub - TRANSFORMERS_CACHE=/tmp/huggingface/transformers @@ -121,7 +121,7 @@ services: - FASTMCP_TRANSPORT=${FASTMCP_HTTP_TRANSPORT} - QDRANT_URL=${QDRANT_URL} - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/work/.cache/huggingface - TRANSFORMERS_CACHE=/work/.cache/huggingface - HUGGINGFACE_HUB_CACHE=/work/.cache/huggingface @@ -170,7 +170,7 @@ services: - LLAMACPP_URL=${LLAMACPP_URL:-http://llamacpp:8080} - FASTMCP_HEALTH_PORT=18001 - COLLECTION_NAME=${COLLECTION_NAME} - - PATH_EMIT_MODE=container + - PATH_EMIT_MODE=auto - HF_HOME=/tmp/huggingface - HF_HUB_CACHE=/tmp/huggingface/hub - TRANSFORMERS_CACHE=/tmp/huggingface/transformers diff --git a/scripts/ctx.py b/scripts/ctx.py index c5dda3d3..b1cc3c00 100755 --- a/scripts/ctx.py +++ b/scripts/ctx.py @@ -306,8 +306,14 @@ def format_search_results(results: List[Dict[str, Any]], include_snippets: bool """ lines: List[str] = [] for hit in results: - # Prefer client-facing host_path, fall back to container path - path = hit.get("host_path") or hit.get("path", "unknown") + # Prefer the server-chosen display path; fall back to host/container paths + raw_path = ( + hit.get("path") + or hit.get("host_path") + or hit.get("container_path") + or "unknown" + ) + path = raw_path start = hit.get("start_line", "?") end = hit.get("end_line", "?") language = hit.get("language") or "" @@ -513,25 +519,19 @@ def sanitize_citations(text: str, allowed_paths: Set[str]) -> str: if _b: basename_to_paths.setdefault(_b, set()).add(_p) + # For now, keep allowed paths exactly as they appear in the context refs. + # Earlier versions tried to be clever by rewriting absolute paths to + # workspace-relative forms (e.g., "Context-Engine/scripts/ctx.py"), which + # could produce confusing hybrids when multiple workspace roots or + # slugged/collection-hash directories were involved. To simplify behavior + # and avoid mixing host/container/hash paths, we preserve the original + # full path strings for any citation that is known to come from the + # formatted context. root = (os.environ.get("CTX_WORKSPACE_DIR") or "").strip() def _to_display_path(full_path: str) -> str: - if not full_path: - return full_path - if not root: - return full_path - try: - root_norm = root.rstrip("/\\") - repo_name = os.path.basename(root_norm) if root_norm else "" - if full_path == root_norm: - return repo_name or "." - if full_path.startswith(root_norm + os.sep): - rel = os.path.relpath(full_path, root_norm) - if repo_name: - return repo_name + os.sep + (rel or "") - return rel or "." - except Exception: - return full_path + # Identity mapping: leave allowed paths as-is so the LLM sees the same + # absolute/host paths that appeared in the Context refs. return full_path def _repl(m): diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 9bc79139..1446110d 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -2657,7 +2657,10 @@ def make_point(pid, dense_vec, lex_vec, payload): _rel = _cur_path[len("/work/"):] # Prioritize client path from origin metadata over HOST_INDEX_PATH if _origin_client_path: - _host_path = os.path.realpath(os.path.join(_origin_client_path, _rel)) + _parts = _rel.split("/", 1) + _tail = _parts[1] if len(_parts) > 1 else "" + _base = _origin_client_path.rstrip("/") + _host_path = os.path.realpath(os.path.join(_base, _tail)) if _tail else _base else: _host_path = os.path.realpath(os.path.join(_host_root, _rel)) _container_path = _cur_path From f74619fd1237f24762b2f4daf3d2fb065d49c17e Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 22:04:34 +0000 Subject: [PATCH 18/25] Removes liveness probe from qdrant - readinesspobe still there --- deploy/kubernetes/qdrant.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/deploy/kubernetes/qdrant.yaml b/deploy/kubernetes/qdrant.yaml index 180ed637..7ec599f7 100644 --- a/deploy/kubernetes/qdrant.yaml +++ b/deploy/kubernetes/qdrant.yaml @@ -47,12 +47,6 @@ spec: volumeMounts: - name: qdrant-storage mountPath: /qdrant/storage - livenessProbe: - httpGet: - path: /healthz - port: http - initialDelaySeconds: 30 - periodSeconds: 10 readinessProbe: httpGet: path: /readyz From 1749fd006a82e9fbf9a1a78db0af27cef6311767 Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 23:11:28 +0000 Subject: [PATCH 19/25] perf: Reduce Qdrant index overhead in multi-repo indexer/watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, multi-repo indexing called ensure_collection() and ensure_payload_indexes() inside the per-file loop. In large workspaces this meant O(#files × #indexed_fields) Qdrant control-plane traffic (many PUT /collections//index?wait=true calls), even on the "Skipping unchanged file (cache)" path. This change introduces ENSURED_COLLECTIONS and ensure_collection_and_indexes_once() in ingest_code: - Single-collection index_repo: - Still recreates the collection when --recreate is set. - Now uses ensure_collection_and_indexes_once() so collection + payload indexes are ensured once per process, not repeatedly. - Multi-repo index_repo: - For each per-repo collection, calls ensure_collection_and_indexes_once() the first time it is seen. - Subsequent files in the same collection (including cached "Skipping unchanged file (cache)" files) no longer trigger extra ensure_collection / create_payload_index calls. - Net effect: Qdrant index setup overhead becomes O(#collections × #indexed_fields) per process instead of O(#files × #indexed_fields). watch_index is updated to use the same helper: - On startup, ensures the default collection once. - In _process_paths(), ensures each repo's collection once per watcher process, then relies on cached state for subsequent file events, avoiding repeated index-setup chatter. --- scripts/ingest_code.py | 24 +++++++++++++++++++----- scripts/watch_index.py | 6 ++---- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 1446110d..716f2cc4 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -1113,6 +1113,23 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): except Exception: pass +ENSURED_COLLECTIONS: set[str] = set() + + +def ensure_collection_and_indexes_once( + client: QdrantClient, + collection: str, + dim: int, + vector_name: str | None, +) -> None: + if not collection: + return + if collection in ENSURED_COLLECTIONS: + return + ensure_collection(client, collection, dim, vector_name) + ensure_payload_indexes(client, collection) + ENSURED_COLLECTIONS.add(collection) + # Lightweight import extraction per language (best-effort) def _extract_imports(language: str, text: str) -> list: @@ -2348,10 +2365,8 @@ def index_repo( if not use_per_repo_collections: if recreate: recreate_collection(client, collection, dim, vector_name) - else: - ensure_collection(client, collection, dim, vector_name) # Ensure useful payload indexes exist (idempotent) - ensure_payload_indexes(client, collection) + ensure_collection_and_indexes_once(client, collection, dim, vector_name) else: print("[multi_repo] Skipping single collection setup - will create per-repo collections during indexing") # Repo tag for filtering: auto-detect from git or folder name @@ -2416,8 +2431,7 @@ def make_point(pid, dense_vec, lex_vec, payload): if _get_collection_for_file: current_collection = _get_collection_for_file(file_path) # Ensure collection exists on first use - ensure_collection(client, current_collection, dim, vector_name) - ensure_payload_indexes(client, current_collection) + ensure_collection_and_indexes_once(client, current_collection, dim, vector_name) else: current_collection = get_collection_name(ws_path) if get_collection_name else "default-collection" diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 30b7ea08..7dfa224a 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -714,10 +714,9 @@ def main(): vector_name = idx._sanitize_vector_name(MODEL) try: - idx.ensure_collection(client, default_collection, model_dim, vector_name) + idx.ensure_collection_and_indexes_once(client, default_collection, model_dim, vector_name) except Exception: pass - idx.ensure_payload_indexes(client, default_collection) try: if multi_repo_enabled: @@ -908,8 +907,7 @@ def _process_paths(paths, client, model, vector_name: str, model_dim: int, works if client is not None and model is not None: try: - idx.ensure_collection(client, collection, model_dim, vector_name) - idx.ensure_payload_indexes(client, collection) + idx.ensure_collection_and_indexes_once(client, collection, model_dim, vector_name) except Exception: pass From 10e5b3c4b2c0dff8221cd7972937d77fc4ecfa3f Mon Sep 17 00:00:00 2001 From: Reese Date: Fri, 28 Nov 2025 23:40:36 +0000 Subject: [PATCH 20/25] docs(claude example): Clarifies agentic AI tool usage guidelines Refines the guidance for AI agents when deciding between using the MCP Qdrant-Indexer and literal search/file-open. The changes emphasize the MCP Qdrant-Indexer as the primary tool for exploration, debugging, and understanding code and history, reserving literal search/file-open for narrow, exact-literal lookups. It also simplifies the heuristics for tool selection and removes redundant descriptions of tool documentation. --- docs/CLAUDE.example.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/CLAUDE.example.md b/docs/CLAUDE.example.md index 92b3755b..baa1f3c1 100644 --- a/docs/CLAUDE.example.md +++ b/docs/CLAUDE.example.md @@ -1,25 +1,26 @@ -This file is intended for AI agents (Claude, etc.) using the Context‑Engine Qdrant‑Indexer and Memory MCP tools. It encodes project‑specific best practices; adapt it per‑repo. +This file is intended for AI agents (Claude, etc.) using the Context‑Engine Qdrant‑Indexer and Memory MCP tools. Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep Core Decision Rules (for AI agents) + Workspace default: For this repo, MCP Qdrant-Indexer tools are the primary way to explore code and history. Always start with MCP for exploration, debugging, or "where/why" questions; use literal search/file-open only for narrow exact-literal lookups. + - Use MCP Qdrant-Indexer when: - You are exploring or don't know exact strings/symbols. - You need semantic or cross-file understanding (relationships, patterns, architecture). - You want ranked results with surrounding context, not just line hits. - - Use grep when: - - You know the exact string/function/variable or error message. - - You need fast literal search or are extremely token/latency constrained. + - Use literal search/file-open when (and only when): + - You know the exact string/function/variable or error message, and you only need to confirm its existence or a file/line quickly (not to understand behavior or architecture). Quick Heuristics: - - If you know the exact string → start with grep, then switch to MCP for broader context. - - If the question is conceptual/architectural → start with MCP. + - If the question is conceptual/architectural or about "where/why" behavior changed → start with MCP. - If you need rich context/snippets around matches → MCP. - - If you just need to confirm existence/location → grep. + - If you only need to confirm existence/location of a specific literal (error message, env var, exact function name) → literal search/file-open. + - If in doubt → start with MCP. Grep Anti-Patterns: @@ -114,9 +115,6 @@ Agentic AI Project Rules: When to Use MCP Qdrant-Indexer vs Grep - Then call context_answer to summarize behavior, using a behavior-focused question that doesn't over-specify filenames. - Avoid using context_answer as a primary debugger for low-level helper/env behavior; prefer repo_search + direct code reading for detailed semantics. - Remember: the MCP tools themselves expose detailed descriptions and parameter docs. - Use those for exact knobs; this guide is about choosing the right tool and shaping good queries. - MCP Tool Families (for AI agents) - Indexer / Qdrant tools: From e0b21df9a3d734d1ad88a4b0ddb64601b1fb77f0 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 29 Nov 2025 01:30:10 +0000 Subject: [PATCH 21/25] chore: commit .codebase and dev-workspace folders (empty) to allow remote compose stack to run after git clone without creating the dirs yourself --- .codebase/.gitkeep | 0 .gitignore | 6 ++++-- dev-workspace/.gitkeep | 0 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 .codebase/.gitkeep create mode 100644 dev-workspace/.gitkeep diff --git a/.codebase/.gitkeep b/.codebase/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/.gitignore b/.gitignore index 0ddd7594..4c548379 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,8 @@ __pycache__/ paper.md /semantic-search /codebase-index-cli -/.codebase +/.codebase/* +!/.codebase/.gitkeep scripts/.codebase/cache.json scripts/.codebase/state.json tests/.codebase/cache.json @@ -29,4 +30,5 @@ tests/.codebase/state.json CLAUDE.md .qodo/.cursor/rules /.augment -/dev-workspace +/dev-workspace/* +!/dev-workspace/.gitkeep diff --git a/dev-workspace/.gitkeep b/dev-workspace/.gitkeep new file mode 100644 index 00000000..e69de29b From b71b352b73b5f9091869e652f3a61eef67adf778 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 29 Nov 2025 01:35:49 +0000 Subject: [PATCH 22/25] chore(docs): Adds Getting Started guide and updates docs Introduces a new "Getting Started" guide for quickly trying Context Engine with VS Code and the dev-remote stack. Updates documentation links across all documents to include the new Getting Started guide. --- README.md | 10 ++- docs/ARCHITECTURE.md | 2 +- docs/CONFIGURATION.md | 2 +- docs/CTX_CLI.md | 2 +- docs/DEVELOPMENT.md | 4 +- docs/GETTING_STARTED.md | 153 +++++++++++++++++++++++++++++++++ docs/IDE_CLIENTS.md | 8 +- docs/MCP_API.md | 4 +- docs/MEMORY_GUIDE.md | 2 +- docs/MULTI_REPO_COLLECTIONS.md | 2 +- docs/TROUBLESHOOTING.md | 2 +- docs/vscode-extension.md | 32 +++++-- 12 files changed, 200 insertions(+), 23 deletions(-) create mode 100644 docs/GETTING_STARTED.md diff --git a/README.md b/README.md index 96106fdf..c6150e29 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![CI](https://github.com/m1rl0k/Context-Engine/actions/workflows/ci.yml/badge.svg)](https://github.com/m1rl0k/Context-Engine/actions/workflows/ci.yml) -**Documentation:** README · [Configuration](docs/CONFIGURATION.md) · [IDE Clients](docs/IDE_CLIENTS.md) · [MCP API](docs/MCP_API.md) · [ctx CLI](docs/CTX_CLI.md) · [Memory Guide](docs/MEMORY_GUIDE.md) · [Architecture](docs/ARCHITECTURE.md) · [Multi-Repo](docs/MULTI_REPO_COLLECTIONS.md) · [Kubernetes](deploy/kubernetes/README.md) · [VS Code Extension](docs/vscode-extension.md) · [Troubleshooting](docs/TROUBLESHOOTING.md) · [Development](docs/DEVELOPMENT.md) +**Documentation:** [Getting Started](docs/GETTING_STARTED.md) · README · [Configuration](docs/CONFIGURATION.md) · [IDE Clients](docs/IDE_CLIENTS.md) · [MCP API](docs/MCP_API.md) · [ctx CLI](docs/CTX_CLI.md) · [Memory Guide](docs/MEMORY_GUIDE.md) · [Architecture](docs/ARCHITECTURE.md) · [Multi-Repo](docs/MULTI_REPO_COLLECTIONS.md) · [Kubernetes](deploy/kubernetes/README.md) · [VS Code Extension](docs/vscode-extension.md) · [Troubleshooting](docs/TROUBLESHOOTING.md) · [Development](docs/DEVELOPMENT.md) --- @@ -38,13 +38,19 @@ Context-Engine is a plug-and-play MCP retrieval stack that unifies code indexing | OpenAI Codex | RMCP | TOML config | | Augment | SSE | Simple JSON configs | | AmpCode | SSE | Simple URL for SSE endpoints | -| Claude Code CLI | SSE | Simple JSON configs | +| Claude Code CLI | SSE / HTTP (RMCP) | Simple JSON configs via .mcp.json | > **See [docs/IDE_CLIENTS.md](docs/IDE_CLIENTS.md) for detailed configuration examples.** ## Getting Started +If you're a VS Code user trying Context-Engine locally, start with the low-friction dev-remote + extension guide: + +- **[docs/GETTING_STARTED.md](docs/GETTING_STARTED.md)** + +The options below describe the `docker compose` + CLI workflows. + ### Option 1: Deploy & Connect (Recommended) Deploy Context-Engine once, connect any IDE. No need to clone this repo into your project. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 9de3318c..cc6edd43 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,6 +1,6 @@ # Context Engine Architecture -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index dda96fdd..13c3a502 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -2,7 +2,7 @@ Complete environment variable reference for Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/CTX_CLI.md b/docs/CTX_CLI.md index 2a0f620c..a9e87938 100644 --- a/docs/CTX_CLI.md +++ b/docs/CTX_CLI.md @@ -2,7 +2,7 @@ A thin CLI that retrieves code context and rewrites your input into a better, context-aware prompt using the local LLM decoder. Works with both questions and commands/instructions. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 1bf6d3bf..b38e0b1a 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -2,7 +2,7 @@ This guide covers setting up a development environment, understanding the codebase structure, and contributing to Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -68,7 +68,7 @@ curl http://localhost:8001/sse # Indexer server SSE ### 4. IDE MCP Configuration -For MCP-aware IDEs (Claude Desktop, Windsurf, etc.), prefer the HTTP MCP endpoints: +For MCP-aware IDEs (Claude Code, Windsurf, etc.), prefer the HTTP MCP endpoints: ```bash # Memory MCP (HTTP) diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 00000000..8d22bafb --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,153 @@ +# Getting Started (VS Code + Dev-Remote) + +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) + +This guide is for developers who want the lowest-friction way to try Context-Engine: + +- Run a single Docker Compose stack +- Install one VS Code extension +- Open a project and start asking questions about your code + +--- + +## 1. Prerequisites + +- **Docker** (Docker Desktop or equivalent) +- **Git** +- **VS Code** (to use the Context Engine Uploader extension) +- **An MCP-enabled IDE or client** to talk to Context-Engine via MCP, for example: + - Claude Code, Windsurf, Cursor, Roo, Cline, Zed (via `mcp-remote`), etc. + +CLI-only workflows using `ctx.py` and hybrid search tools are supported but are documented separately. This guide assumes you will talk to Context-Engine through an MCP-enabled assistant. + +You do *not* need to clone this repo into every project. You run Context-Engine once, then point it at whatever code you care about. + +--- + +## 2. Start the dev-remote stack + +In a terminal (from wherever you want the stack to live): + +```bash +git clone https://github.com/m1rl0k/Context-Engine.git +cd Context-Engine + +# Start the dev-remote stack (Qdrant, MCPs, upload service, watcher, etc.) +docker compose -f docker-compose.dev-remote.yml up -d +``` + +This brings up, on your host machine: + +- Qdrant on `http://localhost:6333` +- Memory MCP: + - SSE: `http://localhost:8000/sse` + - HTTP / RMCP: `http://localhost:8002/mcp` +- Indexer MCP: + - SSE: `http://localhost:8001/sse` + - HTTP / RMCP: `http://localhost:8003/mcp` +- Upload service (used by the VS Code extension) on `http://localhost:8004` + +--- + +## 3. Index your code (via VS Code extension) + +In the dev-remote flow, you normally do **not** run the indexer manually. + +Instead, the VS Code extension uploads your workspace to the dev-remote stack, and the `indexer` + `watcher` services handle: + +- Mirroring your project into the container under `/work` (in dev-workspace folder) +- Walking files, chunking them, and writing vectors + metadata into Qdrant +- Tracking per-file hashes under `.codebase` so unchanged files are skipped + +If you prefer CLI-based indexing, see the README and advanced docs (Multi-Repo, Kubernetes, etc.) for `docker compose run --rm indexer` usage. + +--- + +## 4. Connect your IDE + +The normal way to use Context-Engine is through an MCP-enabled assistant. The simplest config is via the HTTP MCP endpoints below; the VS Code extension can also scaffold these configs for you. + +### Example: Claude Code / generic RMCP client + +Add to your MCP config (for example `claude_code_config.json`): + +```json +{ + "mcpServers": { + "memory": { "url": "http://localhost:8002/mcp" }, + "qdrant-indexer": { "url": "http://localhost:8003/mcp" } + } +} +``` + +### Example: Windsurf / Cursor (SSE) + +If your client prefers SSE: + +```json +{ + "mcpServers": { + "memory": { "type": "sse", "url": "http://localhost:8000/sse", "disabled": false }, + "qdrant-indexer": { "type": "sse", "url": "http://localhost:8001/sse", "disabled": false } + } +} +``` + +See [docs/IDE_CLIENTS.md](IDE_CLIENTS.md) for copy-paste configs for specific IDEs. + +--- + +## 5. Try a few example queries + +Once your IDE MCP's are connected and indexing has finished, just *ask your assistant* questions like the ones below; it will call the MCP tools on your behalf. + +### Code search examples (qdrant-indexer) + +Ask your assistant to run something like: + +- "Find places where remote uploads are retried" +- "Show functions that call ingest_code.index_repo" +- "Search for performance bottlenecks in the upload_service script" + +Under the hood, the client will call tools such as `repo_search` or `context_answer` on the `qdrant-indexer` server. + +### Commit history / lineage examples (qdrant-indexer) + +If you have git history ingestion enabled, you can also ask: + +- "When did we add git history ingestion to the upload client?" +- "When did we optimize git history collection to fetch only commits since last upload?" +- "What commits mention Windows UnicodeDecodeError in git history collection?" + +These eventually call `search_commits_for` and related tools, which use the commit index and lineage summaries. + +--- + +## 6. VS Code extension (recommended) + +For this dev-remote flow, the **Context Engine Uploader** VS Code extension is the primary way to sync and index code: + +- Install from the Marketplace: +- Point it at your project (or let it auto-detect the current workspace root) +- Configure the upload endpoint to `http://localhost:8004` +- Start the uploader; it will force an initial upload and then watch for changes + +Under **Settings → Extensions → Context Engine Uploader** you will typically use: + +- `endpoint`: `http://localhost:8004` (dev-remote upload_service) +- Optional MCP settings: `mcpIndexerUrl`, `mcpMemoryUrl`, and `mcpTransportMode` (`sse-remote` or `http`) pointing at the dev-remote memory/indexer URLs listed above +- Optional auto-config: enable `mcpClaudeEnabled` / `mcpWindsurfEnabled` and `autoWriteMcpConfigOnStartup` to have the extension write Claude Code/Windsurf MCP configs (and an optional `/ctx` hook) for you + +Once running, your code is kept in sync with the dev-remote stack without any manual indexer commands. + +## 7. Where to go next + +Once you have the basic flow working (dev-remote stack up → VS Code extension syncing → IDE connected via MCP → run a few queries), you can explore: + +- [Configuration](CONFIGURATION.md) — environment variables and tuning knobs +- [IDE Clients](IDE_CLIENTS.md) — detailed configs for specific IDEs +- [Multi-Repo](MULTI_REPO_COLLECTIONS.md) — multi-repo collections, remote servers, Kubernetes +- [Memory Guide](MEMORY_GUIDE.md) — how to use the Memory MCP server alongside the indexer +- [Architecture](ARCHITECTURE.md) — deeper dive into how the components fit together +- [ctx CLI](CTX_CLI.md) — CLI workflows and prompt hooks; see `ctx/claude-hook-example.json` for a Claude Code `/ctx` hook wired to `ctx.py` +- [VS Code Extension](vscode-extension.md) — full extension capabilities and settings diff --git a/docs/IDE_CLIENTS.md b/docs/IDE_CLIENTS.md index fe196c9e..468138f6 100644 --- a/docs/IDE_CLIENTS.md +++ b/docs/IDE_CLIENTS.md @@ -2,7 +2,7 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo into your project. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -30,7 +30,7 @@ Connect your IDE to a running Context-Engine stack. No need to clone this repo i } ``` -**HTTP (recommended for RMCP-capable IDEs)** — prefer this when your IDE supports HTTP MCP / RMCP (Claude Desktop, Windsurf, Qodo, etc.): +**HTTP (recommended for RMCP-capable IDEs)** — prefer this when your IDE supports HTTP MCP / RMCP (Claude Code, Windsurf, Qodo, etc.): ```json { @@ -66,7 +66,9 @@ Replace `localhost` with your server IP/hostname for remote setups. | OpenAI Codex | RMCP | TOML config | | Augment | SSE | Simple JSON configs | | AmpCode | SSE | Simple URL for SSE endpoints | -| Claude Code CLI | SSE | Simple JSON configs | +| Claude Code CLI | SSE / HTTP (RMCP) | Simple JSON configs via .mcp.json | + +**Claude Desktop (Connectors):** Claude Desktop also supports remote MCP servers over SSE and streamable HTTP, but configuration happens via the Claude Connectors UI (Settings → Connectors on claude.ai), not local `.mcp.json`. Treat Context-Engine as a normal remote MCP server there; this guide focuses on IDEs where you control MCP URLs/config files directly (Claude Code, Windsurf, etc.). --- diff --git a/docs/MCP_API.md b/docs/MCP_API.md index 38ca1145..ac12ea69 100644 --- a/docs/MCP_API.md +++ b/docs/MCP_API.md @@ -2,7 +2,7 @@ This document provides comprehensive API documentation for all MCP (Model Context Protocol) tools exposed by Context Engine's dual-server architecture. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -41,7 +41,7 @@ For each server, two transports are available: - Indexer: `http://localhost:18003/readyz` - Tools (for debugging): `GET /tools` on the health ports. -**Recommendation for IDEs:** Prefer the HTTP `/mcp` endpoints when integrating with IDE clients (Claude Desktop, Windsurf, etc.). HTTP uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding initialization races. +**Recommendation for IDEs:** Prefer the HTTP `/mcp` endpoints when integrating with IDE clients (Claude Code, Windsurf, etc.). HTTP uses a simple request/response pattern where `initialize` completes before `listTools` and other calls, avoiding initialization races. When using SSE via `mcp-remote`, some clients may send MCP messages (for example `listTools`) in parallel on a fresh session before `initialize` has fully completed. FastMCP enforces that only `initialize` may be processed during initialization; if a non-initialize request arrives too early, the server can log: diff --git a/docs/MEMORY_GUIDE.md b/docs/MEMORY_GUIDE.md index 8f6ee1de..5f65d599 100644 --- a/docs/MEMORY_GUIDE.md +++ b/docs/MEMORY_GUIDE.md @@ -2,7 +2,7 @@ Best practices for using Context Engine's memory system effectively. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/MULTI_REPO_COLLECTIONS.md b/docs/MULTI_REPO_COLLECTIONS.md index 991d2cac..9aba4852 100644 --- a/docs/MULTI_REPO_COLLECTIONS.md +++ b/docs/MULTI_REPO_COLLECTIONS.md @@ -1,6 +1,6 @@ # Multi-Repository Collection Architecture -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 34913d72..ad044bb8 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -2,7 +2,7 @@ Common issues and solutions for Context Engine. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- diff --git a/docs/vscode-extension.md b/docs/vscode-extension.md index 605fd85a..396fbd49 100644 --- a/docs/vscode-extension.md +++ b/docs/vscode-extension.md @@ -2,7 +2,7 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integration. -**Documentation:** [README](../README.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) +**Documentation:** [README](../README.md) · [Getting Started](GETTING_STARTED.md) · [Configuration](CONFIGURATION.md) · [IDE Clients](IDE_CLIENTS.md) · [MCP API](MCP_API.md) · [ctx CLI](CTX_CLI.md) · [Memory Guide](MEMORY_GUIDE.md) · [Architecture](ARCHITECTURE.md) · [Multi-Repo](MULTI_REPO_COLLECTIONS.md) · [Kubernetes](../deploy/kubernetes/README.md) · [VS Code Extension](vscode-extension.md) · [Troubleshooting](TROUBLESHOOTING.md) · [Development](DEVELOPMENT.md) --- @@ -19,7 +19,7 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integ ## Quick Start 1. **Install**: Build the `.vsix` and install in VS Code (see [Installation](#installation)) -2. **Configure server**: Settings → `contextEngineUploader.endpoint` → `http://localhost:9090` (or remote server) +2. **Configure server**: Settings → `contextEngineUploader.endpoint` → `http://localhost:8004` for the dev-remote Docker stack (or your upload_service URL) 3. **Index workspace**: Click status bar button or run `Context Engine Uploader: Start` 4. **Use Prompt+**: Select code, click `Prompt+` in status bar to enhance with AI @@ -30,14 +30,15 @@ Context Engine Uploader extension for automatic workspace sync and Prompt+ integ - **Output channel**: Real-time logs for force-sync and watch operations - **GPU decoder support**: Configure llama.cpp, Ollama, or GLM as decoder backend - **Remote server support**: Index to any Context-Engine server (local, remote, Kubernetes) +- **MCP + ctx scaffolding**: Optionally auto-writes Claude Code/Windsurf MCP configs, an optional Claude prompt hook, and a `ctx_config.json` wired to the right collection and decoder settings. ## Workflow Examples -### Local Development -Context-Engine running on same machine: +### Local Development (dev-remote stack) +Context-Engine running via `docker-compose.dev-remote.yml` on the same machine: ``` -Endpoint: http://localhost:9090 -Target Path: (leave empty - uses current workspace) +Endpoint: http://localhost:8004 +Target Path: (leave empty - uses current workspace or let the extension auto-detect) ``` Open any project → extension auto-syncs → MCP tools have your code context. @@ -76,6 +77,13 @@ Looking at upload_service.py lines 120-180, the upload_file() function currently Reference the existing error patterns in remote_upload_client.py lines 45-67 which use structured logging via logger.error(). ``` +### Claude Code hook (optional) + +For Claude Code, you can also enable a `/ctx` hook so that each prompt is expanded via `ctx.py` before it reaches Claude: + +- The extension can auto-write MCP config and, on Linux/dev-remote, a Claude hook when `claudeHookEnabled` is turned on. +- See `docs/ctx/claude-hook-example.json` for a minimal `UserPromptSubmit` hook that shells out to `ctx-hook-simple.sh`. + ## Installation ### Build Prerequisites @@ -108,7 +116,7 @@ Key Settings After Install -------------------------- - `Context Engine Upload` output channel shows force-sync and watch logs. - `Context Engine Uploader: Index Codebase` command or status bar button runs a force sync followed by watch. -- Configure `contextEngineUploader.targetPath`, `endpoint`, and other options under Settings → Extensions → Context Engine Uploader. +- Configure `contextEngineUploader.targetPath`, `endpoint`, and (optionally) MCP settings (`mcpIndexerUrl`, `mcpMemoryUrl`, `mcpTransportMode`, `mcpClaudeEnabled`, `mcpWindsurfEnabled`, `autoWriteMcpConfigOnStartup`) under Settings → Extensions → Context Engine Uploader. ## Prerequisites Python 3.8+ must be available on the host so the bundled client can run. @@ -130,6 +138,14 @@ All settings live under `Context Engine Uploader` in the VS Code settings UI or | `contextEngineUploader.intervalSeconds` | Poll interval for watch mode. Set to `5` to match the previous command file. | | `contextEngineUploader.extraForceArgs` | Optional string array appended to the force invocation. Leave empty for the standard workflow. | | `contextEngineUploader.extraWatchArgs` | Optional string array appended to the watch invocation. | +| `contextEngineUploader.mcpClaudeEnabled` | Enable writing the project-local `.mcp.json` used by Claude Code MCP clients. | +| `contextEngineUploader.mcpWindsurfEnabled` | Enable writing Windsurf’s global MCP config. | +| `contextEngineUploader.autoWriteMcpConfigOnStartup` | Automatically run “Write MCP Config” on activation to keep `.mcp.json`, Windsurf config, and Claude hook in sync with these settings. | +| `contextEngineUploader.mcpTransportMode` | Transport for MCP configs: `sse-remote` (SSE via mcp-remote) or `http` (direct `/mcp` endpoints). | +| `contextEngineUploader.mcpIndexerUrl` | MCP indexer URL used when writing configs. For dev-remote, typical values are `http://localhost:8001/sse` (SSE) or `http://localhost:8003/mcp` (HTTP). | +| `contextEngineUploader.mcpMemoryUrl` | MCP memory URL used when writing configs. For dev-remote, typical values are `http://localhost:8000/sse` (SSE) or `http://localhost:8002/mcp` (HTTP). | +| `contextEngineUploader.ctxIndexerUrl` | HTTP MCP indexer endpoint used by `ctx.py` in the Claude Code `/ctx` hook, typically `http://localhost:8003/mcp` for dev-remote. | +| `contextEngineUploader.claudeHookEnabled` | Enable writing a Claude Code `/ctx` hook in `.claude/settings.local.json`. | ## Commands and lifecycle @@ -156,7 +172,7 @@ The extension logs all subprocess output to the **Context Engine Upload** output ### Connection refused ```bash # Verify upload service is running -curl http://localhost:9090/health +curl http://localhost:8004/health # Check Docker logs docker compose logs upload_service From 11fe255bf68ee0c83a8a63420068adf6247bba7b Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 29 Nov 2025 14:41:31 +0000 Subject: [PATCH 23/25] fix(search/snippets): use container_path for repo_search/context_answer snippets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: hybrid_search.run_hybrid_search was updated to emit host paths in the path field (with PATH_EMIT_MODE=auto preferring host_path), while still attaching container_path for the /work/... mirror. mcp_indexer_server.repo_search and context_answer snippet helpers still assumed path was a /work path and refused to read files when path pointed outside /work. Result: repo_search returned empty snippet fields, and context_answer’s identifier-based span filtering lost access to filesystem snippets, even though Qdrant payloads and /work mounts were correct. Fix: In repo_search’s _read_snip, prefer item["container_path"] for filesystem reads, falling back to item["path"] only when container_path is missing. Still enforce the /work sandbox: resolve the candidate path and bail if realpath is not under /work. In context_answer’s _read_span_snippet, do the same: prefer span["container_path"] over span["path"] when resolving the file used to build _ident_snippet. Leave path unchanged in both APIs so callers continue to see host‑centric paths, while internal snippet I/O always targets the server’s /work/... tree. Correctness / compatibility: Works for both local Linux and remote Windows uploads: Remote clients send host paths only in host_path; the indexer populates container_path under /work/..., which is what the server now uses for reads. Existing points that predate dual-path metadata still work via the fallback to path when container_path is absent. The /work realpath guard is retained, so snippet reads remain sandboxed to the mounted workspace. Existing tests like test_repo_search_snippet_strict_cap_after_highlight continue to pass, and manual repo_search calls in the dev‑remote stack now return non-empty snippets without reindexing --- scripts/mcp_indexer_server.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index ea705a62..8e20a7b0 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -2373,7 +2373,11 @@ def _read_snip(args): el = int(item.get("end_line") or 0) if not path or not sl: return (i, "") - raw_path = str(path) + raw_path = ( + str(item.get("container_path")) + if item.get("container_path") + else str(path) + ) p = ( raw_path if os.path.isabs(raw_path) @@ -5718,12 +5722,14 @@ def _read_span_snippet(span: Dict[str, Any]) -> str: return "" try: path = str(span.get("path") or "") + container_path = str(span.get("container_path") or "") sline = int(span.get("start_line") or 0) eline = int(span.get("end_line") or 0) - if not path or sline <= 0: + if not (path or container_path) or sline <= 0: span["_ident_snippet"] = "" return "" - fp = path + raw_path = container_path or path + fp = raw_path if not os.path.isabs(fp): fp = os.path.join("/work", fp) realp = os.path.realpath(fp) From dd90ac2198c056b48fb27bee8fed502fbadb7a0c Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 29 Nov 2025 15:51:29 +0000 Subject: [PATCH 24/25] Fix Windows host_path corruption when HOST_INDEX_PATH includes drive letters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Root cause: when HOST_INDEX_PATH in the indexer container contained a Windows-style path (e.g. "/workc:\Users\Admin\..."), ingest_code combined it with /work//... using os.path.join. This produced malformed host_path values like "/work/c:\Users\.../3-5-/API/Logs.py" that then surfaced in Qdrant metadata and repo_search results. - ingest_code.index_single_file: guard HOST_INDEX_PATH by ignoring any value that contains a colon (Windows drive letter heuristic). When origin.source_path is available we still derive host_path from that; when origin is missing and HOST_INDEX_PATH looks Windows-y we fall back to using the container path instead of constructing /workC:\... strings. - watch_index._rename_in_store: apply the same HOST_INDEX_PATH guard when recomputing host_path/container_path during fast-path renames so we don’t reintroduce malformed host paths for Windows-uploaded repos. - Linux/local behavior is unchanged: normal Unix HOST_INDEX_PATH values do not contain a colon, so the guard is inactive and existing host_path/container_path derivation continues to work as before. --- scripts/ingest_code.py | 2 ++ scripts/watch_index.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 716f2cc4..72d2e9d5 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -2116,6 +2116,8 @@ def make_point(pid, dense_vec, lex_vec, payload): # Track both container path (/work mirror) and original host path for clarity across environments _cur_path = str(file_path) _host_root = str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") + if ":" in _host_root: # Windows drive letter (e.g., "C:") + _host_root = "" _host_path = None _container_path = None diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 7dfa224a..9a5af634 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -573,6 +573,8 @@ def _rename_in_store( host_root = ( str(os.environ.get("HOST_INDEX_PATH") or "").strip().rstrip("/") ) + if ":" in host_root: # Windows drive letter (e.g., "C:") + host_root = "" host_path = None container_path = None try: From bd264b2ebc2b1afabc8cfd67a25a2bdef9a9cd09 Mon Sep 17 00:00:00 2001 From: Reese Date: Sat, 29 Nov 2025 22:47:40 +0000 Subject: [PATCH 25/25] perf(git worktrees): feat: add logical repo-based collection reuse and cross-worktree dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Introduce a stable logical_repo_id in workspace_state, computed from the git common dir when available (git:, fs: fallback). - Add workspace_state helpers to find or create collections by logical repo (find_collection_for_logical_repo, get_or_create_collection_for_logical_repo), and persist logical_repo_id + qdrant_collection in .codebase state. - Update upload_service to accept logical_repo_id from clients, reuse an existing canonical collection when a mapping exists, and perform a one-time latent migration from legacy fs: IDs when there is a single existing mapping. - Extend ingest_code to store logical file identity in Qdrant payloads (metadata.repo_id + metadata.repo_rel_path) and to prefer this logical identity in get_indexed_file_hash, enabling skip-unchanged across git worktrees / slugs instead of only by absolute path. - Update index_single_file and batch index_repo flows to derive (repo_id, repo_rel_path) from workspace_state + /work layout, and pass them into Qdrant lookups and metadata writes. - Enhance watch_index’s _get_collection_for_repo to consult workspace state and reuse the canonical collection for all slugs sharing the same logical_repo_id, aligning local watcher/indexer behavior with the remote upload service. - Gate all logical-repo / collection reuse behavior behind a new LOGICAL_REPO_REUSE feature flag so default behavior remains the legacy per-repo collection and path-based dedup until explicitly enabled. - Add lightweight logging around logical_repo_id state reads/writes to avoid completely silent failures while keeping the new paths best-effort and backwards compatible. - Document LOGICAL_REPO_REUSE in .env.example (commented out by default), with notes on collection reuse across worktrees and logical (repo_id + repo_rel_path) skip-unchanged semantics. --- .env.example | 9 +- scripts/ingest_code.py | 191 +++++++++++++++++++---- scripts/remote_upload_client.py | 46 +++++- scripts/standalone_upload_client.py | 47 +++++- scripts/upload_service.py | 90 +++++++++-- scripts/watch_index.py | 70 ++++++++- scripts/workspace_state.py | 232 +++++++++++++++++++++++++++- 7 files changed, 635 insertions(+), 50 deletions(-) diff --git a/.env.example b/.env.example index 8d7f8f2f..3da5e865 100644 --- a/.env.example +++ b/.env.example @@ -7,6 +7,12 @@ QDRANT_URL=http://localhost:6333 # Multi-repo: Each subdirectory gets its own collection MULTI_REPO_MODE=0 +# Logical repo reuse (experimental): 0=disabled (default), 1=enable logical_repo_id-based +# collection reuse across git worktrees / clones. When enabled, indexer, watcher, and +# upload service will try to reuse a canonical collection per logical repository and +# use (repo_id + repo_rel_path) for skip-unchanged across worktrees. +#LOGICAL_REPO_REUSE=0 + # Single unified collection for seamless cross-repo search (default: "codebase") # Leave unset or use "codebase" for unified search across all your code COLLECTION_NAME=codebase @@ -177,4 +183,5 @@ SMART_SYMBOL_REINDEXING=0 # Enable commit lineage goals for indexing REFRAG_COMMIT_DESCRIBE=1 -STRICT_MEMORY_RESTORE=0 \ No newline at end of file +STRICT_MEMORY_RESTORE=0 + diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index 72d2e9d5..dc769dcd 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -40,11 +40,15 @@ def _detect_repo_name_from_path(path: Path) -> str: from scripts.workspace_state import ( is_multi_repo_mode, get_collection_name, + logical_repo_reuse_enabled, ) except ImportError: is_multi_repo_mode = None # type: ignore get_collection_name = None # type: ignore + def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] + return False + # Import watcher's repo detection for surgical fix try: from scripts.watch_index import _detect_repo_for_file, _get_collection_for_file @@ -68,6 +72,7 @@ def _detect_repo_name_from_path(path: Path) -> str: get_cached_pseudo, set_cached_pseudo, update_symbols_with_pseudo, + get_workspace_state, ) except ImportError: # State integration is optional; continue if not available @@ -84,7 +89,7 @@ def _detect_repo_name_from_path(path: Path) -> str: set_cached_pseudo = None # type: ignore update_symbols_with_pseudo = None # type: ignore compare_symbol_changes = None # type: ignore - compare_symbol_changes = None # type: ignore + get_workspace_state = None # type: ignore # Optional Tree-sitter import (graceful fallback) try: @@ -155,7 +160,6 @@ def _use_tree_sitter() -> bool: ".csproj": "xml", ".config": "xml", ".resx": "xml", - } # --- Named vector config --- @@ -307,7 +311,6 @@ def _git_metadata(file_path: Path) -> tuple[int, int, int]: "/.vs", "/.cache", "/.codebase", - "/node_modules", "/dist", "/build", @@ -317,7 +320,6 @@ def _git_metadata(file_path: Path) -> tuple[int, int, int]: "bin", "obj", "TestResults", - "/.git", ] _DEFAULT_EXCLUDE_FILES = [ @@ -454,7 +456,6 @@ def chunk_lines(text: str, max_lines: int = 120, overlap: int = 20) -> List[Dict if j == n: break - i = max(j - overlap, i + 1) return chunks @@ -470,7 +471,6 @@ def chunk_semantic( lines = text.splitlines() n = len(lines) - # Extract symbols with line ranges symbols = _extract_symbols(language, text) if not symbols: @@ -530,7 +530,6 @@ def chunk_by_tokens( except Exception: Tokenizer = None # type: ignore - try: k = int(os.environ.get("MICRO_CHUNK_TOKENS", str(k_tokens or 16)) or 16) except Exception: @@ -639,7 +638,12 @@ def char_to_line(c: int) -> int: def _pseudo_describe_enabled() -> bool: try: - return str(os.environ.get("REFRAG_PSEUDO_DESCRIBE", "0")).strip().lower() in {"1","true","yes","on"} + return str(os.environ.get("REFRAG_PSEUDO_DESCRIBE", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } except Exception: return False @@ -649,7 +653,12 @@ def _pseudo_describe_enabled() -> bool: def _smart_symbol_reindexing_enabled() -> bool: """Check if symbol-aware reindexing is enabled.""" try: - return str(os.environ.get("SMART_SYMBOL_REINDEXING", "0")).strip().lower() in {"1","true","yes","on"} + return str(os.environ.get("SMART_SYMBOL_REINDEXING", "0")).strip().lower() in { + "1", + "true", + "yes", + "on", + } except Exception: return False @@ -674,21 +683,21 @@ def extract_symbols_with_tree_sitter(file_path: str) -> dict: symbol_id = f"{sym['kind']}_{sym['name']}_{sym['start']}" # Extract actual content for hashing - content_lines = text.split('\n')[sym['start']-1:sym['end']] - content = '\n'.join(content_lines) - content_hash = hashlib.sha1(content.encode('utf-8', errors='ignore')).hexdigest() + content_lines = text.split("\n")[sym["start"] - 1 : sym["end"]] + content = "\n".join(content_lines) + content_hash = hashlib.sha1(content.encode("utf-8", errors="ignore")).hexdigest() symbols[symbol_id] = { - 'name': sym['name'], - 'type': sym['kind'], - 'start_line': sym['start'], - 'end_line': sym['end'], - 'content_hash': content_hash, - 'content': content, + "name": sym["name"], + "type": sym["kind"], + "start_line": sym["start"], + "end_line": sym["end"], + "content_hash": content_hash, + "content": content, # These will be populated during processing - 'pseudo': '', - 'tags': [], - 'qdrant_ids': [] # Will store Qdrant point IDs for this symbol + "pseudo": "", + "tags": [], + "qdrant_ids": [], # Will store Qdrant point IDs for this symbol } return symbols @@ -1091,6 +1100,8 @@ def ensure_payload_indexes(client: QdrantClient, collection: str): for field in ( "metadata.language", "metadata.path_prefix", + "metadata.repo_id", + "metadata.repo_rel_path", "metadata.repo", "metadata.kind", "metadata.symbol", @@ -1269,8 +1280,50 @@ def _extract_calls(language: str, text: str) -> list: return out[:200] -def get_indexed_file_hash(client: QdrantClient, collection: str, file_path: str) -> str: - """Return previously indexed file hash for this path, or empty string.""" +def get_indexed_file_hash( + client: QdrantClient, + collection: str, + file_path: str, + *, + repo_id: str | None = None, + repo_rel_path: str | None = None, +) -> str: + """Return previously indexed file hash for this logical path, or empty string. + + Prefers logical identity (repo_id + repo_rel_path) when available so that + worktrees sharing a logical repo can reuse existing index state, but falls + back to metadata.path for backwards compatibility. + """ + # Prefer logical identity when both repo_id and repo_rel_path are provided + if logical_repo_reuse_enabled() and repo_id and repo_rel_path: + try: + filt = models.Filter( + must=[ + models.FieldCondition( + key="metadata.repo_id", match=models.MatchValue(value=repo_id) + ), + models.FieldCondition( + key="metadata.repo_rel_path", + match=models.MatchValue(value=repo_rel_path), + ), + ] + ) + points, _ = client.scroll( + collection_name=collection, + scroll_filter=filt, + with_payload=True, + limit=1, + ) + if points: + md = (points[0].payload or {}).get("metadata") or {} + fh = md.get("file_hash") + if fh: + return str(fh) + except Exception: + # Fall back to path-based lookup below + pass + + # Backwards-compatible path-based lookup try: filt = models.Filter( must=[ @@ -1287,7 +1340,9 @@ def get_indexed_file_hash(client: QdrantClient, collection: str, file_path: str) ) if points: md = (points[0].payload or {}).get("metadata") or {} - return str(md.get("file_hash") or "") + fh = md.get("file_hash") + if fh: + return str(fh) except Exception: return "" return "" @@ -1982,6 +2037,37 @@ def index_single_file( repo_tag = _detect_repo_name_from_path(file_path) + # Derive logical repo identity and repo-relative path for cross-worktree reuse. + repo_id: str | None = None + repo_rel_path: str | None = None + if logical_repo_reuse_enabled() and get_workspace_state is not None: + try: + ws_root = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" + # Resolve workspace state for this repo to read logical_repo_id + state = get_workspace_state(ws_root, repo_tag) + lrid = state.get("logical_repo_id") if isinstance(state, dict) else None + if isinstance(lrid, str) and lrid: + repo_id = lrid + # Compute repo-relative path within the current workspace tree + try: + fp = file_path.resolve() + except Exception: + fp = file_path + try: + ws_base = Path(os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work").resolve() + repo_root = ws_base + if repo_tag: + # In multi-repo scenarios, repos live under /work/ + candidate = ws_base / repo_tag + if candidate.exists(): + repo_root = candidate + rel = fp.relative_to(repo_root) + repo_rel_path = rel.as_posix() + except Exception: + repo_rel_path = None + except Exception as e: + print(f"[logical_repo] Failed to derive logical identity for {file_path}: {e}") + # Get changed symbols for pseudo processing optimization changed_symbols = set() if get_cached_symbols and set_cached_symbols: @@ -2006,7 +2092,13 @@ def index_single_file( return False except Exception: pass - prev = get_indexed_file_hash(client, collection, str(file_path)) + prev = get_indexed_file_hash( + client, + collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) if prev and prev == file_hash: print(f"Skipping unchanged file: {file_path}") return False @@ -2185,6 +2277,9 @@ def make_point(pid, dense_vec, lex_vec, payload): "last_modified_at": int(last_mod), "churn_count": int(churn_count), "author_count": int(author_count), + # Logical identity for cross-worktree reuse + "repo_id": repo_id, + "repo_rel_path": repo_rel_path, # New: explicit dual-path tracking "host_path": _host_path, "container_path": _container_path, @@ -2457,6 +2552,35 @@ def make_point(pid, dense_vec, lex_vec, payload): str(Path(workspace_root).resolve() / per_file_repo), ) + # Derive logical repo identity and repo-relative path for cross-worktree reuse. + repo_id: str | None = None + repo_rel_path: str | None = None + try: + if get_workspace_state is not None: + ws_root = os.environ.get("WATCH_ROOT") or os.environ.get("WORKSPACE_PATH") or "/work" + state = get_workspace_state(ws_root, per_file_repo) + lrid = state.get("logical_repo_id") if isinstance(state, dict) else None + if isinstance(lrid, str) and lrid: + repo_id = lrid + try: + fp_resolved = file_path.resolve() + except Exception: + fp_resolved = file_path + try: + ws_base = Path(workspace_root).resolve() + repo_root = ws_base + if per_file_repo: + candidate = ws_base / per_file_repo + if candidate.exists(): + repo_root = candidate + rel = fp_resolved.relative_to(repo_root) + repo_rel_path = rel.as_posix() + except Exception: + repo_rel_path = None + except Exception: + repo_id = None + repo_rel_path = None + # Skip unchanged files if enabled (default) if skip_unchanged: # Prefer local workspace cache to avoid Qdrant lookups @@ -2494,8 +2618,14 @@ def make_point(pid, dense_vec, lex_vec, payload): except Exception: pass - # Check existing indexed hash in Qdrant - prev = get_indexed_file_hash(client, current_collection, str(file_path)) + # Check existing indexed hash in Qdrant (logical identity when available) + prev = get_indexed_file_hash( + client, + current_collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) if prev and file_hash and prev == file_hash: # File exists in Qdrant with same hash - cache it locally for next time try: @@ -2712,6 +2842,9 @@ def make_point(pid, dense_vec, lex_vec, payload): "last_modified_at": int(last_mod), "churn_count": int(churn_count), "author_count": int(author_count), + # Logical identity for cross-worktree reuse + "repo_id": repo_id, + "repo_rel_path": repo_rel_path, # New: dual-path tracking "host_path": _host_path, "container_path": _container_path, @@ -2929,6 +3062,10 @@ def process_file_with_smart_reindexing( Symbol cache is used to gate pseudo/tag generation, but embedding reuse is decided at the chunk level by matching previous chunk code. + + TODO(logical_repo): consider loading existing points by logical identity + (repo_id + repo_rel_path) instead of metadata.path so worktrees/branches + sharing a repo can reuse embeddings across slugs, not just per-path. """ try: print(f"[SMART_REINDEX] Processing {file_path} with chunk-level reindexing") diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index 127bbbc0..a8e8a5be 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -69,6 +69,36 @@ def _find_git_root(start: Path) -> Optional[Path]: return None +def _compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + try: + r = subprocess.run( + ["git", "-C", str(p), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode == 0 and raw: + common = Path(raw) + if not common.is_absolute(): + base = p if p.is_dir() else p.parent + common = base / common + key = str(common.resolve()) + prefix = "git:" + else: + raise RuntimeError + except Exception: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + def _redact_emails(text: str) -> str: """Redact email addresses from commit messages for privacy.""" try: @@ -346,7 +376,8 @@ def _translate_to_container_path(self, host_path: str) -> str: return host_path.replace('\\', '/').replace(':', '') def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, - max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None, + logical_repo_id: Optional[str] = None): """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path @@ -354,6 +385,7 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.max_retries = max_retries self.timeout = timeout self.temp_dir = None + self.logical_repo_id = logical_repo_id # Set environment variables for cache functions os.environ["WORKSPACE_PATH"] = workspace_path @@ -803,6 +835,8 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, 'force': 'false', 'source_path': self.workspace_path, } + if getattr(self, "logical_repo_id", None): + data['logical_repo_id'] = self.logical_repo_id logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") @@ -1327,6 +1361,8 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: else: workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + logical_repo_id = _compute_logical_repo_id(workspace_path) + # Use auto-generated collection name based on repo name repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails @@ -1338,6 +1374,7 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), "workspace_path": workspace_path, "collection_name": collection_name, + "logical_repo_id": logical_repo_id, # Use higher, more robust defaults but still allow env overrides "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "5")), "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "1800")), @@ -1454,6 +1491,7 @@ def main(): collection_name=config["collection_name"], max_retries=config["max_retries"], timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: client.log_mapping_summary() return 0 @@ -1467,7 +1505,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") @@ -1509,7 +1548,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py index 063db07e..3aaf9cd9 100644 --- a/scripts/standalone_upload_client.py +++ b/scripts/standalone_upload_client.py @@ -255,6 +255,36 @@ def _find_git_root(start: Path) -> Optional[Path]: return None +def _compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + try: + r = subprocess.run( + ["git", "-C", str(p), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode == 0 and raw: + common = Path(raw) + if not common.is_absolute(): + base = p if p.is_dir() else p.parent + common = base / common + key = str(common.resolve()) + prefix = "git:" + else: + raise RuntimeError + except Exception: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + def _redact_emails(text: str) -> str: """Redact email addresses from commit messages for privacy.""" try: @@ -503,7 +533,8 @@ def _translate_to_container_path(self, host_path: str) -> str: return host_path.replace('\\', '/').replace(':', '') def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: str, - max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None): + max_retries: int = 3, timeout: int = 30, metadata_path: Optional[str] = None, + logical_repo_id: Optional[str] = None): """Initialize remote upload client.""" self.upload_endpoint = upload_endpoint.rstrip('/') self.workspace_path = workspace_path @@ -511,6 +542,7 @@ def __init__(self, upload_endpoint: str, workspace_path: str, collection_name: s self.max_retries = max_retries self.timeout = timeout self.temp_dir = None + self.logical_repo_id = logical_repo_id # Set environment variables for cache functions os.environ["WORKSPACE_PATH"] = workspace_path @@ -956,6 +988,9 @@ def upload_bundle(self, bundle_path: str, manifest: Dict[str, Any]) -> Dict[str, 'source_path': self.workspace_path, } + if getattr(self, "logical_repo_id", None): + data['logical_repo_id'] = self.logical_repo_id + logger.info(f"[remote_upload] Uploading bundle {manifest['bundle_id']} (size: {bundle_size} bytes)") response = self.session.post( @@ -1478,6 +1513,8 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: else: workspace_path = os.environ.get("WATCH_ROOT", os.environ.get("WORKSPACE_PATH", "/work")) + logical_repo_id = _compute_logical_repo_id(workspace_path) + # Use auto-generated collection name based on repo name repo_name = _extract_repo_name_from_path(workspace_path) # Fallback to directory name if repo detection fails @@ -1489,6 +1526,7 @@ def get_remote_config(cli_path: Optional[str] = None) -> Dict[str, str]: "upload_endpoint": os.environ.get("REMOTE_UPLOAD_ENDPOINT", "http://localhost:8080"), "workspace_path": workspace_path, "collection_name": collection_name, + "logical_repo_id": logical_repo_id, # Use higher, more robust defaults but still allow env overrides "max_retries": int(os.environ.get("REMOTE_UPLOAD_MAX_RETRIES", "5")), "timeout": int(os.environ.get("REMOTE_UPLOAD_TIMEOUT", "1800")), @@ -1599,6 +1637,7 @@ def main(): collection_name=config["collection_name"], max_retries=config["max_retries"], timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: client.log_mapping_summary() return 0 @@ -1612,7 +1651,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") @@ -1655,7 +1695,8 @@ def main(): workspace_path=config["workspace_path"], collection_name=config["collection_name"], max_retries=config["max_retries"], - timeout=config["timeout"] + timeout=config["timeout"], + logical_repo_id=config.get("logical_repo_id"), ) as client: logger.info("Remote upload client initialized successfully") diff --git a/scripts/upload_service.py b/scripts/upload_service.py index b5095034..ad386faf 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -33,6 +33,9 @@ _extract_repo_name_from_path, update_repo_origin, get_collection_mappings, + find_collection_for_logical_repo, + update_workspace_state, + logical_repo_reuse_enabled, ) except ImportError: # Fallback for testing without full environment @@ -43,6 +46,11 @@ _extract_repo_name_from_path = None update_repo_origin = None get_collection_mappings = None + find_collection_for_logical_repo = None + update_workspace_state = None + + def logical_repo_reuse_enabled() -> bool: # type: ignore[no-redef] + return False # Configure logging @@ -424,6 +432,7 @@ async def upload_delta_bundle( sequence_number: Optional[int] = Form(None), force: Optional[bool] = Form(False), source_path: Optional[str] = Form(None), + logical_repo_id: Optional[str] = Form(None), ): """Upload and process delta bundle.""" start_time = datetime.now() @@ -443,8 +452,51 @@ async def upload_delta_bundle( if not repo_name: repo_name = Path(workspace_path).name - # Get collection name (respect client-supplied name when provided) - if not collection_name: + # Preserve any client-supplied collection name but allow server-side overrides + client_collection_name = collection_name + resolved_collection: Optional[str] = None + + # Resolve collection name, preferring server-side mapping for logical_repo_id when enabled + if logical_repo_reuse_enabled() and logical_repo_id and find_collection_for_logical_repo: + try: + existing = find_collection_for_logical_repo(logical_repo_id, search_root=WORK_DIR) + except Exception: + existing = None + if existing: + resolved_collection = existing + + # Latent migration: when no explicit mapping exists yet for this logical_repo_id, but there is a + # single existing collection mapping, prefer reusing it rather than creating a fresh collection. + if logical_repo_reuse_enabled() and logical_repo_id and resolved_collection is None and get_collection_mappings: + try: + mappings = get_collection_mappings(search_root=WORK_DIR) or [] + except Exception: + mappings = [] + + if len(mappings) == 1: + canonical = mappings[0] + canonical_coll = canonical.get("collection_name") + if canonical_coll: + resolved_collection = canonical_coll + if update_workspace_state: + try: + update_workspace_state( + workspace_path=canonical.get("container_path") or canonical.get("state_file"), + updates={"logical_repo_id": logical_repo_id}, + repo_name=canonical.get("repo_name"), + ) + except Exception as migrate_err: + logger.debug( + f"[upload_service] Failed to migrate logical_repo_id for existing mapping: {migrate_err}" + ) + + # Finalize collection_name: prefer resolved server-side mapping, then client-supplied name, + # then standard get_collection_name/DEFAULT_COLLECTION fallbacks. + if resolved_collection is not None: + collection_name = resolved_collection + elif client_collection_name: + collection_name = client_collection_name + else: if get_collection_name and repo_name: collection_name = get_collection_name(repo_name) else: @@ -453,17 +505,35 @@ async def upload_delta_bundle( # Persist origin metadata for remote lookups (including client source_path) # Use slugged repo name (repo+16) for state so it matches ingest/watch_index usage try: - if update_repo_origin and repo_name: + if repo_name: workspace_key = get_workspace_key(workspace_path) slug_repo_name = f"{repo_name}-{workspace_key}" container_workspace = str(Path(WORK_DIR) / slug_repo_name) - update_repo_origin( - workspace_path=container_workspace, - repo_name=slug_repo_name, - container_path=container_workspace, - source_path=source_path or workspace_path, - collection_name=collection_name, - ) + + # Persist logical_repo_id mapping for this slug/workspace when provided (feature-gated) + if logical_repo_reuse_enabled() and logical_repo_id and update_workspace_state: + try: + update_workspace_state( + workspace_path=container_workspace, + updates={ + "logical_repo_id": logical_repo_id, + "qdrant_collection": collection_name, + }, + repo_name=slug_repo_name, + ) + except Exception as state_err: + logger.debug( + f"[upload_service] Failed to persist logical_repo_id mapping: {state_err}" + ) + + if update_repo_origin: + update_repo_origin( + workspace_path=container_workspace, + repo_name=slug_repo_name, + container_path=container_workspace, + source_path=source_path or workspace_path, + collection_name=collection_name, + ) except Exception as origin_err: logger.debug(f"[upload_service] Failed to persist origin info: {origin_err}") diff --git a/scripts/watch_index.py b/scripts/watch_index.py index 9a5af634..ba373d82 100644 --- a/scripts/watch_index.py +++ b/scripts/watch_index.py @@ -31,6 +31,10 @@ remove_cached_file, update_indexing_status, update_workspace_state, + get_workspace_state, + ensure_logical_repo_id, + find_collection_for_logical_repo, + logical_repo_reuse_enabled, ) import hashlib from datetime import datetime @@ -70,13 +74,75 @@ def _detect_repo_for_file(file_path: Path) -> Optional[Path]: def _get_collection_for_repo(repo_path: Path) -> str: + """Resolve Qdrant collection for a repo, with logical_repo_id-aware reuse. + + In multi-repo mode, prefer reusing an existing canonical collection that has + already been associated with this logical repository (same git common dir) + by consulting workspace_state. Falls back to the legacy per-repo hashed + collection naming when no mapping exists. + """ + default_coll = os.environ.get("COLLECTION_NAME", "my-collection") try: repo_name = _extract_repo_name_from_path(str(repo_path)) + except Exception: + repo_name = None + + # Multi-repo: try to reuse a canonical collection based on logical_repo_id + if repo_name and is_multi_repo_mode() and logical_repo_reuse_enabled(): + workspace_root = os.environ.get("WORKSPACE_PATH") or os.environ.get("WATCH_ROOT") or "/work" + try: + ws_root_path = Path(workspace_root).resolve() + except Exception: + ws_root_path = Path(workspace_root) + ws_path = str((ws_root_path / repo_name).resolve()) + + state: Dict[str, Any] + try: + state = get_workspace_state(ws_path, repo_name) or {} + except Exception: + state = {} + + if isinstance(state, dict): + try: + state = ensure_logical_repo_id(state, ws_path) + except Exception: + pass + lrid = state.get("logical_repo_id") + if isinstance(lrid, str) and lrid: + coll: Optional[str] + try: + coll = find_collection_for_logical_repo(lrid, search_root=str(ws_root_path)) + except Exception: + coll = None + if isinstance(coll, str) and coll: + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll, "logical_repo_id": lrid}, + repo_name=repo_name, + ) + except Exception: + pass + return coll + + # Fallback to any explicit collection stored in state for this repo + coll2 = state.get("qdrant_collection") + if isinstance(coll2, str) and coll2: + return coll2 + + # Legacy behaviour: derive per-repo collection name + try: + return get_collection_name(repo_name) + except Exception: + return default_coll + + # Single-repo mode or repo_name detection failed: use existing helpers/env + try: if repo_name: return get_collection_name(repo_name) except Exception: pass - return os.environ.get("COLLECTION_NAME", "my-collection") + return default_coll def _get_collection_for_file(file_path: Path) -> str: @@ -266,6 +332,8 @@ def on_deleted(self, event): p = Path(event.src_path).resolve() except Exception: return + if any(part == ".codebase" for part in p.parts): + return # Only attempt deletion for code files we would have indexed if p.suffix.lower() not in idx.CODE_EXTS: return diff --git a/scripts/workspace_state.py b/scripts/workspace_state.py index 70f0a15b..df48059e 100644 --- a/scripts/workspace_state.py +++ b/scripts/workspace_state.py @@ -73,6 +73,7 @@ class WorkspaceState(TypedDict, total=False): last_activity: Optional[LastActivity] qdrant_stats: Optional[Dict[str, Any]] origin: Optional[OriginInfo] + logical_repo_id: Optional[str] def is_multi_repo_mode() -> bool: """Check if multi-repo mode is enabled.""" @@ -80,6 +81,21 @@ def is_multi_repo_mode() -> bool: "1", "true", "yes", "on" } + +def logical_repo_reuse_enabled() -> bool: + """Feature flag for logical-repo / collection reuse. + + Controlled by LOGICAL_REPO_REUSE env var: 1/true/yes/on => enabled. + When disabled, behavior falls back to legacy per-repo collection logic + and does not write logical_repo_id into workspace state. + """ + return os.environ.get("LOGICAL_REPO_REUSE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + _state_lock = threading.Lock() # Track last-used timestamps for cleanup of idle workspace locks _state_locks: Dict[str, threading.RLock] = {} @@ -158,6 +174,59 @@ def _sanitize_name(s: str, max_len: int = 64) -> str: return s[:max_len] +def _detect_git_common_dir(start: Path) -> Optional[Path]: + try: + base = start if start.is_dir() else start.parent + r = subprocess.run( + ["git", "-C", str(base), "rev-parse", "--git-common-dir"], + capture_output=True, + text=True, + ) + raw = (r.stdout or "").strip() + if r.returncode != 0 or not raw: + return None + p = Path(raw) + if not p.is_absolute(): + p = base / p + return p.resolve() + except Exception: + return None + + +def compute_logical_repo_id(workspace_path: str) -> str: + try: + p = Path(workspace_path).resolve() + except Exception: + p = Path(workspace_path) + + common = _detect_git_common_dir(p) + if common is not None: + key = str(common) + prefix = "git:" + else: + key = str(p) + prefix = "fs:" + + h = hashlib.sha1(key.encode("utf-8", errors="ignore")).hexdigest()[:16] + return f"{prefix}{h}" + + +def ensure_logical_repo_id(state: WorkspaceState, workspace_path: str) -> WorkspaceState: + if not isinstance(state, dict): + return state + if not logical_repo_reuse_enabled(): + # Gate: when logical repo reuse is disabled, leave state untouched + return state + if state.get("logical_repo_id"): + return state + lrid = compute_logical_repo_id(workspace_path) + state["logical_repo_id"] = lrid + origin = dict(state.get("origin", {}) or {}) + origin.setdefault("logical_repo_id", lrid) + state["origin"] = origin + return state + + # Cross-process file locking (POSIX fcntl), falls back to no-op if unavailable try: import fcntl # type: ignore @@ -310,10 +379,17 @@ def get_workspace_state( try: with open(state_path, "r", encoding="utf-8") as f: state = json.load(f) - if isinstance(state, dict): - return state - except (json.JSONDecodeError, ValueError, OSError): - pass + if isinstance(state, dict): + if logical_repo_reuse_enabled(): + workspace_real = str(Path(workspace_path or _resolve_workspace_root()).resolve()) + state = ensure_logical_repo_id(state, workspace_real) + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id to {state_path}: {e}") + return state + except (json.JSONDecodeError, ValueError, OSError) as e: + print(f"[workspace_state] Failed to read state from {state_path}: {e}") now = datetime.now().isoformat() collection_name = get_collection_name(repo_name) @@ -326,6 +402,12 @@ def get_workspace_state( "indexing_status": {"state": "idle"}, } + if logical_repo_reuse_enabled(): + try: + state = ensure_logical_repo_id(state, state.get("workspace_path", workspace_path or _resolve_workspace_root())) + except Exception as e: + print(f"[workspace_state] Failed to ensure logical_repo_id for {workspace_path}: {e}") + _atomic_write_state(state_path, state) return state @@ -768,7 +850,8 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, try: with open(state_path, "r", encoding="utf-8") as f: state = json.load(f) or {} - except Exception: + except Exception as e: + print(f"[workspace_state] Failed to read repo state from {state_path}: {e}") continue origin = state.get("origin", {}) or {} @@ -813,6 +896,145 @@ def get_collection_mappings(search_root: Optional[str] = None) -> List[Dict[str, return mappings +def find_collection_for_logical_repo(logical_repo_id: str, search_root: Optional[str] = None) -> Optional[str]: + if not logical_repo_reuse_enabled(): + return None + + root_path = Path(search_root or _resolve_workspace_root()).resolve() + + try: + if is_multi_repo_mode(): + repos_root = root_path / STATE_DIRNAME / "repos" + if repos_root.exists(): + for repo_dir in repos_root.iterdir(): + if not repo_dir.is_dir(): + continue + state_path = repo_dir / STATE_FILENAME + if not state_path.exists(): + continue + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception: + continue + + ws = state.get("workspace_path") or str(root_path) + state = ensure_logical_repo_id(state, ws) + if state.get("logical_repo_id") == logical_repo_id: + coll = state.get("qdrant_collection") + if coll: + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id mapping to {state_path}: {e}") + return coll + + state_path = root_path / STATE_DIRNAME / STATE_FILENAME + if state_path.exists(): + try: + with open(state_path, "r", encoding="utf-8") as f: + state = json.load(f) or {} + except Exception as e: + print(f"[workspace_state] Failed to read workspace state from {state_path}: {e}") + state = {} + + ws = state.get("workspace_path") or str(root_path) + state = ensure_logical_repo_id(state, ws) + if state.get("logical_repo_id") == logical_repo_id: + coll = state.get("qdrant_collection") + if coll: + try: + _atomic_write_state(state_path, state) + except Exception as e: + print(f"[workspace_state] Failed to persist logical_repo_id mapping to {state_path}: {e}") + return coll + except Exception as e: + print(f"[workspace_state] Error while searching collections for logical_repo_id={logical_repo_id}: {e}") + return None + + return None + + +def get_or_create_collection_for_logical_repo( + workspace_path: str, + preferred_repo_name: Optional[str] = None, +) -> str: + # Gate entire logical-repo based resolution behind feature flag + if not logical_repo_reuse_enabled(): + base_repo = preferred_repo_name + try: + coll = get_collection_name(base_repo) + except Exception: + coll = get_collection_name(None) + try: + update_workspace_state( + workspace_path=workspace_path, + updates={"qdrant_collection": coll}, + repo_name=preferred_repo_name, + ) + except Exception as e: + print(f"[workspace_state] Failed to persist legacy qdrant_collection for {workspace_path}: {e}") + return coll + try: + ws = Path(workspace_path).resolve() + except Exception: + ws = Path(workspace_path) + + common = _detect_git_common_dir(ws) + if common is not None: + canonical_root = common.parent + else: + canonical_root = ws + + ws_path = str(canonical_root) + + try: + state = get_workspace_state(workspace_path=ws_path, repo_name=preferred_repo_name) + except Exception: + state = {} + + if not isinstance(state, dict): + state = {} + + try: + state = ensure_logical_repo_id(state, ws_path) + except Exception: + pass + + lrid = state.get("logical_repo_id") + if isinstance(lrid, str) and lrid: + coll = find_collection_for_logical_repo(lrid, search_root=ws_path) + if isinstance(coll, str) and coll: + if state.get("qdrant_collection") != coll: + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll, "logical_repo_id": lrid}, + repo_name=preferred_repo_name, + ) + except Exception: + pass + return coll + + coll = state.get("qdrant_collection") + if not isinstance(coll, str) or not coll: + base_repo = preferred_repo_name + try: + coll = get_collection_name(base_repo) + except Exception: + coll = get_collection_name(None) + try: + update_workspace_state( + workspace_path=ws_path, + updates={"qdrant_collection": coll}, + repo_name=preferred_repo_name, + ) + except Exception: + pass + + return coll + + # ===== Symbol-Level Cache for Smart Reindexing ===== def _get_symbol_cache_path(file_path: str) -> Path: