From 1059e402cde684327515c02b478163c92e29c76f Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sat, 24 Jan 2026 23:51:31 -0500
Subject: [PATCH 01/10] Improve Qdrant error handling and add timeout/fallback
 logic

Adds robust timeout detection and logging for Qdrant queries, introduces concurrency control with a semaphore, and ensures partial results are returned on timeouts. The Qdrant client manager now supports configurable HTTP timeouts via environment variables. The search implementation clamps rerank timeouts and adds fallback to learning reranker results if subprocess reranking fails. Remote upload client now consistently uses container paths for cache lookups and updates, improving cache consistency. Test cases are updated to use asyncio.run for coroutine execution.
---
 scripts/hybrid/qdrant.py         | 280 ++++++++++++++++++++++---------
 scripts/mcp_impl/search.py       |  12 ++
 scripts/qdrant_client_manager.py |  39 ++++-
 scripts/remote_upload_client.py  |  21 ++-
 tests/test_context_answer.py     |  13 +-
 tests/test_error_paths.py        |   4 +-
 6 files changed, 267 insertions(+), 102 deletions(-)

diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py
index 039ca6f1..498366e9 100644
--- a/scripts/hybrid/qdrant.py
+++ b/scripts/hybrid/qdrant.py
@@ -17,19 +17,73 @@
 import logging
 import threading
 import re
-from typing import List, Dict, Any, Tuple
+import time
+from typing import List, Dict, Any, Tuple, Optional, Callable, TypeVar
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 
-# Core Qdrant imports
+# Core Qdrant imports (optional in some runtimes)
 try:
     from qdrant_client import QdrantClient, models
-except ImportError:
+except ImportError:  # pragma: no cover
     QdrantClient = None  # type: ignore
     models = None  # type: ignore
 
+try:
+    from qdrant_client.http.exceptions import ResponseHandlingException
+except ImportError:  # pragma: no cover
+    ResponseHandlingException = None  # type: ignore
+
+try:  # pragma: no cover - optional dependency
+    import httpx
+except ImportError:
+    httpx = None  # type: ignore
+
+try:  # pragma: no cover - optional dependency
+    import httpcore
+except ImportError:
+    httpcore = None  # type: ignore
+
 logger = logging.getLogger("hybrid_qdrant")
 
+
+def _is_timeout_exception(exc: Exception) -> bool:
+    """Detect whether an exception is a Qdrant/http timeout."""
+
+    if ResponseHandlingException and isinstance(exc, ResponseHandlingException):
+        cause = exc.__cause__ or exc.__context__
+        if cause is not None and cause is not exc:
+            return _is_timeout_exception(cause)
+        return "timeout" in str(exc).lower()
+
+    timeout_types = []
+    if httpx is not None:
+        timeout_types.append(getattr(httpx, "TimeoutException", None))
+        timeout_types.append(getattr(httpx, "ReadTimeout", None))
+    if httpcore is not None:
+        timeout_types.append(getattr(httpcore, "TimeoutException", None))
+        timeout_types.append(getattr(httpcore, "ReadTimeout", None))
+
+    for t in timeout_types:
+        if t and isinstance(exc, t):
+            return True
+
+    return isinstance(exc, TimeoutError)
+
+
+def _log_qdrant_timeout(kind: str, collection: Optional[str], detail: Exception) -> None:
+    coll = collection or "(unknown)"
+    logger.warning(
+        "Qdrant %s query timed out for collection %s; returning partial results", kind, coll
+    )
+
+
+def _handle_timeout(kind: str, collection: Optional[str], exc: Exception) -> bool:
+    if _is_timeout_exception(exc):
+        _log_qdrant_timeout(kind, collection, exc)
+        return True
+    return False
+
 # ---------------------------------------------------------------------------
 # Helper functions for safe type conversion
 # ---------------------------------------------------------------------------
@@ -75,6 +129,10 @@ def _safe_float(val: Any, default: float) -> float:
 )
 
 EF_SEARCH = _safe_int(os.environ.get("QDRANT_EF_SEARCH", "128"), 128)
+_MAX_QDRANT_CONCURRENCY = max(1, _safe_int(os.environ.get("QDRANT_MAX_CONCURRENCY", "6"), 6))
+_SEMAPHORE_LOG_THRESHOLD = float(os.environ.get("QDRANT_SEMAPHORE_LOG_THRESHOLD", "0.5") or 0.5)
+_QDRANT_REQUEST_SEMAPHORE = threading.BoundedSemaphore(_MAX_QDRANT_CONCURRENCY)
+T = TypeVar("T")
 
 # Quantization search params (for faster search with quantized collections)
 QDRANT_QUANTIZATION = os.environ.get("QDRANT_QUANTIZATION", "none").strip().lower()
@@ -95,6 +153,24 @@ def _get_search_params(ef: int) -> models.SearchParams:
     return models.SearchParams(hnsw_ef=ef)
 
 
+def _with_qdrant_slot(kind: str, fn: Callable[[], T]) -> T:
+    """Serialize Qdrant calls to avoid overload while preserving concurrency."""
+    wait_start = time.perf_counter()
+    _QDRANT_REQUEST_SEMAPHORE.acquire()
+    waited = time.perf_counter() - wait_start
+    if waited >= _SEMAPHORE_LOG_THRESHOLD:
+        logger.debug(
+            "Qdrant %s query waited %.3fs for slot (max=%s)",
+            kind,
+            waited,
+            _MAX_QDRANT_CONCURRENCY,
+        )
+    try:
+        return fn()
+    finally:
+        _QDRANT_REQUEST_SEMAPHORE.release()
+
+
 # ---------------------------------------------------------------------------
 # Connection pooling setup
 # ---------------------------------------------------------------------------
@@ -191,7 +267,9 @@ def _legacy_vector_search(
             query_filter=flt,
         )
         return _coerce_points(getattr(result, "points", result))
-    except Exception:
+    except Exception as exc:
+        if _handle_timeout("legacy", collection, exc):
+            return []
         return []
 
 
@@ -469,57 +547,71 @@ def lex_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=LEX_VECTOR_NAME,
-            query_filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "lex",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=LEX_VECTOR_NAME,
+                query_filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except TypeError:
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": LEX_VECTOR_NAME})
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=LEX_VECTOR_NAME,
-            filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "lex",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=LEX_VECTOR_NAME,
+                filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except AttributeError:
         return _legacy_vector_search(client, collection, LEX_VECTOR_NAME, v, per_query, flt)
     except Exception as e:
+        if _handle_timeout("lex", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             try:
                 logger.debug("QP_FILTER_DROP", extra={"using": LEX_VECTOR_NAME, "reason": str(e)[:200]})
             except Exception as e:
                 logger.debug(f"Suppressed exception: {e}")
         try:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=LEX_VECTOR_NAME,
-                query_filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
+            qp = _with_qdrant_slot(
+                "lex",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=v,
+                    using=LEX_VECTOR_NAME,
+                    query_filter=None,
+                    search_params=_get_search_params(ef),
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except TypeError:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=LEX_VECTOR_NAME,
-                filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
+            qp = _with_qdrant_slot(
+                "lex",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=v,
+                    using=LEX_VECTOR_NAME,
+                    filter=None,
+                    search_params=_get_search_params(ef),
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except Exception as e2:
@@ -553,35 +645,43 @@ def sparse_lex_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=models.SparseVector(
-                indices=sparse_vec["indices"],
-                values=sparse_vec["values"],
-            ),
-            using=LEX_SPARSE_NAME,
-            query_filter=flt,
-            limit=per_query,
-            with_payload=True,
-        )
-        return _coerce_points(getattr(qp, "points", qp))
-    except TypeError:
-        try:
-            qp = client.query_points(
+        qp = _with_qdrant_slot(
+            "sparse",
+            lambda: client.query_points(
                 collection_name=collection,
                 query=models.SparseVector(
                     indices=sparse_vec["indices"],
                     values=sparse_vec["values"],
                 ),
                 using=LEX_SPARSE_NAME,
-                filter=flt,
+                query_filter=flt,
                 limit=per_query,
                 with_payload=True,
+            ),
+        )
+        return _coerce_points(getattr(qp, "points", qp))
+    except TypeError:
+        try:
+            qp = _with_qdrant_slot(
+                "sparse",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=models.SparseVector(
+                        indices=sparse_vec["indices"],
+                        values=sparse_vec["values"],
+                    ),
+                    using=LEX_SPARSE_NAME,
+                    filter=flt,
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except Exception:
             return []
     except Exception as e:
+        if _handle_timeout("sparse", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("SPARSE_LEX_QUERY_ERROR", extra={"error": str(e)[:200]})
         return []
@@ -624,30 +724,38 @@ def dense_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=vec_name,
-            query_filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "dense",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=vec_name,
+                query_filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except TypeError:
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": vec_name})
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=vec_name,
-            filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "dense",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=vec_name,
+                filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except Exception as e:
+        if _handle_timeout("dense", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             try:
                 logger.debug("QP_FILTER_DROP", extra={"using": vec_name, "reason": str(e)[:200]})
@@ -656,29 +764,37 @@ def dense_query(
         if not collection:
             return _legacy_vector_search(client, _collection(), vec_name, v, per_query, flt)
         try:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=vec_name,
-                query_filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
-            )
-            return _coerce_points(getattr(qp, "points", qp))
-        except TypeError:
-            try:
-                qp = client.query_points(
+            qp = _with_qdrant_slot(
+                "dense",
+                lambda: client.query_points(
                     collection_name=collection,
                     query=v,
                     using=vec_name,
-                    filter=None,
+                    query_filter=None,
                     search_params=_get_search_params(ef),
                     limit=per_query,
                     with_payload=True,
+                ),
+            )
+            return _coerce_points(getattr(qp, "points", qp))
+        except TypeError:
+            try:
+                qp = _with_qdrant_slot(
+                    "dense",
+                    lambda: client.query_points(
+                        collection_name=collection,
+                        query=v,
+                        using=vec_name,
+                        filter=None,
+                        search_params=_get_search_params(ef),
+                        limit=per_query,
+                        with_payload=True,
+                    ),
                 )
                 return _coerce_points(getattr(qp, "points", qp))
             except Exception as e2:
+                if _handle_timeout("dense", collection, e2):
+                    return []
                 if os.environ.get("DEBUG_HYBRID_SEARCH"):
                     try:
                         logger.debug("QP_FILTER_DROP_FAILED", extra={"using": vec_name, "reason": str(e2)[:200]})
diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py
index bf49664c..43df34e4 100644
--- a/scripts/mcp_impl/search.py
+++ b/scripts/mcp_impl/search.py
@@ -323,6 +323,9 @@ def _to_str(x, default=""):
     rerank_timeout_ms = _to_int(
         rerank_timeout_ms, int(os.environ.get("RERANKER_TIMEOUT_MS", "3000") or 3000)
     )
+    # Clamp rerank timeout to prevent unreasonably low deadlines
+    _MIN_RERANK_TIMEOUT_MS = int(os.environ.get("RERANK_TIMEOUT_MIN_MS", "10000") or 10000)
+    rerank_timeout_ms = max(rerank_timeout_ms, _MIN_RERANK_TIMEOUT_MS)
     highlight_snippet = _to_bool(highlight_snippet, True)
 
     # Resolve collection and related hints: explicit > per-connection defaults > token defaults > env
@@ -795,6 +798,7 @@ def _match_glob(glob_pat: str, path_val: str) -> bool:
 
     # Optional rerank fallback path: if enabled, attempt; on timeout or error, keep hybrid
     used_rerank = False
+    learning_results = None  # May hold learning reranker output for fallback
     rerank_counters = {
         "inproc_hybrid": 0,
         "inproc_dense": 0,
@@ -870,6 +874,8 @@ def _match_glob(glob_pat: str, path_val: str) -> bool:
                         tmp.append(item)
 
                     if tmp:
+                        # Store learning results separately; may be used as fallback
+                        learning_results = tmp
                         results = tmp
                         used_rerank = True
                         rerank_counters["learning"] += 1
@@ -1191,6 +1197,12 @@ def _doc_for(obj: dict) -> str:
                     rerank_counters["error"] += 1
                     used_rerank = False
 
+    # Fallback to learning reranker results if subprocess failed but learning succeeded
+    if (not used_rerank) and learning_results:
+        results = learning_results
+        used_rerank = True
+        logger.debug("Falling back to learning reranker results after subprocess failure")
+
     if not used_rerank:
         # Build results from hybrid JSON lines
         for obj in json_lines:
diff --git a/scripts/qdrant_client_manager.py b/scripts/qdrant_client_manager.py
index 78cc716b..3e901cef 100644
--- a/scripts/qdrant_client_manager.py
+++ b/scripts/qdrant_client_manager.py
@@ -9,7 +9,7 @@
 import threading
 import time
 import weakref
-from typing import Optional, Dict, List
+from typing import Optional, Dict, List, Any
 from contextlib import contextmanager
 from qdrant_client import QdrantClient
 
@@ -17,6 +17,33 @@
 # Connection pool implementation
 
 logger = logging.getLogger(__name__)
+
+
+def _get_qdrant_timeout() -> Optional[float]:
+    """Return the configured Qdrant HTTP timeout (seconds) if set."""
+    raw = os.environ.get("QDRANT_TIMEOUT") or os.environ.get("QDRANT_CLIENT_TIMEOUT")
+    if not raw:
+        return None
+    try:
+        timeout = float(raw)
+        return timeout if timeout > 0 else None
+    except (TypeError, ValueError):
+        logger.debug("Invalid Qdrant timeout value '%s'; ignoring", raw)
+        return None
+
+
+def _client_kwargs(url: str, api_key: Optional[str]) -> Dict[str, Any]:
+    """Build kwargs dict for QdrantClient constructor with optional timeout."""
+    kwargs: Dict[str, Any] = {
+        "url": url,
+        "api_key": api_key if api_key else None,
+    }
+    timeout = _get_qdrant_timeout()
+    if timeout is not None:
+        kwargs["timeout"] = timeout
+    return kwargs
+
+
 class QdrantConnectionPool:
     """Thread-safe connection pool for QdrantClient instances."""
     
@@ -49,7 +76,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient:
             
             # No suitable client found, create a new one
             if self._created_count < self.max_size:
-                client = QdrantClient(url=url, api_key=api_key)
+                client = QdrantClient(**_client_kwargs(url, api_key))
                 pool_entry = {
                     'client': client,
                     'url': url,
@@ -66,7 +93,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient:
                 # Pool is full, create a temporary client (not pooled)
                 # Mark it for tracking so return_client can close it
                 self._misses += 1
-                temp_client = QdrantClient(url=url, api_key=api_key)
+                temp_client = QdrantClient(**_client_kwargs(url, api_key))
                 # Track temporary clients with weakref so they auto-close
                 self._temp_clients.add(temp_client)
                 return temp_client
@@ -197,13 +224,13 @@ def get_qdrant_client(
     
     # Fallback to singleton pattern for backward compatibility
     if force_new:
-        return QdrantClient(url=url, api_key=api_key if api_key else None)
-    
+        return QdrantClient(**_client_kwargs(url, api_key))
+
     global _client
     
     with _client_lock:
         if _client is None:
-            _client = QdrantClient(url=url, api_key=api_key if api_key else None)
+            _client = QdrantClient(**_client_kwargs(url, api_key))
         return _client
 
 
diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py
index a1a22279..0a065b22 100644
--- a/scripts/remote_upload_client.py
+++ b/scripts/remote_upload_client.py
@@ -580,7 +580,10 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]:
                 # Skip paths that cannot be resolved
                 continue
 
-            cached_hash = get_cached_file_hash(abs_path, self.repo_name)
+            # Translate to container path for cache lookup (cache stores container paths)
+            # This handles the case where bridge runs locally but cache was created in container
+            cache_key = self._translate_to_container_path(abs_path)
+            cached_hash = get_cached_file_hash(cache_key, self.repo_name)
 
             if not path.exists():
                 # File was deleted
@@ -631,12 +634,12 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]:
                 # Unchanged (content same despite stat change)
                 changes["unchanged"].append(path)
 
-            # Update caches
+            # Update caches (use container path for cache consistency)
             try:
                 self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size)
             except Exception as e:
                 logger.debug(f"Suppressed exception: {e}")
-            set_cached_file_hash(abs_path, current_hash, self.repo_name)
+            set_cached_file_hash(cache_key, current_hash, self.repo_name)
 
         # Detect moves by looking for files with same content hash
         # but different paths (requires additional tracking)
@@ -662,7 +665,9 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) ->
         for deleted_path in deleted_files:
             try:
                 # Try to get cached hash first, fallback to file content
-                cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name)
+                # Use container path for cache lookup (cache stores container paths)
+                cache_key = self._translate_to_container_path(str(deleted_path))
+                cached_hash = get_cached_file_hash(cache_key, self.repo_name)
                 if cached_hash:
                     deleted_hashes[cached_hash] = deleted_path
                     continue
@@ -777,7 +782,9 @@ def create_delta_bundle(
                         content = f.read()
                     file_hash = hashlib.sha1(content).hexdigest()
                     content_hash = f"sha1:{file_hash}"
-                    previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name)
+                    # Use container path for cache lookup (cache stores container paths)
+                    cache_key = self._translate_to_container_path(str(path.resolve()))
+                    previous_hash = get_cached_file_hash(cache_key, self.repo_name)
 
                     # Write file to bundle
                     bundle_file_path = files_dir / "updated" / rel_path
@@ -853,7 +860,9 @@ def create_delta_bundle(
             for path in changes["deleted"]:
                 rel_path = path.relative_to(Path(self.workspace_path)).as_posix()
                 try:
-                    previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name)
+                    # Use container path for cache lookup (cache stores container paths)
+                    cache_key = self._translate_to_container_path(str(path.resolve()))
+                    previous_hash = get_cached_file_hash(cache_key, self.repo_name)
 
                     operation = {
                         "operation": "deleted",
diff --git a/tests/test_context_answer.py b/tests/test_context_answer.py
index 9aa74c4b..022c5d85 100644
--- a/tests/test_context_answer.py
+++ b/tests/test_context_answer.py
@@ -1,3 +1,4 @@
+import asyncio
 import importlib
 import types
 import pytest
@@ -51,7 +52,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="how to do x", limit=2, per_path=1)
     )
 
@@ -82,7 +83,7 @@ def generate_with_soft_embeddings(self, *a, **k):
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="how to do y", limit=1)
     )
 
@@ -130,7 +131,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="what is RRF_K in hybrid_search.py?", limit=1, per_path=1)
     )
 
@@ -179,7 +180,7 @@ def generate_with_soft_embeddings(self, *a, **kw):
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="RRF_K", limit=1, per_path=1)
     )
 
@@ -214,7 +215,7 @@ def _raise_retrieval(*a, **k):
 
     monkeypatch.setattr(srv, "_ca_prepare_filters_and_retrieve", _raise_retrieval)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="x", limit=1, per_path=1)
     )
     assert "error" in out
@@ -245,7 +246,7 @@ def _fake_retrieval(*a, **k):
     import scripts.refrag_llamacpp as ref
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False)
 
-    out2 = srv.asyncio.get_event_loop().run_until_complete(
+    out2 = asyncio.run(
         srv.context_answer(query="x", limit=1, per_path=1)
     )
     assert isinstance(out2, dict)
diff --git a/tests/test_error_paths.py b/tests/test_error_paths.py
index 0beb9d4b..ad4d04e4 100644
--- a/tests/test_error_paths.py
+++ b/tests/test_error_paths.py
@@ -17,7 +17,7 @@ async def fake_run(cmd, **kwargs):
 
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = asyncio.run(
         srv.repo_search(queries=["x"], limit=1, compact=False, lean=False)
     )
 
@@ -49,7 +49,7 @@ async def fake_run(cmd, **kwargs):
 
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = asyncio.run(
         srv.repo_search(queries=["x"], limit=1, compact=True, lean=False)
     )
 

From 104ff7a60c677cad188c7e458e13a8ca6d6dfcbe Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 09:18:08 -0500
Subject: [PATCH 02/10] Enable auto-memory storage and memory blending features

Adds automatic storage of successful context_answer responses as memories in context_answer.py, configurable via environment variables. Updates mcp_indexer_server.py to support memory blending in code_search via new parameters (include_memories, memory_weight, per_source_limits). Adjusts Helm values for resource requests/limits and autoscaling for several services. Also refines tool routing in mcpServer.js to match any 'memory' prefix.
---
 ctx-mcp-bridge/package.json                   |  2 +-
 ctx-mcp-bridge/src/mcpServer.js               |  3 +-
 .../helm/context-engine/values-example.yaml   | 37 +++++---
 deploy/helm/context-engine/values.yaml        | 38 ++++----
 scripts/mcp_impl/context_answer.py            | 93 +++++++++++++++++++
 scripts/mcp_indexer_server.py                 | 40 ++++++++
 6 files changed, 177 insertions(+), 36 deletions(-)

diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json
index 6c4e93cd..df69fa17 100644
--- a/ctx-mcp-bridge/package.json
+++ b/ctx-mcp-bridge/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@context-engine-bridge/context-engine-mcp-bridge",
-  "version": "0.0.16",
+  "version": "0.0.17",
   "description": "Context Engine MCP bridge (http/stdio proxy combining indexer + memory servers)",
   "bin": {
     "ctxce": "bin/ctxce.js",
diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js
index 2fe1b101..d2426d3b 100644
--- a/ctx-mcp-bridge/src/mcpServer.js
+++ b/ctx-mcp-bridge/src/mcpServer.js
@@ -126,7 +126,8 @@ function selectClientForTool(name, indexerClient, memoryClient) {
     return indexerClient;
   }
   const lowered = name.toLowerCase();
-  if (memoryClient && (lowered.startsWith("memory.") || lowered.startsWith("mcp_memory_"))) {
+  // Route to memory server for any memory-prefixed tool
+  if (memoryClient && lowered.startsWith("memory")) {
     return memoryClient;
   }
   return indexerClient;
diff --git a/deploy/helm/context-engine/values-example.yaml b/deploy/helm/context-engine/values-example.yaml
index 94dd017e..4b669359 100644
--- a/deploy/helm/context-engine/values-example.yaml
+++ b/deploy/helm/context-engine/values-example.yaml
@@ -52,15 +52,15 @@ mcpIndexerHttp:
   replicas: 1
   resources:
     requests:
-      cpu: 250m
+      cpu: 500m
       memory: 8Gi
     limits:
-      cpu: "1"
+      cpu: "2"
       memory: 16Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 4
+    maxReplicas: 2
 
 # MCP Memory HTTP
 mcpMemoryHttp:
@@ -68,45 +68,52 @@ mcpMemoryHttp:
   replicas: 1
   resources:
     requests:
-      cpu: 250m
-      memory: 512Mi
+      cpu: 500m
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
 
 # Upload Service
 uploadService:
   enabled: true
   replicas: 1
+  resources:
+    requests:
+      cpu: 250m
+      memory: 1Gi
+    limits:
+      cpu: "1500m"
+      memory: 3Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 2
 
 # Watcher
 watcher:
   enabled: true
-  replicas: 1
+  replicas: 2
   resources:
     requests:
       cpu: 500m
-      memory: 2Gi
+      memory: 3Gi
     limits:
-      cpu: "2"
-      memory: 8Gi
+      cpu: "2500m"
+      memory: 10Gi
 
-# Learning Reranker Worker
+# Learning Reranker Worker (singleton - only 1 can run due to leader election)
 learningRerankerWorker:
   enabled: true
   replicas: 1
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
 
 # Persistence - shared PVCs
 persistence:
diff --git a/deploy/helm/context-engine/values.yaml b/deploy/helm/context-engine/values.yaml
index 757c02bb..bdfc11be 100644
--- a/deploy/helm/context-engine/values.yaml
+++ b/deploy/helm/context-engine/values.yaml
@@ -140,10 +140,10 @@ mcpIndexerHttp:
   # -- Resource requests and limits
   resources:
     requests:
-      cpu: 250m
+      cpu: 500m
       memory: 8Gi
     limits:
-      cpu: "1"
+      cpu: "2"
       memory: 16Gi
   # -- Liveness probe
   livenessProbe:
@@ -167,7 +167,7 @@ mcpIndexerHttp:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 4
+    maxReplicas: 2
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -207,11 +207,11 @@ mcpMemoryHttp:
   # -- Resource requests and limits
   resources:
     requests:
-      cpu: 250m
-      memory: 512Mi
+      cpu: 500m
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   # -- Liveness probe
   livenessProbe:
     httpGet:
@@ -230,7 +230,7 @@ mcpMemoryHttp:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -273,10 +273,10 @@ uploadService:
   resources:
     requests:
       cpu: 250m
-      memory: 512Mi
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   # -- Environment variables
   env:
     UPLOAD_SERVICE_HOST: "0.0.0.0"
@@ -288,7 +288,7 @@ uploadService:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 2
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -305,7 +305,7 @@ watcher:
   # -- Enable Watcher
   enabled: true
   # -- Number of replicas
-  replicas: 1
+  replicas: 2
   # -- Command to run
   command:
     - python
@@ -316,10 +316,10 @@ watcher:
   resources:
     requests:
       cpu: 500m
-      memory: 2Gi
+      memory: 3Gi
     limits:
-      cpu: "2"
-      memory: 8Gi
+      cpu: "2500m"
+      memory: 10Gi
   # -- Environment variables (in addition to configmap)
   env:
     WATCH_ROOT: /work
@@ -343,7 +343,7 @@ watcher:
 learningRerankerWorker:
   # -- Enable Learning Reranker Worker
   enabled: true
-  # -- Number of replicas
+  # -- Number of replicas (singleton worker with leader election - only 1 can run)
   replicas: 1
   # -- Command to run
   command:
@@ -358,11 +358,11 @@ learningRerankerWorker:
     limits:
       cpu: "1"
       memory: 2Gi
-  # -- HPA configuration
+  # -- HPA configuration (capped at 1 - singleton worker)
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
diff --git a/scripts/mcp_impl/context_answer.py b/scripts/mcp_impl/context_answer.py
index 272bb869..20cc3824 100644
--- a/scripts/mcp_impl/context_answer.py
+++ b/scripts/mcp_impl/context_answer.py
@@ -57,6 +57,90 @@
 
 logger = logging.getLogger(__name__)
 
+# ---------------------------------------------------------------------------
+# Auto-memory storage for successful answers
+# ---------------------------------------------------------------------------
+
+# Minimum answer length to auto-store (default 200 chars)
+_AUTO_MEMORY_MIN_CHARS = int(os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY_MIN_CHARS", "200") or 200)
+# Enable/disable auto-memory storage (default ON)
+_AUTO_MEMORY_ENABLED = os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY", "1").strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _maybe_store_answer_as_memory(
+    answer: str,
+    queries: List[str],
+    citations: List[Dict[str, Any]],
+    collection: Optional[str] = None,
+) -> None:
+    """Fire-and-forget storage of successful context_answer responses as memories.
+
+    Criteria for storage:
+    - Answer is not "insufficient context"
+    - Answer has at least one citation
+    - Answer length >= _AUTO_MEMORY_MIN_CHARS (default 200)
+    - _AUTO_MEMORY_ENABLED is True (default)
+
+    Runs in a background thread to not block the response.
+    """
+    if not _AUTO_MEMORY_ENABLED:
+        return
+
+    # Check criteria
+    ans_clean = (answer or "").strip()
+    if not ans_clean:
+        return
+    if ans_clean.lower() == "insufficient context":
+        return
+    if not citations:
+        return
+    if len(ans_clean) < _AUTO_MEMORY_MIN_CHARS:
+        return
+
+    # Build memory content with query context
+    query_str = " | ".join(queries) if queries else "unknown query"
+
+    # Build citation summary (paths only)
+    cite_paths = []
+    for cit in citations[:5]:  # Limit to first 5 citations
+        p = cit.get("path") or cit.get("rel_path") or ""
+        if p:
+            cite_paths.append(p)
+    cite_summary = ", ".join(cite_paths) if cite_paths else "no paths"
+
+    # Format the memory content
+    memory_content = f"Q: {query_str}\n\nA: {ans_clean}\n\nSources: {cite_summary}"
+
+    # Build metadata
+    metadata = {
+        "kind": "context_answer",
+        "source": "auto_memory",
+        "queries": queries,
+        "citation_count": len(citations),
+        "answer_length": len(ans_clean),
+    }
+
+    # Fire-and-forget in background thread
+    import threading
+
+    def _store():
+        try:
+            # Import here to avoid circular imports
+            from scripts.mcp_memory_server import memory_store
+            memory_store(
+                information=memory_content,
+                metadata=metadata,
+                collection=collection,
+            )
+            logger.debug("Auto-stored context_answer as memory (len=%d, cites=%d)", len(ans_clean), len(citations))
+        except Exception as e:
+            # Silently fail - this is best-effort
+            logger.debug("Auto-memory storage failed: %s", e)
+
+    t = threading.Thread(target=_store, daemon=True)
+    t.start()
+
+
 # Keys to strip from citations for slim MCP output (agents only need path + rel_path)
 _VERBOSE_PATH_KEYS = ("host_path", "container_path", "client_path")
 
@@ -3373,4 +3457,13 @@ def _tok2(s: str) -> list[str]:
     }
     if answers_by_query:
         out["answers_by_query"] = answers_by_query
+
+    # Auto-store successful answers as memories (fire-and-forget)
+    _maybe_store_answer_as_memory(
+        answer=answer.strip(),
+        queries=original_queries,
+        citations=citations,
+        collection=collection,
+    )
+
     return out
diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 6d0ff832..dabecad3 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -1685,6 +1685,10 @@ async def code_search(
     case: Any = None,
     session: Any = None,
     compact: Any = None,
+    # Memory blending (opt-in)
+    include_memories: Any = None,
+    memory_weight: Any = None,
+    per_source_limits: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
     """Exact alias of repo_search (hybrid code search with reranking enabled by default).
@@ -1692,7 +1696,43 @@ async def code_search(
     Prefer repo_search; this name exists for discoverability in some IDEs/agents.
     Same parameters and return shape as repo_search.
     Reranking (rerank_enabled=true) is ON by default for optimal result quality.
+
+    Memory blending (opt-in):
+    - include_memories: bool. If true, blends memory results with code results.
+    - memory_weight: float (default 1.0). Scales memory scores relative to code.
+    - per_source_limits: dict, e.g. {"code": 5, "memory": 3}
     """
+    # If include_memories is requested, delegate to context_search for blending
+    if include_memories:
+        return await context_search(
+            query=query,
+            limit=limit,
+            per_path=per_path,
+            include_memories=include_memories,
+            memory_weight=memory_weight,
+            per_source_limits=per_source_limits,
+            include_snippet=include_snippet,
+            context_lines=context_lines,
+            rerank_enabled=rerank_enabled,
+            rerank_top_n=rerank_top_n,
+            rerank_return_m=rerank_return_m,
+            rerank_timeout_ms=rerank_timeout_ms,
+            highlight_snippet=highlight_snippet,
+            collection=collection,
+            language=language,
+            under=under,
+            kind=kind,
+            symbol=symbol,
+            path_regex=path_regex,
+            path_glob=path_glob,
+            not_glob=not_glob,
+            ext=ext,
+            not_=not_,
+            case=case,
+            session=session,
+            compact=compact,
+            kwargs=kwargs,
+        )
     return await repo_search(
         query=query,
         limit=limit,

From acd8764b590f4e6aa09777675e7953f0d508afdc Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 12:17:51 -0500
Subject: [PATCH 03/10] Enhance tool metadata, docstrings, and ignore patterns

Added comprehensive TOOLS_METADATA for all search/answer tools and enriched the /readyz endpoint with tool metadata. Expanded and clarified docstrings for all major API functions to provide detailed usage, parameters, and examples. Updated .indexignore and retrieval filters to exclude common build, dist, and IDE artifacts. Increased max_neighbors in context_answer for improved subgraph context injection. Added more session default parameters to set_session_defaults.
---
 .indexignore                       |   15 +
 scripts/mcp_impl/context_answer.py |   10 +-
 scripts/mcp_impl/search.py         |    3 +-
 scripts/mcp_indexer_server.py      | 1340 ++++++++++++++++++++++++----
 scripts/mcp_memory_server.py       |  306 ++++++-
 5 files changed, 1490 insertions(+), 184 deletions(-)

diff --git a/.indexignore b/.indexignore
index 6ab86f92..b4da8f56 100644
--- a/.indexignore
+++ b/.indexignore
@@ -2,3 +2,18 @@ cosqa*.json
 # dev-workspace contains uploaded client workspaces - they get indexed
 # separately via upload service, not as part of the main Context-Engine repo
 dev-workspace/
+
+# CDK/deploy build artifacts - duplicates of source files
+deploy/eks-cdk-*/cdk.out/
+**/cdk.out/
+
+# Build/dist artifacts
+dist/
+build/
+*.egg-info/
+
+# IDE/editor artifacts
+.idea/
+.vscode/
+*.swp
+*.swo
diff --git a/scripts/mcp_impl/context_answer.py b/scripts/mcp_impl/context_answer.py
index 20cc3824..a7f434c5 100644
--- a/scripts/mcp_impl/context_answer.py
+++ b/scripts/mcp_impl/context_answer.py
@@ -750,6 +750,14 @@ def _ca_prepare_filters_and_retrieve(
         "node_modules/",
         ".git/",
         ".git",
+        # CDK/deploy build artifacts (duplicates of source files)
+        "cdk.out/",
+        "**/cdk.out/**",
+        "deploy/eks-cdk-*/cdk.out/",
+        # Build artifacts
+        "dist/",
+        "build/",
+        "*.egg-info/",
     ]
 
     def _variants(p: str) -> list[str]:
@@ -3031,7 +3039,7 @@ def safe_float(val, default=0.0, **kw):
                     items=items,
                     collection=coll,
                     repo=repo,
-                    max_neighbors=2,
+                    max_neighbors=5,
                 )
             except Exception as e:
                 logger.debug(f"Subgraph context injection failed: {e}")
diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py
index 43df34e4..a2565c32 100644
--- a/scripts/mcp_impl/search.py
+++ b/scripts/mcp_impl/search.py
@@ -62,6 +62,7 @@
     logger=logger,
     context="MCP_SNIPPET_MAX_BYTES",
 )
+SEARCH_COMPACT_DEFAULT = os.environ.get("SEARCH_COMPACT_DEFAULT", "0").lower() in {"1", "true", "yes", "on"}
 
 
 async def _repo_search_impl(
@@ -457,7 +458,7 @@ def _to_str_list(x):
             repo_filter = [detected_repo]
 
     compact_raw = compact
-    compact = _to_bool(compact, False)
+    compact = _to_bool(compact, SEARCH_COMPACT_DEFAULT)
     # If snippets are requested, do not compact (we need snippet field in results)
     if include_snippet:
         compact = False
diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index dabecad3..99891017 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -295,6 +295,268 @@ def _highlight_snippet(snippet, tokens):  # type: ignore
     _work_script,
 )
 
+TOOLS_METADATA: dict[str, dict] = {
+    "repo_search": {
+        "name": "repo_search",
+        "category": "search",
+        "primary_use": "Hybrid semantic + lexical code search",
+        "choose_when": [
+            "Finding code related to a concept",
+            "Starting a search without knowing which tool",
+            "Need flexible filtering by language/path/symbol",
+        ],
+        "choose_instead": {
+            "symbol_graph": "Need precise caller/definition relationships",
+            "context_answer": "Need an explanation, not raw results",
+            "search_tests_for": "Specifically want test files",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under", "include_snippet"],
+            "advanced": ["rerank_enabled", "output_format", "compact", "mode"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{score, path, symbol, start_line, end_line, snippet?}]",
+            "total": "int",
+        },
+        "related_tools": ["code_search", "context_search", "info_request"],
+        "performance": {
+            "typical_latency_ms": (100, 2000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "context_answer": {
+        "name": "context_answer",
+        "category": "answer",
+        "primary_use": "LLM-generated answers with code citations",
+        "choose_when": [
+            "Need an explanation of how code works",
+            "Asking 'how does X work?' questions",
+            "Want synthesized answer with sources",
+        ],
+        "choose_instead": {
+            "repo_search": "Want raw code results, not explanation",
+            "symbol_graph": "Need precise relationships",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under", "include_snippet"],
+            "advanced": ["max_tokens", "temperature", "expand", "budget_tokens"],
+        },
+        "returns": {
+            "ok": "bool",
+            "answer": "str",
+            "citations": "list[{id, path, start_line, end_line}]",
+        },
+        "related_tools": ["repo_search", "context_search"],
+        "performance": {
+            "typical_latency_ms": (1000, 10000),
+            "requires_index": True,
+            "requires_decoder": True,
+        },
+    },
+    "symbol_graph": {
+        "name": "symbol_graph",
+        "category": "graph",
+        "primary_use": "AST-backed symbol relationship queries",
+        "choose_when": [
+            "Need 'who calls function X'",
+            "Need 'where is X defined'",
+            "Need 'what imports module Y'",
+            "Doing refactoring impact analysis",
+        ],
+        "choose_instead": {
+            "repo_search": "Want conceptual search, not precise relationships",
+            "search_callers_for": "Quick text search is sufficient",
+        },
+        "parameters": {
+            "essential": ["symbol", "query_type"],
+            "common": ["limit", "language", "under", "repo"],
+            "advanced": ["depth", "output_format"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{path, start_line, end_line, symbol, snippet}]",
+            "count": "int",
+        },
+        "related_tools": ["search_callers_for", "search_importers_for"],
+        "performance": {
+            "typical_latency_ms": (50, 500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "context_search": {
+        "name": "context_search",
+        "category": "search",
+        "primary_use": "Blend code search with memory retrieval",
+        "choose_when": [
+            "Want code AND stored memories together",
+            "Searching for documented decisions",
+            "Need context from team knowledge",
+        ],
+        "choose_instead": {
+            "repo_search": "Only want code, no memories",
+            "memory_find": "Only want memories, no code",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["include_memories", "memory_weight", "limit"],
+            "advanced": ["per_source_limits", "rerank_enabled"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{source, score, path|content, ...}]",
+            "total": "int",
+        },
+        "related_tools": ["repo_search", "memory_find"],
+        "performance": {
+            "typical_latency_ms": (200, 3000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "info_request": {
+        "name": "info_request",
+        "category": "search",
+        "primary_use": "Simplified code discovery with explanations",
+        "choose_when": [
+            "Want simple single-parameter search",
+            "Need human-readable result descriptions",
+            "Building minimal integrations",
+        ],
+        "choose_instead": {
+            "repo_search": "Need full control over parameters",
+            "context_answer": "Need LLM-generated explanation",
+        },
+        "parameters": {
+            "essential": ["info_request"],
+            "common": ["limit", "language", "include_explanation"],
+            "advanced": ["include_relationships", "output_format"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{information, relevance_score, path, ...}]",
+            "summary?": "str",
+            "related_concepts?": "list[str]",
+        },
+        "related_tools": ["repo_search", "context_answer"],
+        "performance": {
+            "typical_latency_ms": (100, 2000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "pattern_search": {
+        "name": "pattern_search",
+        "category": "search",
+        "primary_use": "Structural code pattern matching",
+        "choose_when": [
+            "Have code example, find similar",
+            "Cross-language pattern search",
+            "Find structural duplicates",
+        ],
+        "choose_instead": {
+            "repo_search": "Searching by concept, not structure",
+            "symbol_graph": "Looking for relationships",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["language", "limit", "target_languages"],
+            "advanced": ["query_mode", "aroma_rerank", "min_score"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{path, start_line, end_line, score, language}]",
+            "query_mode": "str",
+        },
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (200, 3000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_tests_for": {
+        "name": "search_tests_for",
+        "category": "specialized",
+        "primary_use": "Find test files for a feature/function",
+        "choose_when": ["Specifically want test files", "Looking for test coverage"],
+        "choose_instead": {"repo_search": "Want all code, not just tests"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under"],
+            "advanced": ["include_snippet", "compact"],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_config_for": {
+        "name": "search_config_for",
+        "category": "specialized",
+        "primary_use": "Find configuration files",
+        "choose_when": ["Looking for config files", "Finding settings/options"],
+        "choose_instead": {"repo_search": "Want all code, not just config"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "under"],
+            "advanced": ["include_snippet", "compact"],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_callers_for": {
+        "name": "search_callers_for",
+        "category": "specialized",
+        "primary_use": "Text-based search for symbol callers",
+        "choose_when": ["Quick caller search is sufficient", "No graph index available"],
+        "choose_instead": {"symbol_graph": "Need precise AST-backed callers"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language"],
+            "advanced": [],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["symbol_graph"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_importers_for": {
+        "name": "search_importers_for",
+        "category": "specialized",
+        "primary_use": "Text-based search for module importers",
+        "choose_when": ["Quick import search is sufficient", "No graph index available"],
+        "choose_instead": {"symbol_graph": "Need precise AST-backed importers"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language"],
+            "advanced": [],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["symbol_graph"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+}
+
 # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000)
 _security_settings = (
     TransportSecuritySettings(enable_dns_rebinding_protection=False)
@@ -417,7 +679,6 @@ def do_GET(self):
                         self.send_response(200)
                         self.send_header("Content-Type", "application/json")
                         self.end_headers()
-                        # Hide expand_query when decoder is disabled
                         tools = _TOOLS_REGISTRY
                         try:
                             from scripts.refrag_llamacpp import is_decoder_enabled  # type: ignore
@@ -432,7 +693,12 @@ def do_GET(self):
                                 ]
                         except Exception as e:
                             logger.debug(f"Suppressed exception: {e}")
-                        payload = {"ok": True, "tools": tools}
+                        enriched = []
+                        for t in tools:
+                            name = t.get("name", "")
+                            meta = TOOLS_METADATA.get(name, {})
+                            enriched.append({**t, **meta})
+                        payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA}
                         self.wfile.write(_json_dumps_bytes(payload))
                     else:
                         self.send_response(404)
@@ -979,6 +1245,12 @@ async def set_session_defaults(
     mode: Any = None,
     under: Any = None,
     language: Any = None,
+    repo: Any = None,
+    compact: Any = None,
+    output_format: Any = None,
+    include_snippet: Any = None,
+    rerank_enabled: Any = None,
+    limit: Any = None,
     session: Any = None,
     ctx: Context = None,
     **kwargs,
@@ -989,6 +1261,19 @@ async def set_session_defaults(
     - If request Context is available, persist defaults per-connection so later calls on
       the same MCP session automatically use them (no token required).
     - Optionally also stores token-scoped defaults for cross-connection reuse.
+
+    Parameters:
+    - collection: Default collection name
+    - mode: Search mode hint
+    - under: Default path prefix filter
+    - language: Default language filter
+    - repo: Default repo filter for multi-repo setups
+    - compact: Default compact response mode (bool)
+    - output_format: Default output format ("json" or "toon")
+    - include_snippet: Default snippet inclusion (bool)
+    - rerank_enabled: Default reranking toggle (bool)
+    - limit: Default result limit (int)
+    - session: Session token for cross-connection reuse
     """
     try:
         _extra = _extract_kwargs_payload(kwargs)
@@ -1003,6 +1288,18 @@ async def set_session_defaults(
                 language = _extra.get("language")
             if (session is None or (isinstance(session, str) and str(session).strip() == "")) and _extra.get("session") is not None:
                 session = _extra.get("session")
+            if repo is None and _extra.get("repo") is not None:
+                repo = _extra.get("repo")
+            if compact is None and _extra.get("compact") is not None:
+                compact = _extra.get("compact")
+            if output_format is None and _extra.get("output_format") is not None:
+                output_format = _extra.get("output_format")
+            if include_snippet is None and _extra.get("include_snippet") is not None:
+                include_snippet = _extra.get("include_snippet")
+            if rerank_enabled is None and _extra.get("rerank_enabled") is not None:
+                rerank_enabled = _extra.get("rerank_enabled")
+            if limit is None and _extra.get("limit") is not None:
+                limit = _extra.get("limit")
     except Exception as e:
         logger.debug(f"Suppressed exception: {e}")
 
@@ -1015,6 +1312,23 @@ async def set_session_defaults(
                 defaults[_key] = _s
             else:
                 unset_keys.add(_key)
+    if isinstance(repo, str) and repo.strip():
+        defaults["repo"] = repo.strip()
+    elif isinstance(repo, list):
+        defaults["repo"] = repo
+    if isinstance(output_format, str) and output_format.strip():
+        defaults["output_format"] = output_format.strip()
+    if compact is not None:
+        defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact
+    if include_snippet is not None:
+        defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet
+    if rerank_enabled is not None:
+        defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled
+    if limit is not None:
+        try:
+            defaults["limit"] = int(limit)
+        except (ValueError, TypeError):
+            pass
 
     # Per-connection storage (preferred)
     try:
@@ -1119,24 +1433,91 @@ async def repo_search(
     args: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Zero-config code search over repositories (hybrid: vector + lexical RRF, rerank ON by default).
-
-    When to use:
-    - Find relevant code spans quickly; prefer this over embedding-only search.
-    - Use context_answer when you need a synthesized explanation; use context_search to blend with memory notes.
-
-    Key parameters:
-    - query: str or list[str]. Multiple queries are fused; accepts "queries" alias.
-    - limit: int (default 10). Total results across files.
-    - per_path: int (default 2). Max results per file.
-    - include_snippet/context_lines: return inline snippets near hits when true.
-    - rerank_*: ONNX reranker is ON by default for best relevance; timeouts fall back to hybrid.
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
-    - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos.
+    """Primary hybrid semantic + lexical code search across the repository.
+
+    PRIMARY USE: Find code spans matching a natural language concept or topic.
+
+    CHOOSE THIS WHEN:
+    - You need to find code related to a concept (e.g., "authentication", "caching")
+    - You want to locate implementations, not just definitions
+    - You need flexible filtering by language, path, or symbol
+    - You want the best balance of recall and precision
+    - You're starting a search and aren't sure which specific tool to use
+
+    CHOOSE INSTEAD:
+    - symbol_graph -> when you need "who calls X" or "where is X defined" (AST-backed)
+    - context_answer -> when you need an EXPLANATION, not raw code results
+    - context_search -> when you want to blend code results with stored memories
+    - search_tests_for -> when specifically looking for test files
+    - search_config_for -> when specifically looking for config files
+    - pattern_search -> when searching by code structure/pattern across languages
+
+    QUERY EXAMPLES:
+    Good queries (natural language, conceptual):
+      "authentication middleware"     - finds auth-related code
+      "error handling with retry"     - finds retry logic
+      "database connection pooling"   - finds connection management
+      "user session management"       - finds session-related code
+      "API rate limiting"             - finds rate limit implementations
+      "caching layer implementation"  - finds cache logic
+      "websocket message handling"    - finds WS handlers
+      "file upload processing"        - finds upload logic
+
+    Bad queries (will return poor results):
+      "auth OR login OR session"      - boolean operators NOT supported
+      "def.*authenticate"             - regex NOT supported in query
+      "*.py with class User"          - glob syntax NOT for query field
+      "function"                      - too vague, be more specific
+      "get"                           - too generic
+      "the code that handles the thing" - unclear intent
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you're looking for.
+      Multiple queries are fused for broader recall.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - per_path (int, default=2): Max results per file. Increase for thorough search.
+    - include_snippet (bool, default=True): Include code snippets in results.
+    - language (str): Filter by language ("python", "typescript", "go", etc.)
+    - under (str): Restrict to directory path ("scripts/", "src/api/")
+    - symbol (str): Filter by symbol name (function, class, method)
+    - path_glob (str | list[str]): File pattern filter ("**/*.py", "src/**")
+    - repo (str | list[str]): Filter by repo name(s). Use "*" for all repos.
+
+    ADVANCED PARAMETERS:
+    - rerank_enabled (bool, default=True): ONNX cross-encoder reranking for relevance.
+    - rerank_top_n (int, default=20): Candidates to rerank. Increase for benchmarks.
+    - output_format (str): "json" (default) or "toon" for token-efficient format.
+    - compact (bool, default=False): Strip verbose fields for minimal response.
+    - mode (str): "code_first", "docs_first", "balanced", or "dense" (pure embedding).
+    - not_glob (str | list[str]): Exclude paths matching pattern.
+    - not_ (str): Exclude results containing this text.
+    - case (str): "sensitive" for case-sensitive matching.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.85,           // Relevance score (0-1+)
+                "path": "src/auth.py",   // File path
+                "symbol": "authenticate", // Symbol name if available
+                "start_line": 42,        // Start line number
+                "end_line": 67,          // End line number
+                "snippet": "def auth..." // Code snippet (if include_snippet=true)
+            }
+        ],
+        "total": 5,                      // Total results returned
+        "used_rerank": true,             // Whether reranking was applied
+        "rerank_counters": {...}         // Reranking statistics
+    }
 
-    Returns:
-    - Dict with keys: results, total, used_rerank, rerank_counters
+    PERFORMANCE TIPS:
+    - Use language filter to reduce search space and improve relevance
+    - Use under filter when you know the general code area
+    - Set include_snippet=false if you only need file locations
+    - Set compact=true to reduce response size for large result sets
     """
     return await _repo_search_impl(
         query=query,
@@ -1305,14 +1686,77 @@ async def search_tests_for(
 ) -> Dict[str, Any]:
     """Find test files related to a query.
 
-    What it does:
-    - Presets common test file globs and forwards to repo_search
-    - Accepts extra filters via kwargs (e.g., language, under, case)
+    PRIMARY USE: Quickly find tests for a feature, function, or module.
+    Convenience wrapper that presets common test file patterns.
+
+    CHOOSE THIS WHEN:
+    - You specifically want TEST files, not implementation code
+    - You're looking for tests related to a feature
+    - You want to find test coverage for a function/class
+    - You're exploring how something is tested
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want ALL code, not just tests
+    - symbol_graph -> when you need "what tests call function X"
+
+    QUERY EXAMPLES:
+    Good queries (feature/function focused):
+      "user authentication"           - finds tests for auth features
+      "database connection"           - finds DB connection tests
+      "API rate limiting"             - finds rate limit tests
+      "email sending"                 - finds email-related tests
+      "input validation"              - finds validation tests
+      "UserService"                   - finds tests for UserService class
+
+    Bad queries:
+      "all tests"                     - too broad
+      "test_*.py"                     - glob pattern, use path_glob param
+      "pass"                          - assertion keyword, not meaningful
+      "def test_"                     - code fragment, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you want tests for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - include_snippet (bool, default=True): Include test code snippets.
+    - context_lines (int): Lines of context around matches.
+    - under (str): Restrict to directory path (e.g., "tests/unit/").
+    - language (str): Filter by language.
+    - compact (bool): Minimal response fields.
+
+    PRESET GLOBS (automatically applied):
+    - tests/**
+    - test/**
+    - **/*test*.*
+    - **/*_test.*
+    - **/Test*/**
+
+    RETURNS: Same schema as repo_search.
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.82,
+                "path": "tests/test_auth.py",
+                "symbol": "test_authenticate_valid_user",
+                "start_line": 45,
+                "end_line": 58,
+                "snippet": "def test_authenticate_valid_user():..."
+            }
+        ],
+        "total": 8
+    }
 
-    Parameters:
-    - query: str or list[str]; limit; include_snippet/context_lines; under; language; compact
+    USAGE PATTERNS:
+    # Find tests for authentication
+    search_tests_for(query="authentication")
+
+    # Find tests in a specific directory
+    search_tests_for(query="database", under="tests/integration/")
 
-    Returns: repo_search result shape.
+    # Find Python tests only
+    search_tests_for(query="caching", language="python")
     """
     return await _search_tests_for_impl(
         query=query,
@@ -1341,13 +1785,86 @@ async def search_config_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find likely configuration files for a service/query.
+    """Find configuration files related to a query.
+
+    PRIMARY USE: Quickly find config files for a service, feature, or setting.
+    Convenience wrapper that presets common config file patterns.
+
+    CHOOSE THIS WHEN:
+    - You need to find configuration for a service/feature
+    - You're looking for environment variables, settings, or options
+    - You want to find where something is configured
+    - You're debugging configuration issues
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want ALL code, not just config files
+    - search_tests_for -> when looking for test files
+
+    QUERY EXAMPLES:
+    Good queries (service/setting focused):
+      "database connection"           - finds DB config files
+      "authentication settings"       - finds auth config
+      "logging configuration"         - finds logging setup
+      "API keys"                      - finds key config (careful with secrets!)
+      "environment variables"         - finds env config
+      "redis cache"                   - finds Redis config
+      "docker compose"                - finds Docker config
+
+    Bad queries:
+      "*.yaml"                        - glob pattern, handled by presets
+      "config"                        - too vague
+      "settings"                      - too generic
+      "json"                          - file format, not a query
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what config you need.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - include_snippet (bool, default=True): Include config content snippets.
+    - context_lines (int): Lines of context around matches.
+    - under (str): Restrict to directory path.
+    - compact (bool): Minimal response fields.
+
+    PRESET GLOBS (automatically applied):
+    - **/*.yml, **/*.yaml
+    - **/*.json
+    - **/*.toml
+    - **/*.ini
+    - **/*.env
+    - **/*.config, **/*.conf
+    - **/*.properties
+    - **/*.csproj, **/*.props, **/*.targets
+    - **/*.xml
+    - **/appsettings*.json
+
+    RETURNS: Same schema as repo_search.
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.85,
+                "path": "config/database.yml",
+                "start_line": 12,
+                "end_line": 25,
+                "snippet": "database:\\n  host: localhost\\n  port: 5432..."
+            }
+        ],
+        "total": 5
+    }
 
-    What it does:
-    - Presets config file globs (yaml/json/toml/etc.) and forwards to repo_search
-    - Accepts extra filters via kwargs
+    USAGE PATTERNS:
+    # Find database config
+    search_config_for(query="database connection")
 
-    Returns: repo_search result shape.
+    # Find Docker configuration
+    search_config_for(query="docker service ports")
+
+    # Find in specific directory
+    search_config_for(query="api settings", under="config/")
+
+    WARNING: Config files may contain sensitive data (API keys, passwords).
+    Be cautious about exposing results that might contain secrets.
     """
     return await _search_config_for_impl(
         query=query,
@@ -1372,14 +1889,57 @@ async def search_callers_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Heuristic search for callers/usages of a symbol.
-
-    When to use:
-    - You want files that reference/invoke a function/class
-
-    Notes:
-    - Thin wrapper over repo_search today; pass language or path_glob to narrow
-    - Returns repo_search result shape
+    """Heuristic text-based search for callers/usages of a symbol.
+
+    PRIMARY USE: Find files that likely call or reference a function/class.
+    Uses text search, not AST analysis - faster but less precise than symbol_graph.
+
+    CHOOSE THIS WHEN:
+    - You want a quick, broad search for symbol references
+    - You're okay with some false positives in exchange for speed
+    - The codebase doesn't have graph index built yet
+    - You want to find textual mentions, not just actual calls
+
+    CHOOSE INSTEAD:
+    - symbol_graph with query_type="callers" -> for PRECISE AST-backed caller analysis
+    - repo_search -> when you want full control over search parameters
+
+    QUERY EXAMPLES:
+    Good queries (symbol names):
+      "authenticate"                  - finds references to authenticate
+      "UserService"                   - finds references to UserService
+      "validate_input"                - finds references to validate_input
+      "CacheManager.get"              - finds references to CacheManager.get
+
+    Bad queries:
+      "who calls authenticate"        - use symbol_graph for this phrasing
+      "find all usages of X"          - use symbol_graph
+      "authentication"                - concept, not symbol name
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Symbol name to find callers/references for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language for more relevant results.
+
+    RETURNS: Same schema as repo_search.
+
+    COMPARISON WITH symbol_graph:
+    | Aspect | search_callers_for | symbol_graph |
+    |--------|-------------------|--------------|
+    | Method | Text search | AST analysis |
+    | Speed | Faster | Slower |
+    | Precision | Lower (false positives) | Higher (actual calls) |
+    | Requires | Nothing special | Graph index |
+    | Use for | Quick exploration | Precise refactoring |
+
+    USAGE PATTERNS:
+    # Quick reference search
+    search_callers_for(query="authenticate", language="python")
+
+    # For precise caller analysis, prefer:
+    symbol_graph(symbol="authenticate", query_type="callers")
     """
     return await _search_callers_for_impl(
         query=query,
@@ -1401,13 +1961,61 @@ async def search_importers_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find files likely importing or referencing a module/symbol.
-
-    What it does:
-    - Presets code globs across common languages; forwards to repo_search
-    - Accepts additional filters via kwargs (e.g., under, case)
-
-    Returns: repo_search result shape.
+    """Heuristic text-based search for files importing a module/symbol.
+
+    PRIMARY USE: Find files that likely import a module or symbol.
+    Uses text search, not AST analysis - faster but less precise than symbol_graph.
+
+    CHOOSE THIS WHEN:
+    - You want a quick search for import statements
+    - You're looking for textual import/require/use mentions
+    - The codebase doesn't have graph index built yet
+    - You want approximate results quickly
+
+    CHOOSE INSTEAD:
+    - symbol_graph with query_type="importers" -> for PRECISE AST-backed import analysis
+    - repo_search -> when you want full control over search parameters
+
+    QUERY EXAMPLES:
+    Good queries (module/symbol names):
+      "auth_utils"                    - finds imports of auth_utils
+      "CacheManager"                  - finds imports of CacheManager
+      "qdrant_client"                 - finds imports of qdrant_client
+      "express"                       - finds require('express')
+      "pandas"                        - finds import pandas
+
+    Bad queries:
+      "what imports X"                - use symbol_graph for this phrasing
+      "import statements"             - too vague
+      "from ... import"               - syntax, not a module name
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Module or symbol name to find importers for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language for more relevant results.
+
+    PRESET GLOBS (automatically applied):
+    Code files across all common languages (*.py, *.js, *.ts, *.go, etc.)
+
+    RETURNS: Same schema as repo_search.
+
+    COMPARISON WITH symbol_graph:
+    | Aspect | search_importers_for | symbol_graph |
+    |--------|---------------------|--------------|
+    | Method | Text search | AST analysis |
+    | Speed | Faster | Slower |
+    | Precision | Lower (false positives) | Higher (actual imports) |
+    | Requires | Nothing special | Graph index |
+    | Use for | Quick exploration | Precise dependency analysis |
+
+    USAGE PATTERNS:
+    # Quick import search
+    search_importers_for(query="qdrant_client", language="python")
+
+    # For precise import analysis, prefer:
+    symbol_graph(symbol="qdrant_client", query_type="importers")
     """
     return await _search_importers_for_impl(
         query=query,
@@ -1433,35 +2041,102 @@ async def symbol_graph(
     depth: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Query the symbol graph to find callers, definitions, or importers.
+    """AST-backed symbol graph queries for precise code relationships.
+
+    PRIMARY USE: Find WHO CALLS a function, WHERE something is DEFINED,
+    or WHAT IMPORTS a module using the pre-built symbol graph.
+
+    CHOOSE THIS WHEN:
+    - You need "who calls this function?" (callers)
+    - You need "where is this defined?" (definition)
+    - You need "what imports this module?" (importers)
+    - You need "what does this function call?" (callees)
+    - You want PRECISE relationships, not text-based fuzzy matches
+    - You're doing refactoring impact analysis
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want CONCEPTUAL search, not precise relationships
+    - search_callers_for -> convenience wrapper, uses text search (less precise)
+    - search_importers_for -> convenience wrapper, uses text search (less precise)
+
+    QUERY EXAMPLES:
+
+    For "callers" query_type (who calls X?):
+      symbol="authenticate"          - finds all callers of authenticate()
+      symbol="UserService.get_user"  - finds callers of get_user method
+      symbol="validate_input"        - finds where validate_input is called
+
+    For "definition" query_type (where is X defined?):
+      symbol="CacheManager"          - finds CacheManager class definition
+      symbol="run_hybrid_search"     - finds function definition
+      symbol="USER_TIMEOUT"          - finds constant definition
+
+    For "importers" query_type (what imports X?):
+      symbol="auth_utils"            - finds files importing auth_utils module
+      symbol="CacheManager"          - finds files importing CacheManager
+      symbol="qdrant_client"         - finds files importing qdrant_client
+
+    For "callees" query_type (what does X call?):
+      symbol="authenticate"          - finds functions called BY authenticate
+      symbol="process_request"       - finds all functions process_request calls
+
+    ESSENTIAL PARAMETERS:
+    - symbol (str): Symbol name to analyze. Can be:
+      - Simple name: "authenticate"
+      - Qualified path: "UserService.get_user"
+      - Module name: "auth_utils"
+
+    - query_type (str, default="callers"): Type of relationship query:
+      - "callers": Find code that CALLS this symbol
+      - "definition": Find WHERE this symbol is DEFINED
+      - "importers": Find code that IMPORTS this symbol/module
+      - "callees": Find what this symbol CALLS (inverse of callers)
+
+    COMMON PARAMETERS:
+    - limit (int, default=20): Maximum results to return.
+    - depth (int, default=1): Traversal depth for multi-hop queries.
+      - depth=1: Direct relationships only
+      - depth=2: Callers of callers, callees of callees, etc.
+      - depth=3+: Use sparingly, can be expensive
+    - language (str): Filter by language.
+    - under (str): Filter by path prefix.
+    - repo (str): Filter by repository name. Use "*" for all repos.
+    - output_format (str): "json" or "toon" for token-efficient format.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "path": "src/api/handlers.py",
+                "start_line": 142,
+                "end_line": 145,
+                "symbol": "handle_login",
+                "symbol_path": "handlers.handle_login",
+                "language": "python",
+                "snippet": "    result = authenticate(username, password)",
+                "hop": 1,           // For depth>1: which hop found this
+                "via": "authenticate"  // For depth>1: intermediate symbol
+            }
+        ],
+        "symbol": "authenticate",
+        "query_type": "callers",
+        "count": 12,
+        "depth": 1,
+        "used_graph": true,    // True if graph collection was used (fast)
+        "suggestions": [...]   // Fuzzy matches if exact symbol not found
+    }
 
-    When to use:
-    - "Who calls X?" → query_type="callers"
-    - "Where is X defined?" → query_type="definition"
-    - "What imports Y?" → query_type="importers"
-    - "What does X call?" → query_type="callees"
-
-    Key parameters:
-    - symbol: str. The function, class, or module name to search for.
-    - query_type: str. One of "callers", "definition", "importers".
-    - limit: int (default 20). Maximum results to return.
-    - language: str (optional). Filter by programming language.
-    - under: str (optional). Filter by path prefix.
-    - repo: str (optional). Filter by repository name. Use "*" to search all repos.
-    - output_format: "json" (default) or "toon" for token-efficient format.
-    - depth: int (default 1). Multi-hop traversal depth. 2 = callers of callers, etc.
+    MULTI-HOP EXAMPLE (depth=2):
+    # "Who calls the callers of authenticate?"
+    symbol_graph(symbol="authenticate", query_type="callers", depth=2)
+    # Returns both direct callers (hop=1) and callers-of-callers (hop=2)
 
-    Returns:
-    - {"results": [...], "symbol": str, "query_type": str, "count": int, "depth": int}
-    - Each result includes path, start_line, end_line, symbol_path, and relevant context.
-    - Multi-hop results include "hop" (1, 2, ...) and "via" (intermediate symbol).
-
-    Example:
-    - symbol_graph(symbol="get_embedding_model", query_type="callers")
-    - symbol_graph(symbol="ASTAnalyzer", query_type="definition")
-    - symbol_graph(symbol="qdrant_client", query_type="importers")
-    - symbol_graph(symbol="my_function", query_type="callers", repo="backend")
-    - symbol_graph(symbol="authenticate", query_type="callers", depth=2)
+    NOTES:
+    - Graph must be indexed (run qdrant_index_root first)
+    - For fuzzy matching, suggestions are returned if exact symbol not found
+    - Hydration adds code snippets and accurate line numbers automatically
+    - Use depth>1 carefully - exponential growth in results
     """
     if not symbol or not str(symbol).strip():
         return {"error": "symbol parameter is required", "results": []}
@@ -1601,36 +2276,112 @@ async def context_answer(
     repo: Any = None,  # str, list[str], or "*" to search all repos
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Natural-language Q&A over the repo using retrieval + local LLM (llama.cpp).
-
-    What it does:
-    - Retrieves relevant code (hybrid vector+lexical with reranking enabled by default).
-    - Budgets/merges micro-spans, builds citations, and asks the LLM to answer.
-    - Returns a concise answer plus file/line citations.
-
-    When to use:
-    - You need an explanation or "how to" grounded in code.
-    - Prefer repo_search for raw hits; prefer context_search to blend code + memory.
-
-    Key parameters:
-    - query: str or list[str]; may be expanded if expand=true.
-    - budget_tokens: int. Token budget across code spans (defaults from MICRO_BUDGET_TOKENS).
-    - include_snippet: bool (default true). Include code snippets sent to the LLM and return them when requested.
-    - max_tokens, temperature: decoding controls.
-    - mode: "stitch" (default) or "pack" for prompt assembly.
-    - expand: bool. Use tiny local LLM to propose up to 2 alternate queries.
-    - Filters: language, under, kind, symbol, ext, path_regex, path_glob, not_glob, not_, case.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter).
-      By default, auto-detects current repo from CURRENT_REPO env and filters to it.
+    """Generate LLM-powered answers with citations grounded in retrieved code.
+
+    PRIMARY USE: Get an EXPLANATION or ANSWER to a question, not raw search results.
+    Uses retrieval-augmented generation (RAG) with a local LLM decoder.
+
+    CHOOSE THIS WHEN:
+    - You need an EXPLANATION ("How does X work?", "What is Y?")
+    - You want a synthesized answer with source citations
+    - You're asking a question that requires understanding, not just finding
+    - You want the system to READ code and EXPLAIN it to you
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want RAW CODE RESULTS, not explanations
+    - symbol_graph -> when you need precise "who calls X" relationships
+    - context_search -> when you want code + memories without LLM synthesis
+
+    QUERY EXAMPLES:
+    Good queries (questions requiring explanation):
+      "How does the authentication system validate tokens?"
+      "What is the purpose of the CacheManager class?"
+      "Explain the error handling strategy in the API layer"
+      "How are database connections pooled in this project?"
+      "What happens when a user session expires?"
+      "Describe the data flow for user registration"
+      "How does retry logic work in the HTTP client?"
+
+    Bad queries (not suited for LLM answers):
+      "find authentication code"      - use repo_search for finding code
+      "list all Python files"         - use repo_search with language filter
+      "UserController"                - symbol name only, use symbol_graph
+      "src/auth.py"                   - file path, just read the file
+      "def authenticate"              - code fragment, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Question or topic requiring explanation.
+      Should be phrased as a question or request for explanation.
+
+    RETRIEVAL PARAMETERS:
+    - limit (int, default=15): Code spans to retrieve for context.
+    - per_path (int, default=5): Max spans per file.
+    - budget_tokens (int): Token budget for code context. Default from env.
+    - include_snippet (bool, default=True): Include code in response.
+    - language (str): Filter retrieval by language.
+    - under (str): Restrict retrieval to directory path.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+
+    GENERATION PARAMETERS:
+    - max_tokens (int): Max tokens for generated answer.
+    - temperature (float): Sampling temperature (0.0-1.0). Lower = more focused.
+    - mode (str): Prompt assembly mode. "stitch" (default) or "pack".
+    - expand (bool): Use LLM to generate query expansions for better recall.
+
+    COMMON FILTER PARAMETERS (same as repo_search):
+    - symbol (str): Filter by symbol name.
+    - path_glob (str | list[str]): Filter by file pattern.
+    - not_glob (str | list[str]): Exclude file patterns.
+    - ext (str): Filter by file extension.
+
+    RETURNS:
+    {
+        "ok": true,
+        "answer": "The authentication system validates tokens by first checking
+                   the JWT signature using the secret from config [1], then
+                   verifying expiration time [2]. If valid, it extracts the
+                   user ID and loads permissions from the database [3].",
+        "citations": [
+            {
+                "id": 1,
+                "path": "src/auth/jwt.py",
+                "start_line": 45,
+                "end_line": 52,
+                "snippet": "def verify_token(token):..."  // Optional
+            },
+            {
+                "id": 2,
+                "path": "src/auth/jwt.py",
+                "start_line": 54,
+                "end_line": 58
+            },
+            {
+                "id": 3,
+                "path": "src/auth/permissions.py",
+                "start_line": 23,
+                "end_line": 31
+            }
+        ],
+        "query": ["How does authentication validate tokens"],
+        "used": {
+            "spans": 5,
+            "tokens": 1842
+        }
+    }
 
-    Returns:
-    - {"answer": str, "citations": [{"path": str, "start_line": int, "end_line": int}], "query": list[str], "used": {...}}
-    - On decoder disabled/error, returns {"error": "...", "citations": [...], "query": [...]}
+    // On insufficient context:
+    {
+        "answer": "insufficient context",
+        "citations": [],
+        "query": [...],
+        "hint": "Try broadening your query or checking if the feature exists"
+    }
 
-    Notes:
-    - Reranking is enabled by default for optimal retrieval quality.
-    - Honors env knobs such as REFRAG_MODE, REFRAG_GATE_FIRST, MICRO_BUDGET_TOKENS, DECODER_*.
-    - Keeps answers brief (2–4 sentences) and grounded; rejects ungrounded output.
+    NOTES:
+    - Answers include bracketed citations like [1], [2] referencing the citations array
+    - If context is insufficient, returns "insufficient context" as the answer
+    - Local LLM decoder must be available (llama.cpp or cloud fallback)
+    - Reranking is enabled by default for optimal retrieval quality
     """
     return await _context_answer_impl(
         query=query,
@@ -1691,16 +2442,46 @@ async def code_search(
     per_source_limits: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Exact alias of repo_search (hybrid code search with reranking enabled by default).
+    """Alias of repo_search for discoverability. Use repo_search directly.
+
+    PRIMARY USE: This is an EXACT ALIAS of repo_search. Exists for discoverability
+    in IDEs and agents that might search for "code_search" instead of "repo_search".
+
+    CHOOSE THIS WHEN:
+    - You would use repo_search (they are identical)
+    - Your tooling expects a "code_search" function name
 
-    Prefer repo_search; this name exists for discoverability in some IDEs/agents.
-    Same parameters and return shape as repo_search.
-    Reranking (rerank_enabled=true) is ON by default for optimal result quality.
+    CHOOSE INSTEAD:
+    - repo_search -> same functionality, canonical name
+    - See repo_search docstring for full documentation
 
-    Memory blending (opt-in):
+    QUERY EXAMPLES:
+    Good queries (natural language, conceptual):
+      "authentication middleware"     - finds auth-related code
+      "error handling with retry"     - finds retry logic
+      "database connection setup"     - finds DB connection code
+      "user input validation"         - finds validation logic
+      "async task processing"         - finds async patterns
+
+    Bad queries (will return poor results):
+      "auth AND login"                - boolean operators NOT supported
+      "grep -r 'password'"            - not a shell command
+      "class.*Controller"             - regex NOT supported
+      "SELECT * FROM users"           - SQL query, not code search
+      "https://github.com/..."        - URL, not a search query
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Natural language description of code you're looking for.
+
+    All parameters and return format are identical to repo_search.
+    See repo_search documentation for complete parameter reference.
+
+    MEMORY BLENDING (opt-in, delegates to context_search):
     - include_memories: bool. If true, blends memory results with code results.
     - memory_weight: float (default 1.0). Scales memory scores relative to code.
     - per_source_limits: dict, e.g. {"code": 5, "memory": 3}
+
+    RETURNS: Same schema as repo_search.
     """
     # If include_memories is requested, delegate to context_search for blending
     if include_memories:
@@ -1789,31 +2570,105 @@ async def info_request(
     output_format: Any = None,  # "json" (default) or "toon" for token-efficient format
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Simplified codebase retrieval with optional explanation mode.
+    """Simplified codebase discovery with optional explanation mode.
+
+    PRIMARY USE: Quick, single-parameter code search with human-readable results.
+    Designed as a drop-in replacement for basic "find code about X" queries.
+
+    CHOOSE THIS WHEN:
+    - You want a simple, one-parameter search interface
+    - You want results with human-readable "information" descriptions
+    - You want optional explanation mode for richer context
+    - You're building a simple integration and want minimal complexity
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you need full control over filtering and parameters
+    - context_answer -> when you need an LLM-generated ANSWER, not just results
+    - symbol_graph -> when you need precise call/definition relationships
+
+    QUERY EXAMPLES:
+    Good queries (natural language descriptions):
+      "database connection pooling"     - finds DB connection code
+      "authentication middleware"       - finds auth-related code
+      "error handling patterns"         - finds error handling logic
+      "user input validation"           - finds validation code
+      "caching implementation"          - finds cache logic
+      "logging configuration"           - finds logging setup
+      "API endpoint handlers"           - finds route handlers
+
+    Bad queries (too vague or wrong format):
+      "code"                           - too vague
+      "the function"                   - unspecific
+      "*.py"                           - glob pattern, use path_glob param
+      "auth|login"                     - boolean syntax not supported
+      "line 42"                        - use file reading for specific lines
+
+    ESSENTIAL PARAMETERS:
+    - info_request (str): Natural language description of code you're looking for.
+    - information_request (str): Alias for info_request.
+
+    EXPLANATION MODE PARAMETERS:
+    - include_explanation (bool, default=False): When true, adds:
+      - summary: Brief overview of what was found
+      - primary_locations: Key file paths
+      - related_concepts: Technical concepts discovered
+      - query_understanding: How the query was interpreted
+
+    - include_relationships (bool, default=False): When true, adds to each result:
+      - imports_from: Modules this code imports
+      - calls: Functions this code calls
+      - related_paths: Related files
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language ("python", "typescript", etc.)
+    - under (str): Restrict to directory path.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+    - output_format (str): "json" or "toon" for token-efficient format.
+    - include_snippet (bool, default=True): Include code snippets.
+
+    RETURNS (compact mode, default):
+    {
+        "ok": true,
+        "results": [
+            {
+                "information": "Found function 'authenticate' in src/auth.py (lines 42-67)",
+                "relevance_score": 0.85,   // Alias for score
+                "score": 0.85,
+                "path": "src/auth.py",
+                "symbol": "authenticate",
+                "start_line": 42,
+                "end_line": 67
+            }
+        ],
+        "total": 5
+    }
 
-    When to use:
-    - Simple, single-parameter code search with human-readable descriptions
-    - When you want optional explanation mode for richer context
-    - Drop-in replacement for basic codebase retrieval tools
-
-    Key parameters:
-    - info_request: str. Natural language description of the code you're looking for.
-    - information_request: str. Alias for info_request.
-    - include_explanation: bool (default false). Add summary, primary_locations, related_concepts.
-    - include_relationships: bool (default false). Add imports_from, calls, related_paths to results.
-    - limit: int (default 10). Maximum results to return.
-    - language: str. Filter by programming language.
-    - under: str. Limit search to specific directory.
-    - repo: str or list[str]. Filter by repository name(s).
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
+    RETURNS (explanation mode, include_explanation=True):
+    {
+        "ok": true,
+        "results": [...],
+        "summary": "Found 5 authentication-related functions across 3 files",
+        "primary_locations": ["src/auth.py", "src/middleware/auth.py"],
+        "related_concepts": ["jwt", "token", "session", "middleware"],
+        "query_understanding": "Looking for authentication implementation code",
+        "confidence": {
+            "level": "high",
+            "score": 0.82,
+            "symbol_matches": 3
+        }
+    }
 
-    Returns:
-    - Compact mode (default): results with information field and relevance_score alias
-    - Explanation mode: adds summary, primary_locations, related_concepts, query_understanding
+    USAGE PATTERNS:
+    # Simple discovery:
+    info_request(info_request="database connection")
 
-    Example:
-    - {"info_request": "database connection pooling"}
-    - {"info_request": "authentication middleware", "include_explanation": true}
+    # With explanation:
+    info_request(
+        info_request="authentication flow",
+        include_explanation=True,
+        include_relationships=True
+    )
     """
     # Resolve query from either parameter
     query = info_request or information_request
@@ -2004,29 +2859,90 @@ async def context_search(
     output_format: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Blend code search results with memory-store entries (notes, docs) for richer context.
-
-    When to use:
-    - You want code spans plus relevant memories in one response.
-    - Prefer repo_search for code-only; use context_answer when you need an LLM-written answer.
-
-    Key parameters:
-    - query: str or list[str]
-    - include_memories: bool (opt-in). If true, queries the memory collection and merges with code results.
-    - memory_weight: float (default 1.0). Scales memory scores relative to code.
-    - per_source_limits: dict, e.g. {"code": 5, "memory": 3}
-    - All repo_search filters are supported and passed through.
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
-    - rerank_enabled: bool (default true). ONNX reranker is ON by default for better relevance.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter).
-      By default, auto-detects current repo from CURRENT_REPO env and filters to it.
+    """Blend code search results with memory-store entries for richer context.
+
+    PRIMARY USE: Search code AND retrieve relevant stored memories/notes in one call.
+
+    CHOOSE THIS WHEN:
+    - You want code results PLUS relevant memories (notes, docs, decisions)
+    - You're searching for something where team knowledge might help
+    - You want to surface both implementation AND documentation/context
+    - You need to check if there are existing notes about a topic
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you ONLY want code results (faster, no memory overhead)
+    - context_answer -> when you need an LLM-generated EXPLANATION
+    - memory_find -> when you ONLY want memories (no code search)
+
+    QUERY EXAMPLES:
+    Good queries (conceptual, topic-based):
+      "authentication design decisions"  - finds code + stored auth decisions
+      "API versioning strategy"          - finds API code + design notes
+      "database migration approach"      - finds migration code + notes
+      "caching invalidation policy"      - finds cache code + policy notes
+      "error handling conventions"       - finds error code + team standards
+
+    Bad queries (too narrow for memory blending):
+      "def authenticate("               - too specific, use repo_search
+      "class UserController"            - exact match, use repo_search
+      "line 42 of auth.py"              - specific location, just read the file
+      "git commit abc123"               - not a search query
+      "npm install express"             - command, not a query
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you're looking for.
+
+    MEMORY BLENDING PARAMETERS:
+    - include_memories (bool, default=False): MUST SET TO TRUE to enable memory blending.
+      Without this, context_search behaves identically to repo_search.
+    - memory_weight (float, default=1.0): Scale memory scores relative to code.
+      Values >1.0 boost memories, <1.0 favor code results.
+    - per_source_limits (dict): Control results per source.
+      Example: {"code": 6, "memory": 3} returns max 6 code + 3 memory results.
+
+    COMMON PARAMETERS (same as repo_search):
+    - limit (int, default=10): Maximum total results.
+    - language (str): Filter code results by language.
+    - under (str): Restrict code search to directory path.
+    - include_snippet (bool, default=True): Include code snippets.
+    - rerank_enabled (bool, default=True): Cross-encoder reranking.
+    - output_format (str): "json" or "toon" for token-efficient format.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "source": "code",         // "code" or "memory"
+                "score": 0.85,
+                "path": "src/auth.py",    // For code results
+                "symbol": "authenticate",
+                "start_line": 42,
+                "end_line": 67,
+                "snippet": "def auth..."
+            },
+            {
+                "source": "memory",       // Memory results have different shape
+                "score": 0.78,
+                "content": "Auth uses JWT tokens with 24h expiry...",
+                "metadata": {"kind": "note", "created_at": "2024-..."}
+            }
+        ],
+        "total": 9,
+        "memory_note": "3 memories included"  // Optional note about memory results
+    }
 
-    Returns:
-    - {"results": [{"source": "code"| "memory", ...}, ...], "total": N[, "memory_note": str]}
-    - In compact mode, results are reduced to lightweight records.
+    USAGE PATTERN:
+    # To blend code + memories (recommended pattern):
+    context_search(
+        query="authentication architecture",
+        include_memories=True,
+        per_source_limits={"code": 5, "memory": 3}
+    )
 
-    Example:
-    - include_memories=true, per_source_limits={"code": 6, "memory": 2}, path_glob="docs/**"
+    # To search code only (same as repo_search):
+    context_search(query="authentication", include_memories=False)
     """
     return await _context_search_impl(
         query=query,
@@ -2112,34 +3028,110 @@ async def pattern_search(
     ) -> Dict[str, Any]:
         """Find structurally similar code patterns across all languages.
 
-        Accepts EITHER code examples OR natural language descriptions - auto-detects which.
-
-        When to use:
-        - Find code with similar control flow (retry loops, error handling, etc.)
-        - Cross-language pattern matching (Python pattern → Go/Rust/Java matches)
-        - Detect code duplication based on structure, not syntax
-        - Search by pattern description ("retry with backoff", "resource cleanup")
+        PRIMARY USE: Search by CODE STRUCTURE rather than text/semantics.
+        Finds code with similar control flow, API usage, or patterns.
+
+        CHOOSE THIS WHEN:
+        - You have a CODE EXAMPLE and want to find similar patterns
+        - You want to find code STRUCTURALLY similar (not just textually)
+        - You're searching across languages (Python pattern -> find in Go/Rust/Java)
+        - You want to detect code duplication based on structure
+        - You're searching for patterns like "retry with backoff", "singleton"
+
+        CHOOSE INSTEAD:
+        - repo_search -> when searching by CONCEPT, not structural pattern
+        - symbol_graph -> when looking for call/definition relationships
+        - context_answer -> when you need an EXPLANATION
+
+        QUERY EXAMPLES:
+
+        Code example mode (query_mode="code" or auto-detected):
+          "for i in range(3): try: ... except: time.sleep(2**i)"
+          "if err != nil { return err }"
+          "async function $NAME($$$) { await $EXPR; }"
+          "with open(file) as f: data = f.read()"
+          "try { ... } catch (e) { console.error(e); throw e; }"
+
+        Description mode (query_mode="description" or auto-detected):
+          "retry with exponential backoff"
+          "resource cleanup pattern"
+          "singleton implementation"
+          "factory pattern"
+          "decorator wrapping function"
+          "error handling with logging"
+          "connection pooling"
+          "rate limiting implementation"
+
+        Bad queries (wrong use case):
+          "authentication code"           - use repo_search for concepts
+          "who calls authenticate"        - use symbol_graph
+          "explain the auth flow"         - use context_answer
+          "files in src/"                 - use glob/file tools
+
+        ESSENTIAL PARAMETERS:
+        - query (str): EITHER a code example OR a natural language pattern description.
+          The mode is auto-detected, or you can force it with query_mode.
+
+        MODE CONTROL PARAMETERS:
+        - query_mode (str, default="auto"): How to interpret the query.
+          - "auto": Auto-detect if query is code or description
+          - "code": Force interpretation as code example
+          - "description": Force interpretation as pattern description
+        - language (str): Language hint for code examples. Also triggers code mode
+          in auto-detection. Example: "python", "go", "rust", "typescript"
+
+        COMMON PARAMETERS:
+        - limit (int, default=10): Maximum results to return.
+        - min_score (float, default=0.3): Minimum similarity score threshold.
+        - include_snippet (bool, default=True): Include code snippets in results.
+        - target_languages (list[str]): Filter results to specific languages.
+          Example: ["python", "go"] to find pattern only in Python and Go files.
+        - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+        - output_format (str): "json" or "toon" for token-efficient format.
+        - compact (bool): Minimal response fields.
+
+        AROMA RERANKING PARAMETERS:
+        - aroma_rerank (bool, default=True): Enable AROMA-style pruning/reranking.
+          Improves precision by penalizing partial matches.
+        - aroma_alpha (float, default=0.6): Weight for pruned similarity vs original.
+          Higher values trust pruning more.
+
+        RETURNS:
+        {
+            "ok": true,
+            "results": [
+                {
+                    "path": "src/client.py",
+                    "start_line": 89,
+                    "end_line": 102,
+                    "score": 0.78,
+                    "language": "python",
+                    "snippet": "for attempt in range(max_retries):..."
+                }
+            ],
+            "total": 7,
+            "query_mode": "code",         // or "description"
+            "query_signature": "...",     // Internal: pattern signature used
+            "detection": {                // Mode detection metadata
+                "confidence": 0.95,
+                "ast_validated": true,
+                "signals": {"ast_parsed": 1.0, "nl_similarity": 0.42}
+            }
+        }
 
-        Key parameters:
-        - query: str. Code snippet OR natural language description of pattern.
-        - query_mode: str. "code", "description", or "auto" (default). Explicit override for detection.
-        - language: str. Language hint for code examples (also triggers code mode in auto).
-        - limit: int (default 10). Maximum results to return.
-        - min_score: float (default 0.3). Minimum similarity score threshold.
-        - include_snippet: bool (default false). Include code snippets in results.
-        - target_languages: list[str]. Filter to specific target languages.
-        - output_format: "json" (default) or "toon" for token-efficient format.
-        - compact: bool. If true with TOON, use minimal fields.
-        - aroma_rerank: bool (default true). Enable AROMA-style pruning and reranking.
-        - aroma_alpha: float (default 0.6). Weight for pruned similarity vs original score.
-
-        Returns:
-        - {ok, results: [{path, start_line, end_line, score, language, ...}], total, query_signature}
+        CROSS-LANGUAGE EXAMPLE:
+        # Find Go error handling similar to Python pattern
+        pattern_search(
+            query="if err != nil { return err }",
+            language="go",
+            target_languages=["python", "rust", "java"]
+        )
 
-        Examples:
-        - pattern_search(query="for i in range(3): try: ... except: time.sleep(2**i)")
-        - pattern_search(query="retry with exponential backoff", query_mode="description")
-        - pattern_search(query="if err != nil { return err }", language="go")
+        NOTES:
+        - Pattern vectors must be indexed (PATTERN_VECTORS=1 during indexing)
+        - Auto-detection uses AST parsing + NL embedder comparison
+        - Code mode uses structural pattern matching
+        - Description mode uses semantic search on pattern descriptions
         """
         return await _pattern_search_impl(
             query=query,
diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py
index b11607f4..05e0c759 100644
--- a/scripts/mcp_memory_server.py
+++ b/scripts/mcp_memory_server.py
@@ -68,6 +68,7 @@
 LEX_VECTOR_NAME = os.environ.get("LEX_VECTOR_NAME", "lex")
 LEX_VECTOR_DIM = int(os.environ.get("LEX_VECTOR_DIM", "4096") or 4096)
 EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5")
+MEMORY_FIND_LIMIT_DEFAULT = int(os.environ.get("MEMORY_FIND_LIMIT_DEFAULT", "10") or 10)
 
 # Minimal embedding via fastembed (CPU)
 
@@ -137,6 +138,58 @@ def _ensure_once(name: str) -> bool:
         return False
 
 # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000)
+TOOLS_METADATA: Dict[str, Dict] = {
+    "memory_store": {
+        "name": "memory_store",
+        "category": "memory",
+        "primary_use": "Store knowledge for later retrieval",
+        "choose_when": [
+            "Storing team decisions/notes",
+            "Documenting conventions",
+            "Building institutional memory",
+        ],
+        "choose_instead": {},
+        "parameters": {
+            "essential": ["information"],
+            "common": ["metadata"],
+            "advanced": ["collection", "session"],
+        },
+        "returns": {"ok": "bool", "id": "str", "message": "str"},
+        "related_tools": ["memory_find", "context_search"],
+        "performance": {
+            "typical_latency_ms": (50, 500),
+            "requires_index": False,
+            "requires_decoder": False,
+        },
+    },
+    "memory_find": {
+        "name": "memory_find",
+        "category": "memory",
+        "primary_use": "Retrieve stored memories by similarity",
+        "choose_when": [
+            "Looking for stored notes/decisions",
+            "Recalling team knowledge",
+        ],
+        "choose_instead": {"context_search": "Want code + memories together"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "kind", "topic", "tags"],
+            "advanced": ["priority_min", "collection"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{id, information, metadata, score}]",
+            "total": "int",
+        },
+        "related_tools": ["memory_store", "context_search"],
+        "performance": {
+            "typical_latency_ms": (50, 300),
+            "requires_index": False,
+            "requires_decoder": False,
+        },
+    },
+}
+
 _security_settings = (
     TransportSecuritySettings(enable_dns_rebinding_protection=False)
     if TransportSecuritySettings
@@ -144,7 +197,6 @@ def _ensure_once(name: str) -> bool:
 )
 mcp = FastMCP(name="memory-server", transport_security=_security_settings)
 
-# Capture tool registry automatically by wrapping the decorator once
 _TOOLS_REGISTRY: list[dict] = []
 try:
     _orig_tool = mcp.tool
@@ -251,7 +303,12 @@ def do_GET(self):
                         self.send_response(200)
                         self.send_header("Content-Type", "application/json")
                         self.end_headers()
-                        payload = {"ok": True, "tools": _TOOLS_REGISTRY}
+                        enriched = []
+                        for t in _TOOLS_REGISTRY:
+                            name = t.get("name", "")
+                            meta = TOOLS_METADATA.get(name, {})
+                            enriched.append({**t, **meta})
+                        payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA}
                         self.wfile.write((json.dumps(payload)).encode("utf-8"))
                     else:
                         self.send_response(404)
@@ -436,6 +493,12 @@ def set_session_defaults(
     mode: Optional[str] = None,
     language: Optional[str] = None,
     under: Optional[str] = None,
+    repo: Any = None,
+    compact: Any = None,
+    output_format: Optional[str] = None,
+    include_snippet: Any = None,
+    rerank_enabled: Any = None,
+    limit: Any = None,
     ctx: Context = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
@@ -447,6 +510,19 @@ def set_session_defaults(
     - Optionally, also supports a lightweight token for clients that prefer cross-connection reuse.
 
     Precedence everywhere: explicit collection > per-connection defaults > token defaults > env default.
+
+    Parameters:
+    - collection: Default collection name
+    - mode: Search mode hint
+    - under: Default path prefix filter
+    - language: Default language filter
+    - repo: Default repo filter for multi-repo setups
+    - compact: Default compact response mode (bool)
+    - output_format: Default output format ("json" or "toon")
+    - include_snippet: Default snippet inclusion (bool)
+    - rerank_enabled: Default reranking toggle (bool)
+    - limit: Default result limit (int)
+    - session: Session token for cross-connection reuse
     """
     # Handle kwargs payload from some clients
     try:
@@ -467,6 +543,18 @@ def set_session_defaults(
                 under = _extra["under"]
             if not session and _extra.get("session"):
                 session = _extra["session"]
+            if repo is None and _extra.get("repo"):
+                repo = _extra["repo"]
+            if compact is None and _extra.get("compact") is not None:
+                compact = _extra["compact"]
+            if not output_format and _extra.get("output_format"):
+                output_format = _extra["output_format"]
+            if include_snippet is None and _extra.get("include_snippet") is not None:
+                include_snippet = _extra["include_snippet"]
+            if rerank_enabled is None and _extra.get("rerank_enabled") is not None:
+                rerank_enabled = _extra["rerank_enabled"]
+            if limit is None and _extra.get("limit") is not None:
+                limit = _extra["limit"]
     except Exception as e:
         logger.debug(f"Suppressed exception: {e}")
 
@@ -480,6 +568,23 @@ def set_session_defaults(
         defaults["language"] = language.strip()
     if isinstance(under, str) and under.strip():
         defaults["under"] = under.strip()
+    if isinstance(repo, str) and repo.strip():
+        defaults["repo"] = repo.strip()
+    elif isinstance(repo, list):
+        defaults["repo"] = repo
+    if isinstance(output_format, str) and output_format.strip():
+        defaults["output_format"] = output_format.strip()
+    if compact is not None:
+        defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact
+    if include_snippet is not None:
+        defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet
+    if rerank_enabled is not None:
+        defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled
+    if limit is not None:
+        try:
+            defaults["limit"] = int(limit)
+        except (ValueError, TypeError):
+            pass
 
     # Store per-connection (preferred, no token required)
     try:
@@ -521,9 +626,107 @@ def memory_store(
     session: Optional[str] = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Store a memory entry into Qdrant (dual vectors consistent with indexer).
+    """Store knowledge/notes into the memory system for later retrieval.
+
+    PRIMARY USE: Persist team knowledge, decisions, conventions, or notes
+    that should be retrievable alongside code search results.
+
+    CHOOSE THIS WHEN:
+    - You want to store a decision or convention for future reference
+    - You're documenting why code works a certain way
+    - You want to persist knowledge that context_search can find
+    - You're building institutional memory for the codebase
+
+    WHAT TO STORE:
+    Good candidates for memory storage:
+      - Architecture decisions: "We use JWT for auth because..."
+      - Conventions: "All API responses follow the envelope pattern..."
+      - Gotchas: "The cache has a 5-minute TTL, not configurable..."
+      - Debugging notes: "If X fails, check Y first..."
+      - Integration details: "External API requires header Z..."
+      - Performance notes: "This query is O(n^2), optimize for large N..."
+
+    Bad candidates (don't store these):
+      - Code itself (it's already indexed)
+      - Temporary debug output
+      - Personal notes not relevant to the codebase
+      - Sensitive data (passwords, keys, secrets)
+
+    ESSENTIAL PARAMETERS:
+    - information (str): The knowledge/note to store. Should be clear,
+      self-contained text that will be useful when retrieved later.
+
+    METADATA PARAMETERS:
+    - metadata (dict): Optional structured metadata for filtering.
+      Common keys:
+      - kind: "note", "decision", "convention", "gotcha", "policy"
+      - topic: Subject area ("auth", "caching", "api", "database")
+      - priority: Importance (1=low, 5=high)
+      - tags: List of tags for filtering
+      - author: Who wrote this note
+
+      Auto-added if not provided:
+      - created_at: ISO timestamp
+      - kind: "memory" (default)
+      - source: "memory" (default)
+
+    SESSION PARAMETERS:
+    - collection (str): Target collection. Defaults to workspace collection.
+    - session (str): Session token for multi-user scenarios.
+
+    RETURNS:
+    {
+        "ok": true,
+        "id": "abc123...",           // Unique ID for this memory
+        "message": "Successfully stored information",
+        "collection": "codebase",
+        "vector": "bge-base-en-v1-5"  // Embedding model used
+    }
+
+    USAGE PATTERNS:
+
+    # Store an architecture decision
+    memory_store(
+        information="We chose FastAPI over Flask because we need async support
+        for the WebSocket handlers and automatic OpenAPI documentation.",
+        metadata={
+            "kind": "decision",
+            "topic": "api",
+            "tags": ["framework", "architecture"]
+        }
+    )
+
+    # Store a debugging gotcha
+    memory_store(
+        information="If authentication fails silently, check that the JWT_SECRET
+        env var is set. The auth middleware swallows exceptions.",
+        metadata={
+            "kind": "gotcha",
+            "topic": "auth",
+            "priority": 4
+        }
+    )
 
-    First call may be slower because the embedding model loads lazily.
+    # Store a convention
+    memory_store(
+        information="All database queries must use parameterized statements.
+        Raw string interpolation is forbidden for security.",
+        metadata={
+            "kind": "convention",
+            "topic": "database",
+            "tags": ["security", "sql"]
+        }
+    )
+
+    RETRIEVAL:
+    Stored memories can be retrieved via:
+    - memory_find(query="...") -> searches only memories
+    - context_search(query="...", include_memories=True) -> code + memories
+
+    NOTES:
+    - First call may be slower due to embedding model loading
+    - Memories are embedded using the same model as code for consistent search
+    - Duplicate content is not deduplicated; avoid storing the same thing twice
     """
     sess = _require_auth_session(session)
     coll = _resolve_collection(collection, session=session, ctx=ctx)
@@ -587,10 +790,97 @@ def memory_find(
     priority_min: Optional[int] = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find memory-like entries by vector similarity (dense + lexical fusion).
+    """Retrieve stored memories/notes by semantic similarity.
+
+    PRIMARY USE: Find previously stored knowledge, decisions, or notes.
+    Searches ONLY the memory store, not code.
+
+    CHOOSE THIS WHEN:
+    - You want to find previously stored notes/decisions
+    - You're looking for team knowledge without code results
+    - You want to filter memories by metadata (kind, topic, tags)
+    - You need to recall specific documented information
+
+    CHOOSE INSTEAD:
+    - context_search with include_memories=True -> when you want code + memories
+    - repo_search -> when you want code only, no memories
+
+    QUERY EXAMPLES:
+    Good queries (conceptual, knowledge-seeking):
+      "authentication decisions"      - finds auth-related notes
+      "why we chose this approach"    - finds decision rationale
+      "database performance tips"     - finds DB-related notes
+      "API design conventions"        - finds API conventions
+      "deployment gotchas"            - finds deployment notes
+
+    Bad queries:
+      "def authenticate"              - code fragment, use repo_search
+      "src/auth.py"                   - file path, not a memory query
+      "UserService"                   - class name, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Natural language description of what you're looking for.
+
+    ALTERNATIVE QUERY PARAMETERS:
+    - q (str): Alias for query.
+    - top_k (int): Alias for limit.
+
+    FILTER PARAMETERS:
+    - kind (str): Filter by memory kind.
+      Values: "note", "decision", "convention", "gotcha", "policy", "preference"
+    - topic (str): Filter by topic/subject area.
+      Example: "auth", "database", "api", "caching"
+    - tags (str | list[str]): Filter by tags.
+      Example: "security" or ["security", "sql"]
+    - language (str): Filter by programming language context.
+    - priority_min (int): Minimum priority (1-5). Higher = more important.
+
+    COMMON PARAMETERS:
+    - limit (int, default=5): Maximum results to return.
+    - collection (str): Target collection. Defaults to workspace collection.
+    - session (str): Session token for multi-user scenarios.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "id": "abc123...",
+                "information": "We chose JWT for authentication because...",
+                "metadata": {
+                    "kind": "decision",
+                    "topic": "auth",
+                    "created_at": "2024-01-15T10:30:00Z",
+                    "tags": ["security", "architecture"]
+                },
+                "score": 0.85,
+                "highlights": ["...chose <<JWT>> for <<authentication>>..."]
+            }
+        ],
+        "total": 3,
+        "count": 3,
+        "query": "authentication decisions"
+    }
+
+    USAGE PATTERNS:
+
+    # Find all authentication-related notes
+    memory_find(query="authentication", topic="auth")
+
+    # Find high-priority gotchas
+    memory_find(query="common issues", kind="gotcha", priority_min=4)
+
+    # Find security-related conventions
+    memory_find(query="security best practices", kind="convention", tags="security")
+
+    # Find recent decisions
+    memory_find(query="recent architecture decisions", kind="decision", limit=10)
 
-    Cold-start option: set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding until the
-    model is cached (useful on slow storage).
+    NOTES:
+    - Cold start: First call may be slower if embedding model isn't cached
+    - Set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding on cold start
+    - Highlights show query term matches in context
+    - Results are ranked by hybrid similarity (dense + lexical fusion)
     """
     # Handle 'q' alias for query
     if not query and q:
@@ -612,7 +902,7 @@ def memory_find(
     lex = _lex_hash_vector_text(str(query), LEX_VECTOR_DIM)
 
     # Harmonize alias: top_k -> limit
-    lim = int(limit if limit is not None else (top_k if top_k is not None else 5))
+    lim = int(limit if limit is not None else (top_k if top_k is not None else MEMORY_FIND_LIMIT_DEFAULT))
 
     # Build Qdrant filter
     must = []

From 0d02a102dd201ad297d41afc0bc30b75582366d5 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 12:44:00 -0500
Subject: [PATCH 04/10] Add ASGI middleware for Authorization header token
 support

Introduces an ASGI middleware to extract the Authorization header from HTTP requests and store the token in a context variable, enabling stateless token-based authentication. Updates mcp_auth.py to support direct admin/shared token authentication and modifies both mcp_indexer_server.py and mcp_memory_server.py to inject the middleware for non-stdio transports.
---
 scripts/mcp_auth.py           | 25 ++++++++++++++++
 scripts/mcp_indexer_server.py | 56 +++++++++++++++++++++++++++++++++++
 scripts/mcp_memory_server.py  | 52 ++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)

diff --git a/scripts/mcp_auth.py b/scripts/mcp_auth.py
index 2b13c791..fea2f06b 100644
--- a/scripts/mcp_auth.py
+++ b/scripts/mcp_auth.py
@@ -1,3 +1,4 @@
+import contextvars
 import os
 from typing import Any, Dict, Optional
 
@@ -8,6 +9,9 @@
     class ValidationError(Exception):
         pass
 
+# Context variable for Authorization header token (set by HTTP middleware)
+AUTH_HEADER_TOKEN: contextvars.ContextVar[str] = contextvars.ContextVar("auth_header_token", default="")
+
 
 try:
     from scripts.auth_backend import (
@@ -47,13 +51,34 @@ def _has_collection_access(
     in {"1", "true", "yes", "on"}
 )
 
+# Direct token auth: allow admin/shared tokens to bypass session lookup
+_AUTH_ADMIN_TOKEN = (os.environ.get("CTXCE_AUTH_ADMIN_TOKEN") or "").strip()
+_AUTH_SHARED_TOKEN = (os.environ.get("CTXCE_AUTH_SHARED_TOKEN") or "").strip()
+
 
 def require_auth_session(session: Optional[str]) -> Optional[Dict[str, Any]]:
     if not AUTH_ENABLED_AUTH:
         return None
     sid = (session or "").strip()
+    
+    # If no session arg provided, try to get token from HTTP header context var
+    if not sid:
+        sid = AUTH_HEADER_TOKEN.get()
+    
     if not sid:
         raise ValidationError("Missing session for authorized operation")
+    
+    # Strip "Bearer " prefix if present (from HTTP Authorization header)
+    if sid.lower().startswith("bearer "):
+        sid = sid[7:].strip()
+    
+    # Check direct admin/shared token first (bypass session DB lookup)
+    if _AUTH_ADMIN_TOKEN and sid == _AUTH_ADMIN_TOKEN:
+        return {"user_id": "admin", "role": "admin", "token_type": "admin"}
+    if _AUTH_SHARED_TOKEN and sid == _AUTH_SHARED_TOKEN:
+        return {"user_id": "shared", "role": "user", "token_type": "shared"}
+    
+    # Fall back to session database lookup
     info = _auth_validate_session(sid)
     if not info:
         raise ValidationError("Invalid or expired session")
diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 99891017..29b7d6f6 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -136,6 +136,7 @@ def _json_dumps_bytes(obj) -> bytes:
 from scripts.mcp_auth import (
     require_auth_session as _require_auth_session,
     require_collection_access as _require_collection_access,
+    AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN,
 )
 
 # ---------------------------------------------------------------------------
@@ -565,6 +566,56 @@ def _highlight_snippet(snippet, tokens):  # type: ignore
 )
 mcp = FastMCP(APP_NAME, transport_security=_security_settings)
 
+class _AuthHeaderASGIMiddleware:
+    """Pure ASGI middleware that extracts Authorization header into context var."""
+    def __init__(self, app):
+        self.app = app
+    
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            headers = dict(scope.get("headers", []))
+            auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore")
+            if auth_header.lower().startswith("bearer "):
+                token = auth_header[7:].strip()
+            else:
+                token = auth_header.strip() if auth_header else ""
+            _AUTH_HEADER_TOKEN.set(token)
+        return await self.app(scope, receive, send)
+
+
+def _add_auth_middleware():
+    """Wrap FastMCP's ASGI app with auth header extraction middleware.
+    
+    FastMCP creates its Starlette app lazily. We wrap mcp.run() to inject
+    middleware after the app is created but before uvicorn starts.
+    """
+    _original_run = mcp.run
+    _middleware_added = [False]
+    
+    def _patched_run(*args, **kwargs):
+        if not _middleware_added[0]:
+            # FastMCP creates the app when run() is called
+            # Check if app property exists and try to wrap it
+            for attr in ("_app", "app", "_asgi_app", "_sse_app", "_http_app"):
+                try:
+                    original_app = getattr(mcp, attr, None)
+                    if original_app is not None and callable(original_app):
+                        wrapped = _AuthHeaderASGIMiddleware(original_app)
+                        setattr(mcp, attr, wrapped)
+                        logger.info(f"Auth header middleware added (wrapped mcp.{attr})")
+                        _middleware_added[0] = True
+                        break
+                except Exception as e:
+                    logger.debug(f"Could not wrap mcp.{attr}: {e}")
+            
+            if not _middleware_added[0]:
+                logger.warning("Could not add auth middleware - app not found")
+        
+        return _original_run(*args, **kwargs)
+    
+    mcp.run = _patched_run
+    logger.debug("Patched mcp.run() for auth middleware injection")
+
 
 # Capture tool registry automatically by wrapping the decorator once
 _TOOLS_REGISTRY: list[dict] = []
@@ -3312,6 +3363,11 @@ async def neo4j_graph_query(
     transport = os.environ.get("FASTMCP_TRANSPORT", "sse").strip().lower()
     # Enable stateless HTTP mode to avoid session handshake requirement
     stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"}
+    
+    # Add auth header extraction middleware for HTTP transports
+    if transport != "stdio":
+        _add_auth_middleware()
+    
     if transport == "stdio":
         # Run over stdio (for clients that don't support network transports)
         mcp.run(transport="stdio")
diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py
index 05e0c759..ff2b17cc 100644
--- a/scripts/mcp_memory_server.py
+++ b/scripts/mcp_memory_server.py
@@ -43,6 +43,7 @@
 from scripts.mcp_auth import (
     require_auth_session as _require_auth_session,
     require_collection_access as _require_collection_access,
+    AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN,
 )
 
 from qdrant_client import QdrantClient, models
@@ -197,6 +198,52 @@ def _ensure_once(name: str) -> bool:
 )
 mcp = FastMCP(name="memory-server", transport_security=_security_settings)
 
+
+class _AuthHeaderASGIMiddleware:
+    """Pure ASGI middleware that extracts Authorization header into context var."""
+    def __init__(self, app):
+        self.app = app
+    
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            headers = dict(scope.get("headers", []))
+            auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore")
+            if auth_header.lower().startswith("bearer "):
+                token = auth_header[7:].strip()
+            else:
+                token = auth_header.strip() if auth_header else ""
+            _AUTH_HEADER_TOKEN.set(token)
+        return await self.app(scope, receive, send)
+
+
+def _add_auth_middleware():
+    """Wrap FastMCP's ASGI app with auth header extraction middleware."""
+    _original_run = mcp.run
+    _middleware_added = [False]
+    
+    def _patched_run(*args, **kwargs):
+        if not _middleware_added[0]:
+            for attr in ("_app", "app", "_asgi_app", "_sse_app", "_http_app"):
+                try:
+                    original_app = getattr(mcp, attr, None)
+                    if original_app is not None and callable(original_app):
+                        wrapped = _AuthHeaderASGIMiddleware(original_app)
+                        setattr(mcp, attr, wrapped)
+                        logger.info(f"Auth header middleware added (wrapped mcp.{attr})")
+                        _middleware_added[0] = True
+                        break
+                except Exception as e:
+                    logger.debug(f"Could not wrap mcp.{attr}: {e}")
+            
+            if not _middleware_added[0]:
+                logger.warning("Could not add auth middleware - app not found")
+        
+        return _original_run(*args, **kwargs)
+    
+    mcp.run = _patched_run
+    logger.debug("Patched mcp.run() for auth middleware injection")
+
+
 _TOOLS_REGISTRY: list[dict] = []
 try:
     _orig_tool = mcp.tool
@@ -1119,6 +1166,11 @@ def _resolve_collection(
 
     # Enable stateless HTTP mode to avoid session handshake requirement
     stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"}
+    
+    # Add auth header extraction middleware for HTTP transports
+    if transport != "stdio":
+        _add_auth_middleware()
+    
     if transport == "stdio":
         # Run over stdio (for clients that don't support network transports)
         mcp.run(transport="stdio")

From b3132ed38fbcb11e9110a6b08425fafa241c1a59 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 12:54:20 -0500
Subject: [PATCH 05/10] Fix auth middleware: patch uvicorn.run() to wrap app
 after FastMCP creates it

---
 scripts/mcp_indexer_server.py | 41 ++++++++++++-----------------------
 scripts/mcp_memory_server.py  | 35 ++++++++++--------------------
 2 files changed, 26 insertions(+), 50 deletions(-)

diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 29b7d6f6..04510324 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -586,35 +586,22 @@ async def __call__(self, scope, receive, send):
 def _add_auth_middleware():
     """Wrap FastMCP's ASGI app with auth header extraction middleware.
     
-    FastMCP creates its Starlette app lazily. We wrap mcp.run() to inject
-    middleware after the app is created but before uvicorn starts.
+    FastMCP creates its Starlette app lazily inside run(). We intercept uvicorn.run()
+    to wrap the app after FastMCP creates it but before uvicorn starts serving.
     """
-    _original_run = mcp.run
-    _middleware_added = [False]
-    
-    def _patched_run(*args, **kwargs):
-        if not _middleware_added[0]:
-            # FastMCP creates the app when run() is called
-            # Check if app property exists and try to wrap it
-            for attr in ("_app", "app", "_asgi_app", "_sse_app", "_http_app"):
-                try:
-                    original_app = getattr(mcp, attr, None)
-                    if original_app is not None and callable(original_app):
-                        wrapped = _AuthHeaderASGIMiddleware(original_app)
-                        setattr(mcp, attr, wrapped)
-                        logger.info(f"Auth header middleware added (wrapped mcp.{attr})")
-                        _middleware_added[0] = True
-                        break
-                except Exception as e:
-                    logger.debug(f"Could not wrap mcp.{attr}: {e}")
-            
-            if not _middleware_added[0]:
-                logger.warning("Could not add auth middleware - app not found")
+    try:
+        import uvicorn
+        _original_uvicorn_run = uvicorn.run
         
-        return _original_run(*args, **kwargs)
-    
-    mcp.run = _patched_run
-    logger.debug("Patched mcp.run() for auth middleware injection")
+        def _patched_uvicorn_run(app, **kwargs):
+            wrapped_app = _AuthHeaderASGIMiddleware(app)
+            logger.info("Auth header ASGI middleware injected via uvicorn.run() patch")
+            return _original_uvicorn_run(wrapped_app, **kwargs)
+        
+        uvicorn.run = _patched_uvicorn_run
+        logger.debug("Patched uvicorn.run() for auth middleware injection")
+    except Exception as e:
+        logger.warning(f"Failed to patch uvicorn for auth middleware: {e}")
 
 
 # Capture tool registry automatically by wrapping the decorator once
diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py
index ff2b17cc..b366334b 100644
--- a/scripts/mcp_memory_server.py
+++ b/scripts/mcp_memory_server.py
@@ -218,30 +218,19 @@ async def __call__(self, scope, receive, send):
 
 def _add_auth_middleware():
     """Wrap FastMCP's ASGI app with auth header extraction middleware."""
-    _original_run = mcp.run
-    _middleware_added = [False]
-    
-    def _patched_run(*args, **kwargs):
-        if not _middleware_added[0]:
-            for attr in ("_app", "app", "_asgi_app", "_sse_app", "_http_app"):
-                try:
-                    original_app = getattr(mcp, attr, None)
-                    if original_app is not None and callable(original_app):
-                        wrapped = _AuthHeaderASGIMiddleware(original_app)
-                        setattr(mcp, attr, wrapped)
-                        logger.info(f"Auth header middleware added (wrapped mcp.{attr})")
-                        _middleware_added[0] = True
-                        break
-                except Exception as e:
-                    logger.debug(f"Could not wrap mcp.{attr}: {e}")
-            
-            if not _middleware_added[0]:
-                logger.warning("Could not add auth middleware - app not found")
+    try:
+        import uvicorn
+        _original_uvicorn_run = uvicorn.run
         
-        return _original_run(*args, **kwargs)
-    
-    mcp.run = _patched_run
-    logger.debug("Patched mcp.run() for auth middleware injection")
+        def _patched_uvicorn_run(app, **kwargs):
+            wrapped_app = _AuthHeaderASGIMiddleware(app)
+            logger.info("Auth header ASGI middleware injected via uvicorn.run() patch")
+            return _original_uvicorn_run(wrapped_app, **kwargs)
+        
+        uvicorn.run = _patched_uvicorn_run
+        logger.debug("Patched uvicorn.run() for auth middleware injection")
+    except Exception as e:
+        logger.warning(f"Failed to patch uvicorn for auth middleware: {e}")
 
 
 _TOOLS_REGISTRY: list[dict] = []

From 4dee7d1464b1430570bb77f9adaf5f902a4613f5 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 13:00:49 -0500
Subject: [PATCH 06/10] Add verbose logging to auth middleware setup

---
 scripts/mcp_indexer_server.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 04510324..0e625ec1 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -589,17 +589,19 @@ def _add_auth_middleware():
     FastMCP creates its Starlette app lazily inside run(). We intercept uvicorn.run()
     to wrap the app after FastMCP creates it but before uvicorn starts serving.
     """
+    logger.info("Setting up auth header middleware...")
     try:
         import uvicorn
         _original_uvicorn_run = uvicorn.run
         
         def _patched_uvicorn_run(app, **kwargs):
+            logger.info(f"uvicorn.run() intercepted, wrapping app: {type(app).__name__}")
             wrapped_app = _AuthHeaderASGIMiddleware(app)
-            logger.info("Auth header ASGI middleware injected via uvicorn.run() patch")
+            logger.info("Auth header ASGI middleware injected successfully")
             return _original_uvicorn_run(wrapped_app, **kwargs)
         
         uvicorn.run = _patched_uvicorn_run
-        logger.debug("Patched uvicorn.run() for auth middleware injection")
+        logger.info("Patched uvicorn.run() for auth middleware injection")
     except Exception as e:
         logger.warning(f"Failed to patch uvicorn for auth middleware: {e}")
 

From e5ed2c86b7ee725ff1f8ce7645b5481e974b8709 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 13:05:20 -0500
Subject: [PATCH 07/10] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2ddfb00d..9cc39e63 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,4 @@ deploy/eks-cdk/
 ctx_config.json
 /deploy/eks-cdk
 /deploy/eks-cdk-PATHFUL
+.env

From b9dfca1f5d39cd39e89e8fb5152a12efbb110c97 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 13:07:01 -0500
Subject: [PATCH 08/10] Fix auth middleware: patch FastMCP app factory methods
 (streamable_http_app, sse_app)

---
 scripts/mcp_indexer_server.py | 32 ++++++++++++++++++++------------
 scripts/mcp_memory_server.py  | 26 +++++++++++++++++---------
 2 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 0e625ec1..934f2274 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -586,24 +586,32 @@ async def __call__(self, scope, receive, send):
 def _add_auth_middleware():
     """Wrap FastMCP's ASGI app with auth header extraction middleware.
     
-    FastMCP creates its Starlette app lazily inside run(). We intercept uvicorn.run()
-    to wrap the app after FastMCP creates it but before uvicorn starts serving.
+    FastMCP calls streamable_http_app() or sse_app() to create the Starlette app.
+    We patch these methods to wrap the returned app with our middleware.
     """
     logger.info("Setting up auth header middleware...")
     try:
-        import uvicorn
-        _original_uvicorn_run = uvicorn.run
+        # Patch streamable_http_app
+        if hasattr(mcp, "streamable_http_app"):
+            _orig_streamable = mcp.streamable_http_app
+            def _patched_streamable(*args, **kwargs):
+                app = _orig_streamable(*args, **kwargs)
+                logger.info(f"Wrapping streamable_http_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.streamable_http_app = _patched_streamable
         
-        def _patched_uvicorn_run(app, **kwargs):
-            logger.info(f"uvicorn.run() intercepted, wrapping app: {type(app).__name__}")
-            wrapped_app = _AuthHeaderASGIMiddleware(app)
-            logger.info("Auth header ASGI middleware injected successfully")
-            return _original_uvicorn_run(wrapped_app, **kwargs)
+        # Patch sse_app for SSE transport
+        if hasattr(mcp, "sse_app"):
+            _orig_sse = mcp.sse_app
+            def _patched_sse(*args, **kwargs):
+                app = _orig_sse(*args, **kwargs)
+                logger.info(f"Wrapping sse_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.sse_app = _patched_sse
         
-        uvicorn.run = _patched_uvicorn_run
-        logger.info("Patched uvicorn.run() for auth middleware injection")
+        logger.info("Patched FastMCP app factory methods for auth middleware injection")
     except Exception as e:
-        logger.warning(f"Failed to patch uvicorn for auth middleware: {e}")
+        logger.warning(f"Failed to patch FastMCP for auth middleware: {e}")
 
 
 # Capture tool registry automatically by wrapping the decorator once
diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py
index b366334b..61880669 100644
--- a/scripts/mcp_memory_server.py
+++ b/scripts/mcp_memory_server.py
@@ -218,19 +218,27 @@ async def __call__(self, scope, receive, send):
 
 def _add_auth_middleware():
     """Wrap FastMCP's ASGI app with auth header extraction middleware."""
+    logger.info("Setting up auth header middleware...")
     try:
-        import uvicorn
-        _original_uvicorn_run = uvicorn.run
+        if hasattr(mcp, "streamable_http_app"):
+            _orig_streamable = mcp.streamable_http_app
+            def _patched_streamable(*args, **kwargs):
+                app = _orig_streamable(*args, **kwargs)
+                logger.info(f"Wrapping streamable_http_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.streamable_http_app = _patched_streamable
         
-        def _patched_uvicorn_run(app, **kwargs):
-            wrapped_app = _AuthHeaderASGIMiddleware(app)
-            logger.info("Auth header ASGI middleware injected via uvicorn.run() patch")
-            return _original_uvicorn_run(wrapped_app, **kwargs)
+        if hasattr(mcp, "sse_app"):
+            _orig_sse = mcp.sse_app
+            def _patched_sse(*args, **kwargs):
+                app = _orig_sse(*args, **kwargs)
+                logger.info(f"Wrapping sse_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.sse_app = _patched_sse
         
-        uvicorn.run = _patched_uvicorn_run
-        logger.debug("Patched uvicorn.run() for auth middleware injection")
+        logger.info("Patched FastMCP app factory methods for auth middleware injection")
     except Exception as e:
-        logger.warning(f"Failed to patch uvicorn for auth middleware: {e}")
+        logger.warning(f"Failed to patch FastMCP for auth middleware: {e}")
 
 
 _TOOLS_REGISTRY: list[dict] = []

From 4ab940e73c1a923677574ae0b785551db6452a82 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 14:30:39 -0500
Subject: [PATCH 09/10] Add codebase volume and improve auth session validation

Introduces a new codebase volume to Kubernetes and Helm deployment templates for MCP memory services, ensuring proper mounting and separation of code metadata. Enhances authentication logic in mcp_auth.py to support auto-fallback to shared tokens and adds a new /auth/validate endpoint in upload_service.py for remote session validation, improving distributed authentication support.
---
 .../templates/mcp-memory-http.yaml            |  5 ++
 deploy/kubernetes/mcp-http.yaml               | 14 +++---
 deploy/kubernetes/mcp-memory.yaml             | 11 +++--
 scripts/mcp_auth.py                           | 37 ++++++++++-----
 scripts/upload_service.py                     | 46 +++++++++++++++++++
 5 files changed, 92 insertions(+), 21 deletions(-)

diff --git a/deploy/helm/context-engine/templates/mcp-memory-http.yaml b/deploy/helm/context-engine/templates/mcp-memory-http.yaml
index d05b10b8..c05f3827 100644
--- a/deploy/helm/context-engine/templates/mcp-memory-http.yaml
+++ b/deploy/helm/context-engine/templates/mcp-memory-http.yaml
@@ -107,6 +107,8 @@ spec:
             - name: work-volume
               mountPath: /work
               readOnly: true
+            - name: codebase-volume
+              mountPath: /work/.codebase
             - name: metadata-volume
               mountPath: /tmp/rerank_weights
               subPath: rerank_weights
@@ -117,6 +119,9 @@ spec:
         - name: work-volume
           persistentVolumeClaim:
             claimName: {{ .Values.persistence.codeRepos.name }}
+        - name: codebase-volume
+          persistentVolumeClaim:
+            claimName: {{ .Values.persistence.codeMetadata.name }}
         - name: metadata-volume
           persistentVolumeClaim:
             claimName: {{ .Values.persistence.codeMetadata.name }}
diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml
index c3c71fe2..c8ace6ed 100644
--- a/deploy/kubernetes/mcp-http.yaml
+++ b/deploy/kubernetes/mcp-http.yaml
@@ -30,10 +30,10 @@ spec:
         command:
         - sh
         - -c
-        - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events
+        - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events
         volumeMounts:
-        - name: metadata-volume
-          mountPath: /mnt
+        - name: codebase-volume
+          mountPath: /work/.codebase
       containers:
       - name: mcp-memory-http
         image: context-engine-memory
@@ -108,10 +108,12 @@ spec:
         - name: work-volume
           mountPath: /work
           readOnly: true
-        - name: metadata-volume
+        - name: codebase-volume
+          mountPath: /work/.codebase
+        - name: codebase-volume
           mountPath: /tmp/rerank_weights
           subPath: rerank_weights
-        - name: metadata-volume
+        - name: codebase-volume
           mountPath: /tmp/rerank_events
           subPath: rerank_events
         livenessProbe:
@@ -133,7 +135,7 @@ spec:
       - name: work-volume
         persistentVolumeClaim:
           claimName: code-repos-pvc
-      - name: metadata-volume
+      - name: codebase-volume
         persistentVolumeClaim:
           claimName: code-metadata-pvc
 ---
diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml
index 165076db..5f34ff1f 100644
--- a/deploy/kubernetes/mcp-memory.yaml
+++ b/deploy/kubernetes/mcp-memory.yaml
@@ -26,10 +26,10 @@ spec:
         command:
         - sh
         - -c
-        - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events
+        - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events
         volumeMounts:
-        - name: metadata-volume
-          mountPath: /mnt
+        - name: codebase-volume
+          mountPath: /work/.codebase
       containers:
       - name: mcp-memory
         image: context-engine-memory
@@ -85,6 +85,8 @@ spec:
         - name: work-volume
           mountPath: /work
           readOnly: true
+        - name: codebase-volume
+          mountPath: /work/.codebase
         - name: metadata-volume
           mountPath: /tmp/rerank_weights
           subPath: rerank_weights
@@ -110,6 +112,9 @@ spec:
       - name: work-volume
         persistentVolumeClaim:
           claimName: code-repos-pvc
+      - name: codebase-volume
+        persistentVolumeClaim:
+          claimName: code-metadata-pvc
       - name: metadata-volume
         persistentVolumeClaim:
           claimName: code-metadata-pvc
diff --git a/scripts/mcp_auth.py b/scripts/mcp_auth.py
index fea2f06b..ad224705 100644
--- a/scripts/mcp_auth.py
+++ b/scripts/mcp_auth.py
@@ -55,34 +55,47 @@ def _has_collection_access(
 _AUTH_ADMIN_TOKEN = (os.environ.get("CTXCE_AUTH_ADMIN_TOKEN") or "").strip()
 _AUTH_SHARED_TOKEN = (os.environ.get("CTXCE_AUTH_SHARED_TOKEN") or "").strip()
 
+# Auto-fallback: when enabled, use shared token if no session/header provided
+_AUTH_AUTO_SHARED = (
+    str(os.environ.get("CTXCE_AUTH_AUTO_SHARED", "0")).strip().lower()
+    in {"1", "true", "yes", "on"}
+)
+
 
 def require_auth_session(session: Optional[str]) -> Optional[Dict[str, Any]]:
     if not AUTH_ENABLED_AUTH:
         return None
     sid = (session or "").strip()
     
-    # If no session arg provided, try to get token from HTTP header context var
     if not sid:
         sid = AUTH_HEADER_TOKEN.get()
     
-    if not sid:
-        raise ValidationError("Missing session for authorized operation")
-    
-    # Strip "Bearer " prefix if present (from HTTP Authorization header)
-    if sid.lower().startswith("bearer "):
+    if sid and sid.lower().startswith("bearer "):
         sid = sid[7:].strip()
     
-    # Check direct admin/shared token first (bypass session DB lookup)
     if _AUTH_ADMIN_TOKEN and sid == _AUTH_ADMIN_TOKEN:
         return {"user_id": "admin", "role": "admin", "token_type": "admin"}
     if _AUTH_SHARED_TOKEN and sid == _AUTH_SHARED_TOKEN:
         return {"user_id": "shared", "role": "user", "token_type": "shared"}
     
-    # Fall back to session database lookup
-    info = _auth_validate_session(sid)
-    if not info:
-        raise ValidationError("Invalid or expired session")
-    return info
+    if not sid:
+        if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN:
+            return {"user_id": "shared", "role": "user", "token_type": "shared_auto"}
+        raise ValidationError("Missing session for authorized operation")
+    
+    # Try local session validation first
+    try:
+        info = _auth_validate_session(sid)
+        if info:
+            return info
+    except Exception:
+        pass
+
+    # Session not found locally - fall back to shared token if auto-shared is enabled
+    if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN:
+        return {"user_id": "shared", "role": "user", "token_type": "shared_auto"}
+
+    raise ValidationError("Invalid or expired session")
 
 
 def require_collection_access(user_id: Optional[str], collection: str, perm: str) -> None:
diff --git a/scripts/upload_service.py b/scripts/upload_service.py
index 3c718ffd..fc8e66fa 100644
--- a/scripts/upload_service.py
+++ b/scripts/upload_service.py
@@ -303,6 +303,17 @@ class AuthUserCreateResponse(BaseModel):
     username: str
 
 
+class AuthValidateRequest(BaseModel):
+    session_id: str
+
+
+class AuthValidateResponse(BaseModel):
+    valid: bool
+    user_id: Optional[str] = None
+    expires_at: Optional[int] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+
 class PasswordLoginRequest(BaseModel):
     username: str
     password: str
@@ -599,6 +610,41 @@ async def auth_login(payload: AuthLoginRequest):
     )
 
 
+@app.post("/auth/validate", response_model=AuthValidateResponse)
+async def auth_validate(payload: AuthValidateRequest):
+    """Validate a session ID and return session info if valid.
+
+    This endpoint allows remote MCP servers to validate sessions against the
+    auth backend that issued them, enabling distributed auth validation.
+    """
+    try:
+        if not AUTH_ENABLED:
+            # When auth is disabled, all sessions are considered valid
+            return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None)
+
+        sid = (payload.session_id or "").strip()
+        if not sid:
+            return AuthValidateResponse(valid=False)
+
+        try:
+            record = validate_session(sid)
+        except AuthDisabledError:
+            return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None)
+
+        if record is None:
+            return AuthValidateResponse(valid=False)
+
+        return AuthValidateResponse(
+            valid=True,
+            user_id=record.get("user_id"),
+            expires_at=record.get("expires_at"),
+            metadata=record.get("metadata"),
+        )
+    except Exception as e:
+        logger.error(f"[upload_service] Failed to validate session: {e}")
+        raise HTTPException(status_code=500, detail="Failed to validate session")
+
+
 @app.get("/admin")
 async def admin_root(request: Request):
     if not AUTH_ENABLED:

From df941c1b055a6000d3b14a56410c702aa5e57861 Mon Sep 17 00:00:00 2001
From: John Donalson <mirlok@dr.com>
Date: Sun, 25 Jan 2026 14:43:21 -0500
Subject: [PATCH 10/10] Update tests for asyncio usage and docstring assertion

Replaced deprecated 'get_event_loop().run_until_complete' with 'asyncio.run' in test_context_answer_path_mention.py and test_env_behavior.py. Updated docstring assertion in test_globs_and_snippet.py to match new content. Modified _DummyQdrantClient in test_qdrant_client_manager_pool.py to accept additional kwargs for improved flexibility.
---
 tests/test_context_answer_path_mention.py | 2 +-
 tests/test_env_behavior.py                | 4 +++-
 tests/test_globs_and_snippet.py           | 2 +-
 tests/test_qdrant_client_manager_pool.py  | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_context_answer_path_mention.py b/tests/test_context_answer_path_mention.py
index 59299b8d..b0e80f75 100644
--- a/tests/test_context_answer_path_mention.py
+++ b/tests/test_context_answer_path_mention.py
@@ -30,7 +30,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 64, **kw)
 
     # Mention an actual file in this repo so fallback can find it
     q = "explain something in scripts/hybrid_search.py"
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = srv.asyncio.run(
         srv.context_answer(query=q, limit=3, per_path=2)
     )
     assert isinstance(out, dict)
diff --git a/tests/test_env_behavior.py b/tests/test_env_behavior.py
index b5daa6f3..3803f210 100644
--- a/tests/test_env_behavior.py
+++ b/tests/test_env_behavior.py
@@ -20,6 +20,8 @@ def test_rerank_timeout_floor_and_env_defaults(monkeypatch):
     monkeypatch.setenv("RERANK_TIMEOUT_FLOOR_MS", "1500")
     # Fix default timeout for test determinism (CI may set a higher value)
     monkeypatch.setenv("RERANKER_TIMEOUT_MS", "200")
+    # Override the min clamp so the floor takes effect
+    monkeypatch.setenv("RERANK_TIMEOUT_MIN_MS", "0")
 
     # Fake _run_async to capture calls
     calls = []
@@ -47,7 +49,7 @@ async def fake_run(cmd, env=None, timeout=None):
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
     # Call repo_search with no rerank_enabled arg to pick env default
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = srv.asyncio.run(
         srv.repo_search(query="foo", limit=3, per_path=1)
     )
 
diff --git a/tests/test_globs_and_snippet.py b/tests/test_globs_and_snippet.py
index 2486e1a0..9eb7fcbb 100644
--- a/tests/test_globs_and_snippet.py
+++ b/tests/test_globs_and_snippet.py
@@ -223,6 +223,6 @@ def run_hybrid_search(**kwargs):
 @pytest.mark.unit
 def test_repo_search_docstring_clean():
     doc = srv.repo_search.__doc__
-    assert doc and "Zero-config code search" in doc
+    assert doc and "Primary hybrid semantic" in doc
     # Ensure stray inline pseudo-code is not embedded in docstring
     assert "Accept common alias keys from clients" not in doc
diff --git a/tests/test_qdrant_client_manager_pool.py b/tests/test_qdrant_client_manager_pool.py
index 85408783..7cc4c9b2 100644
--- a/tests/test_qdrant_client_manager_pool.py
+++ b/tests/test_qdrant_client_manager_pool.py
@@ -18,7 +18,7 @@
 
 
 class _DummyQdrantClient:
-    def __init__(self, url=None, api_key=None):
+    def __init__(self, url=None, api_key=None, **kwargs):
         self.url = url
         self.api_key = api_key
         self.closed = False