diff --git a/.gitignore b/.gitignore
index 2ddfb00d..9cc39e63 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,4 @@ deploy/eks-cdk/
 ctx_config.json
 /deploy/eks-cdk
 /deploy/eks-cdk-PATHFUL
+.env
diff --git a/.indexignore b/.indexignore
index 6ab86f92..b4da8f56 100644
--- a/.indexignore
+++ b/.indexignore
@@ -2,3 +2,18 @@ cosqa*.json
 # dev-workspace contains uploaded client workspaces - they get indexed
 # separately via upload service, not as part of the main Context-Engine repo
 dev-workspace/
+
+# CDK/deploy build artifacts - duplicates of source files
+deploy/eks-cdk-*/cdk.out/
+**/cdk.out/
+
+# Build/dist artifacts
+dist/
+build/
+*.egg-info/
+
+# IDE/editor artifacts
+.idea/
+.vscode/
+*.swp
+*.swo
diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json
index 6c4e93cd..df69fa17 100644
--- a/ctx-mcp-bridge/package.json
+++ b/ctx-mcp-bridge/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@context-engine-bridge/context-engine-mcp-bridge",
-  "version": "0.0.16",
+  "version": "0.0.17",
   "description": "Context Engine MCP bridge (http/stdio proxy combining indexer + memory servers)",
   "bin": {
     "ctxce": "bin/ctxce.js",
diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js
index 2fe1b101..d2426d3b 100644
--- a/ctx-mcp-bridge/src/mcpServer.js
+++ b/ctx-mcp-bridge/src/mcpServer.js
@@ -126,7 +126,8 @@ function selectClientForTool(name, indexerClient, memoryClient) {
     return indexerClient;
   }
   const lowered = name.toLowerCase();
-  if (memoryClient && (lowered.startsWith("memory.") || lowered.startsWith("mcp_memory_"))) {
+  // Route to memory server for any memory-prefixed tool
+  if (memoryClient && lowered.startsWith("memory")) {
     return memoryClient;
   }
   return indexerClient;
diff --git a/deploy/helm/context-engine/templates/mcp-memory-http.yaml b/deploy/helm/context-engine/templates/mcp-memory-http.yaml
index d05b10b8..c05f3827 100644
--- a/deploy/helm/context-engine/templates/mcp-memory-http.yaml
+++ b/deploy/helm/context-engine/templates/mcp-memory-http.yaml
@@ -107,6 +107,8 @@ spec:
             - name: work-volume
               mountPath: /work
               readOnly: true
+            - name: codebase-volume
+              mountPath: /work/.codebase
             - name: metadata-volume
               mountPath: /tmp/rerank_weights
               subPath: rerank_weights
@@ -117,6 +119,9 @@ spec:
         - name: work-volume
           persistentVolumeClaim:
             claimName: {{ .Values.persistence.codeRepos.name }}
+        - name: codebase-volume
+          persistentVolumeClaim:
+            claimName: {{ .Values.persistence.codeMetadata.name }}
         - name: metadata-volume
           persistentVolumeClaim:
             claimName: {{ .Values.persistence.codeMetadata.name }}
diff --git a/deploy/helm/context-engine/values-example.yaml b/deploy/helm/context-engine/values-example.yaml
index 94dd017e..4b669359 100644
--- a/deploy/helm/context-engine/values-example.yaml
+++ b/deploy/helm/context-engine/values-example.yaml
@@ -52,15 +52,15 @@ mcpIndexerHttp:
   replicas: 1
   resources:
     requests:
-      cpu: 250m
+      cpu: 500m
       memory: 8Gi
     limits:
-      cpu: "1"
+      cpu: "2"
       memory: 16Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 4
+    maxReplicas: 2
 
 # MCP Memory HTTP
 mcpMemoryHttp:
@@ -68,45 +68,52 @@ mcpMemoryHttp:
   replicas: 1
   resources:
     requests:
-      cpu: 250m
-      memory: 512Mi
+      cpu: 500m
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
 
 # Upload Service
 uploadService:
   enabled: true
   replicas: 1
+  resources:
+    requests:
+      cpu: 250m
+      memory: 1Gi
+    limits:
+      cpu: "1500m"
+      memory: 3Gi
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 2
 
 # Watcher
 watcher:
   enabled: true
-  replicas: 1
+  replicas: 2
   resources:
     requests:
       cpu: 500m
-      memory: 2Gi
+      memory: 3Gi
     limits:
-      cpu: "2"
-      memory: 8Gi
+      cpu: "2500m"
+      memory: 10Gi
 
-# Learning Reranker Worker
+# Learning Reranker Worker (singleton - only 1 can run due to leader election)
 learningRerankerWorker:
   enabled: true
   replicas: 1
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
 
 # Persistence - shared PVCs
 persistence:
diff --git a/deploy/helm/context-engine/values.yaml b/deploy/helm/context-engine/values.yaml
index 757c02bb..bdfc11be 100644
--- a/deploy/helm/context-engine/values.yaml
+++ b/deploy/helm/context-engine/values.yaml
@@ -140,10 +140,10 @@ mcpIndexerHttp:
   # -- Resource requests and limits
   resources:
     requests:
-      cpu: 250m
+      cpu: 500m
       memory: 8Gi
     limits:
-      cpu: "1"
+      cpu: "2"
       memory: 16Gi
   # -- Liveness probe
   livenessProbe:
@@ -167,7 +167,7 @@ mcpIndexerHttp:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 4
+    maxReplicas: 2
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -207,11 +207,11 @@ mcpMemoryHttp:
   # -- Resource requests and limits
   resources:
     requests:
-      cpu: 250m
-      memory: 512Mi
+      cpu: 500m
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   # -- Liveness probe
   livenessProbe:
     httpGet:
@@ -230,7 +230,7 @@ mcpMemoryHttp:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -273,10 +273,10 @@ uploadService:
   resources:
     requests:
       cpu: 250m
-      memory: 512Mi
+      memory: 1Gi
     limits:
-      cpu: "1"
-      memory: 2Gi
+      cpu: "1500m"
+      memory: 3Gi
   # -- Environment variables
   env:
     UPLOAD_SERVICE_HOST: "0.0.0.0"
@@ -288,7 +288,7 @@ uploadService:
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 2
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
@@ -305,7 +305,7 @@ watcher:
   # -- Enable Watcher
   enabled: true
   # -- Number of replicas
-  replicas: 1
+  replicas: 2
   # -- Command to run
   command:
     - python
@@ -316,10 +316,10 @@ watcher:
   resources:
     requests:
       cpu: 500m
-      memory: 2Gi
+      memory: 3Gi
     limits:
-      cpu: "2"
-      memory: 8Gi
+      cpu: "2500m"
+      memory: 10Gi
   # -- Environment variables (in addition to configmap)
   env:
     WATCH_ROOT: /work
@@ -343,7 +343,7 @@ watcher:
 learningRerankerWorker:
   # -- Enable Learning Reranker Worker
   enabled: true
-  # -- Number of replicas
+  # -- Number of replicas (singleton worker with leader election - only 1 can run)
   replicas: 1
   # -- Command to run
   command:
@@ -358,11 +358,11 @@ learningRerankerWorker:
     limits:
       cpu: "1"
       memory: 2Gi
-  # -- HPA configuration
+  # -- HPA configuration (capped at 1 - singleton worker)
   autoscaling:
     enabled: true
     minReplicas: 1
-    maxReplicas: 3
+    maxReplicas: 1
     targetCPUUtilizationPercentage: 70
     targetMemoryUtilizationPercentage: 80
   # -- Topology spread constraints
diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml
index c3c71fe2..c8ace6ed 100644
--- a/deploy/kubernetes/mcp-http.yaml
+++ b/deploy/kubernetes/mcp-http.yaml
@@ -30,10 +30,10 @@ spec:
         command:
         - sh
         - -c
-        - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events
+        - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events
         volumeMounts:
-        - name: metadata-volume
-          mountPath: /mnt
+        - name: codebase-volume
+          mountPath: /work/.codebase
       containers:
       - name: mcp-memory-http
         image: context-engine-memory
@@ -108,10 +108,12 @@ spec:
         - name: work-volume
           mountPath: /work
           readOnly: true
-        - name: metadata-volume
+        - name: codebase-volume
+          mountPath: /work/.codebase
+        - name: codebase-volume
           mountPath: /tmp/rerank_weights
           subPath: rerank_weights
-        - name: metadata-volume
+        - name: codebase-volume
           mountPath: /tmp/rerank_events
           subPath: rerank_events
         livenessProbe:
@@ -133,7 +135,7 @@ spec:
       - name: work-volume
         persistentVolumeClaim:
           claimName: code-repos-pvc
-      - name: metadata-volume
+      - name: codebase-volume
         persistentVolumeClaim:
           claimName: code-metadata-pvc
 ---
diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml
index 165076db..5f34ff1f 100644
--- a/deploy/kubernetes/mcp-memory.yaml
+++ b/deploy/kubernetes/mcp-memory.yaml
@@ -26,10 +26,10 @@ spec:
         command:
         - sh
         - -c
-        - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events
+        - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events
         volumeMounts:
-        - name: metadata-volume
-          mountPath: /mnt
+        - name: codebase-volume
+          mountPath: /work/.codebase
       containers:
       - name: mcp-memory
         image: context-engine-memory
@@ -85,6 +85,8 @@ spec:
         - name: work-volume
           mountPath: /work
           readOnly: true
+        - name: codebase-volume
+          mountPath: /work/.codebase
         - name: metadata-volume
           mountPath: /tmp/rerank_weights
           subPath: rerank_weights
@@ -110,6 +112,9 @@ spec:
       - name: work-volume
         persistentVolumeClaim:
           claimName: code-repos-pvc
+      - name: codebase-volume
+        persistentVolumeClaim:
+          claimName: code-metadata-pvc
       - name: metadata-volume
         persistentVolumeClaim:
           claimName: code-metadata-pvc
diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py
index 039ca6f1..498366e9 100644
--- a/scripts/hybrid/qdrant.py
+++ b/scripts/hybrid/qdrant.py
@@ -17,19 +17,73 @@
 import logging
 import threading
 import re
-from typing import List, Dict, Any, Tuple
+import time
+from typing import List, Dict, Any, Tuple, Optional, Callable, TypeVar
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 
-# Core Qdrant imports
+# Core Qdrant imports (optional in some runtimes)
 try:
     from qdrant_client import QdrantClient, models
-except ImportError:
+except ImportError:  # pragma: no cover
     QdrantClient = None  # type: ignore
     models = None  # type: ignore
 
+try:
+    from qdrant_client.http.exceptions import ResponseHandlingException
+except ImportError:  # pragma: no cover
+    ResponseHandlingException = None  # type: ignore
+
+try:  # pragma: no cover - optional dependency
+    import httpx
+except ImportError:
+    httpx = None  # type: ignore
+
+try:  # pragma: no cover - optional dependency
+    import httpcore
+except ImportError:
+    httpcore = None  # type: ignore
+
 logger = logging.getLogger("hybrid_qdrant")
 
+
+def _is_timeout_exception(exc: Exception) -> bool:
+    """Detect whether an exception is a Qdrant/http timeout."""
+
+    if ResponseHandlingException and isinstance(exc, ResponseHandlingException):
+        cause = exc.__cause__ or exc.__context__
+        if cause is not None and cause is not exc:
+            return _is_timeout_exception(cause)
+        return "timeout" in str(exc).lower()
+
+    timeout_types = []
+    if httpx is not None:
+        timeout_types.append(getattr(httpx, "TimeoutException", None))
+        timeout_types.append(getattr(httpx, "ReadTimeout", None))
+    if httpcore is not None:
+        timeout_types.append(getattr(httpcore, "TimeoutException", None))
+        timeout_types.append(getattr(httpcore, "ReadTimeout", None))
+
+    for t in timeout_types:
+        if t and isinstance(exc, t):
+            return True
+
+    return isinstance(exc, TimeoutError)
+
+
+def _log_qdrant_timeout(kind: str, collection: Optional[str], detail: Exception) -> None:
+    coll = collection or "(unknown)"
+    logger.warning(
+        "Qdrant %s query timed out for collection %s; returning partial results", kind, coll
+    )
+
+
+def _handle_timeout(kind: str, collection: Optional[str], exc: Exception) -> bool:
+    if _is_timeout_exception(exc):
+        _log_qdrant_timeout(kind, collection, exc)
+        return True
+    return False
+
 # ---------------------------------------------------------------------------
 # Helper functions for safe type conversion
 # ---------------------------------------------------------------------------
@@ -75,6 +129,10 @@ def _safe_float(val: Any, default: float) -> float:
 )
 
 EF_SEARCH = _safe_int(os.environ.get("QDRANT_EF_SEARCH", "128"), 128)
+_MAX_QDRANT_CONCURRENCY = max(1, _safe_int(os.environ.get("QDRANT_MAX_CONCURRENCY", "6"), 6))
+_SEMAPHORE_LOG_THRESHOLD = float(os.environ.get("QDRANT_SEMAPHORE_LOG_THRESHOLD", "0.5") or 0.5)
+_QDRANT_REQUEST_SEMAPHORE = threading.BoundedSemaphore(_MAX_QDRANT_CONCURRENCY)
+T = TypeVar("T")
 
 # Quantization search params (for faster search with quantized collections)
 QDRANT_QUANTIZATION = os.environ.get("QDRANT_QUANTIZATION", "none").strip().lower()
@@ -95,6 +153,24 @@ def _get_search_params(ef: int) -> models.SearchParams:
     return models.SearchParams(hnsw_ef=ef)
 
 
+def _with_qdrant_slot(kind: str, fn: Callable[[], T]) -> T:
+    """Serialize Qdrant calls to avoid overload while preserving concurrency."""
+    wait_start = time.perf_counter()
+    _QDRANT_REQUEST_SEMAPHORE.acquire()
+    waited = time.perf_counter() - wait_start
+    if waited >= _SEMAPHORE_LOG_THRESHOLD:
+        logger.debug(
+            "Qdrant %s query waited %.3fs for slot (max=%s)",
+            kind,
+            waited,
+            _MAX_QDRANT_CONCURRENCY,
+        )
+    try:
+        return fn()
+    finally:
+        _QDRANT_REQUEST_SEMAPHORE.release()
+
+
 # ---------------------------------------------------------------------------
 # Connection pooling setup
 # ---------------------------------------------------------------------------
@@ -191,7 +267,9 @@ def _legacy_vector_search(
             query_filter=flt,
         )
         return _coerce_points(getattr(result, "points", result))
-    except Exception:
+    except Exception as exc:
+        if _handle_timeout("legacy", collection, exc):
+            return []
         return []
 
 
@@ -469,57 +547,71 @@ def lex_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=LEX_VECTOR_NAME,
-            query_filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "lex",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=LEX_VECTOR_NAME,
+                query_filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except TypeError:
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": LEX_VECTOR_NAME})
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=LEX_VECTOR_NAME,
-            filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "lex",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=LEX_VECTOR_NAME,
+                filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except AttributeError:
         return _legacy_vector_search(client, collection, LEX_VECTOR_NAME, v, per_query, flt)
     except Exception as e:
+        if _handle_timeout("lex", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             try:
                 logger.debug("QP_FILTER_DROP", extra={"using": LEX_VECTOR_NAME, "reason": str(e)[:200]})
             except Exception as e:
                 logger.debug(f"Suppressed exception: {e}")
         try:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=LEX_VECTOR_NAME,
-                query_filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
+            qp = _with_qdrant_slot(
+                "lex",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=v,
+                    using=LEX_VECTOR_NAME,
+                    query_filter=None,
+                    search_params=_get_search_params(ef),
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except TypeError:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=LEX_VECTOR_NAME,
-                filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
+            qp = _with_qdrant_slot(
+                "lex",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=v,
+                    using=LEX_VECTOR_NAME,
+                    filter=None,
+                    search_params=_get_search_params(ef),
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except Exception as e2:
@@ -553,35 +645,43 @@ def sparse_lex_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=models.SparseVector(
-                indices=sparse_vec["indices"],
-                values=sparse_vec["values"],
-            ),
-            using=LEX_SPARSE_NAME,
-            query_filter=flt,
-            limit=per_query,
-            with_payload=True,
-        )
-        return _coerce_points(getattr(qp, "points", qp))
-    except TypeError:
-        try:
-            qp = client.query_points(
+        qp = _with_qdrant_slot(
+            "sparse",
+            lambda: client.query_points(
                 collection_name=collection,
                 query=models.SparseVector(
                     indices=sparse_vec["indices"],
                     values=sparse_vec["values"],
                 ),
                 using=LEX_SPARSE_NAME,
-                filter=flt,
+                query_filter=flt,
                 limit=per_query,
                 with_payload=True,
+            ),
+        )
+        return _coerce_points(getattr(qp, "points", qp))
+    except TypeError:
+        try:
+            qp = _with_qdrant_slot(
+                "sparse",
+                lambda: client.query_points(
+                    collection_name=collection,
+                    query=models.SparseVector(
+                        indices=sparse_vec["indices"],
+                        values=sparse_vec["values"],
+                    ),
+                    using=LEX_SPARSE_NAME,
+                    filter=flt,
+                    limit=per_query,
+                    with_payload=True,
+                ),
             )
             return _coerce_points(getattr(qp, "points", qp))
         except Exception:
             return []
     except Exception as e:
+        if _handle_timeout("sparse", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("SPARSE_LEX_QUERY_ERROR", extra={"error": str(e)[:200]})
         return []
@@ -624,30 +724,38 @@ def dense_query(
         return []
 
     try:
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=vec_name,
-            query_filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "dense",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=vec_name,
+                query_filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except TypeError:
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": vec_name})
-        qp = client.query_points(
-            collection_name=collection,
-            query=v,
-            using=vec_name,
-            filter=flt,
-            search_params=_get_search_params(ef),
-            limit=per_query,
-            with_payload=True,
+        qp = _with_qdrant_slot(
+            "dense",
+            lambda: client.query_points(
+                collection_name=collection,
+                query=v,
+                using=vec_name,
+                filter=flt,
+                search_params=_get_search_params(ef),
+                limit=per_query,
+                with_payload=True,
+            ),
         )
         return _coerce_points(getattr(qp, "points", qp))
     except Exception as e:
+        if _handle_timeout("dense", collection, e):
+            return []
         if os.environ.get("DEBUG_HYBRID_SEARCH"):
             try:
                 logger.debug("QP_FILTER_DROP", extra={"using": vec_name, "reason": str(e)[:200]})
@@ -656,29 +764,37 @@ def dense_query(
         if not collection:
             return _legacy_vector_search(client, _collection(), vec_name, v, per_query, flt)
         try:
-            qp = client.query_points(
-                collection_name=collection,
-                query=v,
-                using=vec_name,
-                query_filter=None,
-                search_params=_get_search_params(ef),
-                limit=per_query,
-                with_payload=True,
-            )
-            return _coerce_points(getattr(qp, "points", qp))
-        except TypeError:
-            try:
-                qp = client.query_points(
+            qp = _with_qdrant_slot(
+                "dense",
+                lambda: client.query_points(
                     collection_name=collection,
                     query=v,
                     using=vec_name,
-                    filter=None,
+                    query_filter=None,
                     search_params=_get_search_params(ef),
                     limit=per_query,
                     with_payload=True,
+                ),
+            )
+            return _coerce_points(getattr(qp, "points", qp))
+        except TypeError:
+            try:
+                qp = _with_qdrant_slot(
+                    "dense",
+                    lambda: client.query_points(
+                        collection_name=collection,
+                        query=v,
+                        using=vec_name,
+                        filter=None,
+                        search_params=_get_search_params(ef),
+                        limit=per_query,
+                        with_payload=True,
+                    ),
                 )
                 return _coerce_points(getattr(qp, "points", qp))
             except Exception as e2:
+                if _handle_timeout("dense", collection, e2):
+                    return []
                 if os.environ.get("DEBUG_HYBRID_SEARCH"):
                     try:
                         logger.debug("QP_FILTER_DROP_FAILED", extra={"using": vec_name, "reason": str(e2)[:200]})
diff --git a/scripts/mcp_auth.py b/scripts/mcp_auth.py
index 2b13c791..ad224705 100644
--- a/scripts/mcp_auth.py
+++ b/scripts/mcp_auth.py
@@ -1,3 +1,4 @@
+import contextvars
 import os
 from typing import Any, Dict, Optional
 
@@ -8,6 +9,9 @@
     class ValidationError(Exception):
         pass
 
+# Context variable for Authorization header token (set by HTTP middleware)
+AUTH_HEADER_TOKEN: contextvars.ContextVar[str] = contextvars.ContextVar("auth_header_token", default="")
+
 
 try:
     from scripts.auth_backend import (
@@ -47,17 +51,51 @@ def _has_collection_access(
     in {"1", "true", "yes", "on"}
 )
 
+# Direct token auth: allow admin/shared tokens to bypass session lookup
+_AUTH_ADMIN_TOKEN = (os.environ.get("CTXCE_AUTH_ADMIN_TOKEN") or "").strip()
+_AUTH_SHARED_TOKEN = (os.environ.get("CTXCE_AUTH_SHARED_TOKEN") or "").strip()
+
+# Auto-fallback: when enabled, use shared token if no session/header provided
+_AUTH_AUTO_SHARED = (
+    str(os.environ.get("CTXCE_AUTH_AUTO_SHARED", "0")).strip().lower()
+    in {"1", "true", "yes", "on"}
+)
+
 
 def require_auth_session(session: Optional[str]) -> Optional[Dict[str, Any]]:
     if not AUTH_ENABLED_AUTH:
         return None
     sid = (session or "").strip()
+    
     if not sid:
+        sid = AUTH_HEADER_TOKEN.get()
+    
+    if sid and sid.lower().startswith("bearer "):
+        sid = sid[7:].strip()
+    
+    if _AUTH_ADMIN_TOKEN and sid == _AUTH_ADMIN_TOKEN:
+        return {"user_id": "admin", "role": "admin", "token_type": "admin"}
+    if _AUTH_SHARED_TOKEN and sid == _AUTH_SHARED_TOKEN:
+        return {"user_id": "shared", "role": "user", "token_type": "shared"}
+    
+    if not sid:
+        if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN:
+            return {"user_id": "shared", "role": "user", "token_type": "shared_auto"}
         raise ValidationError("Missing session for authorized operation")
-    info = _auth_validate_session(sid)
-    if not info:
-        raise ValidationError("Invalid or expired session")
-    return info
+    
+    # Try local session validation first
+    try:
+        info = _auth_validate_session(sid)
+        if info:
+            return info
+    except Exception:
+        pass
+
+    # Session not found locally - fall back to shared token if auto-shared is enabled
+    if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN:
+        return {"user_id": "shared", "role": "user", "token_type": "shared_auto"}
+
+    raise ValidationError("Invalid or expired session")
 
 
 def require_collection_access(user_id: Optional[str], collection: str, perm: str) -> None:
diff --git a/scripts/mcp_impl/context_answer.py b/scripts/mcp_impl/context_answer.py
index 272bb869..a7f434c5 100644
--- a/scripts/mcp_impl/context_answer.py
+++ b/scripts/mcp_impl/context_answer.py
@@ -57,6 +57,90 @@
 
 logger = logging.getLogger(__name__)
 
+# ---------------------------------------------------------------------------
+# Auto-memory storage for successful answers
+# ---------------------------------------------------------------------------
+
+# Minimum answer length to auto-store (default 200 chars)
+_AUTO_MEMORY_MIN_CHARS = int(os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY_MIN_CHARS", "200") or 200)
+# Enable/disable auto-memory storage (default ON)
+_AUTO_MEMORY_ENABLED = os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY", "1").strip().lower() in {"1", "true", "yes", "on"}
+
+
+def _maybe_store_answer_as_memory(
+    answer: str,
+    queries: List[str],
+    citations: List[Dict[str, Any]],
+    collection: Optional[str] = None,
+) -> None:
+    """Fire-and-forget storage of successful context_answer responses as memories.
+
+    Criteria for storage:
+    - Answer is not "insufficient context"
+    - Answer has at least one citation
+    - Answer length >= _AUTO_MEMORY_MIN_CHARS (default 200)
+    - _AUTO_MEMORY_ENABLED is True (default)
+
+    Runs in a background thread to not block the response.
+    """
+    if not _AUTO_MEMORY_ENABLED:
+        return
+
+    # Check criteria
+    ans_clean = (answer or "").strip()
+    if not ans_clean:
+        return
+    if ans_clean.lower() == "insufficient context":
+        return
+    if not citations:
+        return
+    if len(ans_clean) < _AUTO_MEMORY_MIN_CHARS:
+        return
+
+    # Build memory content with query context
+    query_str = " | ".join(queries) if queries else "unknown query"
+
+    # Build citation summary (paths only)
+    cite_paths = []
+    for cit in citations[:5]:  # Limit to first 5 citations
+        p = cit.get("path") or cit.get("rel_path") or ""
+        if p:
+            cite_paths.append(p)
+    cite_summary = ", ".join(cite_paths) if cite_paths else "no paths"
+
+    # Format the memory content
+    memory_content = f"Q: {query_str}\n\nA: {ans_clean}\n\nSources: {cite_summary}"
+
+    # Build metadata
+    metadata = {
+        "kind": "context_answer",
+        "source": "auto_memory",
+        "queries": queries,
+        "citation_count": len(citations),
+        "answer_length": len(ans_clean),
+    }
+
+    # Fire-and-forget in background thread
+    import threading
+
+    def _store():
+        try:
+            # Import here to avoid circular imports
+            from scripts.mcp_memory_server import memory_store
+            memory_store(
+                information=memory_content,
+                metadata=metadata,
+                collection=collection,
+            )
+            logger.debug("Auto-stored context_answer as memory (len=%d, cites=%d)", len(ans_clean), len(citations))
+        except Exception as e:
+            # Silently fail - this is best-effort
+            logger.debug("Auto-memory storage failed: %s", e)
+
+    t = threading.Thread(target=_store, daemon=True)
+    t.start()
+
+
 # Keys to strip from citations for slim MCP output (agents only need path + rel_path)
 _VERBOSE_PATH_KEYS = ("host_path", "container_path", "client_path")
 
@@ -666,6 +750,14 @@ def _ca_prepare_filters_and_retrieve(
         "node_modules/",
         ".git/",
         ".git",
+        # CDK/deploy build artifacts (duplicates of source files)
+        "cdk.out/",
+        "**/cdk.out/**",
+        "deploy/eks-cdk-*/cdk.out/",
+        # Build artifacts
+        "dist/",
+        "build/",
+        "*.egg-info/",
     ]
 
     def _variants(p: str) -> list[str]:
@@ -2947,7 +3039,7 @@ def safe_float(val, default=0.0, **kw):
                     items=items,
                     collection=coll,
                     repo=repo,
-                    max_neighbors=2,
+                    max_neighbors=5,
                 )
             except Exception as e:
                 logger.debug(f"Subgraph context injection failed: {e}")
@@ -3373,4 +3465,13 @@ def _tok2(s: str) -> list[str]:
     }
     if answers_by_query:
         out["answers_by_query"] = answers_by_query
+
+    # Auto-store successful answers as memories (fire-and-forget)
+    _maybe_store_answer_as_memory(
+        answer=answer.strip(),
+        queries=original_queries,
+        citations=citations,
+        collection=collection,
+    )
+
     return out
diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py
index bf49664c..a2565c32 100644
--- a/scripts/mcp_impl/search.py
+++ b/scripts/mcp_impl/search.py
@@ -62,6 +62,7 @@
     logger=logger,
     context="MCP_SNIPPET_MAX_BYTES",
 )
+SEARCH_COMPACT_DEFAULT = os.environ.get("SEARCH_COMPACT_DEFAULT", "0").lower() in {"1", "true", "yes", "on"}
 
 
 async def _repo_search_impl(
@@ -323,6 +324,9 @@ def _to_str(x, default=""):
     rerank_timeout_ms = _to_int(
         rerank_timeout_ms, int(os.environ.get("RERANKER_TIMEOUT_MS", "3000") or 3000)
     )
+    # Clamp rerank timeout to prevent unreasonably low deadlines
+    _MIN_RERANK_TIMEOUT_MS = int(os.environ.get("RERANK_TIMEOUT_MIN_MS", "10000") or 10000)
+    rerank_timeout_ms = max(rerank_timeout_ms, _MIN_RERANK_TIMEOUT_MS)
     highlight_snippet = _to_bool(highlight_snippet, True)
 
     # Resolve collection and related hints: explicit > per-connection defaults > token defaults > env
@@ -454,7 +458,7 @@ def _to_str_list(x):
             repo_filter = [detected_repo]
 
     compact_raw = compact
-    compact = _to_bool(compact, False)
+    compact = _to_bool(compact, SEARCH_COMPACT_DEFAULT)
     # If snippets are requested, do not compact (we need snippet field in results)
     if include_snippet:
         compact = False
@@ -795,6 +799,7 @@ def _match_glob(glob_pat: str, path_val: str) -> bool:
 
     # Optional rerank fallback path: if enabled, attempt; on timeout or error, keep hybrid
     used_rerank = False
+    learning_results = None  # May hold learning reranker output for fallback
     rerank_counters = {
         "inproc_hybrid": 0,
         "inproc_dense": 0,
@@ -870,6 +875,8 @@ def _match_glob(glob_pat: str, path_val: str) -> bool:
                         tmp.append(item)
 
                     if tmp:
+                        # Store learning results separately; may be used as fallback
+                        learning_results = tmp
                         results = tmp
                         used_rerank = True
                         rerank_counters["learning"] += 1
@@ -1191,6 +1198,12 @@ def _doc_for(obj: dict) -> str:
                     rerank_counters["error"] += 1
                     used_rerank = False
 
+    # Fallback to learning reranker results if subprocess failed but learning succeeded
+    if (not used_rerank) and learning_results:
+        results = learning_results
+        used_rerank = True
+        logger.debug("Falling back to learning reranker results after subprocess failure")
+
     if not used_rerank:
         # Build results from hybrid JSON lines
         for obj in json_lines:
diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py
index 6d0ff832..934f2274 100644
--- a/scripts/mcp_indexer_server.py
+++ b/scripts/mcp_indexer_server.py
@@ -136,6 +136,7 @@ def _json_dumps_bytes(obj) -> bytes:
 from scripts.mcp_auth import (
     require_auth_session as _require_auth_session,
     require_collection_access as _require_collection_access,
+    AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN,
 )
 
 # ---------------------------------------------------------------------------
@@ -295,6 +296,268 @@ def _highlight_snippet(snippet, tokens):  # type: ignore
     _work_script,
 )
 
+TOOLS_METADATA: dict[str, dict] = {
+    "repo_search": {
+        "name": "repo_search",
+        "category": "search",
+        "primary_use": "Hybrid semantic + lexical code search",
+        "choose_when": [
+            "Finding code related to a concept",
+            "Starting a search without knowing which tool",
+            "Need flexible filtering by language/path/symbol",
+        ],
+        "choose_instead": {
+            "symbol_graph": "Need precise caller/definition relationships",
+            "context_answer": "Need an explanation, not raw results",
+            "search_tests_for": "Specifically want test files",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under", "include_snippet"],
+            "advanced": ["rerank_enabled", "output_format", "compact", "mode"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{score, path, symbol, start_line, end_line, snippet?}]",
+            "total": "int",
+        },
+        "related_tools": ["code_search", "context_search", "info_request"],
+        "performance": {
+            "typical_latency_ms": (100, 2000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "context_answer": {
+        "name": "context_answer",
+        "category": "answer",
+        "primary_use": "LLM-generated answers with code citations",
+        "choose_when": [
+            "Need an explanation of how code works",
+            "Asking 'how does X work?' questions",
+            "Want synthesized answer with sources",
+        ],
+        "choose_instead": {
+            "repo_search": "Want raw code results, not explanation",
+            "symbol_graph": "Need precise relationships",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under", "include_snippet"],
+            "advanced": ["max_tokens", "temperature", "expand", "budget_tokens"],
+        },
+        "returns": {
+            "ok": "bool",
+            "answer": "str",
+            "citations": "list[{id, path, start_line, end_line}]",
+        },
+        "related_tools": ["repo_search", "context_search"],
+        "performance": {
+            "typical_latency_ms": (1000, 10000),
+            "requires_index": True,
+            "requires_decoder": True,
+        },
+    },
+    "symbol_graph": {
+        "name": "symbol_graph",
+        "category": "graph",
+        "primary_use": "AST-backed symbol relationship queries",
+        "choose_when": [
+            "Need 'who calls function X'",
+            "Need 'where is X defined'",
+            "Need 'what imports module Y'",
+            "Doing refactoring impact analysis",
+        ],
+        "choose_instead": {
+            "repo_search": "Want conceptual search, not precise relationships",
+            "search_callers_for": "Quick text search is sufficient",
+        },
+        "parameters": {
+            "essential": ["symbol", "query_type"],
+            "common": ["limit", "language", "under", "repo"],
+            "advanced": ["depth", "output_format"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{path, start_line, end_line, symbol, snippet}]",
+            "count": "int",
+        },
+        "related_tools": ["search_callers_for", "search_importers_for"],
+        "performance": {
+            "typical_latency_ms": (50, 500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "context_search": {
+        "name": "context_search",
+        "category": "search",
+        "primary_use": "Blend code search with memory retrieval",
+        "choose_when": [
+            "Want code AND stored memories together",
+            "Searching for documented decisions",
+            "Need context from team knowledge",
+        ],
+        "choose_instead": {
+            "repo_search": "Only want code, no memories",
+            "memory_find": "Only want memories, no code",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["include_memories", "memory_weight", "limit"],
+            "advanced": ["per_source_limits", "rerank_enabled"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{source, score, path|content, ...}]",
+            "total": "int",
+        },
+        "related_tools": ["repo_search", "memory_find"],
+        "performance": {
+            "typical_latency_ms": (200, 3000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "info_request": {
+        "name": "info_request",
+        "category": "search",
+        "primary_use": "Simplified code discovery with explanations",
+        "choose_when": [
+            "Want simple single-parameter search",
+            "Need human-readable result descriptions",
+            "Building minimal integrations",
+        ],
+        "choose_instead": {
+            "repo_search": "Need full control over parameters",
+            "context_answer": "Need LLM-generated explanation",
+        },
+        "parameters": {
+            "essential": ["info_request"],
+            "common": ["limit", "language", "include_explanation"],
+            "advanced": ["include_relationships", "output_format"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{information, relevance_score, path, ...}]",
+            "summary?": "str",
+            "related_concepts?": "list[str]",
+        },
+        "related_tools": ["repo_search", "context_answer"],
+        "performance": {
+            "typical_latency_ms": (100, 2000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "pattern_search": {
+        "name": "pattern_search",
+        "category": "search",
+        "primary_use": "Structural code pattern matching",
+        "choose_when": [
+            "Have code example, find similar",
+            "Cross-language pattern search",
+            "Find structural duplicates",
+        ],
+        "choose_instead": {
+            "repo_search": "Searching by concept, not structure",
+            "symbol_graph": "Looking for relationships",
+        },
+        "parameters": {
+            "essential": ["query"],
+            "common": ["language", "limit", "target_languages"],
+            "advanced": ["query_mode", "aroma_rerank", "min_score"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{path, start_line, end_line, score, language}]",
+            "query_mode": "str",
+        },
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (200, 3000),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_tests_for": {
+        "name": "search_tests_for",
+        "category": "specialized",
+        "primary_use": "Find test files for a feature/function",
+        "choose_when": ["Specifically want test files", "Looking for test coverage"],
+        "choose_instead": {"repo_search": "Want all code, not just tests"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language", "under"],
+            "advanced": ["include_snippet", "compact"],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_config_for": {
+        "name": "search_config_for",
+        "category": "specialized",
+        "primary_use": "Find configuration files",
+        "choose_when": ["Looking for config files", "Finding settings/options"],
+        "choose_instead": {"repo_search": "Want all code, not just config"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "under"],
+            "advanced": ["include_snippet", "compact"],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["repo_search"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_callers_for": {
+        "name": "search_callers_for",
+        "category": "specialized",
+        "primary_use": "Text-based search for symbol callers",
+        "choose_when": ["Quick caller search is sufficient", "No graph index available"],
+        "choose_instead": {"symbol_graph": "Need precise AST-backed callers"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language"],
+            "advanced": [],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["symbol_graph"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+    "search_importers_for": {
+        "name": "search_importers_for",
+        "category": "specialized",
+        "primary_use": "Text-based search for module importers",
+        "choose_when": ["Quick import search is sufficient", "No graph index available"],
+        "choose_instead": {"symbol_graph": "Need precise AST-backed importers"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "language"],
+            "advanced": [],
+        },
+        "returns": {"ok": "bool", "results": "list[...]", "total": "int"},
+        "related_tools": ["symbol_graph"],
+        "performance": {
+            "typical_latency_ms": (100, 1500),
+            "requires_index": True,
+            "requires_decoder": False,
+        },
+    },
+}
+
 # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000)
 _security_settings = (
     TransportSecuritySettings(enable_dns_rebinding_protection=False)
@@ -303,6 +566,53 @@ def _highlight_snippet(snippet, tokens):  # type: ignore
 )
 mcp = FastMCP(APP_NAME, transport_security=_security_settings)
 
+class _AuthHeaderASGIMiddleware:
+    """Pure ASGI middleware that extracts Authorization header into context var."""
+    def __init__(self, app):
+        self.app = app
+    
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            headers = dict(scope.get("headers", []))
+            auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore")
+            if auth_header.lower().startswith("bearer "):
+                token = auth_header[7:].strip()
+            else:
+                token = auth_header.strip() if auth_header else ""
+            _AUTH_HEADER_TOKEN.set(token)
+        return await self.app(scope, receive, send)
+
+
+def _add_auth_middleware():
+    """Wrap FastMCP's ASGI app with auth header extraction middleware.
+    
+    FastMCP calls streamable_http_app() or sse_app() to create the Starlette app.
+    We patch these methods to wrap the returned app with our middleware.
+    """
+    logger.info("Setting up auth header middleware...")
+    try:
+        # Patch streamable_http_app
+        if hasattr(mcp, "streamable_http_app"):
+            _orig_streamable = mcp.streamable_http_app
+            def _patched_streamable(*args, **kwargs):
+                app = _orig_streamable(*args, **kwargs)
+                logger.info(f"Wrapping streamable_http_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.streamable_http_app = _patched_streamable
+        
+        # Patch sse_app for SSE transport
+        if hasattr(mcp, "sse_app"):
+            _orig_sse = mcp.sse_app
+            def _patched_sse(*args, **kwargs):
+                app = _orig_sse(*args, **kwargs)
+                logger.info(f"Wrapping sse_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.sse_app = _patched_sse
+        
+        logger.info("Patched FastMCP app factory methods for auth middleware injection")
+    except Exception as e:
+        logger.warning(f"Failed to patch FastMCP for auth middleware: {e}")
+
 
 # Capture tool registry automatically by wrapping the decorator once
 _TOOLS_REGISTRY: list[dict] = []
@@ -417,7 +727,6 @@ def do_GET(self):
                         self.send_response(200)
                         self.send_header("Content-Type", "application/json")
                         self.end_headers()
-                        # Hide expand_query when decoder is disabled
                         tools = _TOOLS_REGISTRY
                         try:
                             from scripts.refrag_llamacpp import is_decoder_enabled  # type: ignore
@@ -432,7 +741,12 @@ def do_GET(self):
                                 ]
                         except Exception as e:
                             logger.debug(f"Suppressed exception: {e}")
-                        payload = {"ok": True, "tools": tools}
+                        enriched = []
+                        for t in tools:
+                            name = t.get("name", "")
+                            meta = TOOLS_METADATA.get(name, {})
+                            enriched.append({**t, **meta})
+                        payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA}
                         self.wfile.write(_json_dumps_bytes(payload))
                     else:
                         self.send_response(404)
@@ -979,6 +1293,12 @@ async def set_session_defaults(
     mode: Any = None,
     under: Any = None,
     language: Any = None,
+    repo: Any = None,
+    compact: Any = None,
+    output_format: Any = None,
+    include_snippet: Any = None,
+    rerank_enabled: Any = None,
+    limit: Any = None,
     session: Any = None,
     ctx: Context = None,
     **kwargs,
@@ -989,6 +1309,19 @@ async def set_session_defaults(
     - If request Context is available, persist defaults per-connection so later calls on
       the same MCP session automatically use them (no token required).
     - Optionally also stores token-scoped defaults for cross-connection reuse.
+
+    Parameters:
+    - collection: Default collection name
+    - mode: Search mode hint
+    - under: Default path prefix filter
+    - language: Default language filter
+    - repo: Default repo filter for multi-repo setups
+    - compact: Default compact response mode (bool)
+    - output_format: Default output format ("json" or "toon")
+    - include_snippet: Default snippet inclusion (bool)
+    - rerank_enabled: Default reranking toggle (bool)
+    - limit: Default result limit (int)
+    - session: Session token for cross-connection reuse
     """
     try:
         _extra = _extract_kwargs_payload(kwargs)
@@ -1003,6 +1336,18 @@ async def set_session_defaults(
                 language = _extra.get("language")
             if (session is None or (isinstance(session, str) and str(session).strip() == "")) and _extra.get("session") is not None:
                 session = _extra.get("session")
+            if repo is None and _extra.get("repo") is not None:
+                repo = _extra.get("repo")
+            if compact is None and _extra.get("compact") is not None:
+                compact = _extra.get("compact")
+            if output_format is None and _extra.get("output_format") is not None:
+                output_format = _extra.get("output_format")
+            if include_snippet is None and _extra.get("include_snippet") is not None:
+                include_snippet = _extra.get("include_snippet")
+            if rerank_enabled is None and _extra.get("rerank_enabled") is not None:
+                rerank_enabled = _extra.get("rerank_enabled")
+            if limit is None and _extra.get("limit") is not None:
+                limit = _extra.get("limit")
     except Exception as e:
         logger.debug(f"Suppressed exception: {e}")
 
@@ -1015,6 +1360,23 @@ async def set_session_defaults(
                 defaults[_key] = _s
             else:
                 unset_keys.add(_key)
+    if isinstance(repo, str) and repo.strip():
+        defaults["repo"] = repo.strip()
+    elif isinstance(repo, list):
+        defaults["repo"] = repo
+    if isinstance(output_format, str) and output_format.strip():
+        defaults["output_format"] = output_format.strip()
+    if compact is not None:
+        defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact
+    if include_snippet is not None:
+        defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet
+    if rerank_enabled is not None:
+        defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled
+    if limit is not None:
+        try:
+            defaults["limit"] = int(limit)
+        except (ValueError, TypeError):
+            pass
 
     # Per-connection storage (preferred)
     try:
@@ -1119,24 +1481,91 @@ async def repo_search(
     args: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Zero-config code search over repositories (hybrid: vector + lexical RRF, rerank ON by default).
-
-    When to use:
-    - Find relevant code spans quickly; prefer this over embedding-only search.
-    - Use context_answer when you need a synthesized explanation; use context_search to blend with memory notes.
-
-    Key parameters:
-    - query: str or list[str]. Multiple queries are fused; accepts "queries" alias.
-    - limit: int (default 10). Total results across files.
-    - per_path: int (default 2). Max results per file.
-    - include_snippet/context_lines: return inline snippets near hits when true.
-    - rerank_*: ONNX reranker is ON by default for best relevance; timeouts fall back to hybrid.
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
-    - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos.
+    """Primary hybrid semantic + lexical code search across the repository.
+
+    PRIMARY USE: Find code spans matching a natural language concept or topic.
+
+    CHOOSE THIS WHEN:
+    - You need to find code related to a concept (e.g., "authentication", "caching")
+    - You want to locate implementations, not just definitions
+    - You need flexible filtering by language, path, or symbol
+    - You want the best balance of recall and precision
+    - You're starting a search and aren't sure which specific tool to use
+
+    CHOOSE INSTEAD:
+    - symbol_graph -> when you need "who calls X" or "where is X defined" (AST-backed)
+    - context_answer -> when you need an EXPLANATION, not raw code results
+    - context_search -> when you want to blend code results with stored memories
+    - search_tests_for -> when specifically looking for test files
+    - search_config_for -> when specifically looking for config files
+    - pattern_search -> when searching by code structure/pattern across languages
+
+    QUERY EXAMPLES:
+    Good queries (natural language, conceptual):
+      "authentication middleware"     - finds auth-related code
+      "error handling with retry"     - finds retry logic
+      "database connection pooling"   - finds connection management
+      "user session management"       - finds session-related code
+      "API rate limiting"             - finds rate limit implementations
+      "caching layer implementation"  - finds cache logic
+      "websocket message handling"    - finds WS handlers
+      "file upload processing"        - finds upload logic
+
+    Bad queries (will return poor results):
+      "auth OR login OR session"      - boolean operators NOT supported
+      "def.*authenticate"             - regex NOT supported in query
+      "*.py with class User"          - glob syntax NOT for query field
+      "function"                      - too vague, be more specific
+      "get"                           - too generic
+      "the code that handles the thing" - unclear intent
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you're looking for.
+      Multiple queries are fused for broader recall.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - per_path (int, default=2): Max results per file. Increase for thorough search.
+    - include_snippet (bool, default=True): Include code snippets in results.
+    - language (str): Filter by language ("python", "typescript", "go", etc.)
+    - under (str): Restrict to directory path ("scripts/", "src/api/")
+    - symbol (str): Filter by symbol name (function, class, method)
+    - path_glob (str | list[str]): File pattern filter ("**/*.py", "src/**")
+    - repo (str | list[str]): Filter by repo name(s). Use "*" for all repos.
+
+    ADVANCED PARAMETERS:
+    - rerank_enabled (bool, default=True): ONNX cross-encoder reranking for relevance.
+    - rerank_top_n (int, default=20): Candidates to rerank. Increase for benchmarks.
+    - output_format (str): "json" (default) or "toon" for token-efficient format.
+    - compact (bool, default=False): Strip verbose fields for minimal response.
+    - mode (str): "code_first", "docs_first", "balanced", or "dense" (pure embedding).
+    - not_glob (str | list[str]): Exclude paths matching pattern.
+    - not_ (str): Exclude results containing this text.
+    - case (str): "sensitive" for case-sensitive matching.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.85,           // Relevance score (0-1+)
+                "path": "src/auth.py",   // File path
+                "symbol": "authenticate", // Symbol name if available
+                "start_line": 42,        // Start line number
+                "end_line": 67,          // End line number
+                "snippet": "def auth..." // Code snippet (if include_snippet=true)
+            }
+        ],
+        "total": 5,                      // Total results returned
+        "used_rerank": true,             // Whether reranking was applied
+        "rerank_counters": {...}         // Reranking statistics
+    }
 
-    Returns:
-    - Dict with keys: results, total, used_rerank, rerank_counters
+    PERFORMANCE TIPS:
+    - Use language filter to reduce search space and improve relevance
+    - Use under filter when you know the general code area
+    - Set include_snippet=false if you only need file locations
+    - Set compact=true to reduce response size for large result sets
     """
     return await _repo_search_impl(
         query=query,
@@ -1305,14 +1734,77 @@ async def search_tests_for(
 ) -> Dict[str, Any]:
     """Find test files related to a query.
 
-    What it does:
-    - Presets common test file globs and forwards to repo_search
-    - Accepts extra filters via kwargs (e.g., language, under, case)
+    PRIMARY USE: Quickly find tests for a feature, function, or module.
+    Convenience wrapper that presets common test file patterns.
+
+    CHOOSE THIS WHEN:
+    - You specifically want TEST files, not implementation code
+    - You're looking for tests related to a feature
+    - You want to find test coverage for a function/class
+    - You're exploring how something is tested
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want ALL code, not just tests
+    - symbol_graph -> when you need "what tests call function X"
+
+    QUERY EXAMPLES:
+    Good queries (feature/function focused):
+      "user authentication"           - finds tests for auth features
+      "database connection"           - finds DB connection tests
+      "API rate limiting"             - finds rate limit tests
+      "email sending"                 - finds email-related tests
+      "input validation"              - finds validation tests
+      "UserService"                   - finds tests for UserService class
+
+    Bad queries:
+      "all tests"                     - too broad
+      "test_*.py"                     - glob pattern, use path_glob param
+      "pass"                          - assertion keyword, not meaningful
+      "def test_"                     - code fragment, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you want tests for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - include_snippet (bool, default=True): Include test code snippets.
+    - context_lines (int): Lines of context around matches.
+    - under (str): Restrict to directory path (e.g., "tests/unit/").
+    - language (str): Filter by language.
+    - compact (bool): Minimal response fields.
+
+    PRESET GLOBS (automatically applied):
+    - tests/**
+    - test/**
+    - **/*test*.*
+    - **/*_test.*
+    - **/Test*/**
+
+    RETURNS: Same schema as repo_search.
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.82,
+                "path": "tests/test_auth.py",
+                "symbol": "test_authenticate_valid_user",
+                "start_line": 45,
+                "end_line": 58,
+                "snippet": "def test_authenticate_valid_user():..."
+            }
+        ],
+        "total": 8
+    }
 
-    Parameters:
-    - query: str or list[str]; limit; include_snippet/context_lines; under; language; compact
+    USAGE PATTERNS:
+    # Find tests for authentication
+    search_tests_for(query="authentication")
 
-    Returns: repo_search result shape.
+    # Find tests in a specific directory
+    search_tests_for(query="database", under="tests/integration/")
+
+    # Find Python tests only
+    search_tests_for(query="caching", language="python")
     """
     return await _search_tests_for_impl(
         query=query,
@@ -1341,13 +1833,86 @@ async def search_config_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find likely configuration files for a service/query.
+    """Find configuration files related to a query.
+
+    PRIMARY USE: Quickly find config files for a service, feature, or setting.
+    Convenience wrapper that presets common config file patterns.
+
+    CHOOSE THIS WHEN:
+    - You need to find configuration for a service/feature
+    - You're looking for environment variables, settings, or options
+    - You want to find where something is configured
+    - You're debugging configuration issues
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want ALL code, not just config files
+    - search_tests_for -> when looking for test files
+
+    QUERY EXAMPLES:
+    Good queries (service/setting focused):
+      "database connection"           - finds DB config files
+      "authentication settings"       - finds auth config
+      "logging configuration"         - finds logging setup
+      "API keys"                      - finds key config (careful with secrets!)
+      "environment variables"         - finds env config
+      "redis cache"                   - finds Redis config
+      "docker compose"                - finds Docker config
+
+    Bad queries:
+      "*.yaml"                        - glob pattern, handled by presets
+      "config"                        - too vague
+      "settings"                      - too generic
+      "json"                          - file format, not a query
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what config you need.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - include_snippet (bool, default=True): Include config content snippets.
+    - context_lines (int): Lines of context around matches.
+    - under (str): Restrict to directory path.
+    - compact (bool): Minimal response fields.
+
+    PRESET GLOBS (automatically applied):
+    - **/*.yml, **/*.yaml
+    - **/*.json
+    - **/*.toml
+    - **/*.ini
+    - **/*.env
+    - **/*.config, **/*.conf
+    - **/*.properties
+    - **/*.csproj, **/*.props, **/*.targets
+    - **/*.xml
+    - **/appsettings*.json
+
+    RETURNS: Same schema as repo_search.
+    {
+        "ok": true,
+        "results": [
+            {
+                "score": 0.85,
+                "path": "config/database.yml",
+                "start_line": 12,
+                "end_line": 25,
+                "snippet": "database:\\n  host: localhost\\n  port: 5432..."
+            }
+        ],
+        "total": 5
+    }
 
-    What it does:
-    - Presets config file globs (yaml/json/toml/etc.) and forwards to repo_search
-    - Accepts extra filters via kwargs
+    USAGE PATTERNS:
+    # Find database config
+    search_config_for(query="database connection")
+
+    # Find Docker configuration
+    search_config_for(query="docker service ports")
+
+    # Find in specific directory
+    search_config_for(query="api settings", under="config/")
 
-    Returns: repo_search result shape.
+    WARNING: Config files may contain sensitive data (API keys, passwords).
+    Be cautious about exposing results that might contain secrets.
     """
     return await _search_config_for_impl(
         query=query,
@@ -1372,14 +1937,57 @@ async def search_callers_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Heuristic search for callers/usages of a symbol.
-
-    When to use:
-    - You want files that reference/invoke a function/class
-
-    Notes:
-    - Thin wrapper over repo_search today; pass language or path_glob to narrow
-    - Returns repo_search result shape
+    """Heuristic text-based search for callers/usages of a symbol.
+
+    PRIMARY USE: Find files that likely call or reference a function/class.
+    Uses text search, not AST analysis - faster but less precise than symbol_graph.
+
+    CHOOSE THIS WHEN:
+    - You want a quick, broad search for symbol references
+    - You're okay with some false positives in exchange for speed
+    - The codebase doesn't have graph index built yet
+    - You want to find textual mentions, not just actual calls
+
+    CHOOSE INSTEAD:
+    - symbol_graph with query_type="callers" -> for PRECISE AST-backed caller analysis
+    - repo_search -> when you want full control over search parameters
+
+    QUERY EXAMPLES:
+    Good queries (symbol names):
+      "authenticate"                  - finds references to authenticate
+      "UserService"                   - finds references to UserService
+      "validate_input"                - finds references to validate_input
+      "CacheManager.get"              - finds references to CacheManager.get
+
+    Bad queries:
+      "who calls authenticate"        - use symbol_graph for this phrasing
+      "find all usages of X"          - use symbol_graph
+      "authentication"                - concept, not symbol name
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Symbol name to find callers/references for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language for more relevant results.
+
+    RETURNS: Same schema as repo_search.
+
+    COMPARISON WITH symbol_graph:
+    | Aspect | search_callers_for | symbol_graph |
+    |--------|-------------------|--------------|
+    | Method | Text search | AST analysis |
+    | Speed | Faster | Slower |
+    | Precision | Lower (false positives) | Higher (actual calls) |
+    | Requires | Nothing special | Graph index |
+    | Use for | Quick exploration | Precise refactoring |
+
+    USAGE PATTERNS:
+    # Quick reference search
+    search_callers_for(query="authenticate", language="python")
+
+    # For precise caller analysis, prefer:
+    symbol_graph(symbol="authenticate", query_type="callers")
     """
     return await _search_callers_for_impl(
         query=query,
@@ -1401,13 +2009,61 @@ async def search_importers_for(
     kwargs: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find files likely importing or referencing a module/symbol.
-
-    What it does:
-    - Presets code globs across common languages; forwards to repo_search
-    - Accepts additional filters via kwargs (e.g., under, case)
-
-    Returns: repo_search result shape.
+    """Heuristic text-based search for files importing a module/symbol.
+
+    PRIMARY USE: Find files that likely import a module or symbol.
+    Uses text search, not AST analysis - faster but less precise than symbol_graph.
+
+    CHOOSE THIS WHEN:
+    - You want a quick search for import statements
+    - You're looking for textual import/require/use mentions
+    - The codebase doesn't have graph index built yet
+    - You want approximate results quickly
+
+    CHOOSE INSTEAD:
+    - symbol_graph with query_type="importers" -> for PRECISE AST-backed import analysis
+    - repo_search -> when you want full control over search parameters
+
+    QUERY EXAMPLES:
+    Good queries (module/symbol names):
+      "auth_utils"                    - finds imports of auth_utils
+      "CacheManager"                  - finds imports of CacheManager
+      "qdrant_client"                 - finds imports of qdrant_client
+      "express"                       - finds require('express')
+      "pandas"                        - finds import pandas
+
+    Bad queries:
+      "what imports X"                - use symbol_graph for this phrasing
+      "import statements"             - too vague
+      "from ... import"               - syntax, not a module name
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Module or symbol name to find importers for.
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language for more relevant results.
+
+    PRESET GLOBS (automatically applied):
+    Code files across all common languages (*.py, *.js, *.ts, *.go, etc.)
+
+    RETURNS: Same schema as repo_search.
+
+    COMPARISON WITH symbol_graph:
+    | Aspect | search_importers_for | symbol_graph |
+    |--------|---------------------|--------------|
+    | Method | Text search | AST analysis |
+    | Speed | Faster | Slower |
+    | Precision | Lower (false positives) | Higher (actual imports) |
+    | Requires | Nothing special | Graph index |
+    | Use for | Quick exploration | Precise dependency analysis |
+
+    USAGE PATTERNS:
+    # Quick import search
+    search_importers_for(query="qdrant_client", language="python")
+
+    # For precise import analysis, prefer:
+    symbol_graph(symbol="qdrant_client", query_type="importers")
     """
     return await _search_importers_for_impl(
         query=query,
@@ -1433,35 +2089,102 @@ async def symbol_graph(
     depth: Any = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Query the symbol graph to find callers, definitions, or importers.
+    """AST-backed symbol graph queries for precise code relationships.
+
+    PRIMARY USE: Find WHO CALLS a function, WHERE something is DEFINED,
+    or WHAT IMPORTS a module using the pre-built symbol graph.
+
+    CHOOSE THIS WHEN:
+    - You need "who calls this function?" (callers)
+    - You need "where is this defined?" (definition)
+    - You need "what imports this module?" (importers)
+    - You need "what does this function call?" (callees)
+    - You want PRECISE relationships, not text-based fuzzy matches
+    - You're doing refactoring impact analysis
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want CONCEPTUAL search, not precise relationships
+    - search_callers_for -> convenience wrapper, uses text search (less precise)
+    - search_importers_for -> convenience wrapper, uses text search (less precise)
+
+    QUERY EXAMPLES:
+
+    For "callers" query_type (who calls X?):
+      symbol="authenticate"          - finds all callers of authenticate()
+      symbol="UserService.get_user"  - finds callers of get_user method
+      symbol="validate_input"        - finds where validate_input is called
+
+    For "definition" query_type (where is X defined?):
+      symbol="CacheManager"          - finds CacheManager class definition
+      symbol="run_hybrid_search"     - finds function definition
+      symbol="USER_TIMEOUT"          - finds constant definition
+
+    For "importers" query_type (what imports X?):
+      symbol="auth_utils"            - finds files importing auth_utils module
+      symbol="CacheManager"          - finds files importing CacheManager
+      symbol="qdrant_client"         - finds files importing qdrant_client
+
+    For "callees" query_type (what does X call?):
+      symbol="authenticate"          - finds functions called BY authenticate
+      symbol="process_request"       - finds all functions process_request calls
+
+    ESSENTIAL PARAMETERS:
+    - symbol (str): Symbol name to analyze. Can be:
+      - Simple name: "authenticate"
+      - Qualified path: "UserService.get_user"
+      - Module name: "auth_utils"
+
+    - query_type (str, default="callers"): Type of relationship query:
+      - "callers": Find code that CALLS this symbol
+      - "definition": Find WHERE this symbol is DEFINED
+      - "importers": Find code that IMPORTS this symbol/module
+      - "callees": Find what this symbol CALLS (inverse of callers)
+
+    COMMON PARAMETERS:
+    - limit (int, default=20): Maximum results to return.
+    - depth (int, default=1): Traversal depth for multi-hop queries.
+      - depth=1: Direct relationships only
+      - depth=2: Callers of callers, callees of callees, etc.
+      - depth=3+: Use sparingly, can be expensive
+    - language (str): Filter by language.
+    - under (str): Filter by path prefix.
+    - repo (str): Filter by repository name. Use "*" for all repos.
+    - output_format (str): "json" or "toon" for token-efficient format.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "path": "src/api/handlers.py",
+                "start_line": 142,
+                "end_line": 145,
+                "symbol": "handle_login",
+                "symbol_path": "handlers.handle_login",
+                "language": "python",
+                "snippet": "    result = authenticate(username, password)",
+                "hop": 1,           // For depth>1: which hop found this
+                "via": "authenticate"  // For depth>1: intermediate symbol
+            }
+        ],
+        "symbol": "authenticate",
+        "query_type": "callers",
+        "count": 12,
+        "depth": 1,
+        "used_graph": true,    // True if graph collection was used (fast)
+        "suggestions": [...]   // Fuzzy matches if exact symbol not found
+    }
 
-    When to use:
-    - "Who calls X?" → query_type="callers"
-    - "Where is X defined?" → query_type="definition"
-    - "What imports Y?" → query_type="importers"
-    - "What does X call?" → query_type="callees"
-
-    Key parameters:
-    - symbol: str. The function, class, or module name to search for.
-    - query_type: str. One of "callers", "definition", "importers".
-    - limit: int (default 20). Maximum results to return.
-    - language: str (optional). Filter by programming language.
-    - under: str (optional). Filter by path prefix.
-    - repo: str (optional). Filter by repository name. Use "*" to search all repos.
-    - output_format: "json" (default) or "toon" for token-efficient format.
-    - depth: int (default 1). Multi-hop traversal depth. 2 = callers of callers, etc.
+    MULTI-HOP EXAMPLE (depth=2):
+    # "Who calls the callers of authenticate?"
+    symbol_graph(symbol="authenticate", query_type="callers", depth=2)
+    # Returns both direct callers (hop=1) and callers-of-callers (hop=2)
 
-    Returns:
-    - {"results": [...], "symbol": str, "query_type": str, "count": int, "depth": int}
-    - Each result includes path, start_line, end_line, symbol_path, and relevant context.
-    - Multi-hop results include "hop" (1, 2, ...) and "via" (intermediate symbol).
-
-    Example:
-    - symbol_graph(symbol="get_embedding_model", query_type="callers")
-    - symbol_graph(symbol="ASTAnalyzer", query_type="definition")
-    - symbol_graph(symbol="qdrant_client", query_type="importers")
-    - symbol_graph(symbol="my_function", query_type="callers", repo="backend")
-    - symbol_graph(symbol="authenticate", query_type="callers", depth=2)
+    NOTES:
+    - Graph must be indexed (run qdrant_index_root first)
+    - For fuzzy matching, suggestions are returned if exact symbol not found
+    - Hydration adds code snippets and accurate line numbers automatically
+    - Use depth>1 carefully - exponential growth in results
     """
     if not symbol or not str(symbol).strip():
         return {"error": "symbol parameter is required", "results": []}
@@ -1601,36 +2324,112 @@ async def context_answer(
     repo: Any = None,  # str, list[str], or "*" to search all repos
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Natural-language Q&A over the repo using retrieval + local LLM (llama.cpp).
-
-    What it does:
-    - Retrieves relevant code (hybrid vector+lexical with reranking enabled by default).
-    - Budgets/merges micro-spans, builds citations, and asks the LLM to answer.
-    - Returns a concise answer plus file/line citations.
-
-    When to use:
-    - You need an explanation or "how to" grounded in code.
-    - Prefer repo_search for raw hits; prefer context_search to blend code + memory.
-
-    Key parameters:
-    - query: str or list[str]; may be expanded if expand=true.
-    - budget_tokens: int. Token budget across code spans (defaults from MICRO_BUDGET_TOKENS).
-    - include_snippet: bool (default true). Include code snippets sent to the LLM and return them when requested.
-    - max_tokens, temperature: decoding controls.
-    - mode: "stitch" (default) or "pack" for prompt assembly.
-    - expand: bool. Use tiny local LLM to propose up to 2 alternate queries.
-    - Filters: language, under, kind, symbol, ext, path_regex, path_glob, not_glob, not_, case.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter).
-      By default, auto-detects current repo from CURRENT_REPO env and filters to it.
+    """Generate LLM-powered answers with citations grounded in retrieved code.
+
+    PRIMARY USE: Get an EXPLANATION or ANSWER to a question, not raw search results.
+    Uses retrieval-augmented generation (RAG) with a local LLM decoder.
+
+    CHOOSE THIS WHEN:
+    - You need an EXPLANATION ("How does X work?", "What is Y?")
+    - You want a synthesized answer with source citations
+    - You're asking a question that requires understanding, not just finding
+    - You want the system to READ code and EXPLAIN it to you
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you want RAW CODE RESULTS, not explanations
+    - symbol_graph -> when you need precise "who calls X" relationships
+    - context_search -> when you want code + memories without LLM synthesis
+
+    QUERY EXAMPLES:
+    Good queries (questions requiring explanation):
+      "How does the authentication system validate tokens?"
+      "What is the purpose of the CacheManager class?"
+      "Explain the error handling strategy in the API layer"
+      "How are database connections pooled in this project?"
+      "What happens when a user session expires?"
+      "Describe the data flow for user registration"
+      "How does retry logic work in the HTTP client?"
+
+    Bad queries (not suited for LLM answers):
+      "find authentication code"      - use repo_search for finding code
+      "list all Python files"         - use repo_search with language filter
+      "UserController"                - symbol name only, use symbol_graph
+      "src/auth.py"                   - file path, just read the file
+      "def authenticate"              - code fragment, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Question or topic requiring explanation.
+      Should be phrased as a question or request for explanation.
+
+    RETRIEVAL PARAMETERS:
+    - limit (int, default=15): Code spans to retrieve for context.
+    - per_path (int, default=5): Max spans per file.
+    - budget_tokens (int): Token budget for code context. Default from env.
+    - include_snippet (bool, default=True): Include code in response.
+    - language (str): Filter retrieval by language.
+    - under (str): Restrict retrieval to directory path.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+
+    GENERATION PARAMETERS:
+    - max_tokens (int): Max tokens for generated answer.
+    - temperature (float): Sampling temperature (0.0-1.0). Lower = more focused.
+    - mode (str): Prompt assembly mode. "stitch" (default) or "pack".
+    - expand (bool): Use LLM to generate query expansions for better recall.
+
+    COMMON FILTER PARAMETERS (same as repo_search):
+    - symbol (str): Filter by symbol name.
+    - path_glob (str | list[str]): Filter by file pattern.
+    - not_glob (str | list[str]): Exclude file patterns.
+    - ext (str): Filter by file extension.
+
+    RETURNS:
+    {
+        "ok": true,
+        "answer": "The authentication system validates tokens by first checking
+                   the JWT signature using the secret from config [1], then
+                   verifying expiration time [2]. If valid, it extracts the
+                   user ID and loads permissions from the database [3].",
+        "citations": [
+            {
+                "id": 1,
+                "path": "src/auth/jwt.py",
+                "start_line": 45,
+                "end_line": 52,
+                "snippet": "def verify_token(token):..."  // Optional
+            },
+            {
+                "id": 2,
+                "path": "src/auth/jwt.py",
+                "start_line": 54,
+                "end_line": 58
+            },
+            {
+                "id": 3,
+                "path": "src/auth/permissions.py",
+                "start_line": 23,
+                "end_line": 31
+            }
+        ],
+        "query": ["How does authentication validate tokens"],
+        "used": {
+            "spans": 5,
+            "tokens": 1842
+        }
+    }
 
-    Returns:
-    - {"answer": str, "citations": [{"path": str, "start_line": int, "end_line": int}], "query": list[str], "used": {...}}
-    - On decoder disabled/error, returns {"error": "...", "citations": [...], "query": [...]}
+    // On insufficient context:
+    {
+        "answer": "insufficient context",
+        "citations": [],
+        "query": [...],
+        "hint": "Try broadening your query or checking if the feature exists"
+    }
 
-    Notes:
-    - Reranking is enabled by default for optimal retrieval quality.
-    - Honors env knobs such as REFRAG_MODE, REFRAG_GATE_FIRST, MICRO_BUDGET_TOKENS, DECODER_*.
-    - Keeps answers brief (2–4 sentences) and grounded; rejects ungrounded output.
+    NOTES:
+    - Answers include bracketed citations like [1], [2] referencing the citations array
+    - If context is insufficient, returns "insufficient context" as the answer
+    - Local LLM decoder must be available (llama.cpp or cloud fallback)
+    - Reranking is enabled by default for optimal retrieval quality
     """
     return await _context_answer_impl(
         query=query,
@@ -1685,14 +2484,84 @@ async def code_search(
     case: Any = None,
     session: Any = None,
     compact: Any = None,
+    # Memory blending (opt-in)
+    include_memories: Any = None,
+    memory_weight: Any = None,
+    per_source_limits: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Exact alias of repo_search (hybrid code search with reranking enabled by default).
+    """Alias of repo_search for discoverability. Use repo_search directly.
+
+    PRIMARY USE: This is an EXACT ALIAS of repo_search. Exists for discoverability
+    in IDEs and agents that might search for "code_search" instead of "repo_search".
+
+    CHOOSE THIS WHEN:
+    - You would use repo_search (they are identical)
+    - Your tooling expects a "code_search" function name
+
+    CHOOSE INSTEAD:
+    - repo_search -> same functionality, canonical name
+    - See repo_search docstring for full documentation
+
+    QUERY EXAMPLES:
+    Good queries (natural language, conceptual):
+      "authentication middleware"     - finds auth-related code
+      "error handling with retry"     - finds retry logic
+      "database connection setup"     - finds DB connection code
+      "user input validation"         - finds validation logic
+      "async task processing"         - finds async patterns
+
+    Bad queries (will return poor results):
+      "auth AND login"                - boolean operators NOT supported
+      "grep -r 'password'"            - not a shell command
+      "class.*Controller"             - regex NOT supported
+      "SELECT * FROM users"           - SQL query, not code search
+      "https://github.com/..."        - URL, not a search query
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Natural language description of code you're looking for.
+
+    All parameters and return format are identical to repo_search.
+    See repo_search documentation for complete parameter reference.
+
+    MEMORY BLENDING (opt-in, delegates to context_search):
+    - include_memories: bool. If true, blends memory results with code results.
+    - memory_weight: float (default 1.0). Scales memory scores relative to code.
+    - per_source_limits: dict, e.g. {"code": 5, "memory": 3}
 
-    Prefer repo_search; this name exists for discoverability in some IDEs/agents.
-    Same parameters and return shape as repo_search.
-    Reranking (rerank_enabled=true) is ON by default for optimal result quality.
+    RETURNS: Same schema as repo_search.
     """
+    # If include_memories is requested, delegate to context_search for blending
+    if include_memories:
+        return await context_search(
+            query=query,
+            limit=limit,
+            per_path=per_path,
+            include_memories=include_memories,
+            memory_weight=memory_weight,
+            per_source_limits=per_source_limits,
+            include_snippet=include_snippet,
+            context_lines=context_lines,
+            rerank_enabled=rerank_enabled,
+            rerank_top_n=rerank_top_n,
+            rerank_return_m=rerank_return_m,
+            rerank_timeout_ms=rerank_timeout_ms,
+            highlight_snippet=highlight_snippet,
+            collection=collection,
+            language=language,
+            under=under,
+            kind=kind,
+            symbol=symbol,
+            path_regex=path_regex,
+            path_glob=path_glob,
+            not_glob=not_glob,
+            ext=ext,
+            not_=not_,
+            case=case,
+            session=session,
+            compact=compact,
+            kwargs=kwargs,
+        )
     return await repo_search(
         query=query,
         limit=limit,
@@ -1749,31 +2618,105 @@ async def info_request(
     output_format: Any = None,  # "json" (default) or "toon" for token-efficient format
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Simplified codebase retrieval with optional explanation mode.
+    """Simplified codebase discovery with optional explanation mode.
+
+    PRIMARY USE: Quick, single-parameter code search with human-readable results.
+    Designed as a drop-in replacement for basic "find code about X" queries.
+
+    CHOOSE THIS WHEN:
+    - You want a simple, one-parameter search interface
+    - You want results with human-readable "information" descriptions
+    - You want optional explanation mode for richer context
+    - You're building a simple integration and want minimal complexity
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you need full control over filtering and parameters
+    - context_answer -> when you need an LLM-generated ANSWER, not just results
+    - symbol_graph -> when you need precise call/definition relationships
+
+    QUERY EXAMPLES:
+    Good queries (natural language descriptions):
+      "database connection pooling"     - finds DB connection code
+      "authentication middleware"       - finds auth-related code
+      "error handling patterns"         - finds error handling logic
+      "user input validation"           - finds validation code
+      "caching implementation"          - finds cache logic
+      "logging configuration"           - finds logging setup
+      "API endpoint handlers"           - finds route handlers
+
+    Bad queries (too vague or wrong format):
+      "code"                           - too vague
+      "the function"                   - unspecific
+      "*.py"                           - glob pattern, use path_glob param
+      "auth|login"                     - boolean syntax not supported
+      "line 42"                        - use file reading for specific lines
+
+    ESSENTIAL PARAMETERS:
+    - info_request (str): Natural language description of code you're looking for.
+    - information_request (str): Alias for info_request.
+
+    EXPLANATION MODE PARAMETERS:
+    - include_explanation (bool, default=False): When true, adds:
+      - summary: Brief overview of what was found
+      - primary_locations: Key file paths
+      - related_concepts: Technical concepts discovered
+      - query_understanding: How the query was interpreted
+
+    - include_relationships (bool, default=False): When true, adds to each result:
+      - imports_from: Modules this code imports
+      - calls: Functions this code calls
+      - related_paths: Related files
+
+    COMMON PARAMETERS:
+    - limit (int, default=10): Maximum results to return.
+    - language (str): Filter by language ("python", "typescript", etc.)
+    - under (str): Restrict to directory path.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+    - output_format (str): "json" or "toon" for token-efficient format.
+    - include_snippet (bool, default=True): Include code snippets.
+
+    RETURNS (compact mode, default):
+    {
+        "ok": true,
+        "results": [
+            {
+                "information": "Found function 'authenticate' in src/auth.py (lines 42-67)",
+                "relevance_score": 0.85,   // Alias for score
+                "score": 0.85,
+                "path": "src/auth.py",
+                "symbol": "authenticate",
+                "start_line": 42,
+                "end_line": 67
+            }
+        ],
+        "total": 5
+    }
 
-    When to use:
-    - Simple, single-parameter code search with human-readable descriptions
-    - When you want optional explanation mode for richer context
-    - Drop-in replacement for basic codebase retrieval tools
-
-    Key parameters:
-    - info_request: str. Natural language description of the code you're looking for.
-    - information_request: str. Alias for info_request.
-    - include_explanation: bool (default false). Add summary, primary_locations, related_concepts.
-    - include_relationships: bool (default false). Add imports_from, calls, related_paths to results.
-    - limit: int (default 10). Maximum results to return.
-    - language: str. Filter by programming language.
-    - under: str. Limit search to specific directory.
-    - repo: str or list[str]. Filter by repository name(s).
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
+    RETURNS (explanation mode, include_explanation=True):
+    {
+        "ok": true,
+        "results": [...],
+        "summary": "Found 5 authentication-related functions across 3 files",
+        "primary_locations": ["src/auth.py", "src/middleware/auth.py"],
+        "related_concepts": ["jwt", "token", "session", "middleware"],
+        "query_understanding": "Looking for authentication implementation code",
+        "confidence": {
+            "level": "high",
+            "score": 0.82,
+            "symbol_matches": 3
+        }
+    }
 
-    Returns:
-    - Compact mode (default): results with information field and relevance_score alias
-    - Explanation mode: adds summary, primary_locations, related_concepts, query_understanding
+    USAGE PATTERNS:
+    # Simple discovery:
+    info_request(info_request="database connection")
 
-    Example:
-    - {"info_request": "database connection pooling"}
-    - {"info_request": "authentication middleware", "include_explanation": true}
+    # With explanation:
+    info_request(
+        info_request="authentication flow",
+        include_explanation=True,
+        include_relationships=True
+    )
     """
     # Resolve query from either parameter
     query = info_request or information_request
@@ -1964,29 +2907,90 @@ async def context_search(
     output_format: Any = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
-    """Blend code search results with memory-store entries (notes, docs) for richer context.
-
-    When to use:
-    - You want code spans plus relevant memories in one response.
-    - Prefer repo_search for code-only; use context_answer when you need an LLM-written answer.
-
-    Key parameters:
-    - query: str or list[str]
-    - include_memories: bool (opt-in). If true, queries the memory collection and merges with code results.
-    - memory_weight: float (default 1.0). Scales memory scores relative to code.
-    - per_source_limits: dict, e.g. {"code": 5, "memory": 3}
-    - All repo_search filters are supported and passed through.
-    - output_format: "json" (default) or "toon" for token-efficient TOON format.
-    - rerank_enabled: bool (default true). ONNX reranker is ON by default for better relevance.
-    - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter).
-      By default, auto-detects current repo from CURRENT_REPO env and filters to it.
+    """Blend code search results with memory-store entries for richer context.
+
+    PRIMARY USE: Search code AND retrieve relevant stored memories/notes in one call.
+
+    CHOOSE THIS WHEN:
+    - You want code results PLUS relevant memories (notes, docs, decisions)
+    - You're searching for something where team knowledge might help
+    - You want to surface both implementation AND documentation/context
+    - You need to check if there are existing notes about a topic
+
+    CHOOSE INSTEAD:
+    - repo_search -> when you ONLY want code results (faster, no memory overhead)
+    - context_answer -> when you need an LLM-generated EXPLANATION
+    - memory_find -> when you ONLY want memories (no code search)
+
+    QUERY EXAMPLES:
+    Good queries (conceptual, topic-based):
+      "authentication design decisions"  - finds code + stored auth decisions
+      "API versioning strategy"          - finds API code + design notes
+      "database migration approach"      - finds migration code + notes
+      "caching invalidation policy"      - finds cache code + policy notes
+      "error handling conventions"       - finds error code + team standards
+
+    Bad queries (too narrow for memory blending):
+      "def authenticate("               - too specific, use repo_search
+      "class UserController"            - exact match, use repo_search
+      "line 42 of auth.py"              - specific location, just read the file
+      "git commit abc123"               - not a search query
+      "npm install express"             - command, not a query
+
+    ESSENTIAL PARAMETERS:
+    - query (str | list[str]): Natural language description of what you're looking for.
+
+    MEMORY BLENDING PARAMETERS:
+    - include_memories (bool, default=False): MUST SET TO TRUE to enable memory blending.
+      Without this, context_search behaves identically to repo_search.
+    - memory_weight (float, default=1.0): Scale memory scores relative to code.
+      Values >1.0 boost memories, <1.0 favor code results.
+    - per_source_limits (dict): Control results per source.
+      Example: {"code": 6, "memory": 3} returns max 6 code + 3 memory results.
+
+    COMMON PARAMETERS (same as repo_search):
+    - limit (int, default=10): Maximum total results.
+    - language (str): Filter code results by language.
+    - under (str): Restrict code search to directory path.
+    - include_snippet (bool, default=True): Include code snippets.
+    - rerank_enabled (bool, default=True): Cross-encoder reranking.
+    - output_format (str): "json" or "toon" for token-efficient format.
+    - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "source": "code",         // "code" or "memory"
+                "score": 0.85,
+                "path": "src/auth.py",    // For code results
+                "symbol": "authenticate",
+                "start_line": 42,
+                "end_line": 67,
+                "snippet": "def auth..."
+            },
+            {
+                "source": "memory",       // Memory results have different shape
+                "score": 0.78,
+                "content": "Auth uses JWT tokens with 24h expiry...",
+                "metadata": {"kind": "note", "created_at": "2024-..."}
+            }
+        ],
+        "total": 9,
+        "memory_note": "3 memories included"  // Optional note about memory results
+    }
 
-    Returns:
-    - {"results": [{"source": "code"| "memory", ...}, ...], "total": N[, "memory_note": str]}
-    - In compact mode, results are reduced to lightweight records.
+    USAGE PATTERN:
+    # To blend code + memories (recommended pattern):
+    context_search(
+        query="authentication architecture",
+        include_memories=True,
+        per_source_limits={"code": 5, "memory": 3}
+    )
 
-    Example:
-    - include_memories=true, per_source_limits={"code": 6, "memory": 2}, path_glob="docs/**"
+    # To search code only (same as repo_search):
+    context_search(query="authentication", include_memories=False)
     """
     return await _context_search_impl(
         query=query,
@@ -2072,34 +3076,110 @@ async def pattern_search(
     ) -> Dict[str, Any]:
         """Find structurally similar code patterns across all languages.
 
-        Accepts EITHER code examples OR natural language descriptions - auto-detects which.
-
-        When to use:
-        - Find code with similar control flow (retry loops, error handling, etc.)
-        - Cross-language pattern matching (Python pattern → Go/Rust/Java matches)
-        - Detect code duplication based on structure, not syntax
-        - Search by pattern description ("retry with backoff", "resource cleanup")
+        PRIMARY USE: Search by CODE STRUCTURE rather than text/semantics.
+        Finds code with similar control flow, API usage, or patterns.
+
+        CHOOSE THIS WHEN:
+        - You have a CODE EXAMPLE and want to find similar patterns
+        - You want to find code STRUCTURALLY similar (not just textually)
+        - You're searching across languages (Python pattern -> find in Go/Rust/Java)
+        - You want to detect code duplication based on structure
+        - You're searching for patterns like "retry with backoff", "singleton"
+
+        CHOOSE INSTEAD:
+        - repo_search -> when searching by CONCEPT, not structural pattern
+        - symbol_graph -> when looking for call/definition relationships
+        - context_answer -> when you need an EXPLANATION
+
+        QUERY EXAMPLES:
+
+        Code example mode (query_mode="code" or auto-detected):
+          "for i in range(3): try: ... except: time.sleep(2**i)"
+          "if err != nil { return err }"
+          "async function $NAME($$$) { await $EXPR; }"
+          "with open(file) as f: data = f.read()"
+          "try { ... } catch (e) { console.error(e); throw e; }"
+
+        Description mode (query_mode="description" or auto-detected):
+          "retry with exponential backoff"
+          "resource cleanup pattern"
+          "singleton implementation"
+          "factory pattern"
+          "decorator wrapping function"
+          "error handling with logging"
+          "connection pooling"
+          "rate limiting implementation"
+
+        Bad queries (wrong use case):
+          "authentication code"           - use repo_search for concepts
+          "who calls authenticate"        - use symbol_graph
+          "explain the auth flow"         - use context_answer
+          "files in src/"                 - use glob/file tools
+
+        ESSENTIAL PARAMETERS:
+        - query (str): EITHER a code example OR a natural language pattern description.
+          The mode is auto-detected, or you can force it with query_mode.
+
+        MODE CONTROL PARAMETERS:
+        - query_mode (str, default="auto"): How to interpret the query.
+          - "auto": Auto-detect if query is code or description
+          - "code": Force interpretation as code example
+          - "description": Force interpretation as pattern description
+        - language (str): Language hint for code examples. Also triggers code mode
+          in auto-detection. Example: "python", "go", "rust", "typescript"
+
+        COMMON PARAMETERS:
+        - limit (int, default=10): Maximum results to return.
+        - min_score (float, default=0.3): Minimum similarity score threshold.
+        - include_snippet (bool, default=True): Include code snippets in results.
+        - target_languages (list[str]): Filter results to specific languages.
+          Example: ["python", "go"] to find pattern only in Python and Go files.
+        - repo (str | list[str]): Filter by repo. Use "*" for all repos.
+        - output_format (str): "json" or "toon" for token-efficient format.
+        - compact (bool): Minimal response fields.
+
+        AROMA RERANKING PARAMETERS:
+        - aroma_rerank (bool, default=True): Enable AROMA-style pruning/reranking.
+          Improves precision by penalizing partial matches.
+        - aroma_alpha (float, default=0.6): Weight for pruned similarity vs original.
+          Higher values trust pruning more.
+
+        RETURNS:
+        {
+            "ok": true,
+            "results": [
+                {
+                    "path": "src/client.py",
+                    "start_line": 89,
+                    "end_line": 102,
+                    "score": 0.78,
+                    "language": "python",
+                    "snippet": "for attempt in range(max_retries):..."
+                }
+            ],
+            "total": 7,
+            "query_mode": "code",         // or "description"
+            "query_signature": "...",     // Internal: pattern signature used
+            "detection": {                // Mode detection metadata
+                "confidence": 0.95,
+                "ast_validated": true,
+                "signals": {"ast_parsed": 1.0, "nl_similarity": 0.42}
+            }
+        }
 
-        Key parameters:
-        - query: str. Code snippet OR natural language description of pattern.
-        - query_mode: str. "code", "description", or "auto" (default). Explicit override for detection.
-        - language: str. Language hint for code examples (also triggers code mode in auto).
-        - limit: int (default 10). Maximum results to return.
-        - min_score: float (default 0.3). Minimum similarity score threshold.
-        - include_snippet: bool (default false). Include code snippets in results.
-        - target_languages: list[str]. Filter to specific target languages.
-        - output_format: "json" (default) or "toon" for token-efficient format.
-        - compact: bool. If true with TOON, use minimal fields.
-        - aroma_rerank: bool (default true). Enable AROMA-style pruning and reranking.
-        - aroma_alpha: float (default 0.6). Weight for pruned similarity vs original score.
-
-        Returns:
-        - {ok, results: [{path, start_line, end_line, score, language, ...}], total, query_signature}
+        CROSS-LANGUAGE EXAMPLE:
+        # Find Go error handling similar to Python pattern
+        pattern_search(
+            query="if err != nil { return err }",
+            language="go",
+            target_languages=["python", "rust", "java"]
+        )
 
-        Examples:
-        - pattern_search(query="for i in range(3): try: ... except: time.sleep(2**i)")
-        - pattern_search(query="retry with exponential backoff", query_mode="description")
-        - pattern_search(query="if err != nil { return err }", language="go")
+        NOTES:
+        - Pattern vectors must be indexed (PATTERN_VECTORS=1 during indexing)
+        - Auto-detection uses AST parsing + NL embedder comparison
+        - Code mode uses structural pattern matching
+        - Description mode uses semantic search on pattern descriptions
         """
         return await _pattern_search_impl(
             query=query,
@@ -2280,6 +3360,11 @@ async def neo4j_graph_query(
     transport = os.environ.get("FASTMCP_TRANSPORT", "sse").strip().lower()
     # Enable stateless HTTP mode to avoid session handshake requirement
     stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"}
+    
+    # Add auth header extraction middleware for HTTP transports
+    if transport != "stdio":
+        _add_auth_middleware()
+    
     if transport == "stdio":
         # Run over stdio (for clients that don't support network transports)
         mcp.run(transport="stdio")
diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py
index b11607f4..61880669 100644
--- a/scripts/mcp_memory_server.py
+++ b/scripts/mcp_memory_server.py
@@ -43,6 +43,7 @@
 from scripts.mcp_auth import (
     require_auth_session as _require_auth_session,
     require_collection_access as _require_collection_access,
+    AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN,
 )
 
 from qdrant_client import QdrantClient, models
@@ -68,6 +69,7 @@
 LEX_VECTOR_NAME = os.environ.get("LEX_VECTOR_NAME", "lex")
 LEX_VECTOR_DIM = int(os.environ.get("LEX_VECTOR_DIM", "4096") or 4096)
 EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5")
+MEMORY_FIND_LIMIT_DEFAULT = int(os.environ.get("MEMORY_FIND_LIMIT_DEFAULT", "10") or 10)
 
 # Minimal embedding via fastembed (CPU)
 
@@ -137,6 +139,58 @@ def _ensure_once(name: str) -> bool:
         return False
 
 # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000)
+TOOLS_METADATA: Dict[str, Dict] = {
+    "memory_store": {
+        "name": "memory_store",
+        "category": "memory",
+        "primary_use": "Store knowledge for later retrieval",
+        "choose_when": [
+            "Storing team decisions/notes",
+            "Documenting conventions",
+            "Building institutional memory",
+        ],
+        "choose_instead": {},
+        "parameters": {
+            "essential": ["information"],
+            "common": ["metadata"],
+            "advanced": ["collection", "session"],
+        },
+        "returns": {"ok": "bool", "id": "str", "message": "str"},
+        "related_tools": ["memory_find", "context_search"],
+        "performance": {
+            "typical_latency_ms": (50, 500),
+            "requires_index": False,
+            "requires_decoder": False,
+        },
+    },
+    "memory_find": {
+        "name": "memory_find",
+        "category": "memory",
+        "primary_use": "Retrieve stored memories by similarity",
+        "choose_when": [
+            "Looking for stored notes/decisions",
+            "Recalling team knowledge",
+        ],
+        "choose_instead": {"context_search": "Want code + memories together"},
+        "parameters": {
+            "essential": ["query"],
+            "common": ["limit", "kind", "topic", "tags"],
+            "advanced": ["priority_min", "collection"],
+        },
+        "returns": {
+            "ok": "bool",
+            "results": "list[{id, information, metadata, score}]",
+            "total": "int",
+        },
+        "related_tools": ["memory_store", "context_search"],
+        "performance": {
+            "typical_latency_ms": (50, 300),
+            "requires_index": False,
+            "requires_decoder": False,
+        },
+    },
+}
+
 _security_settings = (
     TransportSecuritySettings(enable_dns_rebinding_protection=False)
     if TransportSecuritySettings
@@ -144,7 +198,49 @@ def _ensure_once(name: str) -> bool:
 )
 mcp = FastMCP(name="memory-server", transport_security=_security_settings)
 
-# Capture tool registry automatically by wrapping the decorator once
+
+class _AuthHeaderASGIMiddleware:
+    """Pure ASGI middleware that extracts Authorization header into context var."""
+    def __init__(self, app):
+        self.app = app
+    
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            headers = dict(scope.get("headers", []))
+            auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore")
+            if auth_header.lower().startswith("bearer "):
+                token = auth_header[7:].strip()
+            else:
+                token = auth_header.strip() if auth_header else ""
+            _AUTH_HEADER_TOKEN.set(token)
+        return await self.app(scope, receive, send)
+
+
+def _add_auth_middleware():
+    """Wrap FastMCP's ASGI app with auth header extraction middleware."""
+    logger.info("Setting up auth header middleware...")
+    try:
+        if hasattr(mcp, "streamable_http_app"):
+            _orig_streamable = mcp.streamable_http_app
+            def _patched_streamable(*args, **kwargs):
+                app = _orig_streamable(*args, **kwargs)
+                logger.info(f"Wrapping streamable_http_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.streamable_http_app = _patched_streamable
+        
+        if hasattr(mcp, "sse_app"):
+            _orig_sse = mcp.sse_app
+            def _patched_sse(*args, **kwargs):
+                app = _orig_sse(*args, **kwargs)
+                logger.info(f"Wrapping sse_app with auth middleware")
+                return _AuthHeaderASGIMiddleware(app)
+            mcp.sse_app = _patched_sse
+        
+        logger.info("Patched FastMCP app factory methods for auth middleware injection")
+    except Exception as e:
+        logger.warning(f"Failed to patch FastMCP for auth middleware: {e}")
+
+
 _TOOLS_REGISTRY: list[dict] = []
 try:
     _orig_tool = mcp.tool
@@ -251,7 +347,12 @@ def do_GET(self):
                         self.send_response(200)
                         self.send_header("Content-Type", "application/json")
                         self.end_headers()
-                        payload = {"ok": True, "tools": _TOOLS_REGISTRY}
+                        enriched = []
+                        for t in _TOOLS_REGISTRY:
+                            name = t.get("name", "")
+                            meta = TOOLS_METADATA.get(name, {})
+                            enriched.append({**t, **meta})
+                        payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA}
                         self.wfile.write((json.dumps(payload)).encode("utf-8"))
                     else:
                         self.send_response(404)
@@ -436,6 +537,12 @@ def set_session_defaults(
     mode: Optional[str] = None,
     language: Optional[str] = None,
     under: Optional[str] = None,
+    repo: Any = None,
+    compact: Any = None,
+    output_format: Optional[str] = None,
+    include_snippet: Any = None,
+    rerank_enabled: Any = None,
+    limit: Any = None,
     ctx: Context = None,
     kwargs: Any = None,
 ) -> Dict[str, Any]:
@@ -447,6 +554,19 @@ def set_session_defaults(
     - Optionally, also supports a lightweight token for clients that prefer cross-connection reuse.
 
     Precedence everywhere: explicit collection > per-connection defaults > token defaults > env default.
+
+    Parameters:
+    - collection: Default collection name
+    - mode: Search mode hint
+    - under: Default path prefix filter
+    - language: Default language filter
+    - repo: Default repo filter for multi-repo setups
+    - compact: Default compact response mode (bool)
+    - output_format: Default output format ("json" or "toon")
+    - include_snippet: Default snippet inclusion (bool)
+    - rerank_enabled: Default reranking toggle (bool)
+    - limit: Default result limit (int)
+    - session: Session token for cross-connection reuse
     """
     # Handle kwargs payload from some clients
     try:
@@ -467,6 +587,18 @@ def set_session_defaults(
                 under = _extra["under"]
             if not session and _extra.get("session"):
                 session = _extra["session"]
+            if repo is None and _extra.get("repo"):
+                repo = _extra["repo"]
+            if compact is None and _extra.get("compact") is not None:
+                compact = _extra["compact"]
+            if not output_format and _extra.get("output_format"):
+                output_format = _extra["output_format"]
+            if include_snippet is None and _extra.get("include_snippet") is not None:
+                include_snippet = _extra["include_snippet"]
+            if rerank_enabled is None and _extra.get("rerank_enabled") is not None:
+                rerank_enabled = _extra["rerank_enabled"]
+            if limit is None and _extra.get("limit") is not None:
+                limit = _extra["limit"]
     except Exception as e:
         logger.debug(f"Suppressed exception: {e}")
 
@@ -480,6 +612,23 @@ def set_session_defaults(
         defaults["language"] = language.strip()
     if isinstance(under, str) and under.strip():
         defaults["under"] = under.strip()
+    if isinstance(repo, str) and repo.strip():
+        defaults["repo"] = repo.strip()
+    elif isinstance(repo, list):
+        defaults["repo"] = repo
+    if isinstance(output_format, str) and output_format.strip():
+        defaults["output_format"] = output_format.strip()
+    if compact is not None:
+        defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact
+    if include_snippet is not None:
+        defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet
+    if rerank_enabled is not None:
+        defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled
+    if limit is not None:
+        try:
+            defaults["limit"] = int(limit)
+        except (ValueError, TypeError):
+            pass
 
     # Store per-connection (preferred, no token required)
     try:
@@ -521,9 +670,107 @@ def memory_store(
     session: Optional[str] = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Store a memory entry into Qdrant (dual vectors consistent with indexer).
+    """Store knowledge/notes into the memory system for later retrieval.
+
+    PRIMARY USE: Persist team knowledge, decisions, conventions, or notes
+    that should be retrievable alongside code search results.
+
+    CHOOSE THIS WHEN:
+    - You want to store a decision or convention for future reference
+    - You're documenting why code works a certain way
+    - You want to persist knowledge that context_search can find
+    - You're building institutional memory for the codebase
+
+    WHAT TO STORE:
+    Good candidates for memory storage:
+      - Architecture decisions: "We use JWT for auth because..."
+      - Conventions: "All API responses follow the envelope pattern..."
+      - Gotchas: "The cache has a 5-minute TTL, not configurable..."
+      - Debugging notes: "If X fails, check Y first..."
+      - Integration details: "External API requires header Z..."
+      - Performance notes: "This query is O(n^2), optimize for large N..."
+
+    Bad candidates (don't store these):
+      - Code itself (it's already indexed)
+      - Temporary debug output
+      - Personal notes not relevant to the codebase
+      - Sensitive data (passwords, keys, secrets)
+
+    ESSENTIAL PARAMETERS:
+    - information (str): The knowledge/note to store. Should be clear,
+      self-contained text that will be useful when retrieved later.
+
+    METADATA PARAMETERS:
+    - metadata (dict): Optional structured metadata for filtering.
+      Common keys:
+      - kind: "note", "decision", "convention", "gotcha", "policy"
+      - topic: Subject area ("auth", "caching", "api", "database")
+      - priority: Importance (1=low, 5=high)
+      - tags: List of tags for filtering
+      - author: Who wrote this note
+
+      Auto-added if not provided:
+      - created_at: ISO timestamp
+      - kind: "memory" (default)
+      - source: "memory" (default)
+
+    SESSION PARAMETERS:
+    - collection (str): Target collection. Defaults to workspace collection.
+    - session (str): Session token for multi-user scenarios.
+
+    RETURNS:
+    {
+        "ok": true,
+        "id": "abc123...",           // Unique ID for this memory
+        "message": "Successfully stored information",
+        "collection": "codebase",
+        "vector": "bge-base-en-v1-5"  // Embedding model used
+    }
 
-    First call may be slower because the embedding model loads lazily.
+    USAGE PATTERNS:
+
+    # Store an architecture decision
+    memory_store(
+        information="We chose FastAPI over Flask because we need async support
+        for the WebSocket handlers and automatic OpenAPI documentation.",
+        metadata={
+            "kind": "decision",
+            "topic": "api",
+            "tags": ["framework", "architecture"]
+        }
+    )
+
+    # Store a debugging gotcha
+    memory_store(
+        information="If authentication fails silently, check that the JWT_SECRET
+        env var is set. The auth middleware swallows exceptions.",
+        metadata={
+            "kind": "gotcha",
+            "topic": "auth",
+            "priority": 4
+        }
+    )
+
+    # Store a convention
+    memory_store(
+        information="All database queries must use parameterized statements.
+        Raw string interpolation is forbidden for security.",
+        metadata={
+            "kind": "convention",
+            "topic": "database",
+            "tags": ["security", "sql"]
+        }
+    )
+
+    RETRIEVAL:
+    Stored memories can be retrieved via:
+    - memory_find(query="...") -> searches only memories
+    - context_search(query="...", include_memories=True) -> code + memories
+
+    NOTES:
+    - First call may be slower due to embedding model loading
+    - Memories are embedded using the same model as code for consistent search
+    - Duplicate content is not deduplicated; avoid storing the same thing twice
     """
     sess = _require_auth_session(session)
     coll = _resolve_collection(collection, session=session, ctx=ctx)
@@ -587,10 +834,97 @@ def memory_find(
     priority_min: Optional[int] = None,
     ctx: Context = None,
 ) -> Dict[str, Any]:
-    """Find memory-like entries by vector similarity (dense + lexical fusion).
+    """Retrieve stored memories/notes by semantic similarity.
+
+    PRIMARY USE: Find previously stored knowledge, decisions, or notes.
+    Searches ONLY the memory store, not code.
+
+    CHOOSE THIS WHEN:
+    - You want to find previously stored notes/decisions
+    - You're looking for team knowledge without code results
+    - You want to filter memories by metadata (kind, topic, tags)
+    - You need to recall specific documented information
+
+    CHOOSE INSTEAD:
+    - context_search with include_memories=True -> when you want code + memories
+    - repo_search -> when you want code only, no memories
+
+    QUERY EXAMPLES:
+    Good queries (conceptual, knowledge-seeking):
+      "authentication decisions"      - finds auth-related notes
+      "why we chose this approach"    - finds decision rationale
+      "database performance tips"     - finds DB-related notes
+      "API design conventions"        - finds API conventions
+      "deployment gotchas"            - finds deployment notes
+
+    Bad queries:
+      "def authenticate"              - code fragment, use repo_search
+      "src/auth.py"                   - file path, not a memory query
+      "UserService"                   - class name, use repo_search
+
+    ESSENTIAL PARAMETERS:
+    - query (str): Natural language description of what you're looking for.
+
+    ALTERNATIVE QUERY PARAMETERS:
+    - q (str): Alias for query.
+    - top_k (int): Alias for limit.
+
+    FILTER PARAMETERS:
+    - kind (str): Filter by memory kind.
+      Values: "note", "decision", "convention", "gotcha", "policy", "preference"
+    - topic (str): Filter by topic/subject area.
+      Example: "auth", "database", "api", "caching"
+    - tags (str | list[str]): Filter by tags.
+      Example: "security" or ["security", "sql"]
+    - language (str): Filter by programming language context.
+    - priority_min (int): Minimum priority (1-5). Higher = more important.
+
+    COMMON PARAMETERS:
+    - limit (int, default=5): Maximum results to return.
+    - collection (str): Target collection. Defaults to workspace collection.
+    - session (str): Session token for multi-user scenarios.
+
+    RETURNS:
+    {
+        "ok": true,
+        "results": [
+            {
+                "id": "abc123...",
+                "information": "We chose JWT for authentication because...",
+                "metadata": {
+                    "kind": "decision",
+                    "topic": "auth",
+                    "created_at": "2024-01-15T10:30:00Z",
+                    "tags": ["security", "architecture"]
+                },
+                "score": 0.85,
+                "highlights": ["...chose <<JWT>> for <<authentication>>..."]
+            }
+        ],
+        "total": 3,
+        "count": 3,
+        "query": "authentication decisions"
+    }
+
+    USAGE PATTERNS:
+
+    # Find all authentication-related notes
+    memory_find(query="authentication", topic="auth")
+
+    # Find high-priority gotchas
+    memory_find(query="common issues", kind="gotcha", priority_min=4)
 
-    Cold-start option: set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding until the
-    model is cached (useful on slow storage).
+    # Find security-related conventions
+    memory_find(query="security best practices", kind="convention", tags="security")
+
+    # Find recent decisions
+    memory_find(query="recent architecture decisions", kind="decision", limit=10)
+
+    NOTES:
+    - Cold start: First call may be slower if embedding model isn't cached
+    - Set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding on cold start
+    - Highlights show query term matches in context
+    - Results are ranked by hybrid similarity (dense + lexical fusion)
     """
     # Handle 'q' alias for query
     if not query and q:
@@ -612,7 +946,7 @@ def memory_find(
     lex = _lex_hash_vector_text(str(query), LEX_VECTOR_DIM)
 
     # Harmonize alias: top_k -> limit
-    lim = int(limit if limit is not None else (top_k if top_k is not None else 5))
+    lim = int(limit if limit is not None else (top_k if top_k is not None else MEMORY_FIND_LIMIT_DEFAULT))
 
     # Build Qdrant filter
     must = []
@@ -829,6 +1163,11 @@ def _resolve_collection(
 
     # Enable stateless HTTP mode to avoid session handshake requirement
     stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"}
+    
+    # Add auth header extraction middleware for HTTP transports
+    if transport != "stdio":
+        _add_auth_middleware()
+    
     if transport == "stdio":
         # Run over stdio (for clients that don't support network transports)
         mcp.run(transport="stdio")
diff --git a/scripts/qdrant_client_manager.py b/scripts/qdrant_client_manager.py
index 78cc716b..3e901cef 100644
--- a/scripts/qdrant_client_manager.py
+++ b/scripts/qdrant_client_manager.py
@@ -9,7 +9,7 @@
 import threading
 import time
 import weakref
-from typing import Optional, Dict, List
+from typing import Optional, Dict, List, Any
 from contextlib import contextmanager
 from qdrant_client import QdrantClient
 
@@ -17,6 +17,33 @@
 # Connection pool implementation
 
 logger = logging.getLogger(__name__)
+
+
+def _get_qdrant_timeout() -> Optional[float]:
+    """Return the configured Qdrant HTTP timeout (seconds) if set."""
+    raw = os.environ.get("QDRANT_TIMEOUT") or os.environ.get("QDRANT_CLIENT_TIMEOUT")
+    if not raw:
+        return None
+    try:
+        timeout = float(raw)
+        return timeout if timeout > 0 else None
+    except (TypeError, ValueError):
+        logger.debug("Invalid Qdrant timeout value '%s'; ignoring", raw)
+        return None
+
+
+def _client_kwargs(url: str, api_key: Optional[str]) -> Dict[str, Any]:
+    """Build kwargs dict for QdrantClient constructor with optional timeout."""
+    kwargs: Dict[str, Any] = {
+        "url": url,
+        "api_key": api_key if api_key else None,
+    }
+    timeout = _get_qdrant_timeout()
+    if timeout is not None:
+        kwargs["timeout"] = timeout
+    return kwargs
+
+
 class QdrantConnectionPool:
     """Thread-safe connection pool for QdrantClient instances."""
     
@@ -49,7 +76,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient:
             
             # No suitable client found, create a new one
             if self._created_count < self.max_size:
-                client = QdrantClient(url=url, api_key=api_key)
+                client = QdrantClient(**_client_kwargs(url, api_key))
                 pool_entry = {
                     'client': client,
                     'url': url,
@@ -66,7 +93,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient:
                 # Pool is full, create a temporary client (not pooled)
                 # Mark it for tracking so return_client can close it
                 self._misses += 1
-                temp_client = QdrantClient(url=url, api_key=api_key)
+                temp_client = QdrantClient(**_client_kwargs(url, api_key))
                 # Track temporary clients with weakref so they auto-close
                 self._temp_clients.add(temp_client)
                 return temp_client
@@ -197,13 +224,13 @@ def get_qdrant_client(
     
     # Fallback to singleton pattern for backward compatibility
     if force_new:
-        return QdrantClient(url=url, api_key=api_key if api_key else None)
-    
+        return QdrantClient(**_client_kwargs(url, api_key))
+
     global _client
     
     with _client_lock:
         if _client is None:
-            _client = QdrantClient(url=url, api_key=api_key if api_key else None)
+            _client = QdrantClient(**_client_kwargs(url, api_key))
         return _client
 
 
diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py
index a1a22279..0a065b22 100644
--- a/scripts/remote_upload_client.py
+++ b/scripts/remote_upload_client.py
@@ -580,7 +580,10 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]:
                 # Skip paths that cannot be resolved
                 continue
 
-            cached_hash = get_cached_file_hash(abs_path, self.repo_name)
+            # Translate to container path for cache lookup (cache stores container paths)
+            # This handles the case where bridge runs locally but cache was created in container
+            cache_key = self._translate_to_container_path(abs_path)
+            cached_hash = get_cached_file_hash(cache_key, self.repo_name)
 
             if not path.exists():
                 # File was deleted
@@ -631,12 +634,12 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]:
                 # Unchanged (content same despite stat change)
                 changes["unchanged"].append(path)
 
-            # Update caches
+            # Update caches (use container path for cache consistency)
             try:
                 self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size)
             except Exception as e:
                 logger.debug(f"Suppressed exception: {e}")
-            set_cached_file_hash(abs_path, current_hash, self.repo_name)
+            set_cached_file_hash(cache_key, current_hash, self.repo_name)
 
         # Detect moves by looking for files with same content hash
         # but different paths (requires additional tracking)
@@ -662,7 +665,9 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) ->
         for deleted_path in deleted_files:
             try:
                 # Try to get cached hash first, fallback to file content
-                cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name)
+                # Use container path for cache lookup (cache stores container paths)
+                cache_key = self._translate_to_container_path(str(deleted_path))
+                cached_hash = get_cached_file_hash(cache_key, self.repo_name)
                 if cached_hash:
                     deleted_hashes[cached_hash] = deleted_path
                     continue
@@ -777,7 +782,9 @@ def create_delta_bundle(
                         content = f.read()
                     file_hash = hashlib.sha1(content).hexdigest()
                     content_hash = f"sha1:{file_hash}"
-                    previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name)
+                    # Use container path for cache lookup (cache stores container paths)
+                    cache_key = self._translate_to_container_path(str(path.resolve()))
+                    previous_hash = get_cached_file_hash(cache_key, self.repo_name)
 
                     # Write file to bundle
                     bundle_file_path = files_dir / "updated" / rel_path
@@ -853,7 +860,9 @@ def create_delta_bundle(
             for path in changes["deleted"]:
                 rel_path = path.relative_to(Path(self.workspace_path)).as_posix()
                 try:
-                    previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name)
+                    # Use container path for cache lookup (cache stores container paths)
+                    cache_key = self._translate_to_container_path(str(path.resolve()))
+                    previous_hash = get_cached_file_hash(cache_key, self.repo_name)
 
                     operation = {
                         "operation": "deleted",
diff --git a/scripts/upload_service.py b/scripts/upload_service.py
index 3c718ffd..fc8e66fa 100644
--- a/scripts/upload_service.py
+++ b/scripts/upload_service.py
@@ -303,6 +303,17 @@ class AuthUserCreateResponse(BaseModel):
     username: str
 
 
+class AuthValidateRequest(BaseModel):
+    session_id: str
+
+
+class AuthValidateResponse(BaseModel):
+    valid: bool
+    user_id: Optional[str] = None
+    expires_at: Optional[int] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+
 class PasswordLoginRequest(BaseModel):
     username: str
     password: str
@@ -599,6 +610,41 @@ async def auth_login(payload: AuthLoginRequest):
     )
 
 
+@app.post("/auth/validate", response_model=AuthValidateResponse)
+async def auth_validate(payload: AuthValidateRequest):
+    """Validate a session ID and return session info if valid.
+
+    This endpoint allows remote MCP servers to validate sessions against the
+    auth backend that issued them, enabling distributed auth validation.
+    """
+    try:
+        if not AUTH_ENABLED:
+            # When auth is disabled, all sessions are considered valid
+            return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None)
+
+        sid = (payload.session_id or "").strip()
+        if not sid:
+            return AuthValidateResponse(valid=False)
+
+        try:
+            record = validate_session(sid)
+        except AuthDisabledError:
+            return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None)
+
+        if record is None:
+            return AuthValidateResponse(valid=False)
+
+        return AuthValidateResponse(
+            valid=True,
+            user_id=record.get("user_id"),
+            expires_at=record.get("expires_at"),
+            metadata=record.get("metadata"),
+        )
+    except Exception as e:
+        logger.error(f"[upload_service] Failed to validate session: {e}")
+        raise HTTPException(status_code=500, detail="Failed to validate session")
+
+
 @app.get("/admin")
 async def admin_root(request: Request):
     if not AUTH_ENABLED:
diff --git a/tests/test_context_answer.py b/tests/test_context_answer.py
index 9aa74c4b..022c5d85 100644
--- a/tests/test_context_answer.py
+++ b/tests/test_context_answer.py
@@ -1,3 +1,4 @@
+import asyncio
 import importlib
 import types
 import pytest
@@ -51,7 +52,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="how to do x", limit=2, per_path=1)
     )
 
@@ -82,7 +83,7 @@ def generate_with_soft_embeddings(self, *a, **k):
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="how to do y", limit=1)
     )
 
@@ -130,7 +131,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="what is RRF_K in hybrid_search.py?", limit=1, per_path=1)
     )
 
@@ -179,7 +180,7 @@ def generate_with_soft_embeddings(self, *a, **kw):
     monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama)
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="RRF_K", limit=1, per_path=1)
     )
 
@@ -214,7 +215,7 @@ def _raise_retrieval(*a, **k):
 
     monkeypatch.setattr(srv, "_ca_prepare_filters_and_retrieve", _raise_retrieval)
 
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = asyncio.run(
         srv.context_answer(query="x", limit=1, per_path=1)
     )
     assert "error" in out
@@ -245,7 +246,7 @@ def _fake_retrieval(*a, **k):
     import scripts.refrag_llamacpp as ref
     monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False)
 
-    out2 = srv.asyncio.get_event_loop().run_until_complete(
+    out2 = asyncio.run(
         srv.context_answer(query="x", limit=1, per_path=1)
     )
     assert isinstance(out2, dict)
diff --git a/tests/test_context_answer_path_mention.py b/tests/test_context_answer_path_mention.py
index 59299b8d..b0e80f75 100644
--- a/tests/test_context_answer_path_mention.py
+++ b/tests/test_context_answer_path_mention.py
@@ -30,7 +30,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 64, **kw)
 
     # Mention an actual file in this repo so fallback can find it
     q = "explain something in scripts/hybrid_search.py"
-    out = srv.asyncio.get_event_loop().run_until_complete(
+    out = srv.asyncio.run(
         srv.context_answer(query=q, limit=3, per_path=2)
     )
     assert isinstance(out, dict)
diff --git a/tests/test_env_behavior.py b/tests/test_env_behavior.py
index b5daa6f3..3803f210 100644
--- a/tests/test_env_behavior.py
+++ b/tests/test_env_behavior.py
@@ -20,6 +20,8 @@ def test_rerank_timeout_floor_and_env_defaults(monkeypatch):
     monkeypatch.setenv("RERANK_TIMEOUT_FLOOR_MS", "1500")
     # Fix default timeout for test determinism (CI may set a higher value)
     monkeypatch.setenv("RERANKER_TIMEOUT_MS", "200")
+    # Override the min clamp so the floor takes effect
+    monkeypatch.setenv("RERANK_TIMEOUT_MIN_MS", "0")
 
     # Fake _run_async to capture calls
     calls = []
@@ -47,7 +49,7 @@ async def fake_run(cmd, env=None, timeout=None):
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
     # Call repo_search with no rerank_enabled arg to pick env default
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = srv.asyncio.run(
         srv.repo_search(query="foo", limit=3, per_path=1)
     )
 
diff --git a/tests/test_error_paths.py b/tests/test_error_paths.py
index 0beb9d4b..ad4d04e4 100644
--- a/tests/test_error_paths.py
+++ b/tests/test_error_paths.py
@@ -17,7 +17,7 @@ async def fake_run(cmd, **kwargs):
 
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = asyncio.run(
         srv.repo_search(queries=["x"], limit=1, compact=False, lean=False)
     )
 
@@ -49,7 +49,7 @@ async def fake_run(cmd, **kwargs):
 
     monkeypatch.setattr(srv, "_run_async", fake_run)
 
-    res = srv.asyncio.get_event_loop().run_until_complete(
+    res = asyncio.run(
         srv.repo_search(queries=["x"], limit=1, compact=True, lean=False)
     )
 
diff --git a/tests/test_globs_and_snippet.py b/tests/test_globs_and_snippet.py
index 2486e1a0..9eb7fcbb 100644
--- a/tests/test_globs_and_snippet.py
+++ b/tests/test_globs_and_snippet.py
@@ -223,6 +223,6 @@ def run_hybrid_search(**kwargs):
 @pytest.mark.unit
 def test_repo_search_docstring_clean():
     doc = srv.repo_search.__doc__
-    assert doc and "Zero-config code search" in doc
+    assert doc and "Primary hybrid semantic" in doc
     # Ensure stray inline pseudo-code is not embedded in docstring
     assert "Accept common alias keys from clients" not in doc
diff --git a/tests/test_qdrant_client_manager_pool.py b/tests/test_qdrant_client_manager_pool.py
index 85408783..7cc4c9b2 100644
--- a/tests/test_qdrant_client_manager_pool.py
+++ b/tests/test_qdrant_client_manager_pool.py
@@ -18,7 +18,7 @@
 
 
 class _DummyQdrantClient:
-    def __init__(self, url=None, api_key=None):
+    def __init__(self, url=None, api_key=None, **kwargs):
         self.url = url
         self.api_key = api_key
         self.closed = False