diff --git a/.gitignore b/.gitignore index 2ddfb00d..9cc39e63 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ deploy/eks-cdk/ ctx_config.json /deploy/eks-cdk /deploy/eks-cdk-PATHFUL +.env diff --git a/.indexignore b/.indexignore index 6ab86f92..b4da8f56 100644 --- a/.indexignore +++ b/.indexignore @@ -2,3 +2,18 @@ cosqa*.json # dev-workspace contains uploaded client workspaces - they get indexed # separately via upload service, not as part of the main Context-Engine repo dev-workspace/ + +# CDK/deploy build artifacts - duplicates of source files +deploy/eks-cdk-*/cdk.out/ +**/cdk.out/ + +# Build/dist artifacts +dist/ +build/ +*.egg-info/ + +# IDE/editor artifacts +.idea/ +.vscode/ +*.swp +*.swo diff --git a/ctx-mcp-bridge/package.json b/ctx-mcp-bridge/package.json index 6c4e93cd..df69fa17 100644 --- a/ctx-mcp-bridge/package.json +++ b/ctx-mcp-bridge/package.json @@ -1,6 +1,6 @@ { "name": "@context-engine-bridge/context-engine-mcp-bridge", - "version": "0.0.16", + "version": "0.0.17", "description": "Context Engine MCP bridge (http/stdio proxy combining indexer + memory servers)", "bin": { "ctxce": "bin/ctxce.js", diff --git a/ctx-mcp-bridge/src/mcpServer.js b/ctx-mcp-bridge/src/mcpServer.js index 2fe1b101..d2426d3b 100644 --- a/ctx-mcp-bridge/src/mcpServer.js +++ b/ctx-mcp-bridge/src/mcpServer.js @@ -126,7 +126,8 @@ function selectClientForTool(name, indexerClient, memoryClient) { return indexerClient; } const lowered = name.toLowerCase(); - if (memoryClient && (lowered.startsWith("memory.") || lowered.startsWith("mcp_memory_"))) { + // Route to memory server for any memory-prefixed tool + if (memoryClient && lowered.startsWith("memory")) { return memoryClient; } return indexerClient; diff --git a/deploy/helm/context-engine/templates/mcp-memory-http.yaml b/deploy/helm/context-engine/templates/mcp-memory-http.yaml index d05b10b8..c05f3827 100644 --- a/deploy/helm/context-engine/templates/mcp-memory-http.yaml +++ b/deploy/helm/context-engine/templates/mcp-memory-http.yaml @@ -107,6 +107,8 @@ spec: - name: work-volume mountPath: /work readOnly: true + - name: codebase-volume + mountPath: /work/.codebase - name: metadata-volume mountPath: /tmp/rerank_weights subPath: rerank_weights @@ -117,6 +119,9 @@ spec: - name: work-volume persistentVolumeClaim: claimName: {{ .Values.persistence.codeRepos.name }} + - name: codebase-volume + persistentVolumeClaim: + claimName: {{ .Values.persistence.codeMetadata.name }} - name: metadata-volume persistentVolumeClaim: claimName: {{ .Values.persistence.codeMetadata.name }} diff --git a/deploy/helm/context-engine/values-example.yaml b/deploy/helm/context-engine/values-example.yaml index 94dd017e..4b669359 100644 --- a/deploy/helm/context-engine/values-example.yaml +++ b/deploy/helm/context-engine/values-example.yaml @@ -52,15 +52,15 @@ mcpIndexerHttp: replicas: 1 resources: requests: - cpu: 250m + cpu: 500m memory: 8Gi limits: - cpu: "1" + cpu: "2" memory: 16Gi autoscaling: enabled: true minReplicas: 1 - maxReplicas: 4 + maxReplicas: 2 # MCP Memory HTTP mcpMemoryHttp: @@ -68,45 +68,52 @@ mcpMemoryHttp: replicas: 1 resources: requests: - cpu: 250m - memory: 512Mi + cpu: 500m + memory: 1Gi limits: - cpu: "1" - memory: 2Gi + cpu: "1500m" + memory: 3Gi autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 1 # Upload Service uploadService: enabled: true replicas: 1 + resources: + requests: + cpu: 250m + memory: 1Gi + limits: + cpu: "1500m" + memory: 3Gi autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 2 # Watcher watcher: enabled: true - replicas: 1 + replicas: 2 resources: requests: cpu: 500m - memory: 2Gi + memory: 3Gi limits: - cpu: "2" - memory: 8Gi + cpu: "2500m" + memory: 10Gi -# Learning Reranker Worker +# Learning Reranker Worker (singleton - only 1 can run due to leader election) learningRerankerWorker: enabled: true replicas: 1 autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 1 # Persistence - shared PVCs persistence: diff --git a/deploy/helm/context-engine/values.yaml b/deploy/helm/context-engine/values.yaml index 757c02bb..bdfc11be 100644 --- a/deploy/helm/context-engine/values.yaml +++ b/deploy/helm/context-engine/values.yaml @@ -140,10 +140,10 @@ mcpIndexerHttp: # -- Resource requests and limits resources: requests: - cpu: 250m + cpu: 500m memory: 8Gi limits: - cpu: "1" + cpu: "2" memory: 16Gi # -- Liveness probe livenessProbe: @@ -167,7 +167,7 @@ mcpIndexerHttp: autoscaling: enabled: true minReplicas: 1 - maxReplicas: 4 + maxReplicas: 2 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # -- Topology spread constraints @@ -207,11 +207,11 @@ mcpMemoryHttp: # -- Resource requests and limits resources: requests: - cpu: 250m - memory: 512Mi + cpu: 500m + memory: 1Gi limits: - cpu: "1" - memory: 2Gi + cpu: "1500m" + memory: 3Gi # -- Liveness probe livenessProbe: httpGet: @@ -230,7 +230,7 @@ mcpMemoryHttp: autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 1 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # -- Topology spread constraints @@ -273,10 +273,10 @@ uploadService: resources: requests: cpu: 250m - memory: 512Mi + memory: 1Gi limits: - cpu: "1" - memory: 2Gi + cpu: "1500m" + memory: 3Gi # -- Environment variables env: UPLOAD_SERVICE_HOST: "0.0.0.0" @@ -288,7 +288,7 @@ uploadService: autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 2 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # -- Topology spread constraints @@ -305,7 +305,7 @@ watcher: # -- Enable Watcher enabled: true # -- Number of replicas - replicas: 1 + replicas: 2 # -- Command to run command: - python @@ -316,10 +316,10 @@ watcher: resources: requests: cpu: 500m - memory: 2Gi + memory: 3Gi limits: - cpu: "2" - memory: 8Gi + cpu: "2500m" + memory: 10Gi # -- Environment variables (in addition to configmap) env: WATCH_ROOT: /work @@ -343,7 +343,7 @@ watcher: learningRerankerWorker: # -- Enable Learning Reranker Worker enabled: true - # -- Number of replicas + # -- Number of replicas (singleton worker with leader election - only 1 can run) replicas: 1 # -- Command to run command: @@ -358,11 +358,11 @@ learningRerankerWorker: limits: cpu: "1" memory: 2Gi - # -- HPA configuration + # -- HPA configuration (capped at 1 - singleton worker) autoscaling: enabled: true minReplicas: 1 - maxReplicas: 3 + maxReplicas: 1 targetCPUUtilizationPercentage: 70 targetMemoryUtilizationPercentage: 80 # -- Topology spread constraints diff --git a/deploy/kubernetes/mcp-http.yaml b/deploy/kubernetes/mcp-http.yaml index c3c71fe2..c8ace6ed 100644 --- a/deploy/kubernetes/mcp-http.yaml +++ b/deploy/kubernetes/mcp-http.yaml @@ -30,10 +30,10 @@ spec: command: - sh - -c - - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events + - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events volumeMounts: - - name: metadata-volume - mountPath: /mnt + - name: codebase-volume + mountPath: /work/.codebase containers: - name: mcp-memory-http image: context-engine-memory @@ -108,10 +108,12 @@ spec: - name: work-volume mountPath: /work readOnly: true - - name: metadata-volume + - name: codebase-volume + mountPath: /work/.codebase + - name: codebase-volume mountPath: /tmp/rerank_weights subPath: rerank_weights - - name: metadata-volume + - name: codebase-volume mountPath: /tmp/rerank_events subPath: rerank_events livenessProbe: @@ -133,7 +135,7 @@ spec: - name: work-volume persistentVolumeClaim: claimName: code-repos-pvc - - name: metadata-volume + - name: codebase-volume persistentVolumeClaim: claimName: code-metadata-pvc --- diff --git a/deploy/kubernetes/mcp-memory.yaml b/deploy/kubernetes/mcp-memory.yaml index 165076db..5f34ff1f 100644 --- a/deploy/kubernetes/mcp-memory.yaml +++ b/deploy/kubernetes/mcp-memory.yaml @@ -26,10 +26,10 @@ spec: command: - sh - -c - - mkdir -p /mnt/rerank_weights /mnt/rerank_events && chmod 777 /mnt/rerank_weights /mnt/rerank_events + - mkdir -p /work/.codebase/rerank_weights /work/.codebase/rerank_events && chmod 777 /work/.codebase/rerank_weights /work/.codebase/rerank_events volumeMounts: - - name: metadata-volume - mountPath: /mnt + - name: codebase-volume + mountPath: /work/.codebase containers: - name: mcp-memory image: context-engine-memory @@ -85,6 +85,8 @@ spec: - name: work-volume mountPath: /work readOnly: true + - name: codebase-volume + mountPath: /work/.codebase - name: metadata-volume mountPath: /tmp/rerank_weights subPath: rerank_weights @@ -110,6 +112,9 @@ spec: - name: work-volume persistentVolumeClaim: claimName: code-repos-pvc + - name: codebase-volume + persistentVolumeClaim: + claimName: code-metadata-pvc - name: metadata-volume persistentVolumeClaim: claimName: code-metadata-pvc diff --git a/scripts/hybrid/qdrant.py b/scripts/hybrid/qdrant.py index 039ca6f1..498366e9 100644 --- a/scripts/hybrid/qdrant.py +++ b/scripts/hybrid/qdrant.py @@ -17,19 +17,73 @@ import logging import threading import re -from typing import List, Dict, Any, Tuple +import time +from typing import List, Dict, Any, Tuple, Optional, Callable, TypeVar from pathlib import Path from concurrent.futures import ThreadPoolExecutor -# Core Qdrant imports +# Core Qdrant imports (optional in some runtimes) try: from qdrant_client import QdrantClient, models -except ImportError: +except ImportError: # pragma: no cover QdrantClient = None # type: ignore models = None # type: ignore +try: + from qdrant_client.http.exceptions import ResponseHandlingException +except ImportError: # pragma: no cover + ResponseHandlingException = None # type: ignore + +try: # pragma: no cover - optional dependency + import httpx +except ImportError: + httpx = None # type: ignore + +try: # pragma: no cover - optional dependency + import httpcore +except ImportError: + httpcore = None # type: ignore + logger = logging.getLogger("hybrid_qdrant") + +def _is_timeout_exception(exc: Exception) -> bool: + """Detect whether an exception is a Qdrant/http timeout.""" + + if ResponseHandlingException and isinstance(exc, ResponseHandlingException): + cause = exc.__cause__ or exc.__context__ + if cause is not None and cause is not exc: + return _is_timeout_exception(cause) + return "timeout" in str(exc).lower() + + timeout_types = [] + if httpx is not None: + timeout_types.append(getattr(httpx, "TimeoutException", None)) + timeout_types.append(getattr(httpx, "ReadTimeout", None)) + if httpcore is not None: + timeout_types.append(getattr(httpcore, "TimeoutException", None)) + timeout_types.append(getattr(httpcore, "ReadTimeout", None)) + + for t in timeout_types: + if t and isinstance(exc, t): + return True + + return isinstance(exc, TimeoutError) + + +def _log_qdrant_timeout(kind: str, collection: Optional[str], detail: Exception) -> None: + coll = collection or "(unknown)" + logger.warning( + "Qdrant %s query timed out for collection %s; returning partial results", kind, coll + ) + + +def _handle_timeout(kind: str, collection: Optional[str], exc: Exception) -> bool: + if _is_timeout_exception(exc): + _log_qdrant_timeout(kind, collection, exc) + return True + return False + # --------------------------------------------------------------------------- # Helper functions for safe type conversion # --------------------------------------------------------------------------- @@ -75,6 +129,10 @@ def _safe_float(val: Any, default: float) -> float: ) EF_SEARCH = _safe_int(os.environ.get("QDRANT_EF_SEARCH", "128"), 128) +_MAX_QDRANT_CONCURRENCY = max(1, _safe_int(os.environ.get("QDRANT_MAX_CONCURRENCY", "6"), 6)) +_SEMAPHORE_LOG_THRESHOLD = float(os.environ.get("QDRANT_SEMAPHORE_LOG_THRESHOLD", "0.5") or 0.5) +_QDRANT_REQUEST_SEMAPHORE = threading.BoundedSemaphore(_MAX_QDRANT_CONCURRENCY) +T = TypeVar("T") # Quantization search params (for faster search with quantized collections) QDRANT_QUANTIZATION = os.environ.get("QDRANT_QUANTIZATION", "none").strip().lower() @@ -95,6 +153,24 @@ def _get_search_params(ef: int) -> models.SearchParams: return models.SearchParams(hnsw_ef=ef) +def _with_qdrant_slot(kind: str, fn: Callable[[], T]) -> T: + """Serialize Qdrant calls to avoid overload while preserving concurrency.""" + wait_start = time.perf_counter() + _QDRANT_REQUEST_SEMAPHORE.acquire() + waited = time.perf_counter() - wait_start + if waited >= _SEMAPHORE_LOG_THRESHOLD: + logger.debug( + "Qdrant %s query waited %.3fs for slot (max=%s)", + kind, + waited, + _MAX_QDRANT_CONCURRENCY, + ) + try: + return fn() + finally: + _QDRANT_REQUEST_SEMAPHORE.release() + + # --------------------------------------------------------------------------- # Connection pooling setup # --------------------------------------------------------------------------- @@ -191,7 +267,9 @@ def _legacy_vector_search( query_filter=flt, ) return _coerce_points(getattr(result, "points", result)) - except Exception: + except Exception as exc: + if _handle_timeout("legacy", collection, exc): + return [] return [] @@ -469,57 +547,71 @@ def lex_query( return [] try: - qp = client.query_points( - collection_name=collection, - query=v, - using=LEX_VECTOR_NAME, - query_filter=flt, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "lex", + lambda: client.query_points( + collection_name=collection, + query=v, + using=LEX_VECTOR_NAME, + query_filter=flt, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except TypeError: if os.environ.get("DEBUG_HYBRID_SEARCH"): logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": LEX_VECTOR_NAME}) - qp = client.query_points( - collection_name=collection, - query=v, - using=LEX_VECTOR_NAME, - filter=flt, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "lex", + lambda: client.query_points( + collection_name=collection, + query=v, + using=LEX_VECTOR_NAME, + filter=flt, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except AttributeError: return _legacy_vector_search(client, collection, LEX_VECTOR_NAME, v, per_query, flt) except Exception as e: + if _handle_timeout("lex", collection, e): + return [] if os.environ.get("DEBUG_HYBRID_SEARCH"): try: logger.debug("QP_FILTER_DROP", extra={"using": LEX_VECTOR_NAME, "reason": str(e)[:200]}) except Exception as e: logger.debug(f"Suppressed exception: {e}") try: - qp = client.query_points( - collection_name=collection, - query=v, - using=LEX_VECTOR_NAME, - query_filter=None, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "lex", + lambda: client.query_points( + collection_name=collection, + query=v, + using=LEX_VECTOR_NAME, + query_filter=None, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except TypeError: - qp = client.query_points( - collection_name=collection, - query=v, - using=LEX_VECTOR_NAME, - filter=None, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "lex", + lambda: client.query_points( + collection_name=collection, + query=v, + using=LEX_VECTOR_NAME, + filter=None, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except Exception as e2: @@ -553,35 +645,43 @@ def sparse_lex_query( return [] try: - qp = client.query_points( - collection_name=collection, - query=models.SparseVector( - indices=sparse_vec["indices"], - values=sparse_vec["values"], - ), - using=LEX_SPARSE_NAME, - query_filter=flt, - limit=per_query, - with_payload=True, - ) - return _coerce_points(getattr(qp, "points", qp)) - except TypeError: - try: - qp = client.query_points( + qp = _with_qdrant_slot( + "sparse", + lambda: client.query_points( collection_name=collection, query=models.SparseVector( indices=sparse_vec["indices"], values=sparse_vec["values"], ), using=LEX_SPARSE_NAME, - filter=flt, + query_filter=flt, limit=per_query, with_payload=True, + ), + ) + return _coerce_points(getattr(qp, "points", qp)) + except TypeError: + try: + qp = _with_qdrant_slot( + "sparse", + lambda: client.query_points( + collection_name=collection, + query=models.SparseVector( + indices=sparse_vec["indices"], + values=sparse_vec["values"], + ), + using=LEX_SPARSE_NAME, + filter=flt, + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except Exception: return [] except Exception as e: + if _handle_timeout("sparse", collection, e): + return [] if os.environ.get("DEBUG_HYBRID_SEARCH"): logger.debug("SPARSE_LEX_QUERY_ERROR", extra={"error": str(e)[:200]}) return [] @@ -624,30 +724,38 @@ def dense_query( return [] try: - qp = client.query_points( - collection_name=collection, - query=v, - using=vec_name, - query_filter=flt, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "dense", + lambda: client.query_points( + collection_name=collection, + query=v, + using=vec_name, + query_filter=flt, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except TypeError: if os.environ.get("DEBUG_HYBRID_SEARCH"): logger.debug("QP_FILTER_KWARG_SWITCH", extra={"using": vec_name}) - qp = client.query_points( - collection_name=collection, - query=v, - using=vec_name, - filter=flt, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, + qp = _with_qdrant_slot( + "dense", + lambda: client.query_points( + collection_name=collection, + query=v, + using=vec_name, + filter=flt, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except Exception as e: + if _handle_timeout("dense", collection, e): + return [] if os.environ.get("DEBUG_HYBRID_SEARCH"): try: logger.debug("QP_FILTER_DROP", extra={"using": vec_name, "reason": str(e)[:200]}) @@ -656,29 +764,37 @@ def dense_query( if not collection: return _legacy_vector_search(client, _collection(), vec_name, v, per_query, flt) try: - qp = client.query_points( - collection_name=collection, - query=v, - using=vec_name, - query_filter=None, - search_params=_get_search_params(ef), - limit=per_query, - with_payload=True, - ) - return _coerce_points(getattr(qp, "points", qp)) - except TypeError: - try: - qp = client.query_points( + qp = _with_qdrant_slot( + "dense", + lambda: client.query_points( collection_name=collection, query=v, using=vec_name, - filter=None, + query_filter=None, search_params=_get_search_params(ef), limit=per_query, with_payload=True, + ), + ) + return _coerce_points(getattr(qp, "points", qp)) + except TypeError: + try: + qp = _with_qdrant_slot( + "dense", + lambda: client.query_points( + collection_name=collection, + query=v, + using=vec_name, + filter=None, + search_params=_get_search_params(ef), + limit=per_query, + with_payload=True, + ), ) return _coerce_points(getattr(qp, "points", qp)) except Exception as e2: + if _handle_timeout("dense", collection, e2): + return [] if os.environ.get("DEBUG_HYBRID_SEARCH"): try: logger.debug("QP_FILTER_DROP_FAILED", extra={"using": vec_name, "reason": str(e2)[:200]}) diff --git a/scripts/mcp_auth.py b/scripts/mcp_auth.py index 2b13c791..ad224705 100644 --- a/scripts/mcp_auth.py +++ b/scripts/mcp_auth.py @@ -1,3 +1,4 @@ +import contextvars import os from typing import Any, Dict, Optional @@ -8,6 +9,9 @@ class ValidationError(Exception): pass +# Context variable for Authorization header token (set by HTTP middleware) +AUTH_HEADER_TOKEN: contextvars.ContextVar[str] = contextvars.ContextVar("auth_header_token", default="") + try: from scripts.auth_backend import ( @@ -47,17 +51,51 @@ def _has_collection_access( in {"1", "true", "yes", "on"} ) +# Direct token auth: allow admin/shared tokens to bypass session lookup +_AUTH_ADMIN_TOKEN = (os.environ.get("CTXCE_AUTH_ADMIN_TOKEN") or "").strip() +_AUTH_SHARED_TOKEN = (os.environ.get("CTXCE_AUTH_SHARED_TOKEN") or "").strip() + +# Auto-fallback: when enabled, use shared token if no session/header provided +_AUTH_AUTO_SHARED = ( + str(os.environ.get("CTXCE_AUTH_AUTO_SHARED", "0")).strip().lower() + in {"1", "true", "yes", "on"} +) + def require_auth_session(session: Optional[str]) -> Optional[Dict[str, Any]]: if not AUTH_ENABLED_AUTH: return None sid = (session or "").strip() + if not sid: + sid = AUTH_HEADER_TOKEN.get() + + if sid and sid.lower().startswith("bearer "): + sid = sid[7:].strip() + + if _AUTH_ADMIN_TOKEN and sid == _AUTH_ADMIN_TOKEN: + return {"user_id": "admin", "role": "admin", "token_type": "admin"} + if _AUTH_SHARED_TOKEN and sid == _AUTH_SHARED_TOKEN: + return {"user_id": "shared", "role": "user", "token_type": "shared"} + + if not sid: + if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN: + return {"user_id": "shared", "role": "user", "token_type": "shared_auto"} raise ValidationError("Missing session for authorized operation") - info = _auth_validate_session(sid) - if not info: - raise ValidationError("Invalid or expired session") - return info + + # Try local session validation first + try: + info = _auth_validate_session(sid) + if info: + return info + except Exception: + pass + + # Session not found locally - fall back to shared token if auto-shared is enabled + if _AUTH_AUTO_SHARED and _AUTH_SHARED_TOKEN: + return {"user_id": "shared", "role": "user", "token_type": "shared_auto"} + + raise ValidationError("Invalid or expired session") def require_collection_access(user_id: Optional[str], collection: str, perm: str) -> None: diff --git a/scripts/mcp_impl/context_answer.py b/scripts/mcp_impl/context_answer.py index 272bb869..a7f434c5 100644 --- a/scripts/mcp_impl/context_answer.py +++ b/scripts/mcp_impl/context_answer.py @@ -57,6 +57,90 @@ logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Auto-memory storage for successful answers +# --------------------------------------------------------------------------- + +# Minimum answer length to auto-store (default 200 chars) +_AUTO_MEMORY_MIN_CHARS = int(os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY_MIN_CHARS", "200") or 200) +# Enable/disable auto-memory storage (default ON) +_AUTO_MEMORY_ENABLED = os.environ.get("CONTEXT_ANSWER_AUTO_MEMORY", "1").strip().lower() in {"1", "true", "yes", "on"} + + +def _maybe_store_answer_as_memory( + answer: str, + queries: List[str], + citations: List[Dict[str, Any]], + collection: Optional[str] = None, +) -> None: + """Fire-and-forget storage of successful context_answer responses as memories. + + Criteria for storage: + - Answer is not "insufficient context" + - Answer has at least one citation + - Answer length >= _AUTO_MEMORY_MIN_CHARS (default 200) + - _AUTO_MEMORY_ENABLED is True (default) + + Runs in a background thread to not block the response. + """ + if not _AUTO_MEMORY_ENABLED: + return + + # Check criteria + ans_clean = (answer or "").strip() + if not ans_clean: + return + if ans_clean.lower() == "insufficient context": + return + if not citations: + return + if len(ans_clean) < _AUTO_MEMORY_MIN_CHARS: + return + + # Build memory content with query context + query_str = " | ".join(queries) if queries else "unknown query" + + # Build citation summary (paths only) + cite_paths = [] + for cit in citations[:5]: # Limit to first 5 citations + p = cit.get("path") or cit.get("rel_path") or "" + if p: + cite_paths.append(p) + cite_summary = ", ".join(cite_paths) if cite_paths else "no paths" + + # Format the memory content + memory_content = f"Q: {query_str}\n\nA: {ans_clean}\n\nSources: {cite_summary}" + + # Build metadata + metadata = { + "kind": "context_answer", + "source": "auto_memory", + "queries": queries, + "citation_count": len(citations), + "answer_length": len(ans_clean), + } + + # Fire-and-forget in background thread + import threading + + def _store(): + try: + # Import here to avoid circular imports + from scripts.mcp_memory_server import memory_store + memory_store( + information=memory_content, + metadata=metadata, + collection=collection, + ) + logger.debug("Auto-stored context_answer as memory (len=%d, cites=%d)", len(ans_clean), len(citations)) + except Exception as e: + # Silently fail - this is best-effort + logger.debug("Auto-memory storage failed: %s", e) + + t = threading.Thread(target=_store, daemon=True) + t.start() + + # Keys to strip from citations for slim MCP output (agents only need path + rel_path) _VERBOSE_PATH_KEYS = ("host_path", "container_path", "client_path") @@ -666,6 +750,14 @@ def _ca_prepare_filters_and_retrieve( "node_modules/", ".git/", ".git", + # CDK/deploy build artifacts (duplicates of source files) + "cdk.out/", + "**/cdk.out/**", + "deploy/eks-cdk-*/cdk.out/", + # Build artifacts + "dist/", + "build/", + "*.egg-info/", ] def _variants(p: str) -> list[str]: @@ -2947,7 +3039,7 @@ def safe_float(val, default=0.0, **kw): items=items, collection=coll, repo=repo, - max_neighbors=2, + max_neighbors=5, ) except Exception as e: logger.debug(f"Subgraph context injection failed: {e}") @@ -3373,4 +3465,13 @@ def _tok2(s: str) -> list[str]: } if answers_by_query: out["answers_by_query"] = answers_by_query + + # Auto-store successful answers as memories (fire-and-forget) + _maybe_store_answer_as_memory( + answer=answer.strip(), + queries=original_queries, + citations=citations, + collection=collection, + ) + return out diff --git a/scripts/mcp_impl/search.py b/scripts/mcp_impl/search.py index bf49664c..a2565c32 100644 --- a/scripts/mcp_impl/search.py +++ b/scripts/mcp_impl/search.py @@ -62,6 +62,7 @@ logger=logger, context="MCP_SNIPPET_MAX_BYTES", ) +SEARCH_COMPACT_DEFAULT = os.environ.get("SEARCH_COMPACT_DEFAULT", "0").lower() in {"1", "true", "yes", "on"} async def _repo_search_impl( @@ -323,6 +324,9 @@ def _to_str(x, default=""): rerank_timeout_ms = _to_int( rerank_timeout_ms, int(os.environ.get("RERANKER_TIMEOUT_MS", "3000") or 3000) ) + # Clamp rerank timeout to prevent unreasonably low deadlines + _MIN_RERANK_TIMEOUT_MS = int(os.environ.get("RERANK_TIMEOUT_MIN_MS", "10000") or 10000) + rerank_timeout_ms = max(rerank_timeout_ms, _MIN_RERANK_TIMEOUT_MS) highlight_snippet = _to_bool(highlight_snippet, True) # Resolve collection and related hints: explicit > per-connection defaults > token defaults > env @@ -454,7 +458,7 @@ def _to_str_list(x): repo_filter = [detected_repo] compact_raw = compact - compact = _to_bool(compact, False) + compact = _to_bool(compact, SEARCH_COMPACT_DEFAULT) # If snippets are requested, do not compact (we need snippet field in results) if include_snippet: compact = False @@ -795,6 +799,7 @@ def _match_glob(glob_pat: str, path_val: str) -> bool: # Optional rerank fallback path: if enabled, attempt; on timeout or error, keep hybrid used_rerank = False + learning_results = None # May hold learning reranker output for fallback rerank_counters = { "inproc_hybrid": 0, "inproc_dense": 0, @@ -870,6 +875,8 @@ def _match_glob(glob_pat: str, path_val: str) -> bool: tmp.append(item) if tmp: + # Store learning results separately; may be used as fallback + learning_results = tmp results = tmp used_rerank = True rerank_counters["learning"] += 1 @@ -1191,6 +1198,12 @@ def _doc_for(obj: dict) -> str: rerank_counters["error"] += 1 used_rerank = False + # Fallback to learning reranker results if subprocess failed but learning succeeded + if (not used_rerank) and learning_results: + results = learning_results + used_rerank = True + logger.debug("Falling back to learning reranker results after subprocess failure") + if not used_rerank: # Build results from hybrid JSON lines for obj in json_lines: diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 6d0ff832..934f2274 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -136,6 +136,7 @@ def _json_dumps_bytes(obj) -> bytes: from scripts.mcp_auth import ( require_auth_session as _require_auth_session, require_collection_access as _require_collection_access, + AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN, ) # --------------------------------------------------------------------------- @@ -295,6 +296,268 @@ def _highlight_snippet(snippet, tokens): # type: ignore _work_script, ) +TOOLS_METADATA: dict[str, dict] = { + "repo_search": { + "name": "repo_search", + "category": "search", + "primary_use": "Hybrid semantic + lexical code search", + "choose_when": [ + "Finding code related to a concept", + "Starting a search without knowing which tool", + "Need flexible filtering by language/path/symbol", + ], + "choose_instead": { + "symbol_graph": "Need precise caller/definition relationships", + "context_answer": "Need an explanation, not raw results", + "search_tests_for": "Specifically want test files", + }, + "parameters": { + "essential": ["query"], + "common": ["limit", "language", "under", "include_snippet"], + "advanced": ["rerank_enabled", "output_format", "compact", "mode"], + }, + "returns": { + "ok": "bool", + "results": "list[{score, path, symbol, start_line, end_line, snippet?}]", + "total": "int", + }, + "related_tools": ["code_search", "context_search", "info_request"], + "performance": { + "typical_latency_ms": (100, 2000), + "requires_index": True, + "requires_decoder": False, + }, + }, + "context_answer": { + "name": "context_answer", + "category": "answer", + "primary_use": "LLM-generated answers with code citations", + "choose_when": [ + "Need an explanation of how code works", + "Asking 'how does X work?' questions", + "Want synthesized answer with sources", + ], + "choose_instead": { + "repo_search": "Want raw code results, not explanation", + "symbol_graph": "Need precise relationships", + }, + "parameters": { + "essential": ["query"], + "common": ["limit", "language", "under", "include_snippet"], + "advanced": ["max_tokens", "temperature", "expand", "budget_tokens"], + }, + "returns": { + "ok": "bool", + "answer": "str", + "citations": "list[{id, path, start_line, end_line}]", + }, + "related_tools": ["repo_search", "context_search"], + "performance": { + "typical_latency_ms": (1000, 10000), + "requires_index": True, + "requires_decoder": True, + }, + }, + "symbol_graph": { + "name": "symbol_graph", + "category": "graph", + "primary_use": "AST-backed symbol relationship queries", + "choose_when": [ + "Need 'who calls function X'", + "Need 'where is X defined'", + "Need 'what imports module Y'", + "Doing refactoring impact analysis", + ], + "choose_instead": { + "repo_search": "Want conceptual search, not precise relationships", + "search_callers_for": "Quick text search is sufficient", + }, + "parameters": { + "essential": ["symbol", "query_type"], + "common": ["limit", "language", "under", "repo"], + "advanced": ["depth", "output_format"], + }, + "returns": { + "ok": "bool", + "results": "list[{path, start_line, end_line, symbol, snippet}]", + "count": "int", + }, + "related_tools": ["search_callers_for", "search_importers_for"], + "performance": { + "typical_latency_ms": (50, 500), + "requires_index": True, + "requires_decoder": False, + }, + }, + "context_search": { + "name": "context_search", + "category": "search", + "primary_use": "Blend code search with memory retrieval", + "choose_when": [ + "Want code AND stored memories together", + "Searching for documented decisions", + "Need context from team knowledge", + ], + "choose_instead": { + "repo_search": "Only want code, no memories", + "memory_find": "Only want memories, no code", + }, + "parameters": { + "essential": ["query"], + "common": ["include_memories", "memory_weight", "limit"], + "advanced": ["per_source_limits", "rerank_enabled"], + }, + "returns": { + "ok": "bool", + "results": "list[{source, score, path|content, ...}]", + "total": "int", + }, + "related_tools": ["repo_search", "memory_find"], + "performance": { + "typical_latency_ms": (200, 3000), + "requires_index": True, + "requires_decoder": False, + }, + }, + "info_request": { + "name": "info_request", + "category": "search", + "primary_use": "Simplified code discovery with explanations", + "choose_when": [ + "Want simple single-parameter search", + "Need human-readable result descriptions", + "Building minimal integrations", + ], + "choose_instead": { + "repo_search": "Need full control over parameters", + "context_answer": "Need LLM-generated explanation", + }, + "parameters": { + "essential": ["info_request"], + "common": ["limit", "language", "include_explanation"], + "advanced": ["include_relationships", "output_format"], + }, + "returns": { + "ok": "bool", + "results": "list[{information, relevance_score, path, ...}]", + "summary?": "str", + "related_concepts?": "list[str]", + }, + "related_tools": ["repo_search", "context_answer"], + "performance": { + "typical_latency_ms": (100, 2000), + "requires_index": True, + "requires_decoder": False, + }, + }, + "pattern_search": { + "name": "pattern_search", + "category": "search", + "primary_use": "Structural code pattern matching", + "choose_when": [ + "Have code example, find similar", + "Cross-language pattern search", + "Find structural duplicates", + ], + "choose_instead": { + "repo_search": "Searching by concept, not structure", + "symbol_graph": "Looking for relationships", + }, + "parameters": { + "essential": ["query"], + "common": ["language", "limit", "target_languages"], + "advanced": ["query_mode", "aroma_rerank", "min_score"], + }, + "returns": { + "ok": "bool", + "results": "list[{path, start_line, end_line, score, language}]", + "query_mode": "str", + }, + "related_tools": ["repo_search"], + "performance": { + "typical_latency_ms": (200, 3000), + "requires_index": True, + "requires_decoder": False, + }, + }, + "search_tests_for": { + "name": "search_tests_for", + "category": "specialized", + "primary_use": "Find test files for a feature/function", + "choose_when": ["Specifically want test files", "Looking for test coverage"], + "choose_instead": {"repo_search": "Want all code, not just tests"}, + "parameters": { + "essential": ["query"], + "common": ["limit", "language", "under"], + "advanced": ["include_snippet", "compact"], + }, + "returns": {"ok": "bool", "results": "list[...]", "total": "int"}, + "related_tools": ["repo_search"], + "performance": { + "typical_latency_ms": (100, 1500), + "requires_index": True, + "requires_decoder": False, + }, + }, + "search_config_for": { + "name": "search_config_for", + "category": "specialized", + "primary_use": "Find configuration files", + "choose_when": ["Looking for config files", "Finding settings/options"], + "choose_instead": {"repo_search": "Want all code, not just config"}, + "parameters": { + "essential": ["query"], + "common": ["limit", "under"], + "advanced": ["include_snippet", "compact"], + }, + "returns": {"ok": "bool", "results": "list[...]", "total": "int"}, + "related_tools": ["repo_search"], + "performance": { + "typical_latency_ms": (100, 1500), + "requires_index": True, + "requires_decoder": False, + }, + }, + "search_callers_for": { + "name": "search_callers_for", + "category": "specialized", + "primary_use": "Text-based search for symbol callers", + "choose_when": ["Quick caller search is sufficient", "No graph index available"], + "choose_instead": {"symbol_graph": "Need precise AST-backed callers"}, + "parameters": { + "essential": ["query"], + "common": ["limit", "language"], + "advanced": [], + }, + "returns": {"ok": "bool", "results": "list[...]", "total": "int"}, + "related_tools": ["symbol_graph"], + "performance": { + "typical_latency_ms": (100, 1500), + "requires_index": True, + "requires_decoder": False, + }, + }, + "search_importers_for": { + "name": "search_importers_for", + "category": "specialized", + "primary_use": "Text-based search for module importers", + "choose_when": ["Quick import search is sufficient", "No graph index available"], + "choose_instead": {"symbol_graph": "Need precise AST-backed importers"}, + "parameters": { + "essential": ["query"], + "common": ["limit", "language"], + "advanced": [], + }, + "returns": {"ok": "bool", "results": "list[...]", "total": "int"}, + "related_tools": ["symbol_graph"], + "performance": { + "typical_latency_ms": (100, 1500), + "requires_index": True, + "requires_decoder": False, + }, + }, +} + # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000) _security_settings = ( TransportSecuritySettings(enable_dns_rebinding_protection=False) @@ -303,6 +566,53 @@ def _highlight_snippet(snippet, tokens): # type: ignore ) mcp = FastMCP(APP_NAME, transport_security=_security_settings) +class _AuthHeaderASGIMiddleware: + """Pure ASGI middleware that extracts Authorization header into context var.""" + def __init__(self, app): + self.app = app + + async def __call__(self, scope, receive, send): + if scope["type"] == "http": + headers = dict(scope.get("headers", [])) + auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore") + if auth_header.lower().startswith("bearer "): + token = auth_header[7:].strip() + else: + token = auth_header.strip() if auth_header else "" + _AUTH_HEADER_TOKEN.set(token) + return await self.app(scope, receive, send) + + +def _add_auth_middleware(): + """Wrap FastMCP's ASGI app with auth header extraction middleware. + + FastMCP calls streamable_http_app() or sse_app() to create the Starlette app. + We patch these methods to wrap the returned app with our middleware. + """ + logger.info("Setting up auth header middleware...") + try: + # Patch streamable_http_app + if hasattr(mcp, "streamable_http_app"): + _orig_streamable = mcp.streamable_http_app + def _patched_streamable(*args, **kwargs): + app = _orig_streamable(*args, **kwargs) + logger.info(f"Wrapping streamable_http_app with auth middleware") + return _AuthHeaderASGIMiddleware(app) + mcp.streamable_http_app = _patched_streamable + + # Patch sse_app for SSE transport + if hasattr(mcp, "sse_app"): + _orig_sse = mcp.sse_app + def _patched_sse(*args, **kwargs): + app = _orig_sse(*args, **kwargs) + logger.info(f"Wrapping sse_app with auth middleware") + return _AuthHeaderASGIMiddleware(app) + mcp.sse_app = _patched_sse + + logger.info("Patched FastMCP app factory methods for auth middleware injection") + except Exception as e: + logger.warning(f"Failed to patch FastMCP for auth middleware: {e}") + # Capture tool registry automatically by wrapping the decorator once _TOOLS_REGISTRY: list[dict] = [] @@ -417,7 +727,6 @@ def do_GET(self): self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() - # Hide expand_query when decoder is disabled tools = _TOOLS_REGISTRY try: from scripts.refrag_llamacpp import is_decoder_enabled # type: ignore @@ -432,7 +741,12 @@ def do_GET(self): ] except Exception as e: logger.debug(f"Suppressed exception: {e}") - payload = {"ok": True, "tools": tools} + enriched = [] + for t in tools: + name = t.get("name", "") + meta = TOOLS_METADATA.get(name, {}) + enriched.append({**t, **meta}) + payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA} self.wfile.write(_json_dumps_bytes(payload)) else: self.send_response(404) @@ -979,6 +1293,12 @@ async def set_session_defaults( mode: Any = None, under: Any = None, language: Any = None, + repo: Any = None, + compact: Any = None, + output_format: Any = None, + include_snippet: Any = None, + rerank_enabled: Any = None, + limit: Any = None, session: Any = None, ctx: Context = None, **kwargs, @@ -989,6 +1309,19 @@ async def set_session_defaults( - If request Context is available, persist defaults per-connection so later calls on the same MCP session automatically use them (no token required). - Optionally also stores token-scoped defaults for cross-connection reuse. + + Parameters: + - collection: Default collection name + - mode: Search mode hint + - under: Default path prefix filter + - language: Default language filter + - repo: Default repo filter for multi-repo setups + - compact: Default compact response mode (bool) + - output_format: Default output format ("json" or "toon") + - include_snippet: Default snippet inclusion (bool) + - rerank_enabled: Default reranking toggle (bool) + - limit: Default result limit (int) + - session: Session token for cross-connection reuse """ try: _extra = _extract_kwargs_payload(kwargs) @@ -1003,6 +1336,18 @@ async def set_session_defaults( language = _extra.get("language") if (session is None or (isinstance(session, str) and str(session).strip() == "")) and _extra.get("session") is not None: session = _extra.get("session") + if repo is None and _extra.get("repo") is not None: + repo = _extra.get("repo") + if compact is None and _extra.get("compact") is not None: + compact = _extra.get("compact") + if output_format is None and _extra.get("output_format") is not None: + output_format = _extra.get("output_format") + if include_snippet is None and _extra.get("include_snippet") is not None: + include_snippet = _extra.get("include_snippet") + if rerank_enabled is None and _extra.get("rerank_enabled") is not None: + rerank_enabled = _extra.get("rerank_enabled") + if limit is None and _extra.get("limit") is not None: + limit = _extra.get("limit") except Exception as e: logger.debug(f"Suppressed exception: {e}") @@ -1015,6 +1360,23 @@ async def set_session_defaults( defaults[_key] = _s else: unset_keys.add(_key) + if isinstance(repo, str) and repo.strip(): + defaults["repo"] = repo.strip() + elif isinstance(repo, list): + defaults["repo"] = repo + if isinstance(output_format, str) and output_format.strip(): + defaults["output_format"] = output_format.strip() + if compact is not None: + defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact + if include_snippet is not None: + defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet + if rerank_enabled is not None: + defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled + if limit is not None: + try: + defaults["limit"] = int(limit) + except (ValueError, TypeError): + pass # Per-connection storage (preferred) try: @@ -1119,24 +1481,91 @@ async def repo_search( args: Any = None, kwargs: Any = None, ) -> Dict[str, Any]: - """Zero-config code search over repositories (hybrid: vector + lexical RRF, rerank ON by default). - - When to use: - - Find relevant code spans quickly; prefer this over embedding-only search. - - Use context_answer when you need a synthesized explanation; use context_search to blend with memory notes. - - Key parameters: - - query: str or list[str]. Multiple queries are fused; accepts "queries" alias. - - limit: int (default 10). Total results across files. - - per_path: int (default 2). Max results per file. - - include_snippet/context_lines: return inline snippets near hits when true. - - rerank_*: ONNX reranker is ON by default for best relevance; timeouts fall back to hybrid. - - output_format: "json" (default) or "toon" for token-efficient TOON format. - - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME. - - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos. + """Primary hybrid semantic + lexical code search across the repository. + + PRIMARY USE: Find code spans matching a natural language concept or topic. + + CHOOSE THIS WHEN: + - You need to find code related to a concept (e.g., "authentication", "caching") + - You want to locate implementations, not just definitions + - You need flexible filtering by language, path, or symbol + - You want the best balance of recall and precision + - You're starting a search and aren't sure which specific tool to use + + CHOOSE INSTEAD: + - symbol_graph -> when you need "who calls X" or "where is X defined" (AST-backed) + - context_answer -> when you need an EXPLANATION, not raw code results + - context_search -> when you want to blend code results with stored memories + - search_tests_for -> when specifically looking for test files + - search_config_for -> when specifically looking for config files + - pattern_search -> when searching by code structure/pattern across languages + + QUERY EXAMPLES: + Good queries (natural language, conceptual): + "authentication middleware" - finds auth-related code + "error handling with retry" - finds retry logic + "database connection pooling" - finds connection management + "user session management" - finds session-related code + "API rate limiting" - finds rate limit implementations + "caching layer implementation" - finds cache logic + "websocket message handling" - finds WS handlers + "file upload processing" - finds upload logic + + Bad queries (will return poor results): + "auth OR login OR session" - boolean operators NOT supported + "def.*authenticate" - regex NOT supported in query + "*.py with class User" - glob syntax NOT for query field + "function" - too vague, be more specific + "get" - too generic + "the code that handles the thing" - unclear intent + + ESSENTIAL PARAMETERS: + - query (str | list[str]): Natural language description of what you're looking for. + Multiple queries are fused for broader recall. + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - per_path (int, default=2): Max results per file. Increase for thorough search. + - include_snippet (bool, default=True): Include code snippets in results. + - language (str): Filter by language ("python", "typescript", "go", etc.) + - under (str): Restrict to directory path ("scripts/", "src/api/") + - symbol (str): Filter by symbol name (function, class, method) + - path_glob (str | list[str]): File pattern filter ("**/*.py", "src/**") + - repo (str | list[str]): Filter by repo name(s). Use "*" for all repos. + + ADVANCED PARAMETERS: + - rerank_enabled (bool, default=True): ONNX cross-encoder reranking for relevance. + - rerank_top_n (int, default=20): Candidates to rerank. Increase for benchmarks. + - output_format (str): "json" (default) or "toon" for token-efficient format. + - compact (bool, default=False): Strip verbose fields for minimal response. + - mode (str): "code_first", "docs_first", "balanced", or "dense" (pure embedding). + - not_glob (str | list[str]): Exclude paths matching pattern. + - not_ (str): Exclude results containing this text. + - case (str): "sensitive" for case-sensitive matching. + + RETURNS: + { + "ok": true, + "results": [ + { + "score": 0.85, // Relevance score (0-1+) + "path": "src/auth.py", // File path + "symbol": "authenticate", // Symbol name if available + "start_line": 42, // Start line number + "end_line": 67, // End line number + "snippet": "def auth..." // Code snippet (if include_snippet=true) + } + ], + "total": 5, // Total results returned + "used_rerank": true, // Whether reranking was applied + "rerank_counters": {...} // Reranking statistics + } - Returns: - - Dict with keys: results, total, used_rerank, rerank_counters + PERFORMANCE TIPS: + - Use language filter to reduce search space and improve relevance + - Use under filter when you know the general code area + - Set include_snippet=false if you only need file locations + - Set compact=true to reduce response size for large result sets """ return await _repo_search_impl( query=query, @@ -1305,14 +1734,77 @@ async def search_tests_for( ) -> Dict[str, Any]: """Find test files related to a query. - What it does: - - Presets common test file globs and forwards to repo_search - - Accepts extra filters via kwargs (e.g., language, under, case) + PRIMARY USE: Quickly find tests for a feature, function, or module. + Convenience wrapper that presets common test file patterns. + + CHOOSE THIS WHEN: + - You specifically want TEST files, not implementation code + - You're looking for tests related to a feature + - You want to find test coverage for a function/class + - You're exploring how something is tested + + CHOOSE INSTEAD: + - repo_search -> when you want ALL code, not just tests + - symbol_graph -> when you need "what tests call function X" + + QUERY EXAMPLES: + Good queries (feature/function focused): + "user authentication" - finds tests for auth features + "database connection" - finds DB connection tests + "API rate limiting" - finds rate limit tests + "email sending" - finds email-related tests + "input validation" - finds validation tests + "UserService" - finds tests for UserService class + + Bad queries: + "all tests" - too broad + "test_*.py" - glob pattern, use path_glob param + "pass" - assertion keyword, not meaningful + "def test_" - code fragment, use repo_search + + ESSENTIAL PARAMETERS: + - query (str | list[str]): Natural language description of what you want tests for. + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - include_snippet (bool, default=True): Include test code snippets. + - context_lines (int): Lines of context around matches. + - under (str): Restrict to directory path (e.g., "tests/unit/"). + - language (str): Filter by language. + - compact (bool): Minimal response fields. + + PRESET GLOBS (automatically applied): + - tests/** + - test/** + - **/*test*.* + - **/*_test.* + - **/Test*/** + + RETURNS: Same schema as repo_search. + { + "ok": true, + "results": [ + { + "score": 0.82, + "path": "tests/test_auth.py", + "symbol": "test_authenticate_valid_user", + "start_line": 45, + "end_line": 58, + "snippet": "def test_authenticate_valid_user():..." + } + ], + "total": 8 + } - Parameters: - - query: str or list[str]; limit; include_snippet/context_lines; under; language; compact + USAGE PATTERNS: + # Find tests for authentication + search_tests_for(query="authentication") - Returns: repo_search result shape. + # Find tests in a specific directory + search_tests_for(query="database", under="tests/integration/") + + # Find Python tests only + search_tests_for(query="caching", language="python") """ return await _search_tests_for_impl( query=query, @@ -1341,13 +1833,86 @@ async def search_config_for( kwargs: Any = None, ctx: Context = None, ) -> Dict[str, Any]: - """Find likely configuration files for a service/query. + """Find configuration files related to a query. + + PRIMARY USE: Quickly find config files for a service, feature, or setting. + Convenience wrapper that presets common config file patterns. + + CHOOSE THIS WHEN: + - You need to find configuration for a service/feature + - You're looking for environment variables, settings, or options + - You want to find where something is configured + - You're debugging configuration issues + + CHOOSE INSTEAD: + - repo_search -> when you want ALL code, not just config files + - search_tests_for -> when looking for test files + + QUERY EXAMPLES: + Good queries (service/setting focused): + "database connection" - finds DB config files + "authentication settings" - finds auth config + "logging configuration" - finds logging setup + "API keys" - finds key config (careful with secrets!) + "environment variables" - finds env config + "redis cache" - finds Redis config + "docker compose" - finds Docker config + + Bad queries: + "*.yaml" - glob pattern, handled by presets + "config" - too vague + "settings" - too generic + "json" - file format, not a query + + ESSENTIAL PARAMETERS: + - query (str | list[str]): Natural language description of what config you need. + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - include_snippet (bool, default=True): Include config content snippets. + - context_lines (int): Lines of context around matches. + - under (str): Restrict to directory path. + - compact (bool): Minimal response fields. + + PRESET GLOBS (automatically applied): + - **/*.yml, **/*.yaml + - **/*.json + - **/*.toml + - **/*.ini + - **/*.env + - **/*.config, **/*.conf + - **/*.properties + - **/*.csproj, **/*.props, **/*.targets + - **/*.xml + - **/appsettings*.json + + RETURNS: Same schema as repo_search. + { + "ok": true, + "results": [ + { + "score": 0.85, + "path": "config/database.yml", + "start_line": 12, + "end_line": 25, + "snippet": "database:\\n host: localhost\\n port: 5432..." + } + ], + "total": 5 + } - What it does: - - Presets config file globs (yaml/json/toml/etc.) and forwards to repo_search - - Accepts extra filters via kwargs + USAGE PATTERNS: + # Find database config + search_config_for(query="database connection") + + # Find Docker configuration + search_config_for(query="docker service ports") + + # Find in specific directory + search_config_for(query="api settings", under="config/") - Returns: repo_search result shape. + WARNING: Config files may contain sensitive data (API keys, passwords). + Be cautious about exposing results that might contain secrets. """ return await _search_config_for_impl( query=query, @@ -1372,14 +1937,57 @@ async def search_callers_for( kwargs: Any = None, ctx: Context = None, ) -> Dict[str, Any]: - """Heuristic search for callers/usages of a symbol. - - When to use: - - You want files that reference/invoke a function/class - - Notes: - - Thin wrapper over repo_search today; pass language or path_glob to narrow - - Returns repo_search result shape + """Heuristic text-based search for callers/usages of a symbol. + + PRIMARY USE: Find files that likely call or reference a function/class. + Uses text search, not AST analysis - faster but less precise than symbol_graph. + + CHOOSE THIS WHEN: + - You want a quick, broad search for symbol references + - You're okay with some false positives in exchange for speed + - The codebase doesn't have graph index built yet + - You want to find textual mentions, not just actual calls + + CHOOSE INSTEAD: + - symbol_graph with query_type="callers" -> for PRECISE AST-backed caller analysis + - repo_search -> when you want full control over search parameters + + QUERY EXAMPLES: + Good queries (symbol names): + "authenticate" - finds references to authenticate + "UserService" - finds references to UserService + "validate_input" - finds references to validate_input + "CacheManager.get" - finds references to CacheManager.get + + Bad queries: + "who calls authenticate" - use symbol_graph for this phrasing + "find all usages of X" - use symbol_graph + "authentication" - concept, not symbol name + + ESSENTIAL PARAMETERS: + - query (str): Symbol name to find callers/references for. + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - language (str): Filter by language for more relevant results. + + RETURNS: Same schema as repo_search. + + COMPARISON WITH symbol_graph: + | Aspect | search_callers_for | symbol_graph | + |--------|-------------------|--------------| + | Method | Text search | AST analysis | + | Speed | Faster | Slower | + | Precision | Lower (false positives) | Higher (actual calls) | + | Requires | Nothing special | Graph index | + | Use for | Quick exploration | Precise refactoring | + + USAGE PATTERNS: + # Quick reference search + search_callers_for(query="authenticate", language="python") + + # For precise caller analysis, prefer: + symbol_graph(symbol="authenticate", query_type="callers") """ return await _search_callers_for_impl( query=query, @@ -1401,13 +2009,61 @@ async def search_importers_for( kwargs: Any = None, ctx: Context = None, ) -> Dict[str, Any]: - """Find files likely importing or referencing a module/symbol. - - What it does: - - Presets code globs across common languages; forwards to repo_search - - Accepts additional filters via kwargs (e.g., under, case) - - Returns: repo_search result shape. + """Heuristic text-based search for files importing a module/symbol. + + PRIMARY USE: Find files that likely import a module or symbol. + Uses text search, not AST analysis - faster but less precise than symbol_graph. + + CHOOSE THIS WHEN: + - You want a quick search for import statements + - You're looking for textual import/require/use mentions + - The codebase doesn't have graph index built yet + - You want approximate results quickly + + CHOOSE INSTEAD: + - symbol_graph with query_type="importers" -> for PRECISE AST-backed import analysis + - repo_search -> when you want full control over search parameters + + QUERY EXAMPLES: + Good queries (module/symbol names): + "auth_utils" - finds imports of auth_utils + "CacheManager" - finds imports of CacheManager + "qdrant_client" - finds imports of qdrant_client + "express" - finds require('express') + "pandas" - finds import pandas + + Bad queries: + "what imports X" - use symbol_graph for this phrasing + "import statements" - too vague + "from ... import" - syntax, not a module name + + ESSENTIAL PARAMETERS: + - query (str): Module or symbol name to find importers for. + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - language (str): Filter by language for more relevant results. + + PRESET GLOBS (automatically applied): + Code files across all common languages (*.py, *.js, *.ts, *.go, etc.) + + RETURNS: Same schema as repo_search. + + COMPARISON WITH symbol_graph: + | Aspect | search_importers_for | symbol_graph | + |--------|---------------------|--------------| + | Method | Text search | AST analysis | + | Speed | Faster | Slower | + | Precision | Lower (false positives) | Higher (actual imports) | + | Requires | Nothing special | Graph index | + | Use for | Quick exploration | Precise dependency analysis | + + USAGE PATTERNS: + # Quick import search + search_importers_for(query="qdrant_client", language="python") + + # For precise import analysis, prefer: + symbol_graph(symbol="qdrant_client", query_type="importers") """ return await _search_importers_for_impl( query=query, @@ -1433,35 +2089,102 @@ async def symbol_graph( depth: Any = None, ctx: Context = None, ) -> Dict[str, Any]: - """Query the symbol graph to find callers, definitions, or importers. + """AST-backed symbol graph queries for precise code relationships. + + PRIMARY USE: Find WHO CALLS a function, WHERE something is DEFINED, + or WHAT IMPORTS a module using the pre-built symbol graph. + + CHOOSE THIS WHEN: + - You need "who calls this function?" (callers) + - You need "where is this defined?" (definition) + - You need "what imports this module?" (importers) + - You need "what does this function call?" (callees) + - You want PRECISE relationships, not text-based fuzzy matches + - You're doing refactoring impact analysis + + CHOOSE INSTEAD: + - repo_search -> when you want CONCEPTUAL search, not precise relationships + - search_callers_for -> convenience wrapper, uses text search (less precise) + - search_importers_for -> convenience wrapper, uses text search (less precise) + + QUERY EXAMPLES: + + For "callers" query_type (who calls X?): + symbol="authenticate" - finds all callers of authenticate() + symbol="UserService.get_user" - finds callers of get_user method + symbol="validate_input" - finds where validate_input is called + + For "definition" query_type (where is X defined?): + symbol="CacheManager" - finds CacheManager class definition + symbol="run_hybrid_search" - finds function definition + symbol="USER_TIMEOUT" - finds constant definition + + For "importers" query_type (what imports X?): + symbol="auth_utils" - finds files importing auth_utils module + symbol="CacheManager" - finds files importing CacheManager + symbol="qdrant_client" - finds files importing qdrant_client + + For "callees" query_type (what does X call?): + symbol="authenticate" - finds functions called BY authenticate + symbol="process_request" - finds all functions process_request calls + + ESSENTIAL PARAMETERS: + - symbol (str): Symbol name to analyze. Can be: + - Simple name: "authenticate" + - Qualified path: "UserService.get_user" + - Module name: "auth_utils" + + - query_type (str, default="callers"): Type of relationship query: + - "callers": Find code that CALLS this symbol + - "definition": Find WHERE this symbol is DEFINED + - "importers": Find code that IMPORTS this symbol/module + - "callees": Find what this symbol CALLS (inverse of callers) + + COMMON PARAMETERS: + - limit (int, default=20): Maximum results to return. + - depth (int, default=1): Traversal depth for multi-hop queries. + - depth=1: Direct relationships only + - depth=2: Callers of callers, callees of callees, etc. + - depth=3+: Use sparingly, can be expensive + - language (str): Filter by language. + - under (str): Filter by path prefix. + - repo (str): Filter by repository name. Use "*" for all repos. + - output_format (str): "json" or "toon" for token-efficient format. + + RETURNS: + { + "ok": true, + "results": [ + { + "path": "src/api/handlers.py", + "start_line": 142, + "end_line": 145, + "symbol": "handle_login", + "symbol_path": "handlers.handle_login", + "language": "python", + "snippet": " result = authenticate(username, password)", + "hop": 1, // For depth>1: which hop found this + "via": "authenticate" // For depth>1: intermediate symbol + } + ], + "symbol": "authenticate", + "query_type": "callers", + "count": 12, + "depth": 1, + "used_graph": true, // True if graph collection was used (fast) + "suggestions": [...] // Fuzzy matches if exact symbol not found + } - When to use: - - "Who calls X?" → query_type="callers" - - "Where is X defined?" → query_type="definition" - - "What imports Y?" → query_type="importers" - - "What does X call?" → query_type="callees" - - Key parameters: - - symbol: str. The function, class, or module name to search for. - - query_type: str. One of "callers", "definition", "importers". - - limit: int (default 20). Maximum results to return. - - language: str (optional). Filter by programming language. - - under: str (optional). Filter by path prefix. - - repo: str (optional). Filter by repository name. Use "*" to search all repos. - - output_format: "json" (default) or "toon" for token-efficient format. - - depth: int (default 1). Multi-hop traversal depth. 2 = callers of callers, etc. + MULTI-HOP EXAMPLE (depth=2): + # "Who calls the callers of authenticate?" + symbol_graph(symbol="authenticate", query_type="callers", depth=2) + # Returns both direct callers (hop=1) and callers-of-callers (hop=2) - Returns: - - {"results": [...], "symbol": str, "query_type": str, "count": int, "depth": int} - - Each result includes path, start_line, end_line, symbol_path, and relevant context. - - Multi-hop results include "hop" (1, 2, ...) and "via" (intermediate symbol). - - Example: - - symbol_graph(symbol="get_embedding_model", query_type="callers") - - symbol_graph(symbol="ASTAnalyzer", query_type="definition") - - symbol_graph(symbol="qdrant_client", query_type="importers") - - symbol_graph(symbol="my_function", query_type="callers", repo="backend") - - symbol_graph(symbol="authenticate", query_type="callers", depth=2) + NOTES: + - Graph must be indexed (run qdrant_index_root first) + - For fuzzy matching, suggestions are returned if exact symbol not found + - Hydration adds code snippets and accurate line numbers automatically + - Use depth>1 carefully - exponential growth in results """ if not symbol or not str(symbol).strip(): return {"error": "symbol parameter is required", "results": []} @@ -1601,36 +2324,112 @@ async def context_answer( repo: Any = None, # str, list[str], or "*" to search all repos kwargs: Any = None, ) -> Dict[str, Any]: - """Natural-language Q&A over the repo using retrieval + local LLM (llama.cpp). - - What it does: - - Retrieves relevant code (hybrid vector+lexical with reranking enabled by default). - - Budgets/merges micro-spans, builds citations, and asks the LLM to answer. - - Returns a concise answer plus file/line citations. - - When to use: - - You need an explanation or "how to" grounded in code. - - Prefer repo_search for raw hits; prefer context_search to blend code + memory. - - Key parameters: - - query: str or list[str]; may be expanded if expand=true. - - budget_tokens: int. Token budget across code spans (defaults from MICRO_BUDGET_TOKENS). - - include_snippet: bool (default true). Include code snippets sent to the LLM and return them when requested. - - max_tokens, temperature: decoding controls. - - mode: "stitch" (default) or "pack" for prompt assembly. - - expand: bool. Use tiny local LLM to propose up to 2 alternate queries. - - Filters: language, under, kind, symbol, ext, path_regex, path_glob, not_glob, not_, case. - - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). - By default, auto-detects current repo from CURRENT_REPO env and filters to it. + """Generate LLM-powered answers with citations grounded in retrieved code. + + PRIMARY USE: Get an EXPLANATION or ANSWER to a question, not raw search results. + Uses retrieval-augmented generation (RAG) with a local LLM decoder. + + CHOOSE THIS WHEN: + - You need an EXPLANATION ("How does X work?", "What is Y?") + - You want a synthesized answer with source citations + - You're asking a question that requires understanding, not just finding + - You want the system to READ code and EXPLAIN it to you + + CHOOSE INSTEAD: + - repo_search -> when you want RAW CODE RESULTS, not explanations + - symbol_graph -> when you need precise "who calls X" relationships + - context_search -> when you want code + memories without LLM synthesis + + QUERY EXAMPLES: + Good queries (questions requiring explanation): + "How does the authentication system validate tokens?" + "What is the purpose of the CacheManager class?" + "Explain the error handling strategy in the API layer" + "How are database connections pooled in this project?" + "What happens when a user session expires?" + "Describe the data flow for user registration" + "How does retry logic work in the HTTP client?" + + Bad queries (not suited for LLM answers): + "find authentication code" - use repo_search for finding code + "list all Python files" - use repo_search with language filter + "UserController" - symbol name only, use symbol_graph + "src/auth.py" - file path, just read the file + "def authenticate" - code fragment, use repo_search + + ESSENTIAL PARAMETERS: + - query (str | list[str]): Question or topic requiring explanation. + Should be phrased as a question or request for explanation. + + RETRIEVAL PARAMETERS: + - limit (int, default=15): Code spans to retrieve for context. + - per_path (int, default=5): Max spans per file. + - budget_tokens (int): Token budget for code context. Default from env. + - include_snippet (bool, default=True): Include code in response. + - language (str): Filter retrieval by language. + - under (str): Restrict retrieval to directory path. + - repo (str | list[str]): Filter by repo. Use "*" for all repos. + + GENERATION PARAMETERS: + - max_tokens (int): Max tokens for generated answer. + - temperature (float): Sampling temperature (0.0-1.0). Lower = more focused. + - mode (str): Prompt assembly mode. "stitch" (default) or "pack". + - expand (bool): Use LLM to generate query expansions for better recall. + + COMMON FILTER PARAMETERS (same as repo_search): + - symbol (str): Filter by symbol name. + - path_glob (str | list[str]): Filter by file pattern. + - not_glob (str | list[str]): Exclude file patterns. + - ext (str): Filter by file extension. + + RETURNS: + { + "ok": true, + "answer": "The authentication system validates tokens by first checking + the JWT signature using the secret from config [1], then + verifying expiration time [2]. If valid, it extracts the + user ID and loads permissions from the database [3].", + "citations": [ + { + "id": 1, + "path": "src/auth/jwt.py", + "start_line": 45, + "end_line": 52, + "snippet": "def verify_token(token):..." // Optional + }, + { + "id": 2, + "path": "src/auth/jwt.py", + "start_line": 54, + "end_line": 58 + }, + { + "id": 3, + "path": "src/auth/permissions.py", + "start_line": 23, + "end_line": 31 + } + ], + "query": ["How does authentication validate tokens"], + "used": { + "spans": 5, + "tokens": 1842 + } + } - Returns: - - {"answer": str, "citations": [{"path": str, "start_line": int, "end_line": int}], "query": list[str], "used": {...}} - - On decoder disabled/error, returns {"error": "...", "citations": [...], "query": [...]} + // On insufficient context: + { + "answer": "insufficient context", + "citations": [], + "query": [...], + "hint": "Try broadening your query or checking if the feature exists" + } - Notes: - - Reranking is enabled by default for optimal retrieval quality. - - Honors env knobs such as REFRAG_MODE, REFRAG_GATE_FIRST, MICRO_BUDGET_TOKENS, DECODER_*. - - Keeps answers brief (2–4 sentences) and grounded; rejects ungrounded output. + NOTES: + - Answers include bracketed citations like [1], [2] referencing the citations array + - If context is insufficient, returns "insufficient context" as the answer + - Local LLM decoder must be available (llama.cpp or cloud fallback) + - Reranking is enabled by default for optimal retrieval quality """ return await _context_answer_impl( query=query, @@ -1685,14 +2484,84 @@ async def code_search( case: Any = None, session: Any = None, compact: Any = None, + # Memory blending (opt-in) + include_memories: Any = None, + memory_weight: Any = None, + per_source_limits: Any = None, kwargs: Any = None, ) -> Dict[str, Any]: - """Exact alias of repo_search (hybrid code search with reranking enabled by default). + """Alias of repo_search for discoverability. Use repo_search directly. + + PRIMARY USE: This is an EXACT ALIAS of repo_search. Exists for discoverability + in IDEs and agents that might search for "code_search" instead of "repo_search". + + CHOOSE THIS WHEN: + - You would use repo_search (they are identical) + - Your tooling expects a "code_search" function name + + CHOOSE INSTEAD: + - repo_search -> same functionality, canonical name + - See repo_search docstring for full documentation + + QUERY EXAMPLES: + Good queries (natural language, conceptual): + "authentication middleware" - finds auth-related code + "error handling with retry" - finds retry logic + "database connection setup" - finds DB connection code + "user input validation" - finds validation logic + "async task processing" - finds async patterns + + Bad queries (will return poor results): + "auth AND login" - boolean operators NOT supported + "grep -r 'password'" - not a shell command + "class.*Controller" - regex NOT supported + "SELECT * FROM users" - SQL query, not code search + "https://github.com/..." - URL, not a search query + + ESSENTIAL PARAMETERS: + - query (str): Natural language description of code you're looking for. + + All parameters and return format are identical to repo_search. + See repo_search documentation for complete parameter reference. + + MEMORY BLENDING (opt-in, delegates to context_search): + - include_memories: bool. If true, blends memory results with code results. + - memory_weight: float (default 1.0). Scales memory scores relative to code. + - per_source_limits: dict, e.g. {"code": 5, "memory": 3} - Prefer repo_search; this name exists for discoverability in some IDEs/agents. - Same parameters and return shape as repo_search. - Reranking (rerank_enabled=true) is ON by default for optimal result quality. + RETURNS: Same schema as repo_search. """ + # If include_memories is requested, delegate to context_search for blending + if include_memories: + return await context_search( + query=query, + limit=limit, + per_path=per_path, + include_memories=include_memories, + memory_weight=memory_weight, + per_source_limits=per_source_limits, + include_snippet=include_snippet, + context_lines=context_lines, + rerank_enabled=rerank_enabled, + rerank_top_n=rerank_top_n, + rerank_return_m=rerank_return_m, + rerank_timeout_ms=rerank_timeout_ms, + highlight_snippet=highlight_snippet, + collection=collection, + language=language, + under=under, + kind=kind, + symbol=symbol, + path_regex=path_regex, + path_glob=path_glob, + not_glob=not_glob, + ext=ext, + not_=not_, + case=case, + session=session, + compact=compact, + kwargs=kwargs, + ) return await repo_search( query=query, limit=limit, @@ -1749,31 +2618,105 @@ async def info_request( output_format: Any = None, # "json" (default) or "toon" for token-efficient format kwargs: Any = None, ) -> Dict[str, Any]: - """Simplified codebase retrieval with optional explanation mode. + """Simplified codebase discovery with optional explanation mode. + + PRIMARY USE: Quick, single-parameter code search with human-readable results. + Designed as a drop-in replacement for basic "find code about X" queries. + + CHOOSE THIS WHEN: + - You want a simple, one-parameter search interface + - You want results with human-readable "information" descriptions + - You want optional explanation mode for richer context + - You're building a simple integration and want minimal complexity + + CHOOSE INSTEAD: + - repo_search -> when you need full control over filtering and parameters + - context_answer -> when you need an LLM-generated ANSWER, not just results + - symbol_graph -> when you need precise call/definition relationships + + QUERY EXAMPLES: + Good queries (natural language descriptions): + "database connection pooling" - finds DB connection code + "authentication middleware" - finds auth-related code + "error handling patterns" - finds error handling logic + "user input validation" - finds validation code + "caching implementation" - finds cache logic + "logging configuration" - finds logging setup + "API endpoint handlers" - finds route handlers + + Bad queries (too vague or wrong format): + "code" - too vague + "the function" - unspecific + "*.py" - glob pattern, use path_glob param + "auth|login" - boolean syntax not supported + "line 42" - use file reading for specific lines + + ESSENTIAL PARAMETERS: + - info_request (str): Natural language description of code you're looking for. + - information_request (str): Alias for info_request. + + EXPLANATION MODE PARAMETERS: + - include_explanation (bool, default=False): When true, adds: + - summary: Brief overview of what was found + - primary_locations: Key file paths + - related_concepts: Technical concepts discovered + - query_understanding: How the query was interpreted + + - include_relationships (bool, default=False): When true, adds to each result: + - imports_from: Modules this code imports + - calls: Functions this code calls + - related_paths: Related files + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - language (str): Filter by language ("python", "typescript", etc.) + - under (str): Restrict to directory path. + - repo (str | list[str]): Filter by repo. Use "*" for all repos. + - output_format (str): "json" or "toon" for token-efficient format. + - include_snippet (bool, default=True): Include code snippets. + + RETURNS (compact mode, default): + { + "ok": true, + "results": [ + { + "information": "Found function 'authenticate' in src/auth.py (lines 42-67)", + "relevance_score": 0.85, // Alias for score + "score": 0.85, + "path": "src/auth.py", + "symbol": "authenticate", + "start_line": 42, + "end_line": 67 + } + ], + "total": 5 + } - When to use: - - Simple, single-parameter code search with human-readable descriptions - - When you want optional explanation mode for richer context - - Drop-in replacement for basic codebase retrieval tools - - Key parameters: - - info_request: str. Natural language description of the code you're looking for. - - information_request: str. Alias for info_request. - - include_explanation: bool (default false). Add summary, primary_locations, related_concepts. - - include_relationships: bool (default false). Add imports_from, calls, related_paths to results. - - limit: int (default 10). Maximum results to return. - - language: str. Filter by programming language. - - under: str. Limit search to specific directory. - - repo: str or list[str]. Filter by repository name(s). - - output_format: "json" (default) or "toon" for token-efficient TOON format. + RETURNS (explanation mode, include_explanation=True): + { + "ok": true, + "results": [...], + "summary": "Found 5 authentication-related functions across 3 files", + "primary_locations": ["src/auth.py", "src/middleware/auth.py"], + "related_concepts": ["jwt", "token", "session", "middleware"], + "query_understanding": "Looking for authentication implementation code", + "confidence": { + "level": "high", + "score": 0.82, + "symbol_matches": 3 + } + } - Returns: - - Compact mode (default): results with information field and relevance_score alias - - Explanation mode: adds summary, primary_locations, related_concepts, query_understanding + USAGE PATTERNS: + # Simple discovery: + info_request(info_request="database connection") - Example: - - {"info_request": "database connection pooling"} - - {"info_request": "authentication middleware", "include_explanation": true} + # With explanation: + info_request( + info_request="authentication flow", + include_explanation=True, + include_relationships=True + ) """ # Resolve query from either parameter query = info_request or information_request @@ -1964,29 +2907,90 @@ async def context_search( output_format: Any = None, kwargs: Any = None, ) -> Dict[str, Any]: - """Blend code search results with memory-store entries (notes, docs) for richer context. - - When to use: - - You want code spans plus relevant memories in one response. - - Prefer repo_search for code-only; use context_answer when you need an LLM-written answer. - - Key parameters: - - query: str or list[str] - - include_memories: bool (opt-in). If true, queries the memory collection and merges with code results. - - memory_weight: float (default 1.0). Scales memory scores relative to code. - - per_source_limits: dict, e.g. {"code": 5, "memory": 3} - - All repo_search filters are supported and passed through. - - output_format: "json" (default) or "toon" for token-efficient TOON format. - - rerank_enabled: bool (default true). ONNX reranker is ON by default for better relevance. - - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). - By default, auto-detects current repo from CURRENT_REPO env and filters to it. + """Blend code search results with memory-store entries for richer context. + + PRIMARY USE: Search code AND retrieve relevant stored memories/notes in one call. + + CHOOSE THIS WHEN: + - You want code results PLUS relevant memories (notes, docs, decisions) + - You're searching for something where team knowledge might help + - You want to surface both implementation AND documentation/context + - You need to check if there are existing notes about a topic + + CHOOSE INSTEAD: + - repo_search -> when you ONLY want code results (faster, no memory overhead) + - context_answer -> when you need an LLM-generated EXPLANATION + - memory_find -> when you ONLY want memories (no code search) + + QUERY EXAMPLES: + Good queries (conceptual, topic-based): + "authentication design decisions" - finds code + stored auth decisions + "API versioning strategy" - finds API code + design notes + "database migration approach" - finds migration code + notes + "caching invalidation policy" - finds cache code + policy notes + "error handling conventions" - finds error code + team standards + + Bad queries (too narrow for memory blending): + "def authenticate(" - too specific, use repo_search + "class UserController" - exact match, use repo_search + "line 42 of auth.py" - specific location, just read the file + "git commit abc123" - not a search query + "npm install express" - command, not a query + + ESSENTIAL PARAMETERS: + - query (str | list[str]): Natural language description of what you're looking for. + + MEMORY BLENDING PARAMETERS: + - include_memories (bool, default=False): MUST SET TO TRUE to enable memory blending. + Without this, context_search behaves identically to repo_search. + - memory_weight (float, default=1.0): Scale memory scores relative to code. + Values >1.0 boost memories, <1.0 favor code results. + - per_source_limits (dict): Control results per source. + Example: {"code": 6, "memory": 3} returns max 6 code + 3 memory results. + + COMMON PARAMETERS (same as repo_search): + - limit (int, default=10): Maximum total results. + - language (str): Filter code results by language. + - under (str): Restrict code search to directory path. + - include_snippet (bool, default=True): Include code snippets. + - rerank_enabled (bool, default=True): Cross-encoder reranking. + - output_format (str): "json" or "toon" for token-efficient format. + - repo (str | list[str]): Filter by repo. Use "*" for all repos. + + RETURNS: + { + "ok": true, + "results": [ + { + "source": "code", // "code" or "memory" + "score": 0.85, + "path": "src/auth.py", // For code results + "symbol": "authenticate", + "start_line": 42, + "end_line": 67, + "snippet": "def auth..." + }, + { + "source": "memory", // Memory results have different shape + "score": 0.78, + "content": "Auth uses JWT tokens with 24h expiry...", + "metadata": {"kind": "note", "created_at": "2024-..."} + } + ], + "total": 9, + "memory_note": "3 memories included" // Optional note about memory results + } - Returns: - - {"results": [{"source": "code"| "memory", ...}, ...], "total": N[, "memory_note": str]} - - In compact mode, results are reduced to lightweight records. + USAGE PATTERN: + # To blend code + memories (recommended pattern): + context_search( + query="authentication architecture", + include_memories=True, + per_source_limits={"code": 5, "memory": 3} + ) - Example: - - include_memories=true, per_source_limits={"code": 6, "memory": 2}, path_glob="docs/**" + # To search code only (same as repo_search): + context_search(query="authentication", include_memories=False) """ return await _context_search_impl( query=query, @@ -2072,34 +3076,110 @@ async def pattern_search( ) -> Dict[str, Any]: """Find structurally similar code patterns across all languages. - Accepts EITHER code examples OR natural language descriptions - auto-detects which. - - When to use: - - Find code with similar control flow (retry loops, error handling, etc.) - - Cross-language pattern matching (Python pattern → Go/Rust/Java matches) - - Detect code duplication based on structure, not syntax - - Search by pattern description ("retry with backoff", "resource cleanup") + PRIMARY USE: Search by CODE STRUCTURE rather than text/semantics. + Finds code with similar control flow, API usage, or patterns. + + CHOOSE THIS WHEN: + - You have a CODE EXAMPLE and want to find similar patterns + - You want to find code STRUCTURALLY similar (not just textually) + - You're searching across languages (Python pattern -> find in Go/Rust/Java) + - You want to detect code duplication based on structure + - You're searching for patterns like "retry with backoff", "singleton" + + CHOOSE INSTEAD: + - repo_search -> when searching by CONCEPT, not structural pattern + - symbol_graph -> when looking for call/definition relationships + - context_answer -> when you need an EXPLANATION + + QUERY EXAMPLES: + + Code example mode (query_mode="code" or auto-detected): + "for i in range(3): try: ... except: time.sleep(2**i)" + "if err != nil { return err }" + "async function $NAME($$$) { await $EXPR; }" + "with open(file) as f: data = f.read()" + "try { ... } catch (e) { console.error(e); throw e; }" + + Description mode (query_mode="description" or auto-detected): + "retry with exponential backoff" + "resource cleanup pattern" + "singleton implementation" + "factory pattern" + "decorator wrapping function" + "error handling with logging" + "connection pooling" + "rate limiting implementation" + + Bad queries (wrong use case): + "authentication code" - use repo_search for concepts + "who calls authenticate" - use symbol_graph + "explain the auth flow" - use context_answer + "files in src/" - use glob/file tools + + ESSENTIAL PARAMETERS: + - query (str): EITHER a code example OR a natural language pattern description. + The mode is auto-detected, or you can force it with query_mode. + + MODE CONTROL PARAMETERS: + - query_mode (str, default="auto"): How to interpret the query. + - "auto": Auto-detect if query is code or description + - "code": Force interpretation as code example + - "description": Force interpretation as pattern description + - language (str): Language hint for code examples. Also triggers code mode + in auto-detection. Example: "python", "go", "rust", "typescript" + + COMMON PARAMETERS: + - limit (int, default=10): Maximum results to return. + - min_score (float, default=0.3): Minimum similarity score threshold. + - include_snippet (bool, default=True): Include code snippets in results. + - target_languages (list[str]): Filter results to specific languages. + Example: ["python", "go"] to find pattern only in Python and Go files. + - repo (str | list[str]): Filter by repo. Use "*" for all repos. + - output_format (str): "json" or "toon" for token-efficient format. + - compact (bool): Minimal response fields. + + AROMA RERANKING PARAMETERS: + - aroma_rerank (bool, default=True): Enable AROMA-style pruning/reranking. + Improves precision by penalizing partial matches. + - aroma_alpha (float, default=0.6): Weight for pruned similarity vs original. + Higher values trust pruning more. + + RETURNS: + { + "ok": true, + "results": [ + { + "path": "src/client.py", + "start_line": 89, + "end_line": 102, + "score": 0.78, + "language": "python", + "snippet": "for attempt in range(max_retries):..." + } + ], + "total": 7, + "query_mode": "code", // or "description" + "query_signature": "...", // Internal: pattern signature used + "detection": { // Mode detection metadata + "confidence": 0.95, + "ast_validated": true, + "signals": {"ast_parsed": 1.0, "nl_similarity": 0.42} + } + } - Key parameters: - - query: str. Code snippet OR natural language description of pattern. - - query_mode: str. "code", "description", or "auto" (default). Explicit override for detection. - - language: str. Language hint for code examples (also triggers code mode in auto). - - limit: int (default 10). Maximum results to return. - - min_score: float (default 0.3). Minimum similarity score threshold. - - include_snippet: bool (default false). Include code snippets in results. - - target_languages: list[str]. Filter to specific target languages. - - output_format: "json" (default) or "toon" for token-efficient format. - - compact: bool. If true with TOON, use minimal fields. - - aroma_rerank: bool (default true). Enable AROMA-style pruning and reranking. - - aroma_alpha: float (default 0.6). Weight for pruned similarity vs original score. - - Returns: - - {ok, results: [{path, start_line, end_line, score, language, ...}], total, query_signature} + CROSS-LANGUAGE EXAMPLE: + # Find Go error handling similar to Python pattern + pattern_search( + query="if err != nil { return err }", + language="go", + target_languages=["python", "rust", "java"] + ) - Examples: - - pattern_search(query="for i in range(3): try: ... except: time.sleep(2**i)") - - pattern_search(query="retry with exponential backoff", query_mode="description") - - pattern_search(query="if err != nil { return err }", language="go") + NOTES: + - Pattern vectors must be indexed (PATTERN_VECTORS=1 during indexing) + - Auto-detection uses AST parsing + NL embedder comparison + - Code mode uses structural pattern matching + - Description mode uses semantic search on pattern descriptions """ return await _pattern_search_impl( query=query, @@ -2280,6 +3360,11 @@ async def neo4j_graph_query( transport = os.environ.get("FASTMCP_TRANSPORT", "sse").strip().lower() # Enable stateless HTTP mode to avoid session handshake requirement stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"} + + # Add auth header extraction middleware for HTTP transports + if transport != "stdio": + _add_auth_middleware() + if transport == "stdio": # Run over stdio (for clients that don't support network transports) mcp.run(transport="stdio") diff --git a/scripts/mcp_memory_server.py b/scripts/mcp_memory_server.py index b11607f4..61880669 100644 --- a/scripts/mcp_memory_server.py +++ b/scripts/mcp_memory_server.py @@ -43,6 +43,7 @@ from scripts.mcp_auth import ( require_auth_session as _require_auth_session, require_collection_access as _require_collection_access, + AUTH_HEADER_TOKEN as _AUTH_HEADER_TOKEN, ) from qdrant_client import QdrantClient, models @@ -68,6 +69,7 @@ LEX_VECTOR_NAME = os.environ.get("LEX_VECTOR_NAME", "lex") LEX_VECTOR_DIM = int(os.environ.get("LEX_VECTOR_DIM", "4096") or 4096) EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5") +MEMORY_FIND_LIMIT_DEFAULT = int(os.environ.get("MEMORY_FIND_LIMIT_DEFAULT", "10") or 10) # Minimal embedding via fastembed (CPU) @@ -137,6 +139,58 @@ def _ensure_once(name: str) -> bool: return False # Disable DNS rebinding protection - breaks Docker internal networking (Host: mcp:8000) +TOOLS_METADATA: Dict[str, Dict] = { + "memory_store": { + "name": "memory_store", + "category": "memory", + "primary_use": "Store knowledge for later retrieval", + "choose_when": [ + "Storing team decisions/notes", + "Documenting conventions", + "Building institutional memory", + ], + "choose_instead": {}, + "parameters": { + "essential": ["information"], + "common": ["metadata"], + "advanced": ["collection", "session"], + }, + "returns": {"ok": "bool", "id": "str", "message": "str"}, + "related_tools": ["memory_find", "context_search"], + "performance": { + "typical_latency_ms": (50, 500), + "requires_index": False, + "requires_decoder": False, + }, + }, + "memory_find": { + "name": "memory_find", + "category": "memory", + "primary_use": "Retrieve stored memories by similarity", + "choose_when": [ + "Looking for stored notes/decisions", + "Recalling team knowledge", + ], + "choose_instead": {"context_search": "Want code + memories together"}, + "parameters": { + "essential": ["query"], + "common": ["limit", "kind", "topic", "tags"], + "advanced": ["priority_min", "collection"], + }, + "returns": { + "ok": "bool", + "results": "list[{id, information, metadata, score}]", + "total": "int", + }, + "related_tools": ["memory_store", "context_search"], + "performance": { + "typical_latency_ms": (50, 300), + "requires_index": False, + "requires_decoder": False, + }, + }, +} + _security_settings = ( TransportSecuritySettings(enable_dns_rebinding_protection=False) if TransportSecuritySettings @@ -144,7 +198,49 @@ def _ensure_once(name: str) -> bool: ) mcp = FastMCP(name="memory-server", transport_security=_security_settings) -# Capture tool registry automatically by wrapping the decorator once + +class _AuthHeaderASGIMiddleware: + """Pure ASGI middleware that extracts Authorization header into context var.""" + def __init__(self, app): + self.app = app + + async def __call__(self, scope, receive, send): + if scope["type"] == "http": + headers = dict(scope.get("headers", [])) + auth_header = headers.get(b"authorization", b"").decode("utf-8", errors="ignore") + if auth_header.lower().startswith("bearer "): + token = auth_header[7:].strip() + else: + token = auth_header.strip() if auth_header else "" + _AUTH_HEADER_TOKEN.set(token) + return await self.app(scope, receive, send) + + +def _add_auth_middleware(): + """Wrap FastMCP's ASGI app with auth header extraction middleware.""" + logger.info("Setting up auth header middleware...") + try: + if hasattr(mcp, "streamable_http_app"): + _orig_streamable = mcp.streamable_http_app + def _patched_streamable(*args, **kwargs): + app = _orig_streamable(*args, **kwargs) + logger.info(f"Wrapping streamable_http_app with auth middleware") + return _AuthHeaderASGIMiddleware(app) + mcp.streamable_http_app = _patched_streamable + + if hasattr(mcp, "sse_app"): + _orig_sse = mcp.sse_app + def _patched_sse(*args, **kwargs): + app = _orig_sse(*args, **kwargs) + logger.info(f"Wrapping sse_app with auth middleware") + return _AuthHeaderASGIMiddleware(app) + mcp.sse_app = _patched_sse + + logger.info("Patched FastMCP app factory methods for auth middleware injection") + except Exception as e: + logger.warning(f"Failed to patch FastMCP for auth middleware: {e}") + + _TOOLS_REGISTRY: list[dict] = [] try: _orig_tool = mcp.tool @@ -251,7 +347,12 @@ def do_GET(self): self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() - payload = {"ok": True, "tools": _TOOLS_REGISTRY} + enriched = [] + for t in _TOOLS_REGISTRY: + name = t.get("name", "") + meta = TOOLS_METADATA.get(name, {}) + enriched.append({**t, **meta}) + payload = {"ok": True, "tools": enriched, "metadata": TOOLS_METADATA} self.wfile.write((json.dumps(payload)).encode("utf-8")) else: self.send_response(404) @@ -436,6 +537,12 @@ def set_session_defaults( mode: Optional[str] = None, language: Optional[str] = None, under: Optional[str] = None, + repo: Any = None, + compact: Any = None, + output_format: Optional[str] = None, + include_snippet: Any = None, + rerank_enabled: Any = None, + limit: Any = None, ctx: Context = None, kwargs: Any = None, ) -> Dict[str, Any]: @@ -447,6 +554,19 @@ def set_session_defaults( - Optionally, also supports a lightweight token for clients that prefer cross-connection reuse. Precedence everywhere: explicit collection > per-connection defaults > token defaults > env default. + + Parameters: + - collection: Default collection name + - mode: Search mode hint + - under: Default path prefix filter + - language: Default language filter + - repo: Default repo filter for multi-repo setups + - compact: Default compact response mode (bool) + - output_format: Default output format ("json" or "toon") + - include_snippet: Default snippet inclusion (bool) + - rerank_enabled: Default reranking toggle (bool) + - limit: Default result limit (int) + - session: Session token for cross-connection reuse """ # Handle kwargs payload from some clients try: @@ -467,6 +587,18 @@ def set_session_defaults( under = _extra["under"] if not session and _extra.get("session"): session = _extra["session"] + if repo is None and _extra.get("repo"): + repo = _extra["repo"] + if compact is None and _extra.get("compact") is not None: + compact = _extra["compact"] + if not output_format and _extra.get("output_format"): + output_format = _extra["output_format"] + if include_snippet is None and _extra.get("include_snippet") is not None: + include_snippet = _extra["include_snippet"] + if rerank_enabled is None and _extra.get("rerank_enabled") is not None: + rerank_enabled = _extra["rerank_enabled"] + if limit is None and _extra.get("limit") is not None: + limit = _extra["limit"] except Exception as e: logger.debug(f"Suppressed exception: {e}") @@ -480,6 +612,23 @@ def set_session_defaults( defaults["language"] = language.strip() if isinstance(under, str) and under.strip(): defaults["under"] = under.strip() + if isinstance(repo, str) and repo.strip(): + defaults["repo"] = repo.strip() + elif isinstance(repo, list): + defaults["repo"] = repo + if isinstance(output_format, str) and output_format.strip(): + defaults["output_format"] = output_format.strip() + if compact is not None: + defaults["compact"] = bool(compact) if not isinstance(compact, bool) else compact + if include_snippet is not None: + defaults["include_snippet"] = bool(include_snippet) if not isinstance(include_snippet, bool) else include_snippet + if rerank_enabled is not None: + defaults["rerank_enabled"] = bool(rerank_enabled) if not isinstance(rerank_enabled, bool) else rerank_enabled + if limit is not None: + try: + defaults["limit"] = int(limit) + except (ValueError, TypeError): + pass # Store per-connection (preferred, no token required) try: @@ -521,9 +670,107 @@ def memory_store( session: Optional[str] = None, ctx: Context = None, ) -> Dict[str, Any]: - """Store a memory entry into Qdrant (dual vectors consistent with indexer). + """Store knowledge/notes into the memory system for later retrieval. + + PRIMARY USE: Persist team knowledge, decisions, conventions, or notes + that should be retrievable alongside code search results. + + CHOOSE THIS WHEN: + - You want to store a decision or convention for future reference + - You're documenting why code works a certain way + - You want to persist knowledge that context_search can find + - You're building institutional memory for the codebase + + WHAT TO STORE: + Good candidates for memory storage: + - Architecture decisions: "We use JWT for auth because..." + - Conventions: "All API responses follow the envelope pattern..." + - Gotchas: "The cache has a 5-minute TTL, not configurable..." + - Debugging notes: "If X fails, check Y first..." + - Integration details: "External API requires header Z..." + - Performance notes: "This query is O(n^2), optimize for large N..." + + Bad candidates (don't store these): + - Code itself (it's already indexed) + - Temporary debug output + - Personal notes not relevant to the codebase + - Sensitive data (passwords, keys, secrets) + + ESSENTIAL PARAMETERS: + - information (str): The knowledge/note to store. Should be clear, + self-contained text that will be useful when retrieved later. + + METADATA PARAMETERS: + - metadata (dict): Optional structured metadata for filtering. + Common keys: + - kind: "note", "decision", "convention", "gotcha", "policy" + - topic: Subject area ("auth", "caching", "api", "database") + - priority: Importance (1=low, 5=high) + - tags: List of tags for filtering + - author: Who wrote this note + + Auto-added if not provided: + - created_at: ISO timestamp + - kind: "memory" (default) + - source: "memory" (default) + + SESSION PARAMETERS: + - collection (str): Target collection. Defaults to workspace collection. + - session (str): Session token for multi-user scenarios. + + RETURNS: + { + "ok": true, + "id": "abc123...", // Unique ID for this memory + "message": "Successfully stored information", + "collection": "codebase", + "vector": "bge-base-en-v1-5" // Embedding model used + } - First call may be slower because the embedding model loads lazily. + USAGE PATTERNS: + + # Store an architecture decision + memory_store( + information="We chose FastAPI over Flask because we need async support + for the WebSocket handlers and automatic OpenAPI documentation.", + metadata={ + "kind": "decision", + "topic": "api", + "tags": ["framework", "architecture"] + } + ) + + # Store a debugging gotcha + memory_store( + information="If authentication fails silently, check that the JWT_SECRET + env var is set. The auth middleware swallows exceptions.", + metadata={ + "kind": "gotcha", + "topic": "auth", + "priority": 4 + } + ) + + # Store a convention + memory_store( + information="All database queries must use parameterized statements. + Raw string interpolation is forbidden for security.", + metadata={ + "kind": "convention", + "topic": "database", + "tags": ["security", "sql"] + } + ) + + RETRIEVAL: + Stored memories can be retrieved via: + - memory_find(query="...") -> searches only memories + - context_search(query="...", include_memories=True) -> code + memories + + NOTES: + - First call may be slower due to embedding model loading + - Memories are embedded using the same model as code for consistent search + - Duplicate content is not deduplicated; avoid storing the same thing twice """ sess = _require_auth_session(session) coll = _resolve_collection(collection, session=session, ctx=ctx) @@ -587,10 +834,97 @@ def memory_find( priority_min: Optional[int] = None, ctx: Context = None, ) -> Dict[str, Any]: - """Find memory-like entries by vector similarity (dense + lexical fusion). + """Retrieve stored memories/notes by semantic similarity. + + PRIMARY USE: Find previously stored knowledge, decisions, or notes. + Searches ONLY the memory store, not code. + + CHOOSE THIS WHEN: + - You want to find previously stored notes/decisions + - You're looking for team knowledge without code results + - You want to filter memories by metadata (kind, topic, tags) + - You need to recall specific documented information + + CHOOSE INSTEAD: + - context_search with include_memories=True -> when you want code + memories + - repo_search -> when you want code only, no memories + + QUERY EXAMPLES: + Good queries (conceptual, knowledge-seeking): + "authentication decisions" - finds auth-related notes + "why we chose this approach" - finds decision rationale + "database performance tips" - finds DB-related notes + "API design conventions" - finds API conventions + "deployment gotchas" - finds deployment notes + + Bad queries: + "def authenticate" - code fragment, use repo_search + "src/auth.py" - file path, not a memory query + "UserService" - class name, use repo_search + + ESSENTIAL PARAMETERS: + - query (str): Natural language description of what you're looking for. + + ALTERNATIVE QUERY PARAMETERS: + - q (str): Alias for query. + - top_k (int): Alias for limit. + + FILTER PARAMETERS: + - kind (str): Filter by memory kind. + Values: "note", "decision", "convention", "gotcha", "policy", "preference" + - topic (str): Filter by topic/subject area. + Example: "auth", "database", "api", "caching" + - tags (str | list[str]): Filter by tags. + Example: "security" or ["security", "sql"] + - language (str): Filter by programming language context. + - priority_min (int): Minimum priority (1-5). Higher = more important. + + COMMON PARAMETERS: + - limit (int, default=5): Maximum results to return. + - collection (str): Target collection. Defaults to workspace collection. + - session (str): Session token for multi-user scenarios. + + RETURNS: + { + "ok": true, + "results": [ + { + "id": "abc123...", + "information": "We chose JWT for authentication because...", + "metadata": { + "kind": "decision", + "topic": "auth", + "created_at": "2024-01-15T10:30:00Z", + "tags": ["security", "architecture"] + }, + "score": 0.85, + "highlights": ["...chose <> for <>..."] + } + ], + "total": 3, + "count": 3, + "query": "authentication decisions" + } + + USAGE PATTERNS: + + # Find all authentication-related notes + memory_find(query="authentication", topic="auth") + + # Find high-priority gotchas + memory_find(query="common issues", kind="gotcha", priority_min=4) - Cold-start option: set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding until the - model is cached (useful on slow storage). + # Find security-related conventions + memory_find(query="security best practices", kind="convention", tags="security") + + # Find recent decisions + memory_find(query="recent architecture decisions", kind="decision", limit=10) + + NOTES: + - Cold start: First call may be slower if embedding model isn't cached + - Set MEMORY_COLD_SKIP_DENSE=1 to skip dense embedding on cold start + - Highlights show query term matches in context + - Results are ranked by hybrid similarity (dense + lexical fusion) """ # Handle 'q' alias for query if not query and q: @@ -612,7 +946,7 @@ def memory_find( lex = _lex_hash_vector_text(str(query), LEX_VECTOR_DIM) # Harmonize alias: top_k -> limit - lim = int(limit if limit is not None else (top_k if top_k is not None else 5)) + lim = int(limit if limit is not None else (top_k if top_k is not None else MEMORY_FIND_LIMIT_DEFAULT)) # Build Qdrant filter must = [] @@ -829,6 +1163,11 @@ def _resolve_collection( # Enable stateless HTTP mode to avoid session handshake requirement stateless_http = str(os.environ.get("FASTMCP_STATELESS_HTTP", "1")).strip().lower() in {"1", "true", "yes", "on"} + + # Add auth header extraction middleware for HTTP transports + if transport != "stdio": + _add_auth_middleware() + if transport == "stdio": # Run over stdio (for clients that don't support network transports) mcp.run(transport="stdio") diff --git a/scripts/qdrant_client_manager.py b/scripts/qdrant_client_manager.py index 78cc716b..3e901cef 100644 --- a/scripts/qdrant_client_manager.py +++ b/scripts/qdrant_client_manager.py @@ -9,7 +9,7 @@ import threading import time import weakref -from typing import Optional, Dict, List +from typing import Optional, Dict, List, Any from contextlib import contextmanager from qdrant_client import QdrantClient @@ -17,6 +17,33 @@ # Connection pool implementation logger = logging.getLogger(__name__) + + +def _get_qdrant_timeout() -> Optional[float]: + """Return the configured Qdrant HTTP timeout (seconds) if set.""" + raw = os.environ.get("QDRANT_TIMEOUT") or os.environ.get("QDRANT_CLIENT_TIMEOUT") + if not raw: + return None + try: + timeout = float(raw) + return timeout if timeout > 0 else None + except (TypeError, ValueError): + logger.debug("Invalid Qdrant timeout value '%s'; ignoring", raw) + return None + + +def _client_kwargs(url: str, api_key: Optional[str]) -> Dict[str, Any]: + """Build kwargs dict for QdrantClient constructor with optional timeout.""" + kwargs: Dict[str, Any] = { + "url": url, + "api_key": api_key if api_key else None, + } + timeout = _get_qdrant_timeout() + if timeout is not None: + kwargs["timeout"] = timeout + return kwargs + + class QdrantConnectionPool: """Thread-safe connection pool for QdrantClient instances.""" @@ -49,7 +76,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient: # No suitable client found, create a new one if self._created_count < self.max_size: - client = QdrantClient(url=url, api_key=api_key) + client = QdrantClient(**_client_kwargs(url, api_key)) pool_entry = { 'client': client, 'url': url, @@ -66,7 +93,7 @@ def get_client(self, url: str, api_key: Optional[str] = None) -> QdrantClient: # Pool is full, create a temporary client (not pooled) # Mark it for tracking so return_client can close it self._misses += 1 - temp_client = QdrantClient(url=url, api_key=api_key) + temp_client = QdrantClient(**_client_kwargs(url, api_key)) # Track temporary clients with weakref so they auto-close self._temp_clients.add(temp_client) return temp_client @@ -197,13 +224,13 @@ def get_qdrant_client( # Fallback to singleton pattern for backward compatibility if force_new: - return QdrantClient(url=url, api_key=api_key if api_key else None) - + return QdrantClient(**_client_kwargs(url, api_key)) + global _client with _client_lock: if _client is None: - _client = QdrantClient(url=url, api_key=api_key if api_key else None) + _client = QdrantClient(**_client_kwargs(url, api_key)) return _client diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py index a1a22279..0a065b22 100644 --- a/scripts/remote_upload_client.py +++ b/scripts/remote_upload_client.py @@ -580,7 +580,10 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: # Skip paths that cannot be resolved continue - cached_hash = get_cached_file_hash(abs_path, self.repo_name) + # Translate to container path for cache lookup (cache stores container paths) + # This handles the case where bridge runs locally but cache was created in container + cache_key = self._translate_to_container_path(abs_path) + cached_hash = get_cached_file_hash(cache_key, self.repo_name) if not path.exists(): # File was deleted @@ -631,12 +634,12 @@ def detect_file_changes(self, changed_paths: List[Path]) -> Dict[str, List]: # Unchanged (content same despite stat change) changes["unchanged"].append(path) - # Update caches + # Update caches (use container path for cache consistency) try: self._stat_cache[abs_path] = (getattr(stat, "st_mtime_ns", int(stat.st_mtime * 1e9)), stat.st_size) except Exception as e: logger.debug(f"Suppressed exception: {e}") - set_cached_file_hash(abs_path, current_hash, self.repo_name) + set_cached_file_hash(cache_key, current_hash, self.repo_name) # Detect moves by looking for files with same content hash # but different paths (requires additional tracking) @@ -662,7 +665,9 @@ def _detect_moves(self, created_files: List[Path], deleted_files: List[Path]) -> for deleted_path in deleted_files: try: # Try to get cached hash first, fallback to file content - cached_hash = get_cached_file_hash(str(deleted_path), self.repo_name) + # Use container path for cache lookup (cache stores container paths) + cache_key = self._translate_to_container_path(str(deleted_path)) + cached_hash = get_cached_file_hash(cache_key, self.repo_name) if cached_hash: deleted_hashes[cached_hash] = deleted_path continue @@ -777,7 +782,9 @@ def create_delta_bundle( content = f.read() file_hash = hashlib.sha1(content).hexdigest() content_hash = f"sha1:{file_hash}" - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + # Use container path for cache lookup (cache stores container paths) + cache_key = self._translate_to_container_path(str(path.resolve())) + previous_hash = get_cached_file_hash(cache_key, self.repo_name) # Write file to bundle bundle_file_path = files_dir / "updated" / rel_path @@ -853,7 +860,9 @@ def create_delta_bundle( for path in changes["deleted"]: rel_path = path.relative_to(Path(self.workspace_path)).as_posix() try: - previous_hash = get_cached_file_hash(str(path.resolve()), self.repo_name) + # Use container path for cache lookup (cache stores container paths) + cache_key = self._translate_to_container_path(str(path.resolve())) + previous_hash = get_cached_file_hash(cache_key, self.repo_name) operation = { "operation": "deleted", diff --git a/scripts/upload_service.py b/scripts/upload_service.py index 3c718ffd..fc8e66fa 100644 --- a/scripts/upload_service.py +++ b/scripts/upload_service.py @@ -303,6 +303,17 @@ class AuthUserCreateResponse(BaseModel): username: str +class AuthValidateRequest(BaseModel): + session_id: str + + +class AuthValidateResponse(BaseModel): + valid: bool + user_id: Optional[str] = None + expires_at: Optional[int] = None + metadata: Optional[Dict[str, Any]] = None + + class PasswordLoginRequest(BaseModel): username: str password: str @@ -599,6 +610,41 @@ async def auth_login(payload: AuthLoginRequest): ) +@app.post("/auth/validate", response_model=AuthValidateResponse) +async def auth_validate(payload: AuthValidateRequest): + """Validate a session ID and return session info if valid. + + This endpoint allows remote MCP servers to validate sessions against the + auth backend that issued them, enabling distributed auth validation. + """ + try: + if not AUTH_ENABLED: + # When auth is disabled, all sessions are considered valid + return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None) + + sid = (payload.session_id or "").strip() + if not sid: + return AuthValidateResponse(valid=False) + + try: + record = validate_session(sid) + except AuthDisabledError: + return AuthValidateResponse(valid=True, user_id=None, expires_at=None, metadata=None) + + if record is None: + return AuthValidateResponse(valid=False) + + return AuthValidateResponse( + valid=True, + user_id=record.get("user_id"), + expires_at=record.get("expires_at"), + metadata=record.get("metadata"), + ) + except Exception as e: + logger.error(f"[upload_service] Failed to validate session: {e}") + raise HTTPException(status_code=500, detail="Failed to validate session") + + @app.get("/admin") async def admin_root(request: Request): if not AUTH_ENABLED: diff --git a/tests/test_context_answer.py b/tests/test_context_answer.py index 9aa74c4b..022c5d85 100644 --- a/tests/test_context_answer.py +++ b/tests/test_context_answer.py @@ -1,3 +1,4 @@ +import asyncio import importlib import types import pytest @@ -51,7 +52,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama) monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.context_answer(query="how to do x", limit=2, per_path=1) ) @@ -82,7 +83,7 @@ def generate_with_soft_embeddings(self, *a, **k): monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama) monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.context_answer(query="how to do y", limit=1) ) @@ -130,7 +131,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 256, **kw monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama) monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.context_answer(query="what is RRF_K in hybrid_search.py?", limit=1, per_path=1) ) @@ -179,7 +180,7 @@ def generate_with_soft_embeddings(self, *a, **kw): monkeypatch.setattr(ref, "LlamaCppRefragClient", FakeLlama) monkeypatch.setattr(ref, "is_decoder_enabled", lambda: True) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.context_answer(query="RRF_K", limit=1, per_path=1) ) @@ -214,7 +215,7 @@ def _raise_retrieval(*a, **k): monkeypatch.setattr(srv, "_ca_prepare_filters_and_retrieve", _raise_retrieval) - out = srv.asyncio.get_event_loop().run_until_complete( + out = asyncio.run( srv.context_answer(query="x", limit=1, per_path=1) ) assert "error" in out @@ -245,7 +246,7 @@ def _fake_retrieval(*a, **k): import scripts.refrag_llamacpp as ref monkeypatch.setattr(ref, "is_decoder_enabled", lambda: False) - out2 = srv.asyncio.get_event_loop().run_until_complete( + out2 = asyncio.run( srv.context_answer(query="x", limit=1, per_path=1) ) assert isinstance(out2, dict) diff --git a/tests/test_context_answer_path_mention.py b/tests/test_context_answer_path_mention.py index 59299b8d..b0e80f75 100644 --- a/tests/test_context_answer_path_mention.py +++ b/tests/test_context_answer_path_mention.py @@ -30,7 +30,7 @@ def generate_with_soft_embeddings(self, prompt: str, max_tokens: int = 64, **kw) # Mention an actual file in this repo so fallback can find it q = "explain something in scripts/hybrid_search.py" - out = srv.asyncio.get_event_loop().run_until_complete( + out = srv.asyncio.run( srv.context_answer(query=q, limit=3, per_path=2) ) assert isinstance(out, dict) diff --git a/tests/test_env_behavior.py b/tests/test_env_behavior.py index b5daa6f3..3803f210 100644 --- a/tests/test_env_behavior.py +++ b/tests/test_env_behavior.py @@ -20,6 +20,8 @@ def test_rerank_timeout_floor_and_env_defaults(monkeypatch): monkeypatch.setenv("RERANK_TIMEOUT_FLOOR_MS", "1500") # Fix default timeout for test determinism (CI may set a higher value) monkeypatch.setenv("RERANKER_TIMEOUT_MS", "200") + # Override the min clamp so the floor takes effect + monkeypatch.setenv("RERANK_TIMEOUT_MIN_MS", "0") # Fake _run_async to capture calls calls = [] @@ -47,7 +49,7 @@ async def fake_run(cmd, env=None, timeout=None): monkeypatch.setattr(srv, "_run_async", fake_run) # Call repo_search with no rerank_enabled arg to pick env default - res = srv.asyncio.get_event_loop().run_until_complete( + res = srv.asyncio.run( srv.repo_search(query="foo", limit=3, per_path=1) ) diff --git a/tests/test_error_paths.py b/tests/test_error_paths.py index 0beb9d4b..ad4d04e4 100644 --- a/tests/test_error_paths.py +++ b/tests/test_error_paths.py @@ -17,7 +17,7 @@ async def fake_run(cmd, **kwargs): monkeypatch.setattr(srv, "_run_async", fake_run) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search(queries=["x"], limit=1, compact=False, lean=False) ) @@ -49,7 +49,7 @@ async def fake_run(cmd, **kwargs): monkeypatch.setattr(srv, "_run_async", fake_run) - res = srv.asyncio.get_event_loop().run_until_complete( + res = asyncio.run( srv.repo_search(queries=["x"], limit=1, compact=True, lean=False) ) diff --git a/tests/test_globs_and_snippet.py b/tests/test_globs_and_snippet.py index 2486e1a0..9eb7fcbb 100644 --- a/tests/test_globs_and_snippet.py +++ b/tests/test_globs_and_snippet.py @@ -223,6 +223,6 @@ def run_hybrid_search(**kwargs): @pytest.mark.unit def test_repo_search_docstring_clean(): doc = srv.repo_search.__doc__ - assert doc and "Zero-config code search" in doc + assert doc and "Primary hybrid semantic" in doc # Ensure stray inline pseudo-code is not embedded in docstring assert "Accept common alias keys from clients" not in doc diff --git a/tests/test_qdrant_client_manager_pool.py b/tests/test_qdrant_client_manager_pool.py index 85408783..7cc4c9b2 100644 --- a/tests/test_qdrant_client_manager_pool.py +++ b/tests/test_qdrant_client_manager_pool.py @@ -18,7 +18,7 @@ class _DummyQdrantClient: - def __init__(self, url=None, api_key=None): + def __init__(self, url=None, api_key=None, **kwargs): self.url = url self.api_key = api_key self.closed = False