diff --git a/.env.example b/.env.example index 3da5e865..0e918238 100644 --- a/.env.example +++ b/.env.example @@ -24,6 +24,13 @@ EMBEDDING_PROVIDER=fastembed # Optional repo tag attached to each payload REPO_NAME=workspace +# Cross-Codebase Isolation (multi-repo search scoping) +# When enabled, search results are automatically filtered to the current repo +# Disable (=0) to search all repos by default (legacy behavior) +REPO_AUTO_FILTER=1 +# Explicitly set current repo (overrides auto-detection from git/directory) +# CURRENT_REPO=my-project + # MCP servers (SSE) FASTMCP_HOST=0.0.0.0 FASTMCP_PORT=8000 # search/store MCP (mcp-server-qdrant) @@ -56,6 +63,14 @@ RERANKER_TOPN=50 RERANKER_RETURN_M=12 RERANKER_TIMEOUT_MS=2000 +# Post-rerank symbol boost: ensures exact symbol matches rank highest even when +# the neural reranker disagrees. Applied after rerank blending as a direct score addition. +# Set to 0 to disable, >1.0 to boost symbol matches more aggressively. +POST_RERANK_SYMBOL_BOOST=1.0 +# Rerank blend weight: ratio of rerank score vs fusion score (0.0-1.0) +# Higher = more weight on neural reranker, lower = more weight on lexical/symbol boosts +RERANK_BLEND_WEIGHT=0.6 + # Safety: minimum rerank timeout floor (ms) to avoid cold-start timeouts RERANK_TIMEOUT_FLOOR_MS=1000 diff --git a/docs/MCP_API.md b/docs/MCP_API.md index ac12ea69..bd82157f 100644 --- a/docs/MCP_API.md +++ b/docs/MCP_API.md @@ -160,6 +160,13 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional - `limit` (int, default 10): Maximum total results to return - `per_path` (int, default 2): Maximum results per file path +**Cross-Codebase Isolation:** +- `repo` (str or list[str], optional): Filter results to specific repository(ies) + - Single repo: `"pathful-commons-app"` - Search only this repo + - Multiple repos: `["frontend", "backend"]` - Search related repos together + - All repos: `"*"` - Explicitly search all indexed repos (disable auto-filter) + - Default: Auto-detects current repo from `CURRENT_REPO` env when `REPO_AUTO_FILTER=1` + **Content Filters:** - `language` (str, optional): Filter by programming language - `path_glob` (str or list[str], optional): Glob patterns for path filtering @@ -185,6 +192,10 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional - `rerank_top_n` (int, default 50): Number of candidates to consider for reranking - `rerank_return_m` (int, default 12): Number of results to return after reranking +Reranking uses a blended scoring approach that preserves symbol match boosts: +- **Blend weight** (`RERANK_BLEND_WEIGHT`, default 0.6): Ratio of neural reranker score to fusion score +- **Post-rerank symbol boost** (`POST_RERANK_SYMBOL_BOOST`, default 1.0): Applied after blending to ensure exact symbol matches rank highest even when the neural reranker disagrees + **Response Format:** ```json { @@ -254,12 +265,30 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional } ``` +**Cross-Codebase Search (multi-repo):** +```json +{ + "query": "authentication middleware", + "repo": ["frontend", "backend"], + "limit": 15 +} +``` + +**Single Repo Search:** +```json +{ + "query": "user authentication", + "repo": "my-repo", + "include_snippet": true +} +``` + ### context_search() Blend code search results with memory entries for comprehensive context. **Parameters:** -All `repo_search` parameters plus: +All `repo_search` parameters (including `repo` for cross-codebase isolation) plus: - `include_memories` (bool, default true): Whether to include memory results - `memory_weight` (float, default 1.0): Weight for memory results vs code results - `per_source_limits` (dict, optional): Limits per source type: diff --git a/scripts/hybrid_search.py b/scripts/hybrid_search.py index 98f52588..9ff07bd8 100644 --- a/scripts/hybrid_search.py +++ b/scripts/hybrid_search.py @@ -1454,6 +1454,7 @@ def run_hybrid_search( expand: bool = True, model: TextEmbedding | None = None, collection: str | None = None, + repo: str | list[str] | None = None, # Filter by repo name(s); "*" to disable auto-filter ) -> List[Dict[str, Any]]: client = QdrantClient(url=os.environ.get("QDRANT_URL", QDRANT_URL), api_key=API_KEY) model_name = os.environ.get("EMBEDDING_MODEL", MODEL_NAME) @@ -1470,7 +1471,23 @@ def run_hybrid_search( eff_ext = ext or dsl.get("ext") eff_not = not_filter or dsl.get("not") eff_case = case or dsl.get("case") or os.environ.get("HYBRID_CASE", "insensitive") - eff_repo = dsl.get("repo") + # Repo filter: explicit param > DSL > auto-detect from env + eff_repo = repo or dsl.get("repo") + # Normalize repo to list for multi-repo support + if eff_repo and isinstance(eff_repo, str): + if eff_repo.strip() == "*": + eff_repo = None # "*" means search all repos + else: + eff_repo = [r.strip() for r in eff_repo.split(",") if r.strip()] + elif eff_repo and isinstance(eff_repo, (list, tuple)): + eff_repo = [str(r).strip() for r in eff_repo if str(r).strip() and str(r).strip() != "*"] + if not eff_repo: + eff_repo = None + # Auto-detect repo from env if not specified and auto-filter is enabled + if eff_repo is None and str(os.environ.get("REPO_AUTO_FILTER", "1")).strip().lower() in {"1", "true", "yes", "on"}: + auto_repo = os.environ.get("CURRENT_REPO") or os.environ.get("REPO_NAME") + if auto_repo and auto_repo.strip(): + eff_repo = [auto_repo.strip()] eff_path_regex = path_regex def _to_list(x): @@ -1595,12 +1612,27 @@ def _norm_under(u: str | None) -> str | None: key="metadata.language", match=models.MatchValue(value=eff_language) ) ) + # Repo filter: supports single repo or list of repos (for related codebases) if eff_repo: - must.append( - models.FieldCondition( - key="metadata.repo", match=models.MatchValue(value=eff_repo) + if isinstance(eff_repo, list) and len(eff_repo) == 1: + must.append( + models.FieldCondition( + key="metadata.repo", match=models.MatchValue(value=eff_repo[0]) + ) + ) + elif isinstance(eff_repo, list) and len(eff_repo) > 1: + # Multiple repos: use MatchAny for OR logic + must.append( + models.FieldCondition( + key="metadata.repo", match=models.MatchAny(any=eff_repo) + ) + ) + elif isinstance(eff_repo, str): + must.append( + models.FieldCondition( + key="metadata.repo", match=models.MatchValue(value=eff_repo) + ) ) - ) if eff_under: must.append( models.FieldCondition( diff --git a/scripts/ingest_code.py b/scripts/ingest_code.py index dc769dcd..6b9d9f06 100644 --- a/scripts/ingest_code.py +++ b/scripts/ingest_code.py @@ -23,6 +23,11 @@ def _detect_repo_name_from_path(path: Path) -> str: from pathlib import Path from typing import List, Dict, Iterable +try: + from tqdm import tqdm +except ImportError: + tqdm = None # type: ignore + # Ensure project root is on sys.path when run as a script (so 'scripts' package imports work) ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: @@ -2024,8 +2029,25 @@ def index_single_file( *, dedupe: bool = True, skip_unchanged: bool = True, + trust_cache: bool | None = None, ) -> bool: - """Index a single file path. Returns True if indexed, False if skipped.""" + """Index a single file path. Returns True if indexed, False if skipped. + + When trust_cache is enabled (via argument or INDEX_TRUST_CACHE=1), rely solely on the + local .codebase/cache.json for unchanged detection and skip Qdrant per-file hash checks. + """ + # Resolve trust_cache from env when not explicitly provided + if trust_cache is None: + try: + trust_cache = os.environ.get("INDEX_TRUST_CACHE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + except Exception: + trust_cache = False + try: text = file_path.read_text(encoding="utf-8", errors="ignore") except Exception as e: @@ -2092,16 +2114,19 @@ def index_single_file( return False except Exception: pass - prev = get_indexed_file_hash( - client, - collection, - str(file_path), - repo_id=repo_id, - repo_rel_path=repo_rel_path, - ) - if prev and prev == file_hash: - print(f"Skipping unchanged file: {file_path}") - return False + + # Optional Qdrant-backed unchanged detection; disabled when trust_cache is enabled + if not trust_cache: + prev = get_indexed_file_hash( + client, + collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) + if prev and prev == file_hash: + print(f"Skipping unchanged file: {file_path}") + return False if dedupe: delete_points_by_path(client, collection, str(file_path)) @@ -2443,7 +2468,9 @@ def index_repo( ) # Health check: detect cache/collection sync issues before indexing (single-collection mode only) - if not recreate and skip_unchanged and not use_per_repo_collections and collection: + # Skip with SKIP_HEALTH_CHECK=1 for large collections where scroll is slow + _skip_health = os.environ.get("SKIP_HEALTH_CHECK", "").strip().lower() in {"1", "true", "yes"} + if not _skip_health and not recreate and skip_unchanged and not use_per_repo_collections and collection: try: from scripts.collection_health import auto_heal_if_needed @@ -2457,6 +2484,8 @@ def index_repo( print("[health_check] Collection health OK") except Exception as e: print(f"[health_check] Warning: health check failed: {e}") + elif _skip_health: + print("[health_check] Skipped (SKIP_HEALTH_CHECK=1)") # Skip single collection setup in multi-repo mode if not use_per_repo_collections: @@ -2481,6 +2510,15 @@ def index_repo( CHUNK_LINES = int(os.environ.get("INDEX_CHUNK_LINES", "120") or 120) CHUNK_OVERLAP = int(os.environ.get("INDEX_CHUNK_OVERLAP", "20") or 20) PROGRESS_EVERY = int(os.environ.get("INDEX_PROGRESS_EVERY", "200") or 200) + # Trust-cache mode: skip Qdrant hash lookups when local cache says unchanged + _trust_cache = os.environ.get("INDEX_TRUST_CACHE", "").strip().lower() in { + "1", + "true", + "yes", + "on", + } + if _trust_cache: + print("[trust_cache] INDEX_TRUST_CACHE enabled - skipping Qdrant per-file hash checks") # Semantic chunking toggle use_semantic = os.environ.get("INDEX_SEMANTIC_CHUNKS", "1").lower() in { "1", @@ -2519,7 +2557,21 @@ def make_point(pid, dense_vec, lex_vec, payload): # Track per-file hashes across the entire run for cache updates on any flush batch_file_hashes = {} - for file_path in iter_files(root): + # Collect files for progress bar (fast: just list paths, no I/O) + all_files = list(iter_files(root)) + total_files = len(all_files) + print(f"Found {total_files} files to process") + + # Use tqdm progress bar if available, otherwise simple iteration + # When progress bar is active, suppress per-file skip messages + _use_progress_bar = tqdm is not None + if _use_progress_bar: + file_iter = tqdm(all_files, desc="Indexing", unit="file", ncols=100, + bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") + else: + file_iter = all_files + + for file_path in file_iter: files_seen += 1 # Determine collection per-file in multi-repo mode (use watcher's exact logic) @@ -2537,6 +2589,11 @@ def make_point(pid, dense_vec, lex_vec, payload): except Exception as e: print(f"Skipping {file_path}: {e}") continue + + # Skip empty files + if not text or not text.strip(): + continue + language = detect_language(file_path) file_hash = hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest() @@ -2588,79 +2645,40 @@ def make_point(pid, dense_vec, lex_vec, payload): if get_cached_file_hash: prev_local = get_cached_file_hash(str(file_path), per_file_repo) if prev_local and file_hash and prev_local == file_hash: - if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: - print(f"... processed {files_seen} files (skipping unchanged, cache)") - try: - if update_indexing_status: - target_workspace = ( - ws_path if not use_per_repo_collections else str(file_path.parent) - ) - target_repo = ( - repo_tag if not use_per_repo_collections else per_file_repo - ) - update_indexing_status( - workspace_path=target_workspace, - status={ - "state": "indexing", - "progress": { - "files_processed": files_seen, - "total_files": None, - "current_file": str(file_path), - }, - }, - repo_name=target_repo, - ) - except Exception: - pass - else: - print(f"Skipping unchanged file (cache): {file_path}") + # Only print skip messages if no progress bar + if not _use_progress_bar: + if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: + print(f"... processed {files_seen} files (skipping unchanged, cache)") + else: + print(f"Skipping unchanged file (cache): {file_path}") continue except Exception: pass # Check existing indexed hash in Qdrant (logical identity when available) - prev = get_indexed_file_hash( - client, - current_collection, - str(file_path), - repo_id=repo_id, - repo_rel_path=repo_rel_path, - ) - if prev and file_hash and prev == file_hash: - # File exists in Qdrant with same hash - cache it locally for next time - try: - if set_cached_file_hash: - set_cached_file_hash(str(file_path), file_hash, per_file_repo) - except Exception: - pass - if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: - # minor heartbeat when no progress cadence configured - print(f"... processed {files_seen} files (skipping unchanged)") + # Skip this when INDEX_TRUST_CACHE is enabled - rely solely on local cache + if not _trust_cache: + prev = get_indexed_file_hash( + client, + current_collection, + str(file_path), + repo_id=repo_id, + repo_rel_path=repo_rel_path, + ) + if prev and file_hash and prev == file_hash: + # File exists in Qdrant with same hash - cache it locally for next time try: - if update_indexing_status: - target_workspace = ( - ws_path if not use_per_repo_collections else str(file_path.parent) - ) - target_repo = ( - repo_tag if not use_per_repo_collections else per_file_repo - ) - update_indexing_status( - workspace_path=target_workspace, - status={ - "state": "indexing", - "progress": { - "files_processed": files_seen, - "total_files": None, - "current_file": str(file_path), - }, - }, - repo_name=target_repo, - ) + if set_cached_file_hash: + set_cached_file_hash(str(file_path), file_hash, per_file_repo) except Exception: pass - else: - print(f"Skipping unchanged file: {file_path}") - continue + # Only print skip messages if no progress bar + if not _use_progress_bar: + if PROGRESS_EVERY <= 0 and files_seen % 50 == 0: + print(f"... processed {files_seen} files (skipping unchanged)") + else: + print(f"Skipping unchanged file: {file_path}") + continue # At this point, file content has changed vs previous index; attempt smart reindex when enabled if _smart_symbol_reindexing_enabled(): @@ -2696,6 +2714,8 @@ def make_point(pid, dense_vec, lex_vec, payload): delete_points_by_path(client, current_collection, str(file_path)) files_indexed += 1 + # Progress: show each file being indexed + print(f"Indexing [{files_indexed}]: {file_path}") symbols = _extract_symbols(language, text) imports, calls = _get_imports_calls(language, text) last_mod, churn_count, author_count = _git_metadata(file_path) diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index 8e20a7b0..af96c533 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -883,6 +883,59 @@ def _tokens_from_queries(qs): return out +def _detect_current_repo() -> str | None: + """Detect the current repository name from workspace/env. + + Priority: + 1. CURRENT_REPO env var (explicitly set) + 2. REPO_NAME env var + 3. Detect from /work directory structure (first subdirectory with .git) + 4. Git remote origin name + + Returns: repo name or None if detection fails + """ + # Check explicit env vars first + for env_key in ("CURRENT_REPO", "REPO_NAME"): + val = os.environ.get(env_key, "").strip() + if val: + return val + + # Try to detect from /work directory + work_path = Path("/work") + if work_path.exists(): + try: + # Check for .git in /work itself + if (work_path / ".git").exists(): + # Use git to get repo name from remote + try: + import subprocess + result = subprocess.run( + ["git", "-C", str(work_path), "config", "--get", "remote.origin.url"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0 and result.stdout.strip(): + url = result.stdout.strip() + # Extract repo name from URL (e.g., git@github.com:user/repo.git -> repo) + name = url.rstrip("/").rsplit("/", 1)[-1] + if name.endswith(".git"): + name = name[:-4] + if name: + return name + except Exception: + pass + # Fallback to directory name + return work_path.name + + # Check subdirectories for repos + for subdir in work_path.iterdir(): + if subdir.is_dir() and (subdir / ".git").exists(): + return subdir.name + except Exception: + pass + + return None + + @mcp.tool() async def qdrant_index_root( recreate: Optional[bool] = None, collection: Optional[str] = None @@ -1693,6 +1746,8 @@ async def repo_search( ext: Any = None, not_: Any = None, case: Any = None, + # Repo scoping (cross-codebase isolation) + repo: Any = None, # str, list[str], or "*" to search all repos # Response shaping compact: Any = None, args: Any = None, # Compatibility shim for mcp-remote/Claude wrappers that send args/kwargs @@ -1711,6 +1766,9 @@ async def repo_search( - include_snippet/context_lines: return inline snippets near hits when true. - rerank_*: optional ONNX reranker toggles; timeouts fall back to hybrid output. - collection: str. Target collection; defaults to workspace state or env COLLECTION_NAME. + - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). + By default, auto-detects current repo from CURRENT_REPO env and filters to it. + Use repo=["frontend","backend"] to search related repos together. - Filters (optional): language, under (path prefix), kind, symbol, ext, path_regex, path_glob (str or list[str]), not_glob (str or list[str]), not_ (negative text), case. @@ -1957,6 +2015,29 @@ def _to_str_list(x): ext = _to_str(ext, "").strip() not_ = _to_str(not_, "").strip() case = _to_str(case, "").strip() + + # Normalize repo filter: str, list[str], or "*" (search all) + # Default: auto-detect current repo unless REPO_AUTO_FILTER=0 + repo_filter = None + if repo is not None: + if isinstance(repo, str): + r = repo.strip() + if r == "*": + repo_filter = "*" # Explicit "search all repos" + elif r: + # Support comma-separated list + repo_filter = [x.strip() for x in r.split(",") if x.strip()] + elif isinstance(repo, (list, tuple)): + repo_filter = [str(x).strip() for x in repo if str(x).strip() and str(x).strip() != "*"] + if not repo_filter: + repo_filter = "*" # Empty list after filtering means search all + + # Auto-detect current repo if not explicitly specified and auto-filter is enabled + if repo_filter is None and str(os.environ.get("REPO_AUTO_FILTER", "1")).strip().lower() in {"1", "true", "yes", "on"}: + detected_repo = _detect_current_repo() + if detected_repo: + repo_filter = [detected_repo] + compact_raw = compact compact = _to_bool(compact, False) # If snippets are requested, do not compact (we need snippet field in results) @@ -2039,6 +2120,7 @@ def _to_str_list(x): expand=str(os.environ.get("HYBRID_EXPAND", "1")).strip().lower() in {"1", "true", "yes", "on"}, model=model, + repo=repo_filter, # Cross-codebase isolation ) finally: if prev_coll is None: @@ -2128,6 +2210,7 @@ def _to_str_list(x): expand=str(os.environ.get("HYBRID_EXPAND", "0")).strip().lower() in {"1", "true", "yes", "on"}, model=model, + repo=repo_filter, # Cross-codebase isolation ) json_lines = items except Exception: @@ -2197,20 +2280,44 @@ def _doc_for(obj: dict) -> str: docs = list(ex.map(_doc_for, cand_objs)) pairs = [(rq, d) for d in docs] scores = _rr_local(pairs) - ranked = sorted( - zip(scores, cand_objs), key=lambda x: x[0], reverse=True - ) + # Blend rerank with fusion score to preserve pre-rerank boosts + # (symbol_exact, impl_boost, path boosts are otherwise lost) + _rerank_blend = float(os.environ.get("RERANK_BLEND_WEIGHT", "0.6") or 0.6) + _rerank_blend = max(0.0, min(1.0, _rerank_blend)) # clamp [0,1] + # Post-rerank symbol boost: apply symbol boosts directly to blended score + # This ensures exact symbol matches rank higher even when reranker disagrees + _post_symbol_boost = float(os.environ.get("POST_RERANK_SYMBOL_BOOST", "1.0") or 1.0) + blended = [] + for rr_score, obj in zip(scores, cand_objs): + fusion_score = float(obj.get("score", 0.0) or 0.0) + # Normalize fusion_score to similar scale as rerank (rough heuristic) + # Fusion scores are typically 0-3, rerank scores are -12 to 0 + # Shift fusion to negative range: fusion=2 -> -1, fusion=0 -> -3 + norm_fusion = fusion_score - 3.0 + blended_score = _rerank_blend * rr_score + (1.0 - _rerank_blend) * norm_fusion + # Apply post-rerank symbol boost: extract symbol boosts from components + # and add them directly to blended score (not diluted by blend weight) + comps = obj.get("components") or {} + sym_sub = float(comps.get("symbol_substr", 0.0) or 0.0) + sym_eq = float(comps.get("symbol_exact", 0.0) or 0.0) + post_boost = (sym_sub + sym_eq) * _post_symbol_boost + blended_score += post_boost + blended.append((blended_score, rr_score, obj, post_boost)) + ranked = sorted(blended, key=lambda x: x[0], reverse=True) tmp = [] - for s, obj in ranked[: int(rerank_return_m)]: + for blended_s, rr_s, obj, post_b in ranked[: int(rerank_return_m)]: + why_parts = obj.get("why", []) + [f"rerank_onnx:{float(rr_s):.3f}", f"blend:{float(blended_s):.3f}"] + if post_b > 0: + why_parts.append(f"post_sym:{float(post_b):.3f}") item = { - "score": float(s), + "score": float(blended_s), "path": obj.get("path", ""), "symbol": obj.get("symbol", ""), "start_line": int(obj.get("start_line") or 0), "end_line": int(obj.get("end_line") or 0), - "why": obj.get("why", []) + [f"rerank_onnx:{float(s):.3f}"], + "why": why_parts, "components": (obj.get("components") or {}) - | {"rerank_onnx": float(s)}, + | {"rerank_onnx": float(rr_s), "blended": float(blended_s), "post_symbol_boost": float(post_b)}, } # Preserve dual-path metadata when available so clients can prefer host paths _hostp = obj.get("host_path") @@ -2535,6 +2642,7 @@ async def repo_search_compat(**arguments) -> Dict[str, Any]: "not_": not_value, "case": args.get("case"), "compact": args.get("compact"), + "repo": args.get("repo"), # Cross-codebase isolation # Alias passthroughs captured by repo_search(**kwargs) "queries": queries, "q": args.get("q"), @@ -3093,6 +3201,8 @@ async def context_search( not_: Any = None, case: Any = None, compact: Any = None, + # Repo scoping (cross-codebase isolation) + repo: Any = None, # str, list[str], or "*" to search all repos kwargs: Any = None, ) -> Dict[str, Any]: """Blend code search results with memory-store entries (notes, docs) for richer context. @@ -3107,6 +3217,8 @@ async def context_search( - memory_weight: float (default 1.0). Scales memory scores relative to code. - per_source_limits: dict, e.g. {"code": 5, "memory": 3} - All repo_search filters are supported and passed through. + - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). + By default, auto-detects current repo from CURRENT_REPO env and filters to it. Returns: - {"results": [{"source": "code"| "memory", ...}, ...], "total": N[, "memory_note": str]} @@ -3623,6 +3735,7 @@ def _maybe_dict(val: Any) -> Dict[str, Any]: not_=not_, case=case, compact=False, + repo=repo, # Cross-codebase isolation ) # Optional debug @@ -4619,6 +4732,7 @@ def _ca_prepare_filters_and_retrieve( model: Any, did_local_expand: bool, kwargs: Dict[str, Any], + repo: Any = None, # Cross-codebase isolation: str, list[str], or "*" ) -> Dict[str, Any]: """Build effective filters and run hybrid retrieval with identifier/usage augmentation. Returns a dict with: items, eff_language, eff_path_glob, eff_not_glob, override_under, @@ -4854,6 +4968,7 @@ def _add_query(q: str): in {"1", "true", "yes", "on"} ), model=model, + repo=repo, # Cross-codebase isolation ) if os.environ.get("DEBUG_CONTEXT_ANSWER"): try: @@ -5085,6 +5200,7 @@ def _ca_fallback_and_budget( cwd_root: str, include_snippet: bool, kwargs: Dict[str, Any], + repo: Any = None, # Cross-codebase isolation ) -> list[Dict[str, Any]]: """Run Tier2/Tier3 fallbacks, apply span budgeting, and select prioritized spans. Returns the final list of spans to use for citations/context. @@ -5163,6 +5279,7 @@ def _ok_lang(it: Dict[str, Any]) -> bool: in {"1", "true", "yes", "on"} ), model=model, + repo=repo, # Cross-codebase isolation ) # Ensure last call reflects tier-2 relaxed filters for introspection/testing _ = run_hybrid_search( @@ -5186,6 +5303,7 @@ def _ok_lang(it: Dict[str, Any]) -> bool: in {"1", "true", "yes", "on"} ), model=model, + repo=repo, # Cross-codebase isolation ) if os.environ.get("DEBUG_CONTEXT_ANSWER"): @@ -6542,6 +6660,8 @@ async def context_answer( not_glob: Any = None, case: Any = None, not_: Any = None, + # Repo scoping (cross-codebase isolation) + repo: Any = None, # str, list[str], or "*" to search all repos kwargs: Any = None, ) -> Dict[str, Any]: """Natural-language Q&A over the repo using retrieval + local LLM (llama.cpp). @@ -6563,6 +6683,8 @@ async def context_answer( - mode: "stitch" (default) or "pack" for prompt assembly. - expand: bool. Use tiny local LLM to propose up to 2 alternate queries. - Filters: language, under, kind, symbol, ext, path_regex, path_glob, not_glob, not_, case. + - repo: str or list[str]. Filter by repo name(s). Use "*" to search all repos (disable auto-filter). + By default, auto-detects current repo from CURRENT_REPO env and filters to it. Returns: - {"answer": str, "citations": [{"path": str, "start_line": int, "end_line": int}], "query": list[str], "used": {...}} @@ -6809,6 +6931,7 @@ async def context_answer( "case": _cfg["filters"].get("case"), "symbol": _cfg["filters"].get("symbol"), }, + repo=repo, # Cross-codebase isolation ) items = _retr["items"] eff_language = _retr["eff_language"] @@ -6856,6 +6979,7 @@ def _to_glob_list(val: Any) -> list[str]: cwd_root=cwd_root, include_snippet=bool(include_snippet), kwargs=fallback_kwargs, + repo=repo, # Cross-codebase isolation ) except Exception as e: err = str(e)