Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ EMBEDDING_PROVIDER=fastembed
# Optional repo tag attached to each payload
REPO_NAME=workspace

# Cross-Codebase Isolation (multi-repo search scoping)
# When enabled, search results are automatically filtered to the current repo
# Disable (=0) to search all repos by default (legacy behavior)
REPO_AUTO_FILTER=1
# Explicitly set current repo (overrides auto-detection from git/directory)
# CURRENT_REPO=my-project

# MCP servers (SSE)
FASTMCP_HOST=0.0.0.0
FASTMCP_PORT=8000 # search/store MCP (mcp-server-qdrant)
Expand Down Expand Up @@ -56,6 +63,14 @@ RERANKER_TOPN=50
RERANKER_RETURN_M=12
RERANKER_TIMEOUT_MS=2000

# Post-rerank symbol boost: ensures exact symbol matches rank highest even when
# the neural reranker disagrees. Applied after rerank blending as a direct score addition.
# Set to 0 to disable, >1.0 to boost symbol matches more aggressively.
POST_RERANK_SYMBOL_BOOST=1.0
# Rerank blend weight: ratio of rerank score vs fusion score (0.0-1.0)
# Higher = more weight on neural reranker, lower = more weight on lexical/symbol boosts
RERANK_BLEND_WEIGHT=0.6

# Safety: minimum rerank timeout floor (ms) to avoid cold-start timeouts
RERANK_TIMEOUT_FLOOR_MS=1000

Expand Down
31 changes: 30 additions & 1 deletion docs/MCP_API.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional
- `limit` (int, default 10): Maximum total results to return
- `per_path` (int, default 2): Maximum results per file path

**Cross-Codebase Isolation:**
- `repo` (str or list[str], optional): Filter results to specific repository(ies)
- Single repo: `"pathful-commons-app"` - Search only this repo
- Multiple repos: `["frontend", "backend"]` - Search related repos together
- All repos: `"*"` - Explicitly search all indexed repos (disable auto-filter)
- Default: Auto-detects current repo from `CURRENT_REPO` env when `REPO_AUTO_FILTER=1`

**Content Filters:**
- `language` (str, optional): Filter by programming language
- `path_glob` (str or list[str], optional): Glob patterns for path filtering
Expand All @@ -185,6 +192,10 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional
- `rerank_top_n` (int, default 50): Number of candidates to consider for reranking
- `rerank_return_m` (int, default 12): Number of results to return after reranking

Reranking uses a blended scoring approach that preserves symbol match boosts:
- **Blend weight** (`RERANK_BLEND_WEIGHT`, default 0.6): Ratio of neural reranker score to fusion score
- **Post-rerank symbol boost** (`POST_RERANK_SYMBOL_BOOST`, default 1.0): Applied after blending to ensure exact symbol matches rank highest even when the neural reranker disagrees

**Response Format:**
```json
{
Expand Down Expand Up @@ -254,12 +265,30 @@ Perform hybrid code search combining dense semantic, lexical BM25, and optional
}
```

**Cross-Codebase Search (multi-repo):**
```json
{
"query": "authentication middleware",
"repo": ["frontend", "backend"],
"limit": 15
}
```

**Single Repo Search:**
```json
{
"query": "user authentication",
"repo": "my-repo",
"include_snippet": true
}
```

### context_search()

Blend code search results with memory entries for comprehensive context.

**Parameters:**
All `repo_search` parameters plus:
All `repo_search` parameters (including `repo` for cross-codebase isolation) plus:
- `include_memories` (bool, default true): Whether to include memory results
- `memory_weight` (float, default 1.0): Weight for memory results vs code results
- `per_source_limits` (dict, optional): Limits per source type:
Expand Down
42 changes: 37 additions & 5 deletions scripts/hybrid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1454,6 +1454,7 @@ def run_hybrid_search(
expand: bool = True,
model: TextEmbedding | None = None,
collection: str | None = None,
repo: str | list[str] | None = None, # Filter by repo name(s); "*" to disable auto-filter
) -> List[Dict[str, Any]]:
client = QdrantClient(url=os.environ.get("QDRANT_URL", QDRANT_URL), api_key=API_KEY)
model_name = os.environ.get("EMBEDDING_MODEL", MODEL_NAME)
Expand All @@ -1470,7 +1471,23 @@ def run_hybrid_search(
eff_ext = ext or dsl.get("ext")
eff_not = not_filter or dsl.get("not")
eff_case = case or dsl.get("case") or os.environ.get("HYBRID_CASE", "insensitive")
eff_repo = dsl.get("repo")
# Repo filter: explicit param > DSL > auto-detect from env
eff_repo = repo or dsl.get("repo")
# Normalize repo to list for multi-repo support
if eff_repo and isinstance(eff_repo, str):
if eff_repo.strip() == "*":
eff_repo = None # "*" means search all repos
else:
eff_repo = [r.strip() for r in eff_repo.split(",") if r.strip()]
elif eff_repo and isinstance(eff_repo, (list, tuple)):
eff_repo = [str(r).strip() for r in eff_repo if str(r).strip() and str(r).strip() != "*"]
if not eff_repo:
eff_repo = None
# Auto-detect repo from env if not specified and auto-filter is enabled
if eff_repo is None and str(os.environ.get("REPO_AUTO_FILTER", "1")).strip().lower() in {"1", "true", "yes", "on"}:
auto_repo = os.environ.get("CURRENT_REPO") or os.environ.get("REPO_NAME")
if auto_repo and auto_repo.strip():
eff_repo = [auto_repo.strip()]
eff_path_regex = path_regex

def _to_list(x):
Expand Down Expand Up @@ -1595,12 +1612,27 @@ def _norm_under(u: str | None) -> str | None:
key="metadata.language", match=models.MatchValue(value=eff_language)
)
)
# Repo filter: supports single repo or list of repos (for related codebases)
if eff_repo:
must.append(
models.FieldCondition(
key="metadata.repo", match=models.MatchValue(value=eff_repo)
if isinstance(eff_repo, list) and len(eff_repo) == 1:
must.append(
models.FieldCondition(
key="metadata.repo", match=models.MatchValue(value=eff_repo[0])
)
)
elif isinstance(eff_repo, list) and len(eff_repo) > 1:
# Multiple repos: use MatchAny for OR logic
must.append(
models.FieldCondition(
key="metadata.repo", match=models.MatchAny(any=eff_repo)
)
)
elif isinstance(eff_repo, str):
must.append(
models.FieldCondition(
key="metadata.repo", match=models.MatchValue(value=eff_repo)
)
)
)
if eff_under:
must.append(
models.FieldCondition(
Expand Down
Loading
Loading