From d32345edeb6fff3cd8ce8861ea3497712a767d41 Mon Sep 17 00:00:00 2001 From: John Donalson <11264689+m1rl0k@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:42:10 -0500 Subject: [PATCH] Info-Request --- .env | 17 ++ .env.example | 9 + .gitignore | 2 + scripts/mcp_indexer_server.py | 332 ++++++++++++++++++++++++++++++++++ 4 files changed, 360 insertions(+) diff --git a/.env b/.env index 802b6ea8..ee501184 100644 --- a/.env +++ b/.env @@ -169,3 +169,20 @@ USE_GPU_DECODER=0 # Development Remote Upload Configuration HOST_INDEX_PATH=./dev-workspace + +# Cross-Codebase Isolation (multi-repo search scoping) +# When enabled, search results are automatically filtered to the current repo +REPO_AUTO_FILTER=1 +# Explicitly set current repo (overrides auto-detection from git/directory) +# CURRENT_REPO= + +# Post-rerank symbol boost: ensures exact symbol matches rank highest +POST_RERANK_SYMBOL_BOOST=1.0 +# Rerank blend weight: ratio of rerank score vs fusion score (0.0-1.0) +RERANK_BLEND_WEIGHT=0.6 + +# info_request() tool settings (simplified codebase retrieval) +INFO_REQUEST_LIMIT=10 +INFO_REQUEST_CONTEXT_LINES=5 +# INFO_REQUEST_EXPLAIN_DEFAULT=0 +# INFO_REQUEST_RELATIONSHIPS=0 diff --git a/.env.example b/.env.example index 0e918238..cd8375c2 100644 --- a/.env.example +++ b/.env.example @@ -200,3 +200,12 @@ REFRAG_COMMIT_DESCRIBE=1 STRICT_MEMORY_RESTORE=0 +# info_request() tool settings (simplified codebase retrieval) +# Default result limit for info_request queries +INFO_REQUEST_LIMIT=10 +# Default context lines in snippets (richer than repo_search default) +INFO_REQUEST_CONTEXT_LINES=5 +# Enable explanation mode by default (summary, primary_locations, related_concepts) +# INFO_REQUEST_EXPLAIN_DEFAULT=0 +# Enable relationship mapping by default (imports_from, calls, related_paths) +# INFO_REQUEST_RELATIONSHIPS=0 diff --git a/.gitignore b/.gitignore index 4c548379..1e578035 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ CLAUDE.md /.augment /dev-workspace/* !/dev-workspace/.gitkeep +docs/AUGMENT_COMPATIBILITY.md +.serena/project.yml diff --git a/scripts/mcp_indexer_server.py b/scripts/mcp_indexer_server.py index af96c533..c912c721 100644 --- a/scripts/mcp_indexer_server.py +++ b/scripts/mcp_indexer_server.py @@ -7503,6 +7503,338 @@ async def code_search( ) +# --------------------------------------------------------------------------- +# info_request: Simplified codebase retrieval with explanation mode +# --------------------------------------------------------------------------- + + +def _extract_symbols_from_query(query: str) -> list[str]: + """Extract potential symbol names from a query string.""" + import re + # Match CamelCase, snake_case, or standalone words that look like identifiers + patterns = [ + r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b', # CamelCase + r'\b[a-z_][a-z0-9_]*(?:_[a-z0-9]+)+\b', # snake_case + r'\b(?:def|class|function|method|async)\s+(\w+)', # explicit mentions + ] + symbols = set() + for pat in patterns: + for m in re.finditer(pat, query): + sym = m.group(1) if m.lastindex else m.group(0) + if len(sym) > 2: + symbols.add(sym) + return list(symbols)[:5] # Limit to top 5 + + +def _extract_related_concepts(query: str, results: list) -> list[str]: + """Extract related technical concepts dynamically from results (codebase-agnostic).""" + import re + concepts = set() + + # Extract from results - this works on any codebase + for r in results[:10]: + # From symbols: split CamelCase/snake_case into meaningful parts + sym = r.get("symbol", "") or "" + if sym and len(sym) > 2: + parts = [p for p in re.split(r'(?=[A-Z])|_|-', sym) if p and len(p) > 2] + for part in parts[:3]: + concepts.add(part.lower()) + + # From file paths: extract directory/module names + path = r.get("path", "") or "" + if path: + path_parts = path.replace("\\", "/").split("/") + for pp in path_parts[-3:]: # Last 3 path segments + # Remove extension and split + name = pp.rsplit(".", 1)[0] if "." in pp else pp + if name and len(name) > 2 and not name.startswith("_"): + concepts.add(name.lower()) + + # From kind: function, class, method, etc. + kind = r.get("kind", "") or "" + if kind and len(kind) > 2: + concepts.add(kind.lower()) + + # From query: extract significant words (skip common words) + skip_words = {"the", "is", "are", "how", "does", "what", "where", "find", "get", "set", "for", "and", "with"} + query_parts = re.split(r'\W+', query.lower()) + for qp in query_parts: + if qp and len(qp) > 2 and qp not in skip_words: + concepts.add(qp) + + # Sort by frequency in results for relevance + return list(concepts)[:10] + + +def _format_information_field(result: dict) -> str: + """Generate human-readable information field for a result.""" + path = result.get("path", "") + symbol = result.get("symbol", "") + start = result.get("start_line", 0) + end = result.get("end_line", 0) + kind = result.get("kind", "") + + # Get just the filename + filename = path.split("/")[-1] if "/" in path else path + + if symbol and kind: + return f"Found {kind} '{symbol}' in {filename} (lines {start}-{end})" + elif symbol: + return f"Found '{symbol}' in {filename} (lines {start}-{end})" + else: + return f"Found match in {filename} (lines {start}-{end})" + + +def _extract_relationships(result: dict) -> dict: + """Extract relationship metadata (imports, calls) from a result.""" + relations = result.get("relations") or {} + # Get from relations object if present + imports = relations.get("imports") or [] + calls = relations.get("calls") or [] + symbol_path = relations.get("symbol_path") or "" + # Also check top-level metadata (fallback) + if not imports: + imports = result.get("imports") or [] + if not calls: + calls = result.get("calls") or [] + # Get related paths if available + related_paths = result.get("related_paths") or [] + + return { + "imports_from": imports[:10] if imports else [], # Limit to 10 + "calls": calls[:10] if calls else [], + "symbol_path": symbol_path, + "related_paths": related_paths[:5] if related_paths else [], + } + + +def _calculate_confidence(query: str, results: list) -> dict: + """Calculate confidence metrics for the search.""" + if not results: + return {"level": "none", "score": 0.0, "reason": "no_results"} + + avg_score = sum(r.get("score", 0) for r in results) / len(results) + top_score = results[0].get("score", 0) if results else 0 + + # Check if query terms match symbols + query_tokens = set(_split_ident(query.lower())) + symbol_matches = sum( + 1 for r in results[:5] + if any(tok in _split_ident((r.get("symbol", "") or "").lower()) + for tok in query_tokens) + ) + + if top_score > 0.8 and symbol_matches > 0: + level = "high" + elif avg_score > 0.6: + level = "medium" + elif results: + level = "low" + else: + level = "none" + + return { + "level": level, + "score": round(avg_score, 3), + "top_score": round(top_score, 3), + "symbol_matches": symbol_matches, + } + + +@mcp.tool() +async def info_request( + # Primary parameter + info_request: str = None, + information_request: str = None, # Alias + # Explanation mode + include_explanation: bool = None, + # Relationship mapping + include_relationships: bool = None, + # Optional filters (pass-through to repo_search) + limit: int = None, + language: str = None, + under: str = None, + repo: Any = None, + path_glob: Any = None, + # Additional options + include_snippet: bool = None, + context_lines: int = None, + kwargs: Any = None, +) -> Dict[str, Any]: + """Simplified codebase retrieval with optional explanation mode. + + When to use: + - Simple, single-parameter code search with human-readable descriptions + - When you want optional explanation mode for richer context + - Drop-in replacement for basic codebase retrieval tools + + Key parameters: + - info_request: str. Natural language description of the code you're looking for. + - information_request: str. Alias for info_request. + - include_explanation: bool (default false). Add summary, primary_locations, related_concepts. + - include_relationships: bool (default false). Add imports_from, calls, related_paths to results. + - limit: int (default 10). Maximum results to return. + - language: str. Filter by programming language. + - under: str. Limit search to specific directory. + - repo: str or list[str]. Filter by repository name(s). + + Returns: + - Compact mode (default): results with information field and relevance_score alias + - Explanation mode: adds summary, primary_locations, related_concepts, query_understanding + + Example: + - {"info_request": "database connection pooling"} + - {"info_request": "authentication middleware", "include_explanation": true} + """ + # Resolve query from either parameter + query = info_request or information_request + if not query or not str(query).strip(): + return {"ok": False, "error": "info_request parameter is required", "results": []} + query = str(query).strip() + + # Resolve defaults from env + _default_limit = safe_int( + os.environ.get("INFO_REQUEST_LIMIT", "10"), default=10, logger=logger + ) + _default_context = safe_int( + os.environ.get("INFO_REQUEST_CONTEXT_LINES", "5"), default=5, logger=logger + ) + _default_explain = str( + os.environ.get("INFO_REQUEST_EXPLAIN_DEFAULT", "0") + ).strip().lower() in {"1", "true", "yes", "on"} + _default_relationships = str( + os.environ.get("INFO_REQUEST_RELATIONSHIPS", "0") + ).strip().lower() in {"1", "true", "yes", "on"} + + # Apply defaults + eff_limit = limit if limit is not None else _default_limit + eff_context = context_lines if context_lines is not None else _default_context + eff_snippet = include_snippet if include_snippet is not None else True + eff_explain = include_explanation if include_explanation is not None else _default_explain + eff_relationships = include_relationships if include_relationships is not None else _default_relationships + + # Smart limits based on query characteristics (only if user didn't override) + if limit is None: + query_words = len(query.split()) + query_lower = query.lower() + if query_words <= 2: # Short query like "auth handler" + eff_limit = 15 # More results for broad queries + elif "how does" in query_lower or "what is" in query_lower: + eff_limit = 8 # Questions need focused results + + # Call repo_search + search_result = await repo_search( + query=query, + limit=eff_limit, + per_path=3, # Better default for info requests + include_snippet=eff_snippet, + context_lines=eff_context, + language=language, + under=under, + repo=repo, + path_glob=path_glob, + kwargs=kwargs, + ) + + # Extract results + results = search_result.get("results", []) + total = search_result.get("total", len(results)) + used_rerank = search_result.get("used_rerank", False) + + # Enhance each result with information field and optional relationships + enhanced_results = [] + for r in results: + enhanced = dict(r) + enhanced["information"] = _format_information_field(r) + enhanced["relevance_score"] = r.get("score", 0.0) # Alias + # Add relationships if requested + if eff_relationships: + enhanced["relationships"] = _extract_relationships(r) + enhanced_results.append(enhanced) + + # Build better search strategy string + strategy_parts = ["hybrid"] + if used_rerank: + strategy_parts.append("rerank") + if repo: + strategy_parts.append("repo_filtered") + if language: + strategy_parts.append(f"lang:{language}") + if under: + strategy_parts.append("path_filtered") + search_strategy = "+".join(strategy_parts) + + # Build response + response: Dict[str, Any] = { + "ok": True, + "results": enhanced_results, + "total": total, + "search_strategy": search_strategy, + } + + # Add explanation if requested + if eff_explain: + # Primary locations: unique file paths + seen_paths = set() + primary_locations = [] + for r in results: + p = r.get("path", "") + if p and p not in seen_paths: + seen_paths.add(p) + primary_locations.append(p) + if len(primary_locations) >= 5: + break + + # Related concepts + related_concepts = _extract_related_concepts(query, results) + + # Detected symbols from query + detected_symbols = _extract_symbols_from_query(query) + + # Summary + n_files = len(seen_paths) + summary = f"Found {total} results related to '{query}' across {n_files} file{'s' if n_files != 1 else ''}" + + # Group results by file + files_map: Dict[str, list] = {} + for r in enhanced_results: + p = r.get("path", "") + if p not in files_map: + files_map[p] = [] + files_map[p].append({ + "symbol": r.get("symbol", ""), + "line": r.get("start_line", 0), + "score": r.get("score", 0.0), + }) + + grouped_results = { + "by_file": { + path: { + "count": len(items), + "top_symbols": [i["symbol"] for i in sorted(items, key=lambda x: -x["score"])[:3] if i["symbol"]], + } + for path, items in files_map.items() + } + } + + # Calculate confidence + confidence = _calculate_confidence(query, enhanced_results) + + response["summary"] = summary + response["primary_locations"] = primary_locations + response["related_concepts"] = related_concepts + response["grouped_results"] = grouped_results + response["confidence"] = confidence + response["query_understanding"] = { + "intent": "search_for_code", + "detected_language": language or None, + "detected_symbols": detected_symbols, + "search_strategy": search_strategy, + } + + return response + + _relax_var_kwarg_defaults() if __name__ == "__main__":