diff --git a/fastcode/loader.py b/fastcode/loader.py index 24bb03b..89bb714 100644 --- a/fastcode/loader.py +++ b/fastcode/loader.py @@ -15,6 +15,7 @@ is_supported_file, should_ignore_path, get_repo_name_from_url, + get_repo_name_from_path, normalize_path, ensure_dir, ) @@ -135,9 +136,12 @@ def load_from_path(self, path: str, target_dir: Optional[str] = None) -> str: if not os.path.isdir(path): raise ValueError(f"Path is not a directory: {path}") - source_path = os.path.abspath(path) - self.repo_name = os.path.basename(source_path) destination_root = os.path.abspath(target_dir) if target_dir else self.safe_repo_root + source_path = os.path.abspath(path) + self.repo_name = get_repo_name_from_path( + source_path, + workspace_root=destination_root, + ) destination_path = os.path.join(destination_root, self.repo_name) # If source is already in workspace destination, use it directly. @@ -381,4 +385,3 @@ def cleanup(self): def __del__(self): """Cleanup on deletion""" self.cleanup() - diff --git a/fastcode/main.py b/fastcode/main.py index 9d922c0..ef307ec 100644 --- a/fastcode/main.py +++ b/fastcode/main.py @@ -213,9 +213,9 @@ def index_repository(self, force: bool = False): # Index code elements with repository information elements = self.indexer.index_repository(repo_name=repo_name, repo_url=repo_url) - # Initialize vector store if not already done - if self.vector_store.dimension is None: - self.vector_store.initialize(self.embedder.embedding_dim) + # Single-repository indexing should always build a fresh in-memory + # index so saved artifacts never mix metadata from prior repos. + self.vector_store.initialize(self.embedder.embedding_dim) # Add embeddings to vector store vectors = [] @@ -267,9 +267,14 @@ def index_repository(self, force: bool = False): self.module_resolver = None self.symbol_resolver = None - # Build code graphs with resolvers - # This will now use the initialized resolvers to build precise graphs - self.graph_builder.build_graphs(elements, self.module_resolver, self.symbol_resolver) + # Build code graphs with a fresh builder so per-repo saves never + # retain nodes or edges from a previous repository. + fresh_graph_builder = CodeGraphBuilder(self.config) + fresh_graph_builder.build_graphs( + elements, self.module_resolver, self.symbol_resolver + ) + self.graph_builder = fresh_graph_builder + self.retriever.graph_builder = self.graph_builder # Index for BM25 self.retriever.index_for_bm25(elements) @@ -674,38 +679,46 @@ def _try_load_from_cache(self) -> bool: # Try to load vector store if self.vector_store.load(cache_name): self.logger.info(f"Loaded vector store from cache for {cache_name}") - + # Load BM25 index bm25_loaded = self.retriever.load_bm25(cache_name) if not bm25_loaded: self.logger.warning("Failed to load BM25 index, will need to rebuild") - + # Build separate repo overview BM25 index self.retriever.build_repo_overview_bm25() - - # Load graph data - graph_loaded = self.graph_builder.load(cache_name) + + # Load graph data into a fresh builder so stale state is never reused. + fresh_graph_builder = CodeGraphBuilder(self.config) + graph_loaded = fresh_graph_builder.load(cache_name) if not graph_loaded: self.logger.warning("Failed to load graph data, will need to rebuild") - + # If BM25 or graph failed to load, reconstruct from metadata if not bm25_loaded or not graph_loaded: self.logger.info("Reconstructing missing components from metadata...") - elements = self._reconstruct_elements_from_metadata() - + elements = ( + self.retriever.full_bm25_elements + if bm25_loaded and self.retriever.full_bm25_elements + else self._reconstruct_elements_from_metadata() + ) + if elements: if not bm25_loaded: self.retriever.index_for_bm25(elements) self.logger.info(f"Rebuilt BM25 index with {len(elements)} elements") - + if not graph_loaded: # Note: Rebuilding graph from metadata is a fallback. # Precise linking might be limited if repo_root context is lost. - self.graph_builder.build_graphs(elements) + fresh_graph_builder.build_graphs(elements) + graph_loaded = True self.logger.info("Rebuilt code graph (fallback mode)") else: self.logger.warning("No elements reconstructed from metadata") - + + self.graph_builder = fresh_graph_builder + self.retriever.graph_builder = self.graph_builder self.logger.info("Cache loaded successfully") self._log_statistics() return True @@ -1039,6 +1052,10 @@ def load_multiple_repositories(self, sources: List[Dict[str, Any]]): temp_graph_builder.build_graphs(elements, temp_module_resolver, temp_symbol_resolver) temp_graph_builder.save(repo_name) + self._save_file_manifest( + repo_name, + self._build_file_manifest(elements, self.loader.repo_path), + ) self.logger.info(f"Saved graph data for {repo_name}") successfully_indexed.append(repo_name) @@ -1138,6 +1155,25 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool Returns: True if successful, False otherwise """ + def invalidate_in_memory_state() -> None: + self.loaded_repositories = {} + self.repo_indexed = False + self.repo_loaded = False + self.multi_repo_mode = False + + retriever = getattr(self, "retriever", None) + if retriever is not None: + if hasattr(retriever, "current_loaded_repos"): + retriever.current_loaded_repos = None + if hasattr(retriever, "filtered_vector_store"): + retriever.filtered_vector_store = None + if hasattr(retriever, "filtered_bm25"): + retriever.filtered_bm25 = None + if hasattr(retriever, "filtered_bm25_corpus"): + retriever.filtered_bm25_corpus = [] + if hasattr(retriever, "filtered_bm25_elements"): + retriever.filtered_bm25_elements = [] + try: # Discover available repository indexes persist_dir = self.vector_store.persist_dir @@ -1153,6 +1189,7 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool if not available_repos: self.logger.error("No repository indexes found") + invalidate_in_memory_state() return False # Filter repositories if specific ones are requested @@ -1160,6 +1197,14 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool repos_to_load = [r for r in available_repos if r in repo_names] if not repos_to_load: self.logger.error(f"None of the requested repositories found: {repo_names}") + invalidate_in_memory_state() + return False + missing_repos = sorted(set(repo_names) - set(repos_to_load)) + if missing_repos: + self.logger.error( + f"Requested repositories are missing from cache: {', '.join(missing_repos)}" + ) + invalidate_in_memory_state() return False else: repos_to_load = available_repos @@ -1167,46 +1212,72 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool self.logger.info(f"Found {len(repos_to_load)} repository indexes: {', '.join(repos_to_load)}") # Always reinitialize for clean merge + previously_loaded = dict(self.loaded_repositories) self.vector_store.initialize(self.embedder.embedding_dim) + # The previous in-memory state is no longer valid once the merged + # vector store is rebuilt. Clear it up front so partial reload + # failures cannot leave stale repos marked as loaded. + self.loaded_repositories = {} + self.repo_indexed = False + self.repo_loaded = False + self.multi_repo_mode = False + if hasattr(self.retriever, "current_loaded_repos"): + self.retriever.current_loaded_repos = None # Load each repository index and merge them + successfully_loaded = [] for repo_name in repos_to_load: self.logger.info(f"Loading index for {repo_name}...") try: # Merge this repository's index into the main vector store if self.vector_store.merge_from_index(repo_name): self.logger.info(f"Successfully merged {repo_name}") + successfully_loaded.append(repo_name) else: self.logger.warning(f"Failed to merge index for {repo_name}") except Exception as e: self.logger.error(f"Error loading {repo_name}: {e}") continue - + + if repo_names and set(successfully_loaded) != set(repos_to_load): + missing_repos = sorted(set(repos_to_load) - set(successfully_loaded)) + self.logger.error( + f"Failed to load all requested repositories: {', '.join(missing_repos)}" + ) + invalidate_in_memory_state() + return False + # Check if we successfully loaded any repositories if self.vector_store.get_count() == 0: self.logger.error("Failed to load any repository indexes") + invalidate_in_memory_state() return False - # Register loaded repositories - # We know which repos were successfully loaded from repos_to_load - for repo_name in repos_to_load: - if repo_name not in self.loaded_repositories: - self.loaded_repositories[repo_name] = { - "name": repo_name, - "file_count": 0, # Will be updated if needed - "total_size_mb": 0, - } + # Register only the repositories that are actually present in memory. + self.loaded_repositories = {} + for repo_name in successfully_loaded: + repo_info = previously_loaded.get(repo_name, {}) + self.loaded_repositories[repo_name] = { + "name": repo_name, + "file_count": repo_info.get("file_count", 0), + "total_size_mb": repo_info.get("total_size_mb", 0), + **({"url": repo_info["url"]} if "url" in repo_info else {}), + **({"path": repo_info["path"]} if "path" in repo_info else {}), + } # Try to load BM25 and graph data from saved files # For multi-repo, we merge BM25 data from all loaded repositories self.logger.info("Loading BM25 and graph data...") - + all_bm25_elements = [] all_bm25_corpus = [] + bm25_cache_complete = True + fresh_graph_builder = CodeGraphBuilder(self.config) graphs_loaded = False - - for repo_name in repos_to_load: + graph_cache_complete = True + + for repo_name in successfully_loaded: # Try loading BM25 for each repo bm25_path = os.path.join(self.retriever.persist_dir, f"{repo_name}_bm25.pkl") if os.path.exists(bm25_path): @@ -1222,41 +1293,63 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool self.logger.info(f"Loaded BM25 data for {repo_name}") except Exception as e: self.logger.warning(f"Failed to load BM25 data for {repo_name}: {e}") + bm25_cache_complete = False + else: + self.logger.warning(f"BM25 data not found for {repo_name}") + bm25_cache_complete = False # Load graph data (merge into main graph) if not graphs_loaded: # Load the first repository's graph as base - if self.graph_builder.load(repo_name): + if fresh_graph_builder.load(repo_name): graphs_loaded = True self.logger.info(f"Loaded graph data from {repo_name} as base") + else: + graph_cache_complete = False else: # Merge additional repository graphs - if self.graph_builder.merge_from_file(repo_name): + if fresh_graph_builder.merge_from_file(repo_name): self.logger.info(f"Merged graph data from {repo_name}") else: self.logger.warning(f"Failed to merge graph data from {repo_name}") - # TODO: Merge additional repository graphs if needed + graph_cache_complete = False + # Rebuild FULL BM25 index with merged data (for repository selection) - if all_bm25_elements and all_bm25_corpus: + if all_bm25_elements and all_bm25_corpus and bm25_cache_complete: self.retriever.full_bm25_elements = all_bm25_elements self.retriever.full_bm25_corpus = all_bm25_corpus self.retriever.full_bm25 = BM25Okapi(all_bm25_corpus) self.logger.info(f"Rebuilt full BM25 index with {len(all_bm25_elements)} merged elements") else: # Fallback: reconstruct from metadata - self.logger.info("No BM25 data found, reconstructing from metadata...") + if bm25_cache_complete: + self.logger.info("No BM25 data found, reconstructing from metadata...") + else: + self.logger.info("BM25 cache incomplete, reconstructing from metadata...") elements = self._reconstruct_elements_from_metadata() - + if elements: self.retriever.index_for_bm25(elements) self.logger.info(f"Rebuilt BM25 index with {len(elements)} elements") - - if not graphs_loaded: - self.graph_builder.build_graphs(elements) - self.logger.info("Rebuilt code graph") else: self.logger.warning("No elements reconstructed from metadata") - + + if not graphs_loaded or not graph_cache_complete: + elements = ( + self.retriever.full_bm25_elements + if self.retriever.full_bm25_elements + else self._reconstruct_elements_from_metadata() + ) + if elements: + fresh_graph_builder = CodeGraphBuilder(self.config) + fresh_graph_builder.build_graphs(elements) + graphs_loaded = True + self.logger.info("Rebuilt code graph") + else: + self.logger.warning("No elements available to rebuild graph data") + + self.graph_builder = fresh_graph_builder + self.retriever.graph_builder = self.graph_builder # Build separate BM25 index for repository overviews self.retriever.build_repo_overview_bm25() self.logger.info("Built separate BM25 index for repository overviews") @@ -1272,6 +1365,7 @@ def _load_multi_repo_cache(self, repo_names: Optional[List[str]] = None) -> bool self.logger.error(f"Failed to load multi-repo cache: {e}") import traceback self.logger.error(traceback.format_exc()) + invalidate_in_memory_state() return False # ------------------------------------------------------------------ @@ -1330,20 +1424,19 @@ def _load_file_manifest(self, repo_name): self.logger.warning(f"Failed to load manifest for '{repo_name}': {e}") return None - def _load_existing_metadata(self, repo_name: str) -> list: - """Load existing vector store metadata for a repo directly from disk.""" + def _load_existing_metadata(self, repo_name: str) -> dict: + """Load an existing vector store payload for a repo directly from disk.""" meta_path = os.path.join( self.vector_store.persist_dir, f"{repo_name}_metadata.pkl" ) if not os.path.exists(meta_path): - return [] + return {} try: with open(meta_path, "rb") as f: - data = pickle.load(f) - return data.get("metadata", []) + return pickle.load(f) except Exception as e: self.logger.warning(f"Failed to load metadata for '{repo_name}': {e}") - return [] + return {} def _detect_file_changes(self, repo_name, current_files): """Compare current files against saved manifest to detect changes. @@ -1440,7 +1533,8 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: return {"status": "path_not_found", "changes": 0} self.loader.load_from_path(repo_path) - self.config["repo_root"] = repo_path + effective_repo_path = self.loader.repo_path or repo_path + self.config["repo_root"] = effective_repo_path # 3. Scan current files and detect changes current_files = self.loader.scan_files() @@ -1462,11 +1556,33 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: return {"status": "no_changes", "changes": 0} # 4. Load existing metadata from disk - existing_metadata = self._load_existing_metadata(repo_name) + existing_payload = self._load_existing_metadata(repo_name) + if isinstance(existing_payload, dict) and "metadata" in existing_payload: + existing_metadata = existing_payload.get("metadata", []) + persisted_dimension = existing_payload.get("dimension") + else: + existing_metadata = existing_payload or [] + persisted_dimension = None + if not existing_metadata: self.logger.warning(f"No existing metadata for '{repo_name}'") return {"status": "no_metadata", "changes": 0} + current_dimension = self.embedder.embedding_dim + if persisted_dimension is not None and persisted_dimension != current_dimension: + self.logger.warning( + "Embedding dimension mismatch for '%s': persisted=%s current=%s", + repo_name, + persisted_dimension, + current_dimension, + ) + return { + "status": "embedding_mismatch", + "changes": total_changes, + "persisted_dimension": persisted_dimension, + "current_dimension": current_dimension, + } + # 5. Collect unchanged elements (with pre-computed embeddings) unchanged_elements, _ = self._collect_unchanged_elements( changes["manifest"], unchanged, existing_metadata @@ -1520,13 +1636,28 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: # 8. Rebuild FAISS (temporary store — main instance untouched) temp_store = VectorStore(self.config) - temp_store.initialize(self.embedder.embedding_dim) + temp_store.initialize(current_dimension) vectors, metadata_list = [], [] for elem in all_elements: embedding = elem.metadata.get("embedding") if embedding is not None: - vectors.append(embedding) + embedding_array = np.asarray(embedding, dtype=np.float32) + if embedding_array.ndim != 1 or embedding_array.shape[0] != current_dimension: + self.logger.warning( + "Embedding shape mismatch for '%s' element '%s': expected=%s got=%s", + repo_name, + elem.id, + current_dimension, + tuple(embedding_array.shape), + ) + return { + "status": "embedding_mismatch", + "changes": total_changes, + "persisted_dimension": persisted_dimension, + "current_dimension": current_dimension, + } + vectors.append(embedding_array) metadata_list.append(elem.to_dict()) if vectors: @@ -1535,7 +1666,7 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: # 9. Rebuild BM25 (temporary retriever) temp_retriever = HybridRetriever( self.config, temp_store, self.embedder, - CodeGraphBuilder(self.config), repo_root=repo_path, + CodeGraphBuilder(self.config), repo_root=effective_repo_path, ) temp_retriever.index_for_bm25(all_elements) @@ -1544,7 +1675,7 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: module_resolver, symbol_resolver = None, None try: gib = GlobalIndexBuilder(self.config) - gib.build_maps(all_elements, repo_path) + gib.build_maps(all_elements, effective_repo_path) module_resolver = ModuleResolver(gib) symbol_resolver = SymbolResolver(gib, module_resolver) except Exception as e: @@ -1552,12 +1683,34 @@ def incremental_reindex(self, repo_name: str, repo_path: str = None) -> dict: temp_graph.build_graphs(all_elements, module_resolver, symbol_resolver) + # Refresh repository overview so repo-level selection and summaries stay in sync. + overview_generator = getattr(self.indexer, "overview_generator", None) + if overview_generator and effective_repo_path: + try: + file_structure = overview_generator.parse_file_structure( + effective_repo_path, current_files + ) + repo_overview = overview_generator.generate_overview( + effective_repo_path, repo_name, file_structure + ) + repo_url = ( + self.loaded_repositories.get(repo_name, {}).get("url") + or self.loaded_repositories.get(repo_name, {}).get("remote_url") + ) + self.indexer.current_repo_name = repo_name + self.indexer.current_repo_url = repo_url + self.indexer._save_repository_overview(repo_overview) + except Exception as e: + self.logger.warning( + f"Failed to refresh repository overview for '{repo_name}': {e}" + ) + # 11. Save all artifacts if self._should_persist_indexes(): temp_store.save(repo_name) temp_retriever.save_bm25(repo_name) temp_graph.save(repo_name) - new_manifest = self._build_file_manifest(all_elements, repo_path) + new_manifest = self._build_file_manifest(all_elements, effective_repo_path) self._save_file_manifest(repo_name, new_manifest) self.logger.info(f"Saved all artifacts for '{repo_name}'") diff --git a/fastcode/query_processor.py b/fastcode/query_processor.py index 50dad7f..2d5403b 100644 --- a/fastcode/query_processor.py +++ b/fastcode/query_processor.py @@ -533,7 +533,7 @@ def _enhance_with_llm(self, query: str, intent: str, else: return {} - print(f"LLM response of _enhance_with_llm: {response}") + self.logger.debug("LLM enhancement response received") # Parse LLM response enhancements = self._parse_llm_response(response, intent) @@ -825,4 +825,3 @@ def _parse_rewritten_query(self, response: str) -> Optional[str]: rewritten = rewritten[1:-1] return rewritten if rewritten else None - diff --git a/fastcode/retriever.py b/fastcode/retriever.py index b150f05..ad76e31 100644 --- a/fastcode/retriever.py +++ b/fastcode/retriever.py @@ -501,7 +501,7 @@ def _select_relevant_repositories(self, query: Union[str, List[str]], keywords: # 2. get top k for repo_name, scores in sorted_repos[:top_k]: selected_repos.append(repo_name) - print( + self.logger.info( f"Selected repo: {repo_name} " f"(semantic: {scores['semantic_score']:.3f}, " f"bm25: {scores['bm25_score']:.3f}, " @@ -510,7 +510,9 @@ def _select_relevant_repositories(self, query: Union[str, List[str]], keywords: # 3. no repo selected if not selected_repos: - print(f"No repositories met the minimum score threshold of {MIN_SCORE_THRESHOLD}") + self.logger.info( + f"No repositories met the minimum score threshold of {MIN_SCORE_THRESHOLD}" + ) return selected_repos @@ -1440,4 +1442,3 @@ def _apply_agency_mode(self, query: str, results: List[Dict[str, Any]], self.logger.error(traceback.format_exc()) # Fallback to original results return results - diff --git a/fastcode/utils.py b/fastcode/utils.py index 2bc1aba..d9076c2 100644 --- a/fastcode/utils.py +++ b/fastcode/utils.py @@ -310,6 +310,27 @@ def get_repo_name_from_url(url: str) -> str: return parts[-1] if parts else "unknown_repo" +def get_repo_name_from_path(path: str, workspace_root: Optional[str] = None) -> str: + """Derive a stable local repository identifier from its absolute path.""" + normalized = normalize_path(os.path.abspath(path or "")) + base_name = os.path.basename(normalized.rstrip("/")) or "local_repo" + + if workspace_root: + normalized_root = normalize_path(os.path.abspath(workspace_root)) + parent_dir = normalize_path(os.path.dirname(normalized.rstrip("/"))) + suffix = base_name.rsplit("-", 1) + if ( + parent_dir == normalized_root + and len(suffix) == 2 + and len(suffix[1]) == 8 + and all(ch in "0123456789abcdef" for ch in suffix[1].lower()) + ): + return base_name + + digest = hashlib.md5(normalized.encode("utf-8")).hexdigest()[:8] + return f"{base_name}-{digest}" + + def clean_docstring(docstring: str) -> str: """Clean and format docstring""" if not docstring: @@ -335,4 +356,3 @@ def clean_docstring(docstring: str) -> str: for line in lines] return "\n".join(lines).strip() - diff --git a/fastcode/vector_store.py b/fastcode/vector_store.py index 5c9047f..723f624 100644 --- a/fastcode/vector_store.py +++ b/fastcode/vector_store.py @@ -336,7 +336,9 @@ def search_repository_overviews(self, query_vector: np.ndarray, k: int = 5, distance = float(np.linalg.norm(query_vector - embedding)) similarity = 1.0 / (1.0 + distance) - print(f"similarity: {similarity}, repo_name: {repo_name}") + self.logger.debug( + f"Repository overview similarity for {repo_name}: {similarity:.4f}" + ) # Apply minimum score filter if min_score is not None and similarity < min_score: @@ -752,4 +754,3 @@ def invalidate_scan_cache(self): """Invalidate the scan cache (call this when indexes change)""" self._index_scan_cache = None self.logger.debug("Invalidated index scan cache") - diff --git a/mcp_server.py b/mcp_server.py index 78a1f3a..4c6a7e8 100644 --- a/mcp_server.py +++ b/mcp_server.py @@ -69,11 +69,45 @@ def _get_fastcode(): def _repo_name_from_source(source: str, is_url: bool) -> str: """Derive a canonical repo name from a URL or local path.""" - from fastcode.utils import get_repo_name_from_url + from fastcode.utils import get_repo_name_from_path, get_repo_name_from_url + + def repo_index_files_exist(persist_dir: Optional[str], repo_name: Optional[str]) -> bool: + if not persist_dir or not repo_name: + return False + faiss_path = os.path.join(persist_dir, f"{repo_name}.faiss") + meta_path = os.path.join(persist_dir, f"{repo_name}_metadata.pkl") + return os.path.exists(faiss_path) and os.path.exists(meta_path) + + def strip_repo_hash_suffix(repo_name: str) -> Optional[str]: + base_name, sep, suffix = repo_name.rpartition("-") + if not sep or len(suffix) != 8: + return None + if not all(ch in "0123456789abcdef" for ch in suffix.lower()): + return None + return base_name or None + if is_url: return get_repo_name_from_url(source) - # Local path: use the directory basename - return os.path.basename(os.path.normpath(source)) + + workspace_root = None + persist_dir = None + try: + fc = _get_fastcode() + workspace_root = getattr(getattr(fc, "loader", None), "safe_repo_root", None) + persist_dir = getattr(getattr(fc, "vector_store", None), "persist_dir", None) + except Exception: + workspace_root = None + persist_dir = None + + repo_name = get_repo_name_from_path(source, workspace_root=workspace_root) + if repo_index_files_exist(persist_dir, repo_name): + return repo_name + + legacy_name = strip_repo_hash_suffix(repo_name) + if repo_index_files_exist(persist_dir, legacy_name): + return legacy_name + + return repo_name def _is_repo_indexed(repo_name: str) -> bool: @@ -127,6 +161,49 @@ def _apply_forced_env_excludes(fc) -> None: logger.info(f"Added forced ignore patterns: {added}") +def _invalidate_loaded_state(fc) -> None: + """Mark in-memory indexes as stale so the next request reloads from disk.""" + fc.repo_indexed = False + fc.repo_loaded = False + fc.multi_repo_mode = False + fc.loaded_repositories.clear() + + retriever = getattr(fc, "retriever", None) + if retriever is not None and hasattr(retriever, "current_loaded_repos"): + retriever.current_loaded_repos = None + + +def _run_full_reindex( + repo_source: str, resolved_is_url: bool, repo_name: Optional[str] = None +) -> dict: + """Reindex using a clean FastCode instance to avoid mixed in-memory artifacts.""" + from fastcode import FastCode + + reindex_fc = FastCode() + _apply_forced_env_excludes(reindex_fc) + + if resolved_is_url: + reindex_fc.load_repository(repo_source, is_url=True) + else: + abs_path = os.path.abspath(repo_source) + if not os.path.isdir(abs_path): + return {"status": "path_not_found", "count": 0, "path": abs_path} + reindex_fc.load_repository(abs_path, is_url=False) + + if repo_name: + reindex_fc.repo_info["name"] = repo_name + if hasattr(reindex_fc.loader, "repo_name"): + reindex_fc.loader.repo_name = repo_name + + reindex_fc.index_repository(force=True) + count = reindex_fc.vector_store.get_count() + + if _fastcode_instance is not None: + _invalidate_loaded_state(_fastcode_instance) + + return {"status": "success", "count": count} + + def _ensure_repos_ready(repos: List[str], allow_incremental: bool = True, ctx=None) -> List[str]: """ For each repo source string: @@ -140,25 +217,100 @@ def _ensure_repos_ready(repos: List[str], allow_incremental: bool = True, ctx=No _apply_forced_env_excludes(fc) ready_names: List[str] = [] + def strip_repo_hash_suffix(repo_name: str) -> Optional[str]: + base_name, sep, suffix = repo_name.rpartition("-") + if not sep or len(suffix) != 8: + return None + if not all(ch in "0123456789abcdef" for ch in suffix.lower()): + return None + return base_name or None + + def find_legacy_local_collisions(source: str, resolved_name: str) -> List[str]: + try: + from fastcode.utils import get_repo_name_from_path + except Exception: + return [] + + workspace_root = getattr(getattr(fc, "loader", None), "safe_repo_root", None) + derived_name = get_repo_name_from_path(source, workspace_root=workspace_root) + legacy_name = strip_repo_hash_suffix(derived_name) + if not legacy_name or resolved_name != legacy_name or resolved_name == derived_name: + return [] + + source_path = os.path.abspath(source) + parent_dir = os.path.dirname(source_path) + search_root = os.path.dirname(parent_dir) + if not search_root or search_root == parent_dir or search_root == os.path.sep: + return [] + + basename = os.path.basename(source_path.rstrip(os.sep)) + candidates = set() + + direct_candidate = os.path.join(search_root, basename) + if os.path.isdir(direct_candidate): + candidates.add(os.path.abspath(direct_candidate)) + + try: + for child in os.listdir(search_root): + candidate = os.path.join(search_root, child, basename) + if os.path.isdir(candidate): + candidates.add(os.path.abspath(candidate)) + except OSError: + return [] + + return sorted(candidates) if len(candidates) > 1 else [] + for source in repos: resolved_is_url = fc._infer_is_url(source) name = _repo_name_from_source(source, resolved_is_url) + abs_path = os.path.abspath(source) if not resolved_is_url else None # Already indexed if _is_repo_indexed(name): + if not resolved_is_url and not os.path.isdir(abs_path): + logger.error(f"Local path does not exist: {abs_path}") + continue + + if not resolved_is_url: + collision_paths = find_legacy_local_collisions(abs_path, name) + if collision_paths: + logger.error( + "Legacy local index '%s' is ambiguous for '%s'; matching paths=%s", + name, + abs_path, + collision_paths, + ) + continue + # Try incremental update for local repos if not resolved_is_url and allow_incremental: - abs_path = os.path.abspath(source) - if os.path.isdir(abs_path): - try: - result = fc.incremental_reindex(name, repo_path=abs_path) - if result and result.get("changes", 0) > 0: - logger.info(f"Incremental update for '{name}': {result}") - # Force reload since on-disk data changed - fc.repo_indexed = False - fc.loaded_repositories.clear() - except Exception as e: - logger.warning(f"Incremental reindex failed for '{name}': {e}") + force_full_reindex = False + try: + result = fc.incremental_reindex(name, repo_path=abs_path) + status = (result or {}).get("status") + if status in { + "no_manifest", + "no_metadata", + "embedding_mismatch", + "embedding_dimension_mismatch", + "full_reindex_required", + }: + force_full_reindex = True + elif result and result.get("changes", 0) > 0: + logger.info(f"Incremental update for '{name}': {result}") + _invalidate_loaded_state(fc) + except Exception as e: + logger.warning(f"Incremental reindex failed for '{name}': {e}") + force_full_reindex = True + + if force_full_reindex: + logger.info(f"Falling back to full reindex for '{name}'") + result = _run_full_reindex( + abs_path, resolved_is_url=False, repo_name=name + ) + if result.get("status") != "success": + logger.error(f"Full reindex failed for '{name}': {result}") + continue logger.info(f"Repo '{name}' ready.") ready_names.append(name) continue @@ -190,7 +342,35 @@ def _ensure_loaded(fc, ready_names: List[str]) -> bool: """Ensure repos are loaded into memory (vectors + BM25 + graphs).""" if not fc.repo_indexed or set(ready_names) != set(fc.loaded_repositories.keys()): logger.info(f"Loading repos into memory: {ready_names}") - return fc._load_multi_repo_cache(repo_names=ready_names) + if not fc._load_multi_repo_cache(repo_names=ready_names): + return False + + loaded_names = set(fc.loaded_repositories.keys()) + expected_names = set(ready_names) + if loaded_names != expected_names: + logger.error( + "Loaded repository set mismatch. expected=%s loaded=%s", + sorted(expected_names), + sorted(loaded_names), + ) + return False + + vector_store = getattr(fc, "vector_store", None) + if vector_store is not None and hasattr(vector_store, "get_count"): + if vector_store.get_count() == 0: + logger.error("Repository load finished with an empty in-memory vector store") + return False + + if vector_store is not None and hasattr(vector_store, "get_repository_names"): + in_memory_names = set(vector_store.get_repository_names()) + if in_memory_names != expected_names: + logger.error( + "In-memory repository set mismatch. expected=%s loaded=%s", + sorted(expected_names), + sorted(in_memory_names), + ) + return False + return True @@ -439,7 +619,7 @@ def search_symbol( Matching definitions with file path, line range, and signature. """ fc = _get_fastcode() - ready_names = _ensure_repos_ready(repos, allow_incremental=False) + ready_names = _ensure_repos_ready(repos) if not ready_names: return "Error: None of the specified repositories could be loaded." if not _ensure_loaded(fc, ready_names): @@ -542,7 +722,7 @@ def get_file_summary(file_path: str, repos: list[str]) -> str: File structure: classes (with methods), top-level functions, and import count. """ fc = _get_fastcode() - ready_names = _ensure_repos_ready(repos, allow_incremental=False) + ready_names = _ensure_repos_ready(repos) if not ready_names: return "Error: None of the specified repositories could be loaded." if not _ensure_loaded(fc, ready_names): @@ -649,7 +829,7 @@ def get_call_chain( Formatted call chain showing callers and/or callees. """ fc = _get_fastcode() - ready_names = _ensure_repos_ready(repos, allow_incremental=False) + ready_names = _ensure_repos_ready(repos) if not ready_names: return "Error: None of the specified repositories could be loaded." if not _ensure_loaded(fc, ready_names): @@ -712,28 +892,16 @@ def reindex_repo(repo_source: str) -> str: Confirmation with element count. """ fc = _get_fastcode() - _apply_forced_env_excludes(fc) - resolved_is_url = fc._infer_is_url(repo_source) name = _repo_name_from_source(repo_source, resolved_is_url) logger.info(f"Force re-indexing '{name}' from {repo_source}") + result = _run_full_reindex(repo_source, resolved_is_url, repo_name=name) + if result.get("status") == "path_not_found": + return f"Error: Local path does not exist: {result['path']}" + if result.get("status") != "success": + return f"Error: Failed to re-index '{name}'." - if resolved_is_url: - fc.load_repository(repo_source, is_url=True) - else: - abs_path = os.path.abspath(repo_source) - if not os.path.isdir(abs_path): - return f"Error: Local path does not exist: {abs_path}" - fc.load_repository(abs_path, is_url=False) - - fc.index_repository(force=True) - count = fc.vector_store.get_count() - - # Reset in-memory state so next _ensure_loaded does a clean load - fc.repo_indexed = False - fc.loaded_repositories.clear() - - return f"Successfully re-indexed '{name}': {count} elements indexed." + return f"Successfully re-indexed '{name}': {result['count']} elements indexed." # --------------------------------------------------------------------------- diff --git a/tests/test_incremental_indexing_regressions.py b/tests/test_incremental_indexing_regressions.py new file mode 100644 index 0000000..895e271 --- /dev/null +++ b/tests/test_incremental_indexing_regressions.py @@ -0,0 +1,1849 @@ +import ast +import copy +import os +import pickle +import sys +import types +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import Mock + +import numpy as np + + +ROOT = Path(__file__).resolve().parents[1] +FASTCODE_MAIN = ROOT / "fastcode" / "main.py" +MCP_SERVER = ROOT / "mcp_server.py" + + +def _null_logger(): + return SimpleNamespace( + info=lambda *args, **kwargs: None, + warning=lambda *args, **kwargs: None, + error=lambda *args, **kwargs: None, + debug=lambda *args, **kwargs: None, + ) + + +def _load_functions(path, names, *, class_name=None, global_ns=None): + source = path.read_text(encoding="utf-8") + tree = ast.parse(source, filename=str(path)) + + if class_name: + class_node = next( + node + for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == class_name + ) + lookup = { + node.name: node + for node in class_node.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + } + else: + lookup = { + node.name: node + for node in tree.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + } + + selected = [] + for name in names: + node = copy.deepcopy(lookup[name]) + node.decorator_list = [] + selected.append(node) + + future_import = ast.parse("from __future__ import annotations").body + module = ast.Module(body=future_import + selected, type_ignores=[]) + ast.fix_missing_locations(module) + + namespace = {} + if global_ns: + namespace.update(global_ns) + exec(compile(module, str(path), "exec"), namespace) + return [namespace[name] for name in names] + + +class StubCodeElement: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def to_dict(self): + return dict(self.__dict__) + + +def _element_meta(element_id, file_path, relative_path, embedding): + return { + "id": element_id, + "type": "file", + "name": relative_path, + "file_path": file_path, + "relative_path": relative_path, + "language": "python", + "start_line": 1, + "end_line": 10, + "code": "print('hello')", + "signature": None, + "docstring": None, + "summary": None, + "metadata": {"embedding": embedding}, + "repo_name": "repo", + "repo_url": None, + } + + +def _make_incremental_reindex(globals_override=None): + base_globals = { + "os": os, + "np": np, + "CodeElement": StubCodeElement, + } + if globals_override: + base_globals.update(globals_override) + return _load_functions( + FASTCODE_MAIN, + ["incremental_reindex"], + class_name="FastCode", + global_ns=base_globals, + )[0] + + +def _get_function_node(path, name, *, class_name=None): + source = path.read_text(encoding="utf-8") + tree = ast.parse(source, filename=str(path)) + + if class_name: + class_node = next( + node + for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == class_name + ) + return next( + node + for node in class_node.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == name + ) + + return next( + node + for node in tree.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == name + ) + + +def _assert_no_print_calls(path, name, *, class_name=None): + node = _get_function_node(path, name, class_name=class_name) + print_calls = [ + child + for child in ast.walk(node) + if isinstance(child, ast.Call) + and isinstance(child.func, ast.Name) + and child.func.id == "print" + ] + assert not print_calls, f"{path}:{name} writes to stdout via print()" + + +def test_incremental_reindex_uses_loader_repo_path_when_rebuilding_graphs(tmp_path): + captured = {} + + class FakeTempStore: + def __init__(self, config): + self.config = config + + def initialize(self, dimension): + self.dimension = dimension + + def add_vectors(self, vectors, metadata_list): + self.vectors = vectors + self.metadata_list = metadata_list + + class FakeRetriever: + def __init__(self, config, vector_store, embedder, graph_builder, repo_root=None): + self.repo_root = repo_root + + def index_for_bm25(self, elements): + self.elements = elements + + class FakeGraphBuilder: + def __init__(self, config): + self.config = config + + def build_graphs(self, elements, module_resolver, symbol_resolver): + self.elements = elements + + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + + def build_maps(self, elements, repo_root): + captured["repo_root"] = repo_root + + incremental_reindex = _make_incremental_reindex( + { + "VectorStore": FakeTempStore, + "HybridRetriever": FakeRetriever, + "CodeGraphBuilder": FakeGraphBuilder, + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + } + ) + + original_repo = tmp_path / "source-repo" + copied_repo = tmp_path / "workspace-copy" / "repo" + original_repo.mkdir() + copied_repo.mkdir(parents=True) + + file_path = copied_repo / "a.py" + meta = _element_meta("elem-1", str(file_path), "a.py", [0.1, 0.2, 0.3]) + + class FakeLoader: + def __init__(self): + self.repo_path = None + + def load_from_path(self, path): + self.repo_path = str(copied_repo) + + def scan_files(self): + return [] + + fc = SimpleNamespace( + logger=_null_logger(), + loader=FakeLoader(), + config={}, + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={}, + indexer=SimpleNamespace(index_files=lambda file_infos, repo_name, repo_url=None: []), + _load_file_manifest=lambda repo_name: {"files": {"a.py": {"element_ids": ["elem-1"]}}}, + _detect_file_changes=lambda repo_name, current_files: { + "added": [], + "modified": [], + "deleted": ["deleted.py"], + "unchanged": ["a.py"], + "manifest": {"files": {"a.py": {"element_ids": ["elem-1"]}}}, + "current_lookup": {}, + }, + _load_existing_metadata=lambda repo_name: [meta], + _collect_unchanged_elements=lambda manifest, unchanged_files, existing_metadata: ( + existing_metadata, + ["elem-1"], + ), + _should_persist_indexes=lambda: False, + ) + + incremental_reindex(fc, "repo", repo_path=str(original_repo)) + + assert captured["repo_root"] == str(copied_repo) + + +def test_incremental_reindex_regenerates_repository_overview_after_changes(tmp_path): + class FakeTempStore: + def __init__(self, config): + self.config = config + + def initialize(self, dimension): + self.dimension = dimension + + def add_vectors(self, vectors, metadata_list): + self.vectors = vectors + self.metadata_list = metadata_list + + def save(self, repo_name): + self.saved_repo = repo_name + + class FakeRetriever: + def __init__(self, config, vector_store, embedder, graph_builder, repo_root=None): + self.repo_root = repo_root + + def index_for_bm25(self, elements): + self.elements = elements + + def save_bm25(self, repo_name): + self.saved_repo = repo_name + + class FakeGraphBuilder: + def __init__(self, config): + self.config = config + + def build_graphs(self, elements, module_resolver, symbol_resolver): + self.elements = elements + + def save(self, repo_name): + self.saved_repo = repo_name + + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + + def build_maps(self, elements, repo_root): + self.elements = elements + self.repo_root = repo_root + + incremental_reindex = _make_incremental_reindex( + { + "VectorStore": FakeTempStore, + "HybridRetriever": FakeRetriever, + "CodeGraphBuilder": FakeGraphBuilder, + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + } + ) + + repo_root = tmp_path / "repo" + repo_root.mkdir() + changed_file = repo_root / "a.py" + + current_file_info = { + "path": str(changed_file), + "relative_path": "a.py", + "size": 10, + "extension": ".py", + } + existing_meta = _element_meta( + "old-elem", + str(repo_root / "old.py"), + "old.py", + [0.1, 0.2, 0.3], + ) + new_element = StubCodeElement( + id="new-elem", + type="file", + name="a.py", + file_path=str(changed_file), + relative_path="a.py", + language="python", + start_line=1, + end_line=20, + code="print('updated')", + signature=None, + docstring=None, + summary=None, + metadata={"embedding": [0.4, 0.5, 0.6]}, + repo_name="repo", + repo_url="https://example.com/repo.git", + ) + + class FakeLoader: + def __init__(self): + self.repo_path = None + + def load_from_path(self, path): + self.repo_path = str(repo_root) + + def scan_files(self): + return [current_file_info] + + vector_store = SimpleNamespace( + persist_dir=str(tmp_path / "persist"), + save_repo_overview=Mock(), + ) + + class FakeIndexer: + def __init__(self): + self.overview_generator = SimpleNamespace( + parse_file_structure=lambda repo_path, files: {"languages": {"python": len(files)}}, + generate_overview=lambda repo_path, repo_name, file_structure: { + "repo_name": repo_name, + "summary": "updated summary", + "structure_text": "a.py", + "file_structure": file_structure, + "readme_content": "", + "has_readme": False, + }, + ) + + def index_files(self, file_infos, repo_name, repo_url=None): + return [new_element] + + def _save_repository_overview(self, overview): + vector_store.save_repo_overview( + overview["repo_name"], + overview["summary"], + np.array([0.1, 0.2, 0.3], dtype=np.float32), + { + "summary": overview["summary"], + "structure_text": overview["structure_text"], + "file_structure": overview["file_structure"], + }, + ) + + fc = SimpleNamespace( + logger=_null_logger(), + loader=FakeLoader(), + vector_store=vector_store, + indexer=FakeIndexer(), + config={}, + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo": {"url": "https://example.com/repo.git"}}, + _load_file_manifest=lambda repo_name: {"files": {"old.py": {"element_ids": ["old-elem"]}}}, + _detect_file_changes=lambda repo_name, current_files: { + "added": ["a.py"], + "modified": [], + "deleted": [], + "unchanged": [], + "manifest": {"files": {"old.py": {"element_ids": ["old-elem"]}}}, + "current_lookup": {"a.py": {"file_info": current_file_info}}, + }, + _load_existing_metadata=lambda repo_name: [existing_meta], + _collect_unchanged_elements=lambda manifest, unchanged_files, existing_metadata: ([], []), + _should_persist_indexes=lambda: True, + _build_file_manifest=lambda elements, repo_root: {"files": {}}, + _save_file_manifest=lambda repo_name, manifest: None, + ) + + incremental_reindex(fc, "repo", repo_path=str(repo_root)) + + assert vector_store.save_repo_overview.called + + +def test_incremental_reindex_rejects_incompatible_preserved_embeddings(tmp_path): + class GuardedTempStore: + def __init__(self, config): + self.config = config + self.dimension = None + + def initialize(self, dimension): + self.dimension = dimension + + def add_vectors(self, vectors, metadata_list): + if vectors.shape[1] != self.dimension: + raise AssertionError( + "incremental_reindex attempted to rebuild with incompatible " + "preserved embeddings" + ) + + class FakeRetriever: + def __init__(self, config, vector_store, embedder, graph_builder, repo_root=None): + self.repo_root = repo_root + + def index_for_bm25(self, elements): + self.elements = elements + + class FakeGraphBuilder: + def __init__(self, config): + self.config = config + + def build_graphs(self, elements, module_resolver, symbol_resolver): + self.elements = elements + + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + + def build_maps(self, elements, repo_root): + self.elements = elements + + incremental_reindex = _make_incremental_reindex( + { + "VectorStore": GuardedTempStore, + "HybridRetriever": FakeRetriever, + "CodeGraphBuilder": FakeGraphBuilder, + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + } + ) + + repo_root = tmp_path / "repo" + repo_root.mkdir() + + existing_meta = _element_meta( + "elem-1", + str(repo_root / "a.py"), + "a.py", + [0.1, 0.2], + ) + + class FakeLoader: + def __init__(self): + self.repo_path = None + + def load_from_path(self, path): + self.repo_path = str(repo_root) + + def scan_files(self): + return [] + + fc = SimpleNamespace( + logger=_null_logger(), + loader=FakeLoader(), + config={}, + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={}, + indexer=SimpleNamespace(index_files=lambda file_infos, repo_name, repo_url=None: []), + _load_file_manifest=lambda repo_name: {"files": {"a.py": {"element_ids": ["elem-1"]}}}, + _detect_file_changes=lambda repo_name, current_files: { + "added": [], + "modified": [], + "deleted": ["deleted.py"], + "unchanged": ["a.py"], + "manifest": {"files": {"a.py": {"element_ids": ["elem-1"]}}}, + "current_lookup": {}, + }, + _load_existing_metadata=lambda repo_name: [existing_meta], + _collect_unchanged_elements=lambda manifest, unchanged_files, existing_metadata: ( + existing_metadata, + ["elem-1"], + ), + _should_persist_indexes=lambda: False, + ) + + incremental_reindex(fc, "repo", repo_path=str(repo_root)) + + +def test_ensure_repos_ready_falls_back_to_full_reindex_when_manifest_is_missing(tmp_path): + ensure_repos_ready = _load_functions( + MCP_SERVER, + ["_ensure_repos_ready"], + global_ns={"os": os, "logger": _null_logger()}, + )[0] + + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + fc = SimpleNamespace( + _infer_is_url=lambda source: False, + incremental_reindex=Mock(return_value={"status": "no_manifest", "changes": 0}), + load_repository=Mock(), + index_repository=Mock(), + repo_indexed=True, + loaded_repositories={}, + ) + full_reindex = Mock(return_value={"status": "success", "count": 12}) + + ensure_repos_ready.__globals__.update( + { + "_get_fastcode": lambda: fc, + "_apply_forced_env_excludes": lambda fc: None, + "_repo_name_from_source": lambda source, is_url: "repo", + "_is_repo_indexed": lambda repo_name: True, + "_run_full_reindex": full_reindex, + "_invalidate_loaded_state": lambda fc: None, + } + ) + + ensure_repos_ready([str(repo_dir)]) + + assert full_reindex.called + assert not fc.load_repository.called + assert not fc.index_repository.called + + +def test_ensure_repos_ready_falls_back_to_full_reindex_on_embedding_mismatch(tmp_path): + ensure_repos_ready = _load_functions( + MCP_SERVER, + ["_ensure_repos_ready"], + global_ns={"os": os, "logger": _null_logger()}, + )[0] + + repo_dir = tmp_path / "repo" + repo_dir.mkdir() + + fc = SimpleNamespace( + _infer_is_url=lambda source: False, + incremental_reindex=Mock( + return_value={"status": "embedding_dimension_mismatch", "changes": 3} + ), + load_repository=Mock(), + index_repository=Mock(), + repo_indexed=True, + loaded_repositories={}, + ) + full_reindex = Mock(return_value={"status": "success", "count": 9}) + + ensure_repos_ready.__globals__.update( + { + "_get_fastcode": lambda: fc, + "_apply_forced_env_excludes": lambda fc: None, + "_repo_name_from_source": lambda source, is_url: "repo", + "_is_repo_indexed": lambda repo_name: True, + "_run_full_reindex": full_reindex, + "_invalidate_loaded_state": lambda fc: None, + } + ) + + ensure_repos_ready([str(repo_dir)]) + + assert full_reindex.called + + +def test_lookup_tools_do_not_disable_incremental_refresh(): + search_symbol, get_file_summary, get_call_chain = _load_functions( + MCP_SERVER, + ["search_symbol", "get_file_summary", "get_call_chain"], + global_ns={}, + ) + + calls = [] + + def fake_ensure_repos_ready(repos, allow_incremental=True, ctx=None): + calls.append(allow_incremental) + return ["repo"] + + fake_fc = SimpleNamespace( + vector_store=SimpleNamespace(metadata=[]), + graph_builder=SimpleNamespace( + element_by_name={}, + element_by_id={}, + get_callers=lambda element_id: [], + get_callees=lambda element_id: [], + ), + ) + + shared_globals = { + "_get_fastcode": lambda: fake_fc, + "_ensure_repos_ready": fake_ensure_repos_ready, + "_ensure_loaded": lambda fc, ready_names: True, + } + + search_symbol.__globals__.update(shared_globals) + get_file_summary.__globals__.update(shared_globals) + get_call_chain.__globals__.update(shared_globals) + + search_symbol("FastCode", ["/tmp/repo"]) + get_file_summary("fastcode/main.py", ["/tmp/repo"]) + get_call_chain("query", ["/tmp/repo"]) + + assert calls == [True, True, True] + + +def test_reindex_repo_uses_clean_full_reindex_helper(): + reindex_repo = _load_functions( + MCP_SERVER, + ["reindex_repo"], + global_ns={"os": os, "logger": _null_logger()}, + )[0] + + fc = SimpleNamespace(_infer_is_url=lambda source: False) + full_reindex = Mock(return_value={"status": "success", "count": 42}) + + reindex_repo.__globals__.update( + { + "_get_fastcode": lambda: fc, + "_repo_name_from_source": lambda source, is_url: "repo", + "_run_full_reindex": full_reindex, + } + ) + + message = reindex_repo("/tmp/repo") + + assert "42 elements indexed" in message + assert full_reindex.called + + +def test_load_multi_repo_cache_replaces_loaded_repository_set(tmp_path): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={ + "os": os, + "pickle": __import__("pickle"), + "CodeGraphBuilder": lambda config: SimpleNamespace( + load=lambda repo_name: False, + merge_from_file=lambda repo_name: False, + build_graphs=lambda elements: None, + ), + }, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + for repo_name in ("repo_a", "repo_b"): + (persist_dir / f"{repo_name}.faiss").write_bytes(b"index") + (persist_dir / f"{repo_name}_metadata.pkl").write_bytes(b"meta") + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + self.merged.append(repo_name) + return True + + def get_count(self): + return len(self.merged) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + config={}, + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo_a": {"name": "repo_a"}, "repo_b": {"name": "repo_b"}}, + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + index_for_bm25=lambda elements: None, + full_bm25_elements=[], + full_bm25_corpus=[], + full_bm25=None, + ), + graph_builder=SimpleNamespace(load=lambda repo_name: False, merge_from_file=lambda repo_name: False), + _reconstruct_elements_from_metadata=lambda: [], + ) + + ok = load_multi_repo_cache(fake_fc, repo_names=["repo_a"]) + + assert ok is True + assert set(fake_fc.loaded_repositories) == {"repo_a"} + + +def test_load_multi_repo_cache_fails_when_any_requested_repo_cannot_be_merged(tmp_path): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={"os": os, "pickle": pickle}, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + for repo_name in ("repo_a", "repo_b"): + (persist_dir / f"{repo_name}.faiss").write_bytes(b"index") + (persist_dir / f"{repo_name}_metadata.pkl").write_bytes(b"meta") + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + if repo_name == "repo_a": + self.merged.append(repo_name) + return True + return False + + def get_count(self): + return len(self.merged) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo_a": {"name": "repo_a"}, "repo_b": {"name": "repo_b"}}, + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + ), + graph_builder=SimpleNamespace(load=lambda repo_name: False, merge_from_file=lambda repo_name: False), + _reconstruct_elements_from_metadata=lambda: [], + ) + + ok = load_multi_repo_cache(fake_fc, repo_names=["repo_a", "repo_b"]) + + assert ok is False + + +def test_partial_multi_repo_load_does_not_keep_failed_repo_marked_as_loaded(tmp_path): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={"os": os, "pickle": pickle}, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + for repo_name in ("repo_a", "repo_b"): + (persist_dir / f"{repo_name}.faiss").write_bytes(b"index") + (persist_dir / f"{repo_name}_metadata.pkl").write_bytes(b"meta") + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + if repo_name == "repo_a": + self.merged.append(repo_name) + return True + return False + + def get_count(self): + return len(self.merged) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo_a": {"name": "repo_a"}, "repo_b": {"name": "repo_b"}}, + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + ), + graph_builder=SimpleNamespace(load=lambda repo_name: False, merge_from_file=lambda repo_name: False), + _reconstruct_elements_from_metadata=lambda: [], + ) + + ok = load_multi_repo_cache(fake_fc, repo_names=["repo_a", "repo_b"]) + + assert ok is False + assert "repo_b" not in fake_fc.loaded_repositories + + +def test_ensure_loaded_rejects_partial_multi_repo_load(): + ensure_loaded = _load_functions( + MCP_SERVER, + ["_ensure_loaded"], + global_ns={"logger": _null_logger()}, + )[0] + + class FakeFC: + def __init__(self): + self.repo_indexed = False + self.loaded_repositories = {} + + def _load_multi_repo_cache(self, repo_names=None): + self.loaded_repositories = {"repo_a": {"name": "repo_a"}} + return True + + fc = FakeFC() + + ok = ensure_loaded(fc, ["repo_a", "repo_b"]) + + assert ok is False + + +def test_repo_name_from_source_disambiguates_local_paths_with_same_basename(monkeypatch): + repo_name_from_source = _load_functions( + MCP_SERVER, + ["_repo_name_from_source"], + global_ns={"os": os}, + )[0] + + fake_fastcode = types.ModuleType("fastcode") + fake_fastcode.__path__ = [] + fake_utils = types.ModuleType("fastcode.utils") + fake_utils.get_repo_name_from_url = lambda source: "repo-from-url" + fake_utils.get_repo_name_from_path = ( + lambda source, workspace_root=None: f"derived::{source}" + ) + fake_fastcode.utils = fake_utils + + monkeypatch.setitem(sys.modules, "fastcode", fake_fastcode) + monkeypatch.setitem(sys.modules, "fastcode.utils", fake_utils) + repo_name_from_source.__globals__["_get_fastcode"] = lambda: SimpleNamespace( + loader=SimpleNamespace(safe_repo_root="/repos") + ) + + repo_a = repo_name_from_source("/tmp/team-a/service", False) + repo_b = repo_name_from_source("/var/team-b/service", False) + + assert repo_a != repo_b + + +def test_repo_name_from_source_is_stable_for_workspace_copy(monkeypatch): + repo_name_from_source = _load_functions( + MCP_SERVER, + ["_repo_name_from_source"], + global_ns={"os": os}, + )[0] + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + + fake_fastcode = types.ModuleType("fastcode") + fake_fastcode.__path__ = [] + fake_utils = types.ModuleType("fastcode.utils") + fake_utils.get_repo_name_from_url = lambda source: "repo-from-url" + fake_utils.get_repo_name_from_path = get_repo_name_from_path + fake_fastcode.utils = fake_utils + + monkeypatch.setitem(sys.modules, "fastcode", fake_fastcode) + monkeypatch.setitem(sys.modules, "fastcode.utils", fake_utils) + repo_name_from_source.__globals__["_get_fastcode"] = lambda: SimpleNamespace( + loader=SimpleNamespace(safe_repo_root="/repos") + ) + + original_path = "/tmp/team/service" + workspace_path = f"/repos/{get_repo_name_from_path(original_path)}" + + assert repo_name_from_source(workspace_path, False) == repo_name_from_source( + original_path, False + ) + + +def test_get_repo_name_from_path_is_stable_for_workspace_paths(): + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + + original_path = "/tmp/team/service" + derived_name = get_repo_name_from_path(original_path) + workspace_path = f"/repos/{derived_name}" + + assert get_repo_name_from_path(workspace_path, workspace_root="/repos") == derived_name + + +def test_get_repo_name_from_path_keeps_disambiguation_for_hex_suffixed_names(): + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + + repo_a = get_repo_name_from_path("/tmp/team-a/service-deadbeef", workspace_root="/repos") + repo_b = get_repo_name_from_path("/var/team-b/service-deadbeef", workspace_root="/repos") + + assert repo_a != repo_b + + +def test_load_from_path_reuses_existing_workspace_copy(tmp_path): + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + load_from_path = _load_functions( + ROOT / "fastcode" / "loader.py", + ["load_from_path"], + class_name="RepositoryLoader", + global_ns={ + "os": os, + "shutil": SimpleNamespace(copytree=Mock()), + "get_repo_name_from_path": get_repo_name_from_path, + }, + )[0] + + safe_repo_root = tmp_path / "repos" + source_repo = tmp_path / "source" / "service" + source_repo.mkdir(parents=True) + + derived_name = get_repo_name_from_path(str(source_repo)) + workspace_repo = safe_repo_root / derived_name + workspace_repo.mkdir(parents=True) + + copytree = load_from_path.__globals__["shutil"].copytree + prepare_repo_path = Mock(return_value=str(tmp_path / "unexpected-copy")) + loader = SimpleNamespace( + logger=_null_logger(), + safe_repo_root=str(safe_repo_root), + _prepare_repo_path=prepare_repo_path, + repo_name=None, + repo_path=None, + ) + + repo_path = load_from_path(loader, str(workspace_repo)) + + assert repo_path == str(workspace_repo) + assert loader.repo_path == str(workspace_repo) + assert not prepare_repo_path.called + assert not copytree.called + + +def test_load_multi_repo_cache_rebuilds_graph_when_graph_cache_is_missing(tmp_path): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={ + "os": os, + "pickle": pickle, + "BM25Okapi": lambda corpus: ("bm25", len(corpus)), + "CodeElement": StubCodeElement, + "CodeGraphBuilder": lambda config: FakeGraphBuilder(), + }, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + (persist_dir / "repo.faiss").write_bytes(b"index") + (persist_dir / "repo_metadata.pkl").write_bytes(b"meta") + with open(persist_dir / "repo_bm25.pkl", "wb") as f: + pickle.dump( + { + "bm25_corpus": [["query"]], + "bm25_elements": [ + { + "id": "file_repo_main", + "type": "file", + "name": "main.py", + "file_path": "main.py", + "relative_path": "main.py", + "language": "python", + "start_line": 1, + "end_line": 10, + "code": "print('hello')", + "signature": None, + "docstring": None, + "summary": None, + "metadata": {}, + "repo_name": "repo", + "repo_url": None, + } + ], + }, + f, + ) + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + self.merged.append(repo_name) + return True + + def get_count(self): + return len(self.merged) + + class FakeGraphBuilder: + def __init__(self): + self.rebuilt = False + + def load(self, repo_name): + return False + + def merge_from_file(self, repo_name): + return False + + def build_graphs(self, elements): + self.rebuilt = True + + graph_builder = FakeGraphBuilder() + load_multi_repo_cache.__globals__["CodeGraphBuilder"] = lambda config: graph_builder + fake_fc = SimpleNamespace( + logger=_null_logger(), + config={}, + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo": {"name": "repo"}}, + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + full_bm25_elements=[], + full_bm25_corpus=[], + full_bm25=None, + ), + graph_builder=graph_builder, + _reconstruct_elements_from_metadata=lambda: [], + ) + + ok = load_multi_repo_cache(fake_fc, repo_names=["repo"]) + + assert ok is True + assert graph_builder.rebuilt is True + + +def test_failed_partial_reload_cannot_leave_ensure_loaded_with_stale_memory(tmp_path): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={"os": os, "pickle": pickle}, + )[0] + ensure_loaded = _load_functions( + MCP_SERVER, + ["_ensure_loaded"], + global_ns={"logger": _null_logger()}, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + for repo_name in ("repo_a", "repo_b"): + (persist_dir / f"{repo_name}.faiss").write_bytes(b"index") + (persist_dir / f"{repo_name}_metadata.pkl").write_bytes(b"meta") + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = ["repo_a", "repo_b"] + + def initialize(self, dimension): + self.dimension = dimension + self.merged = [] + + def merge_from_index(self, repo_name): + if repo_name == "repo_a": + self.merged.append(repo_name) + return True + return False + + def get_count(self): + return len(self.merged) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + config={}, + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + ), + graph_builder=SimpleNamespace(load=lambda repo_name: False, merge_from_file=lambda repo_name: False), + loaded_repositories={"repo_a": {"name": "repo_a"}, "repo_b": {"name": "repo_b"}}, + repo_indexed=True, + _reconstruct_elements_from_metadata=lambda: [], + ) + + reload_calls = {"count": 0} + + def wrapped_load_multi_repo_cache(repo_names=None): + reload_calls["count"] += 1 + return load_multi_repo_cache(fake_fc, repo_names=repo_names) + + fake_fc._load_multi_repo_cache = wrapped_load_multi_repo_cache + + assert fake_fc._load_multi_repo_cache(repo_names=["repo_a", "repo_b"]) is False + assert ensure_loaded(fake_fc, ["repo_a", "repo_b"]) is False + assert reload_calls["count"] == 2 + + +def test_ensure_repos_ready_rejects_missing_local_path_even_if_index_exists(): + ensure_repos_ready = _load_functions( + MCP_SERVER, + ["_ensure_repos_ready"], + global_ns={"os": os, "logger": _null_logger()}, + )[0] + + fc = SimpleNamespace( + _infer_is_url=lambda source: False, + incremental_reindex=Mock(), + load_repository=Mock(), + index_repository=Mock(), + repo_indexed=True, + loaded_repositories={}, + ) + + ensure_repos_ready.__globals__.update( + { + "_get_fastcode": lambda: fc, + "_apply_forced_env_excludes": lambda fc: None, + "_repo_name_from_source": lambda source, is_url: "repo", + "_is_repo_indexed": lambda repo_name: True, + "_run_full_reindex": Mock(return_value={"status": "success", "count": 3}), + "_invalidate_loaded_state": lambda fc: None, + } + ) + + ready = ensure_repos_ready(["/tmp/path-that-does-not-exist"]) + + assert ready == [] + + +def test_sequential_index_repository_calls_do_not_save_mixed_repo_indexes(): + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + self.file_map = {} + self.module_map = {} + + def build_maps(self, elements, repo_root): + self.elements = elements + self.repo_root = repo_root + + index_repository = _load_functions( + FASTCODE_MAIN, + ["index_repository"], + class_name="FastCode", + global_ns={ + "np": np, + "CodeGraphBuilder": lambda config: SimpleNamespace( + build_graphs=lambda elements, module_resolver, symbol_resolver: None, + save=lambda repo_name: None, + ), + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + }, + )[0] + + saved_indexes = {} + + def make_element(repo_name: str) -> StubCodeElement: + return StubCodeElement( + id=f"{repo_name}-file", + type="file", + name=f"{repo_name}.py", + file_path=f"/workspace/{repo_name}/{repo_name}.py", + relative_path=f"{repo_name}.py", + language="python", + start_line=1, + end_line=10, + code=f"print('{repo_name}')", + signature=None, + docstring=None, + summary=None, + metadata={"embedding": [0.1, 0.2, 0.3]}, + repo_name=repo_name, + repo_url=f"https://example.com/{repo_name}.git", + ) + + class FakeIndexer: + def index_repository(self, repo_name=None, repo_url=None): + return [make_element(repo_name)] + + class FakeVectorStore: + def __init__(self): + self.dimension = None + self.metadata = [] + + def initialize(self, dimension): + self.dimension = dimension + self.metadata = [] + + def add_vectors(self, vectors, metadata): + self.metadata.extend(metadata) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + eval_config={}, + repo_loaded=True, + repo_indexed=False, + repo_info={"name": "repo_a", "url": "https://example.com/repo_a.git"}, + indexer=FakeIndexer(), + embedder=SimpleNamespace(embedding_dim=3), + vector_store=FakeVectorStore(), + retriever=SimpleNamespace( + index_for_bm25=lambda elements: None, + build_repo_overview_bm25=lambda: None, + save_bm25=lambda repo_name: None, + ), + graph_builder=SimpleNamespace( + build_graphs=lambda elements, module_resolver, symbol_resolver: None, + save=lambda repo_name: None, + ), + loader=SimpleNamespace(repo_path="/workspace/repo_a"), + config={}, + _should_use_cache=lambda: False, + _should_persist_indexes=lambda: True, + _build_file_manifest=lambda elements, repo_root: {"files": {}}, + _save_file_manifest=lambda repo_name, manifest: None, + _log_statistics=lambda: None, + ) + + def save_snapshot(cache_name=None): + saved_indexes[cache_name] = [dict(meta) for meta in fake_fc.vector_store.metadata] + + fake_fc._save_to_cache = save_snapshot + + index_repository(fake_fc) + + fake_fc.repo_info = {"name": "repo_b", "url": "https://example.com/repo_b.git"} + fake_fc.loader.repo_path = "/workspace/repo_b" + index_repository(fake_fc) + + assert {meta["repo_name"] for meta in saved_indexes["repo_a"]} == {"repo_a"} + assert {meta["repo_name"] for meta in saved_indexes["repo_b"]} == {"repo_b"} + + +def test_query_processor_llm_enhancement_does_not_print_to_stdout(): + _assert_no_print_calls( + ROOT / "fastcode" / "query_processor.py", + "_enhance_with_llm", + class_name="QueryProcessor", + ) + + +def test_vector_store_repo_overview_search_does_not_print_to_stdout(): + _assert_no_print_calls( + ROOT / "fastcode" / "vector_store.py", + "search_repository_overviews", + class_name="VectorStore", + ) + + +def test_retriever_repo_selection_does_not_print_to_stdout(): + _assert_no_print_calls( + ROOT / "fastcode" / "retriever.py", + "_select_relevant_repositories", + class_name="HybridRetriever", + ) + + +def test_load_multiple_repositories_persists_manifest_for_each_repo(): + class FakeTempVectorStore: + def __init__(self, config): + self.config = config + self.dimension = None + self.metadata = [] + + def initialize(self, dimension): + self.dimension = dimension + self.metadata = [] + + def add_vectors(self, vectors, metadata): + self.metadata.extend(metadata) + + def save(self, repo_name): + self.saved_repo = repo_name + + class FakeCodeIndexer: + def __init__(self, config, loader, parser, embedder, vector_store): + self.loader = loader + + def index_repository(self, repo_name=None, repo_url=None): + return [ + StubCodeElement( + id=f"{repo_name}-file", + type="file", + name=f"{repo_name}.py", + file_path=f"/workspace/{repo_name}/{repo_name}.py", + relative_path=f"{repo_name}.py", + language="python", + start_line=1, + end_line=10, + code=f"print('{repo_name}')", + signature=None, + docstring=None, + summary=None, + metadata={"embedding": [0.1, 0.2, 0.3]}, + repo_name=repo_name, + repo_url=repo_url, + ) + ] + + class FakeRetriever: + def __init__(self, config, vector_store, embedder, graph_builder, repo_root=None): + self.repo_root = repo_root + + def index_for_bm25(self, elements): + self.elements = elements + + def save_bm25(self, repo_name): + self.saved_repo = repo_name + + def build_repo_overview_bm25(self): + self.repo_overview_built = True + + class FakeGraphBuilder: + def __init__(self, config=None): + self.config = config + + def build_graphs(self, elements, module_resolver=None, symbol_resolver=None): + self.elements = elements + + def save(self, repo_name): + self.saved_repo = repo_name + + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + + def build_maps(self, elements, repo_root): + self.elements = elements + self.repo_root = repo_root + + load_multiple_repositories = _load_functions( + FASTCODE_MAIN, + ["load_multiple_repositories"], + class_name="FastCode", + global_ns={ + "np": np, + "VectorStore": FakeTempVectorStore, + "CodeIndexer": FakeCodeIndexer, + "HybridRetriever": FakeRetriever, + "CodeGraphBuilder": FakeGraphBuilder, + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + }, + )[0] + + class FakeLoader: + def __init__(self): + self.repo_path = None + self.current_name = None + + def load_from_path(self, source): + self.current_name = os.path.basename(source) + self.repo_path = f"/workspace/{self.current_name}" + + def get_repository_info(self): + return { + "name": self.current_name, + "file_count": 1, + "total_size_mb": 0.01, + } + + class FakeMainVectorStore: + def __init__(self): + self.dimension = None + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + self.merged.append(repo_name) + return True + + saved_manifests = [] + + fake_fc = SimpleNamespace( + logger=_null_logger(), + config={}, + loader=FakeLoader(), + parser=None, + embedder=SimpleNamespace(embedding_dim=3), + vector_store=FakeMainVectorStore(), + graph_builder=FakeGraphBuilder(), + loaded_repositories={}, + multi_repo_mode=False, + repo_indexed=False, + repo_loaded=False, + _save_file_manifest=lambda repo_name, manifest: saved_manifests.append(repo_name), + _build_file_manifest=lambda elements, repo_root: {"files": {}}, + ) + + load_multiple_repositories( + fake_fc, + [ + {"source": "/tmp/repo_a", "is_url": False}, + {"source": "/tmp/repo_b", "is_url": False}, + ], + ) + + assert fake_fc.vector_store.merged == ["repo_a", "repo_b"] + assert saved_manifests == ["repo_a", "repo_b"] + + +def test_sequential_index_repository_calls_do_not_save_mixed_repo_graphs(): + class FakeGlobalIndexBuilder: + def __init__(self, config): + self.config = config + self.file_map = {} + self.module_map = {} + + def build_maps(self, elements, repo_root): + self.elements = elements + self.repo_root = repo_root + + saved_graphs = {} + + class FakeGraphBuilder: + def __init__(self): + self.repo_names = [] + + def build_graphs(self, elements, module_resolver, symbol_resolver): + self.repo_names.extend(elem.repo_name for elem in elements) + + def save(self, repo_name): + saved_graphs[repo_name] = list(self.repo_names) + + index_repository = _load_functions( + FASTCODE_MAIN, + ["index_repository"], + class_name="FastCode", + global_ns={ + "np": np, + "CodeGraphBuilder": lambda config: FakeGraphBuilder(), + "GlobalIndexBuilder": FakeGlobalIndexBuilder, + "ModuleResolver": lambda gib: ("module_resolver", gib), + "SymbolResolver": lambda gib, module_resolver: ( + "symbol_resolver", + gib, + module_resolver, + ), + }, + )[0] + + def make_element(repo_name: str) -> StubCodeElement: + return StubCodeElement( + id=f"{repo_name}-file", + type="file", + name=f"{repo_name}.py", + file_path=f"/workspace/{repo_name}/{repo_name}.py", + relative_path=f"{repo_name}.py", + language="python", + start_line=1, + end_line=10, + code=f"print('{repo_name}')", + signature=None, + docstring=None, + summary=None, + metadata={"embedding": [0.1, 0.2, 0.3]}, + repo_name=repo_name, + repo_url=f"https://example.com/{repo_name}.git", + ) + + class FakeIndexer: + def index_repository(self, repo_name=None, repo_url=None): + return [make_element(repo_name)] + + class FakeVectorStore: + def __init__(self): + self.dimension = None + self.metadata = [] + + def initialize(self, dimension): + self.dimension = dimension + self.metadata = [] + + def add_vectors(self, vectors, metadata): + self.metadata.extend(metadata) + + fake_fc = SimpleNamespace( + logger=_null_logger(), + eval_config={}, + repo_loaded=True, + repo_indexed=False, + repo_info={"name": "repo_a", "url": "https://example.com/repo_a.git"}, + indexer=FakeIndexer(), + embedder=SimpleNamespace(embedding_dim=3), + vector_store=FakeVectorStore(), + retriever=SimpleNamespace( + index_for_bm25=lambda elements: None, + build_repo_overview_bm25=lambda: None, + save_bm25=lambda repo_name: None, + ), + graph_builder=FakeGraphBuilder(), + loader=SimpleNamespace(repo_path="/workspace/repo_a"), + config={}, + _should_use_cache=lambda: False, + _should_persist_indexes=lambda: True, + _build_file_manifest=lambda elements, repo_root: {"files": {}}, + _save_file_manifest=lambda repo_name, manifest: None, + _save_to_cache=lambda cache_name=None: None, + _log_statistics=lambda: None, + ) + + index_repository(fake_fc) + + fake_fc.repo_info = {"name": "repo_b", "url": "https://example.com/repo_b.git"} + fake_fc.loader.repo_path = "/workspace/repo_b" + index_repository(fake_fc) + + assert set(saved_graphs["repo_a"]) == {"repo_a"} + assert set(saved_graphs["repo_b"]) == {"repo_b"} + + +def test_ensure_repos_ready_reuses_legacy_local_basename_indexes(monkeypatch, tmp_path): + ensure_repos_ready = _load_functions( + MCP_SERVER, + ["_ensure_repos_ready"], + global_ns={"os": os, "logger": _null_logger()}, + )[0] + repo_name_from_source = _load_functions( + MCP_SERVER, + ["_repo_name_from_source"], + global_ns={"os": os}, + )[0] + is_repo_indexed = _load_functions( + MCP_SERVER, + ["_is_repo_indexed"], + global_ns={"os": os}, + )[0] + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + + fake_fastcode = types.ModuleType("fastcode") + fake_fastcode.__path__ = [] + fake_utils = types.ModuleType("fastcode.utils") + fake_utils.get_repo_name_from_url = lambda source: "repo-from-url" + fake_utils.get_repo_name_from_path = get_repo_name_from_path + fake_fastcode.utils = fake_utils + + monkeypatch.setitem(sys.modules, "fastcode", fake_fastcode) + monkeypatch.setitem(sys.modules, "fastcode.utils", fake_utils) + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + (persist_dir / "service.faiss").write_bytes(b"index") + (persist_dir / "service_metadata.pkl").write_bytes(b"meta") + + source_repo = tmp_path / "team" / "service" + source_repo.mkdir(parents=True) + + fake_fc = SimpleNamespace( + _infer_is_url=lambda source: False, + load_repository=Mock(), + index_repository=Mock(), + vector_store=SimpleNamespace(persist_dir=str(persist_dir)), + loader=SimpleNamespace(safe_repo_root="/repos"), + ) + + repo_name_from_source.__globals__["_get_fastcode"] = lambda: fake_fc + is_repo_indexed.__globals__["_get_fastcode"] = lambda: fake_fc + ensure_repos_ready.__globals__.update( + { + "_get_fastcode": lambda: fake_fc, + "_apply_forced_env_excludes": lambda fc: None, + "_repo_name_from_source": repo_name_from_source, + "_is_repo_indexed": is_repo_indexed, + "_run_full_reindex": Mock(return_value={"status": "success", "count": 1}), + "_invalidate_loaded_state": lambda fc: None, + } + ) + + ready = ensure_repos_ready([str(source_repo)], allow_incremental=False) + + assert ready + assert not fake_fc.load_repository.called + assert not fake_fc.index_repository.called + + +def test_ensure_repos_ready_errors_on_legacy_basename_collision(monkeypatch, tmp_path): + logged_errors = [] + logger = SimpleNamespace( + info=lambda *args, **kwargs: None, + warning=lambda *args, **kwargs: None, + error=lambda *args, **kwargs: logged_errors.append((args, kwargs)), + debug=lambda *args, **kwargs: None, + ) + + ensure_repos_ready = _load_functions( + MCP_SERVER, + ["_ensure_repos_ready"], + global_ns={"os": os, "logger": logger}, + )[0] + repo_name_from_source = _load_functions( + MCP_SERVER, + ["_repo_name_from_source"], + global_ns={"os": os}, + )[0] + is_repo_indexed = _load_functions( + MCP_SERVER, + ["_is_repo_indexed"], + global_ns={"os": os}, + )[0] + get_repo_name_from_path = _load_functions( + ROOT / "fastcode" / "utils.py", + ["get_repo_name_from_path"], + global_ns={ + "os": os, + "hashlib": __import__("hashlib"), + "normalize_path": lambda path: os.path.normpath(path).replace("\\", "/"), + }, + )[0] + + fake_fastcode = types.ModuleType("fastcode") + fake_fastcode.__path__ = [] + fake_utils = types.ModuleType("fastcode.utils") + fake_utils.get_repo_name_from_url = lambda source: "repo-from-url" + fake_utils.get_repo_name_from_path = get_repo_name_from_path + fake_fastcode.utils = fake_utils + + monkeypatch.setitem(sys.modules, "fastcode", fake_fastcode) + monkeypatch.setitem(sys.modules, "fastcode.utils", fake_utils) + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + (persist_dir / "service.faiss").write_bytes(b"index") + (persist_dir / "service_metadata.pkl").write_bytes(b"meta") + + original_repo = tmp_path / "team-a" / "service" + colliding_repo = tmp_path / "team-b" / "service" + original_repo.mkdir(parents=True) + colliding_repo.mkdir(parents=True) + + assert get_repo_name_from_path(str(original_repo)) != get_repo_name_from_path( + str(colliding_repo) + ) + + fake_fc = SimpleNamespace( + _infer_is_url=lambda source: False, + load_repository=Mock(), + index_repository=Mock(), + vector_store=SimpleNamespace(persist_dir=str(persist_dir)), + loader=SimpleNamespace(safe_repo_root="/repos"), + ) + + repo_name_from_source.__globals__["_get_fastcode"] = lambda: fake_fc + is_repo_indexed.__globals__["_get_fastcode"] = lambda: fake_fc + ensure_repos_ready.__globals__.update( + { + "_get_fastcode": lambda: fake_fc, + "_apply_forced_env_excludes": lambda fc: None, + "_repo_name_from_source": repo_name_from_source, + "_is_repo_indexed": is_repo_indexed, + "_run_full_reindex": Mock(return_value={"status": "success", "count": 1}), + "_invalidate_loaded_state": lambda fc: None, + } + ) + + ready = ensure_repos_ready([str(colliding_repo)], allow_incremental=False) + + assert ready == [] + assert logged_errors + assert not fake_fc.load_repository.called + assert not fake_fc.index_repository.called + + +def test_load_multi_repo_cache_rebuilds_graph_from_all_loaded_repositories_when_bm25_is_partial( + tmp_path, +): + load_multi_repo_cache = _load_functions( + FASTCODE_MAIN, + ["_load_multi_repo_cache"], + class_name="FastCode", + global_ns={ + "os": os, + "pickle": pickle, + "BM25Okapi": lambda corpus: ("bm25", len(corpus)), + "CodeElement": StubCodeElement, + "CodeGraphBuilder": lambda config: None, + }, + )[0] + + persist_dir = tmp_path / "persist" + persist_dir.mkdir() + for repo_name in ("repo_a", "repo_b"): + (persist_dir / f"{repo_name}.faiss").write_bytes(b"index") + (persist_dir / f"{repo_name}_metadata.pkl").write_bytes(b"meta") + + with open(persist_dir / "repo_a_bm25.pkl", "wb") as f: + pickle.dump( + { + "bm25_corpus": [["repo", "a"]], + "bm25_elements": [ + { + "id": "repo_a-file", + "type": "file", + "name": "repo_a.py", + "file_path": "/workspace/repo_a/repo_a.py", + "relative_path": "repo_a.py", + "language": "python", + "start_line": 1, + "end_line": 10, + "code": "print('repo_a')", + "signature": None, + "docstring": None, + "summary": None, + "metadata": {}, + "repo_name": "repo_a", + "repo_url": None, + } + ], + }, + f, + ) + + def make_element(repo_name: str) -> StubCodeElement: + return StubCodeElement( + id=f"{repo_name}-file", + type="file", + name=f"{repo_name}.py", + file_path=f"/workspace/{repo_name}/{repo_name}.py", + relative_path=f"{repo_name}.py", + language="python", + start_line=1, + end_line=10, + code=f"print('{repo_name}')", + signature=None, + docstring=None, + summary=None, + metadata={}, + repo_name=repo_name, + repo_url=None, + ) + + class FakeVectorStore: + def __init__(self): + self.persist_dir = str(persist_dir) + self.merged = [] + + def initialize(self, dimension): + self.dimension = dimension + + def merge_from_index(self, repo_name): + self.merged.append(repo_name) + return True + + def get_count(self): + return len(self.merged) + + class FakeGraphBuilder: + def __init__(self): + self.rebuilt_repo_names = [] + + def load(self, repo_name): + return False + + def merge_from_file(self, repo_name): + return False + + def build_graphs(self, elements): + self.rebuilt_repo_names = [elem.repo_name for elem in elements] + + graph_builder = FakeGraphBuilder() + load_multi_repo_cache.__globals__["CodeGraphBuilder"] = lambda config: graph_builder + + fake_fc = SimpleNamespace( + logger=_null_logger(), + config={}, + vector_store=FakeVectorStore(), + embedder=SimpleNamespace(embedding_dim=3), + loaded_repositories={"repo_a": {"name": "repo_a"}, "repo_b": {"name": "repo_b"}}, + retriever=SimpleNamespace( + persist_dir=str(persist_dir), + build_repo_overview_bm25=lambda: None, + index_for_bm25=lambda elements: None, + full_bm25_elements=[], + full_bm25_corpus=[], + full_bm25=None, + ), + graph_builder=graph_builder, + _reconstruct_elements_from_metadata=lambda: [ + make_element("repo_a"), + make_element("repo_b"), + ], + ) + + ok = load_multi_repo_cache(fake_fc, repo_names=["repo_a", "repo_b"]) + + assert ok is True + assert set(graph_builder.rebuilt_repo_names) == {"repo_a", "repo_b"}