From 08e87c6de9c2b5ede2c338a82d4664e01f839fe4 Mon Sep 17 00:00:00 2001 From: shrutu0929 Date: Tue, 7 Apr 2026 11:58:57 +0530 Subject: [PATCH] perf: Batch embeddings in RAGIndexer instead of per-file encode --- refactron/rag/indexer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/refactron/rag/indexer.py b/refactron/rag/indexer.py index 690cfe3..a0b8890 100644 --- a/refactron/rag/indexer.py +++ b/refactron/rag/indexer.py @@ -91,7 +91,7 @@ def __init__( self.parser = CodeParser() def index_repository( - self, repo_path: Optional[Path] = None, summarize: bool = False + self, repo_path: Optional[Path] = None, summarize: bool = False, batch_size: int = 100 ) -> IndexStats: """Index an entire repository. @@ -127,6 +127,8 @@ def index_repository( total_chunks = 0 chunk_type_counts: Dict[str, int] = {} + + current_batch: List[CodeChunk] = [] # Index each file for py_file in python_files: @@ -139,11 +141,20 @@ def index_repository( chunk_type_counts[chunk.chunk_type] = ( chunk_type_counts.get(chunk.chunk_type, 0) + 1 ) + + current_batch.extend(chunks) + + if len(current_batch) >= batch_size: + self.add_chunks(current_batch) + current_batch = [] except Exception as e: # Skip files that can't be parsed print(f"Warning: Could not index {py_file}: {e}") continue + if current_batch: + self.add_chunks(current_batch) + # Save index metadata self._save_metadata( { @@ -188,9 +199,6 @@ def _index_file(self, file_path: Path, summarize: bool = False) -> List[CodeChun except Exception as e: print(f"Warning: AI summarization failed for chunk in {file_path}: {e}") - # Add chunks to the index - self.add_chunks(chunks) - return chunks def _summarize_chunk(self, chunk: CodeChunk) -> Optional[str]: