From 08e87c6de9c2b5ede2c338a82d4664e01f839fe4 Mon Sep 17 00:00:00 2001
From: shrutu0929 <your.github.email@example.com>
Date: Tue, 7 Apr 2026 11:58:57 +0530
Subject: [PATCH] perf: Batch embeddings in RAGIndexer instead of per-file
 encode

---
 refactron/rag/indexer.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/refactron/rag/indexer.py b/refactron/rag/indexer.py
index 690cfe3..a0b8890 100644
--- a/refactron/rag/indexer.py
+++ b/refactron/rag/indexer.py
@@ -91,7 +91,7 @@ def __init__(
         self.parser = CodeParser()
 
     def index_repository(
-        self, repo_path: Optional[Path] = None, summarize: bool = False
+        self, repo_path: Optional[Path] = None, summarize: bool = False, batch_size: int = 100
     ) -> IndexStats:
         """Index an entire repository.
 
@@ -127,6 +127,8 @@ def index_repository(
 
         total_chunks = 0
         chunk_type_counts: Dict[str, int] = {}
+        
+        current_batch: List[CodeChunk] = []
 
         # Index each file
         for py_file in python_files:
@@ -139,11 +141,20 @@ def index_repository(
                     chunk_type_counts[chunk.chunk_type] = (
                         chunk_type_counts.get(chunk.chunk_type, 0) + 1
                     )
+                
+                current_batch.extend(chunks)
+                
+                if len(current_batch) >= batch_size:
+                    self.add_chunks(current_batch)
+                    current_batch = []
             except Exception as e:
                 # Skip files that can't be parsed
                 print(f"Warning: Could not index {py_file}: {e}")
                 continue
 
+        if current_batch:
+            self.add_chunks(current_batch)
+
         # Save index metadata
         self._save_metadata(
             {
@@ -188,9 +199,6 @@ def _index_file(self, file_path: Path, summarize: bool = False) -> List[CodeChun
                 except Exception as e:
                     print(f"Warning: AI summarization failed for chunk in {file_path}: {e}")
 
-        # Add chunks to the index
-        self.add_chunks(chunks)
-
         return chunks
 
     def _summarize_chunk(self, chunk: CodeChunk) -> Optional[str]: