Skip to content

Commit cf45221

Browse files
authored
feat: Better search tool (#739)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed --------- Co-authored-by: kopekC <28070492+kopekC@users.noreply.github.com>
1 parent fc31867 commit cf45221

File tree

4 files changed

+373
-236
lines changed

4 files changed

+373
-236
lines changed

src/codegen/extensions/index/file_index.py

Lines changed: 156 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
import pickle
44
from pathlib import Path
5+
from typing import Optional
56

7+
import modal
68
import numpy as np
79
import tiktoken
810
from openai import OpenAI
@@ -26,6 +28,7 @@ class FileIndex(CodeIndex):
2628
EMBEDDING_MODEL = "text-embedding-3-small"
2729
MAX_TOKENS = 8000
2830
BATCH_SIZE = 100
31+
USE_MODAL_DICT = True # Flag to control whether to use Modal Dict
2932

3033
def __init__(self, codebase: Codebase):
3134
"""Initialize the file index.
@@ -37,10 +40,87 @@ def __init__(self, codebase: Codebase):
3740
self.client = OpenAI()
3841
self.encoding = tiktoken.get_encoding("cl100k_base")
3942

43+
def set_use_modal_dict(self, use_modal: bool) -> None:
44+
"""Set whether to use Modal Dict for storage.
45+
46+
Args:
47+
use_modal: Whether to use Modal Dict for storage
48+
"""
49+
self.USE_MODAL_DICT = use_modal
50+
logger.info(f"Modal Dict storage {'enabled' if use_modal else 'disabled'}")
51+
4052
@property
4153
def save_file_name(self) -> str:
4254
return "file_index_{commit}.pkl"
4355

56+
@property
57+
def modal_dict_id(self) -> str:
58+
"""Get the Modal Dict ID based on the same naming convention as the pickle file."""
59+
if not self.commit_hash:
60+
return "file_index_latest"
61+
return f"file_index_{self.commit_hash}"
62+
63+
def delete_modal_dict(self) -> bool:
64+
"""Delete the Modal Dict storage for this index.
65+
66+
Returns:
67+
bool: True if successfully deleted, False otherwise
68+
"""
69+
if not self.USE_MODAL_DICT:
70+
logger.warning("Modal Dict storage is disabled")
71+
return False
72+
73+
try:
74+
dict_id = self.modal_dict_id
75+
logger.info(f"Deleting Modal Dict: {dict_id}")
76+
77+
# Check if the dict exists before trying to delete
78+
try:
79+
# Use modal.Dict.delete to properly delete the dict
80+
modal.Dict.delete(dict_id)
81+
logger.info(f"Successfully deleted Modal Dict: {dict_id}")
82+
return True
83+
except Exception as e:
84+
logger.info(f"Modal Dict {dict_id} does not exist or cannot be deleted: {e}")
85+
return False
86+
except Exception as e:
87+
logger.exception(f"Failed to delete Modal Dict: {e}")
88+
return False
89+
90+
def modal_dict_exists(self, commit_hash: Optional[str] = None) -> bool:
91+
"""Check if a Modal Dict exists for a specific commit.
92+
93+
Args:
94+
commit_hash: The commit hash to check, or None to use the current commit
95+
96+
Returns:
97+
bool: True if the Modal Dict exists, False otherwise
98+
"""
99+
if not self.USE_MODAL_DICT:
100+
return False
101+
102+
try:
103+
# Use provided commit hash or current one
104+
old_commit = self.commit_hash
105+
if commit_hash is not None:
106+
self.commit_hash = commit_hash
107+
108+
dict_id = self.modal_dict_id
109+
110+
# Restore original commit hash
111+
if commit_hash is not None:
112+
self.commit_hash = old_commit
113+
114+
try:
115+
# Try to access the dict - this will raise an exception if it doesn't exist
116+
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False)
117+
# Check if our data is in the dict
118+
return "index_data" in modal_dict
119+
except Exception:
120+
return False
121+
except Exception:
122+
return False
123+
44124
def _split_by_tokens(self, text: str) -> list[str]:
45125
"""Split text into chunks that fit within token limit."""
46126
tokens = self.encoding.encode(text)
@@ -135,17 +215,69 @@ def _get_changed_items(self) -> set[File]:
135215
return changed_files
136216

137217
def _save_index(self, path: Path) -> None:
138-
"""Save index data to disk."""
218+
"""Save index data to disk and optionally to Modal Dict."""
219+
# Save to local pickle file
139220
with open(path, "wb") as f:
140221
pickle.dump({"E": self.E, "items": self.items, "commit_hash": self.commit_hash}, f)
141222

223+
# Save to Modal Dict if enabled
224+
if self.USE_MODAL_DICT:
225+
try:
226+
dict_id = self.modal_dict_id
227+
logger.info(f"Saving index to Modal Dict: {dict_id}")
228+
229+
# Convert numpy arrays to lists for JSON serialization
230+
modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash}
231+
232+
# Create or update Modal Dict
233+
# Note: from_name is lazy, so we need to explicitly set the data
234+
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True)
235+
modal_dict["index_data"] = modal_data
236+
237+
logger.info(f"Successfully saved index to Modal Dict: {dict_id}")
238+
except Exception as e:
239+
logger.exception(f"Failed to save index to Modal Dict: {e}")
240+
142241
def _load_index(self, path: Path) -> None:
143-
"""Load index data from disk."""
144-
with open(path, "rb") as f:
145-
data = pickle.load(f)
146-
self.E = data["E"]
147-
self.items = data["items"]
148-
self.commit_hash = data["commit_hash"]
242+
"""Load index data from disk or Modal Dict."""
243+
# Try loading from Modal Dict first if enabled
244+
if self.USE_MODAL_DICT:
245+
try:
246+
dict_id = self.modal_dict_id
247+
logger.info(f"Attempting to load index from Modal Dict: {dict_id}")
248+
249+
# from_name is lazy, so we need to check if the dict exists first
250+
try:
251+
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=False)
252+
# Check if the dict contains our data
253+
if "index_data" in modal_dict:
254+
data = modal_dict["index_data"]
255+
256+
# Convert lists back to numpy arrays
257+
self.E = np.array(data["E"]) if data["E"] is not None else None
258+
self.items = np.array(data["items"]) if data["items"] is not None else None
259+
self.commit_hash = data["commit_hash"]
260+
261+
logger.info(f"Successfully loaded index from Modal Dict: {dict_id}")
262+
return
263+
else:
264+
logger.info(f"No index data found in Modal Dict: {dict_id}")
265+
except Exception as e:
266+
logger.warning(f"Modal Dict {dict_id} not found or error accessing it: {e}")
267+
except Exception as e:
268+
logger.warning(f"Failed to load index from Modal Dict, falling back to local file: {e}")
269+
270+
# Fall back to loading from local file
271+
try:
272+
with open(path, "rb") as f:
273+
data = pickle.load(f)
274+
self.E = data["E"]
275+
self.items = data["items"]
276+
self.commit_hash = data["commit_hash"]
277+
logger.info(f"Loaded index from local file: {path}")
278+
except Exception as e:
279+
logger.exception(f"Failed to load index from local file: {e}")
280+
raise
149281

150282
def similarity_search(self, query: str, k: int = 5) -> list[tuple[File, float]]:
151283
"""Find the k most similar files to a query.
@@ -216,3 +348,20 @@ def update(self) -> None:
216348

217349
# Update commit hash
218350
self.commit_hash = self._get_current_commit()
351+
352+
# Save updated index to Modal Dict if enabled
353+
if self.USE_MODAL_DICT and (num_updated > 0 or num_added > 0):
354+
try:
355+
dict_id = self.modal_dict_id
356+
logger.info(f"Updating index in Modal Dict: {dict_id}")
357+
358+
# Convert numpy arrays to lists for JSON serialization
359+
modal_data = {"E": self.E.tolist() if self.E is not None else None, "items": self.items.tolist() if self.items is not None else None, "commit_hash": self.commit_hash}
360+
361+
# Create or update Modal Dict
362+
modal_dict = modal.Dict.from_name(dict_id, create_if_missing=True)
363+
modal_dict["index_data"] = modal_data
364+
365+
logger.info(f"Successfully updated index in Modal Dict: {dict_id}")
366+
except Exception as e:
367+
logger.exception(f"Failed to update index in Modal Dict: {e}")

src/codegen/extensions/langchain/tools.py

Lines changed: 138 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -110,25 +110,111 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
110110

111111

112112
class SearchInput(BaseModel):
113-
"""Input for searching the codebase."""
114-
115113
query: str = Field(
116114
...,
117-
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. "
118-
"For regex searches, set use_regex=True. Ripgrep is the preferred method.",
115+
description="""The text or pattern to search for in the codebase.
116+
117+
For simple text search (use_regex=False):
118+
- Uses ripgrep's fixed-strings mode (--fixed-strings)
119+
- Case-insensitive matching (--ignore-case)
120+
- All characters are treated literally, including special regex characters
121+
- Exact string matching (no regex interpretation)
122+
123+
For regex search (use_regex=True):
124+
- Full regex pattern support
125+
- Case-sensitive by default
126+
- Special characters have regex meaning and need proper escaping
127+
- Uses ripgrep's default regex mode
128+
129+
If no exact matches are found, automatically falls back to semantic search
130+
to find relevant code even without exact text matches.""",
131+
)
132+
133+
target_directories: Optional[list[str]] = Field(
134+
default=None,
135+
description="""Optional list of directories to limit the search scope.
136+
137+
- Paths should be relative to the workspace root
138+
- Multiple directories are searched in parallel
139+
- If None, searches the entire codebase
140+
141+
Example: ["src/frontend", "tests/unit"]""",
142+
)
143+
144+
file_extensions: Optional[list[str]] = Field(
145+
default=None,
146+
description="""Optional list of file extensions to filter the search.
147+
148+
- Include the dot in extensions (e.g. ['.py', '.ts'])
149+
- Multiple extensions are combined with OR logic
150+
- If None, searches all file types
151+
- Binary files are automatically excluded
152+
153+
Example: [".py", ".tsx", ".md"]""",
154+
)
155+
156+
page: int = Field(
157+
default=1,
158+
description="""Page number for paginated results (1-based indexing).
159+
160+
- Use with files_per_page to navigate large result sets
161+
- If page exceeds available pages, returns last available page
162+
- Note: When falling back to semantic search, pagination is not supported
163+
164+
Example: page=2 with files_per_page=10 shows files 11-20""",
165+
)
166+
167+
files_per_page: int = Field(
168+
default=10,
169+
description="""Number of files to show per page.
170+
171+
- Each file can contain multiple matching lines
172+
- Reasonable values are between 5 and 50
173+
- Larger values may impact performance
174+
- When falling back to semantic search, this becomes the number of semantic results
175+
176+
Example: files_per_page=20 shows up to 20 files with matches""",
177+
)
178+
179+
use_regex: bool = Field(
180+
default=False,
181+
description="""Whether to treat the query as a regex pattern.
182+
183+
- False (default): Simple text search, case-insensitive
184+
- True: Full regex syntax, case-sensitive
185+
- Invalid regex patterns will return an error
186+
- Note: Semantic fallback is used regardless of this setting when no matches found
187+
188+
Example: Set to True to use patterns like "test_.*_func.*" """,
119189
)
120-
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
121-
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
122-
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
123-
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
124-
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
125190

126191

127192
class SearchTool(BaseTool):
128193
"""Tool for searching the codebase."""
129194

130195
name: ClassVar[str] = "search"
131-
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
196+
description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching.
197+
198+
This tool provides powerful text-based search capabilities across your codebase,
199+
with support for both simple text matching and regular expressions. It uses ripgrep
200+
when available for high-performance searches.
201+
202+
If no exact matches are found, automatically falls back to semantic search to find
203+
relevant code even without exact text matches.
204+
205+
Features:
206+
- Plain text or regex pattern matching
207+
- Directory and file type filtering
208+
- Paginated results for large codebases
209+
- Case-insensitive by default for simple text searches
210+
- Semantic fallback for finding related code
211+
212+
Example queries:
213+
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
214+
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
215+
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
216+
4. Directory-specific: "api" with target_directories=["src/backend"]
217+
"""
132218
args_schema: ClassVar[type[BaseModel]] = SearchInput
133219
codebase: Codebase = Field(exclude=True)
134220

@@ -151,7 +237,27 @@ class EditFileTool(BaseTool):
151237
"""Tool for editing files."""
152238

153239
name: ClassVar[str] = "edit_file"
154-
description: ClassVar[str] = "Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents."
240+
description: ClassVar[str] = r"""
241+
Edit a file by replacing its entire content. This tool should only be used for replacing entire file contents.
242+
Input for searching the codebase.
243+
244+
This tool provides powerful text-based search capabilities across your codebase,
245+
with support for both simple text matching and regular expressions. It uses ripgrep
246+
when available for high-performance searches, falling back to Python's regex engine
247+
when necessary.
248+
249+
Features:
250+
- Plain text or regex pattern matching
251+
- Directory and file type filtering
252+
- Paginated results for large codebases
253+
- Case-insensitive by default for simple text searches
254+
255+
Example queries:
256+
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
257+
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
258+
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
259+
4. Directory-specific: "api" with target_directories=["src/backend"]
260+
"""
155261
args_schema: ClassVar[type[BaseModel]] = EditFileInput
156262
codebase: Codebase = Field(exclude=True)
157263

@@ -741,7 +847,7 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
741847
RunBashCommandTool(), # Note: This tool doesn't need the codebase
742848
SearchTool(codebase),
743849
# SemanticEditTool(codebase),
744-
SemanticSearchTool(codebase),
850+
# SemanticSearchTool(codebase),
745851
ViewFileTool(codebase),
746852
RelaceEditTool(codebase),
747853
ReflectionTool(codebase),
@@ -761,14 +867,26 @@ def get_workspace_tools(codebase: Codebase) -> list["BaseTool"]:
761867

762868

763869
class ReplacementEditInput(BaseModel):
764-
"""Input for regex-based replacement editing."""
765-
766-
filepath: str = Field(..., description="Path to the file to edit")
767-
pattern: str = Field(..., description="Regex pattern to match")
768-
replacement: str = Field(..., description="Replacement text (can include regex groups)")
769-
start: int = Field(default=1, description="Starting line number (1-indexed, inclusive). Default is 1.")
770-
end: int = Field(default=-1, description="Ending line number (1-indexed, inclusive). Default is -1 (end of file).")
771-
count: Optional[int] = Field(default=None, description="Maximum number of replacements. Default is None (replace all).")
870+
filepath: str = Field(..., description="Path to the file to edit relative to the workspace root. The file must exist and be a text file.")
871+
pattern: str = Field(
872+
...,
873+
description="Regular expression pattern to match text that should be replaced. Supports all Python regex syntax including capture groups (\1, \2, etc). The pattern is compiled with re.MULTILINE flag by default.",
874+
)
875+
replacement: str = Field(
876+
...,
877+
description="Text to replace matched patterns with. Can reference regex capture groups using \1, \2, etc. If using regex groups in pattern, make sure to preserve them in replacement if needed.",
878+
)
879+
start: int = Field(
880+
default=1, description="Starting line number (1-indexed, inclusive) to begin replacements from. Use this with 'end' to limit changes to a specific region. Default is 1 (start of file)."
881+
)
882+
end: int = Field(
883+
default=-1,
884+
description="Ending line number (1-indexed, inclusive) to stop replacements at. Use -1 to indicate end of file. Use this with 'start' to limit changes to a specific region. Default is -1 (end of file).",
885+
)
886+
count: Optional[int] = Field(
887+
default=None,
888+
description="Maximum number of replacements to make. Use None to replace all occurrences (default), or specify a number to limit replacements. Useful when you only want to replace the first N occurrences.",
889+
)
772890

773891

774892
class ReplacementEditTool(BaseTool):

0 commit comments

Comments
 (0)