-
Notifications
You must be signed in to change notification settings - Fork 3
⚡ Bolt: [performance improvement] Optimize keyword delimiter matching #281
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -451,6 +451,9 @@ def _match_keyword_delimiters( | |
| # Filter out delimiters with empty start strings - they match everywhere! | ||
| keyword_delimiters = [d for d in keyword_delimiters if d.start] | ||
|
|
||
| if not keyword_delimiters: | ||
| return matches | ||
|
|
||
| # Define structural delimiters that can complete keywords | ||
| # Map opening structural chars to their closing counterparts | ||
| structural_pairs = { | ||
|
|
@@ -459,49 +462,56 @@ def _match_keyword_delimiters( | |
| "=>": "", # Arrow functions often have expression bodies | ||
| } | ||
|
|
||
| for delimiter in keyword_delimiters: | ||
| # Find all keyword occurrences using word boundary matching | ||
| pattern = rf"\b{re.escape(delimiter.start)}\b" | ||
| # Optimization: Combine all keyword start strings into a single compiled regex pattern. | ||
| # This allows us to make a single pass over the content rather than iterating over | ||
| # `re.finditer` for each keyword delimiter individually, significantly reducing overhead. | ||
| start_strings = [d.start for d in keyword_delimiters] | ||
| combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b") | ||
|
|
||
| for match in re.finditer(pattern, content): | ||
| keyword_pos = match.start() | ||
| # Create a mapping to quickly look up the delimiter by its matched start string | ||
| delimiter_map = {d.start: d for d in keyword_delimiters} | ||
|
Comment on lines
+465
to
+472
|
||
|
|
||
| # Skip if keyword is inside a string or comment | ||
| if self._is_inside_string_or_comment(content, keyword_pos): | ||
| continue | ||
| for match in combined_pattern.finditer(content): | ||
| matched_text = match.group(0) | ||
| delimiter = delimiter_map[matched_text] | ||
| keyword_pos = match.start() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (performance): The This recreates Suggested implementation: allowed_structurals = set(structural_pairs.keys())
for match in combined_pattern.finditer(content):
matched_text = match.group(0)
delimiter = delimiter_map[matched_text]
keyword_pos = match.start()Within the same method where this loop lives, find the call(s) to:
Comment on lines
+468
to
+477
|
||
|
|
||
| # Find the next structural opening after the keyword | ||
| struct_start, struct_char = self._find_next_structural_with_char( | ||
| content, | ||
| start=keyword_pos + len(delimiter.start), | ||
| allowed=set(structural_pairs.keys()), | ||
| ) | ||
| # Skip if keyword is inside a string or comment | ||
| if self._is_inside_string_or_comment(content, keyword_pos): | ||
| continue | ||
|
|
||
| if struct_start is None: | ||
| continue | ||
| # Find the next structural opening after the keyword | ||
| struct_start, struct_char = self._find_next_structural_with_char( | ||
| content, | ||
| start=keyword_pos + len(delimiter.start), | ||
| allowed=set(structural_pairs.keys()), | ||
| ) | ||
|
|
||
| # Find the matching closing delimiter for the structural character | ||
| struct_end = self._find_matching_close( | ||
| content, | ||
| struct_start, | ||
| struct_char or "", | ||
| structural_pairs.get(cast(str, struct_char), ""), | ||
| ) | ||
| if struct_start is None: | ||
| continue | ||
|
|
||
| if struct_end is not None: | ||
| # Calculate nesting level by counting parent structures | ||
| nesting_level = self._calculate_nesting_level(content, keyword_pos) | ||
| # Find the matching closing delimiter for the structural character | ||
| struct_end = self._find_matching_close( | ||
| content, | ||
| struct_start, | ||
| struct_char or "", | ||
| structural_pairs.get(cast(str, struct_char), ""), | ||
| ) | ||
|
|
||
| # Create a complete match from keyword to closing structure | ||
| # This represents the entire construct (e.g., function...}) | ||
| matches.append( | ||
| DelimiterMatch( | ||
| delimiter=delimiter, | ||
| start_pos=keyword_pos, | ||
| end_pos=struct_end, | ||
| nesting_level=nesting_level, | ||
| ) | ||
| if struct_end is not None: | ||
| # Calculate nesting level by counting parent structures | ||
| nesting_level = self._calculate_nesting_level(content, keyword_pos) | ||
|
|
||
| # Create a complete match from keyword to closing structure | ||
| # This represents the entire construct (e.g., function...}) | ||
| matches.append( | ||
| DelimiterMatch( | ||
| delimiter=delimiter, | ||
| start_pos=keyword_pos, | ||
| end_pos=struct_end, | ||
| nesting_level=nesting_level, | ||
| ) | ||
| ) | ||
|
|
||
| return matches | ||
|
|
||
|
|
@@ -1280,7 +1290,7 @@ def _load_custom_delimiters( | |
| patterns when merged with the full delimiter list. | ||
|
|
||
| Args: | ||
| normalized_language: Snake-case normalised language identifier. | ||
| normalized_language: Snake-case normalized language identifier. | ||
| language: Original language string (used for logging only). | ||
|
|
||
| Returns: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue (bug_risk): Handling duplicate keyword start strings may change behavior compared to the previous implementation.
Previously, each delimiter was processed independently with
re.finditer, so duplicatestartvalues would each be applied (possibly with different end behavior/metadata). Withdelimiter_map = {d.start: d for d in keyword_delimiters}, only the last delimiter for a givenstartis ever used. If duplicatestartvalues are valid in this context, this changes semantics. Consider either enforcing uniquestartvalues up front and failing fast, or mapping eachstartto a list of delimiters to preserve the prior behavior.