From b8ee8d4a3851c77558706596707053aba28dbab8 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:40:04 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9A=A1=20Bolt:=20[performance=20improvem?= =?UTF-8?q?ent]=20Optimize=20keyword=20delimiter=20matching?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit modifies `_match_keyword_delimiters` in `src/codeweaver/engine/chunker/delimiter.py` to significantly improve chunking performance. Instead of calling `re.finditer` for every individual keyword delimiter, the optimization combines all start strings into a single compiled regex pattern. This reduces regex execution overhead and limits the algorithm to making a single pass over the content. Additionally, an early return checks for empty lists to prevent compiling a dangerous empty regex. Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --- src/codeweaver/engine/chunker/delimiter.py | 82 ++++++++++++---------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index 07b00994..f9cd403d 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -451,6 +451,9 @@ def _match_keyword_delimiters( # Filter out delimiters with empty start strings - they match everywhere! keyword_delimiters = [d for d in keyword_delimiters if d.start] + if not keyword_delimiters: + return matches + # Define structural delimiters that can complete keywords # Map opening structural chars to their closing counterparts structural_pairs = { @@ -459,49 +462,56 @@ def _match_keyword_delimiters( "=>": "", # Arrow functions often have expression bodies } - for delimiter in keyword_delimiters: - # Find all keyword occurrences using word boundary matching - pattern = rf"\b{re.escape(delimiter.start)}\b" + # Optimization: Combine all keyword start strings into a single compiled regex pattern. + # This allows us to make a single pass over the content rather than iterating over + # `re.finditer` for each keyword delimiter individually, significantly reducing overhead. + start_strings = [d.start for d in keyword_delimiters] + combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b") - for match in re.finditer(pattern, content): - keyword_pos = match.start() + # Create a mapping to quickly look up the delimiter by its matched start string + delimiter_map = {d.start: d for d in keyword_delimiters} - # Skip if keyword is inside a string or comment - if self._is_inside_string_or_comment(content, keyword_pos): - continue + for match in combined_pattern.finditer(content): + matched_text = match.group(0) + delimiter = delimiter_map[matched_text] + keyword_pos = match.start() - # Find the next structural opening after the keyword - struct_start, struct_char = self._find_next_structural_with_char( - content, - start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), - ) + # Skip if keyword is inside a string or comment + if self._is_inside_string_or_comment(content, keyword_pos): + continue - if struct_start is None: - continue + # Find the next structural opening after the keyword + struct_start, struct_char = self._find_next_structural_with_char( + content, + start=keyword_pos + len(delimiter.start), + allowed=set(structural_pairs.keys()), + ) - # Find the matching closing delimiter for the structural character - struct_end = self._find_matching_close( - content, - struct_start, - struct_char or "", - structural_pairs.get(cast(str, struct_char), ""), - ) + if struct_start is None: + continue - if struct_end is not None: - # Calculate nesting level by counting parent structures - nesting_level = self._calculate_nesting_level(content, keyword_pos) + # Find the matching closing delimiter for the structural character + struct_end = self._find_matching_close( + content, + struct_start, + struct_char or "", + structural_pairs.get(cast(str, struct_char), ""), + ) - # Create a complete match from keyword to closing structure - # This represents the entire construct (e.g., function...}) - matches.append( - DelimiterMatch( - delimiter=delimiter, - start_pos=keyword_pos, - end_pos=struct_end, - nesting_level=nesting_level, - ) + if struct_end is not None: + # Calculate nesting level by counting parent structures + nesting_level = self._calculate_nesting_level(content, keyword_pos) + + # Create a complete match from keyword to closing structure + # This represents the entire construct (e.g., function...}) + matches.append( + DelimiterMatch( + delimiter=delimiter, + start_pos=keyword_pos, + end_pos=struct_end, + nesting_level=nesting_level, ) + ) return matches @@ -1280,7 +1290,7 @@ def _load_custom_delimiters( patterns when merged with the full delimiter list. Args: - normalized_language: Snake-case normalised language identifier. + normalized_language: Snake-case normalized language identifier. language: Original language string (used for logging only). Returns: From 196c3ad557f0d46a16f2d0b7c20dde9dead01adf Mon Sep 17 00:00:00 2001 From: Edward Yi <41576951+aiedwardyi@users.noreply.github.com> Date: Sat, 28 Mar 2026 17:16:00 +0900 Subject: [PATCH 2/3] fix: preserve duplicate keyword delimiters --- src/codeweaver/engine/chunker/delimiter.py | 65 ++++++++++--------- .../chunker/test_delimiter_edge_cases.py | 37 +++++++++++ 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index f9cd403d..01d4bdb1 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -465,53 +465,56 @@ def _match_keyword_delimiters( # Optimization: Combine all keyword start strings into a single compiled regex pattern. # This allows us to make a single pass over the content rather than iterating over # `re.finditer` for each keyword delimiter individually, significantly reducing overhead. - start_strings = [d.start for d in keyword_delimiters] + start_strings = list(dict.fromkeys(d.start for d in keyword_delimiters)) combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b") - # Create a mapping to quickly look up the delimiter by its matched start string - delimiter_map = {d.start: d for d in keyword_delimiters} + # Group delimiters by matched start string so duplicate keyword starts + # preserve the original "process each delimiter independently" behavior. + delimiter_map: dict[str, list[Delimiter]] = {} + for delimiter in keyword_delimiters: + delimiter_map.setdefault(delimiter.start, []).append(delimiter) for match in combined_pattern.finditer(content): matched_text = match.group(0) - delimiter = delimiter_map[matched_text] keyword_pos = match.start() # Skip if keyword is inside a string or comment if self._is_inside_string_or_comment(content, keyword_pos): continue - # Find the next structural opening after the keyword - struct_start, struct_char = self._find_next_structural_with_char( - content, - start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), - ) + for delimiter in delimiter_map[matched_text]: + # Find the next structural opening after the keyword + struct_start, struct_char = self._find_next_structural_with_char( + content, + start=keyword_pos + len(delimiter.start), + allowed=set(structural_pairs.keys()), + ) - if struct_start is None: - continue + if struct_start is None: + continue - # Find the matching closing delimiter for the structural character - struct_end = self._find_matching_close( - content, - struct_start, - struct_char or "", - structural_pairs.get(cast(str, struct_char), ""), - ) + # Find the matching closing delimiter for the structural character + struct_end = self._find_matching_close( + content, + struct_start, + struct_char or "", + structural_pairs.get(cast(str, struct_char), ""), + ) - if struct_end is not None: - # Calculate nesting level by counting parent structures - nesting_level = self._calculate_nesting_level(content, keyword_pos) + if struct_end is not None: + # Calculate nesting level by counting parent structures + nesting_level = self._calculate_nesting_level(content, keyword_pos) - # Create a complete match from keyword to closing structure - # This represents the entire construct (e.g., function...}) - matches.append( - DelimiterMatch( - delimiter=delimiter, - start_pos=keyword_pos, - end_pos=struct_end, - nesting_level=nesting_level, + # Create a complete match from keyword to closing structure + # This represents the entire construct (e.g., function...}) + matches.append( + DelimiterMatch( + delimiter=delimiter, + start_pos=keyword_pos, + end_pos=struct_end, + nesting_level=nesting_level, + ) ) - ) return matches diff --git a/tests/unit/engine/chunker/test_delimiter_edge_cases.py b/tests/unit/engine/chunker/test_delimiter_edge_cases.py index ca800938..12b26439 100644 --- a/tests/unit/engine/chunker/test_delimiter_edge_cases.py +++ b/tests/unit/engine/chunker/test_delimiter_edge_cases.py @@ -16,7 +16,9 @@ import pytest +from codeweaver.core.types import DelimiterKind from codeweaver.engine import ChunkGovernor, DelimiterChunker +from codeweaver.engine.chunker.delimiter_model import Delimiter pytestmark = [pytest.mark.unit] @@ -208,6 +210,41 @@ def test_take_whole_lines_expansion( class TestUnusualPatterns: """Test delimiter chunker with unusual patterns.""" + def test_duplicate_keyword_starts_preserve_all_delimiters( + self, delimiter_chunker: DelimiterChunker + ) -> None: + """Verify duplicate keyword starts keep all matching delimiters. + + Keyword delimiters with the same start string used to be processed + independently. The optimized single-regex path must preserve that + behavior instead of collapsing duplicates down to the last delimiter. + """ + content = """type Person struct { + name string +}""" + keyword_delimiters = [ + Delimiter( + start="type", + end="", + kind=DelimiterKind.STRUCT, + priority=DelimiterKind.STRUCT.default_priority, + ), + Delimiter( + start="type", + end="", + kind=DelimiterKind.TYPE_ALIAS, + priority=DelimiterKind.TYPE_ALIAS.default_priority, + ), + ] + + matches = delimiter_chunker._match_keyword_delimiters(content, keyword_delimiters) + + assert len(matches) == 2, "Duplicate keyword starts should preserve all delimiters" + assert {match.delimiter.kind for match in matches} == { + DelimiterKind.STRUCT, + DelimiterKind.TYPE_ALIAS, + } + def test_nested_delimiter_structures( self, delimiter_chunker: DelimiterChunker, tmp_path: Path ) -> None: From bcce8bd842f5696dd42067a4361b0e18af44643b Mon Sep 17 00:00:00 2001 From: Adam Poulemanos <89049923+bashandbone@users.noreply.github.com> Date: Sat, 28 Mar 2026 16:47:17 -0400 Subject: [PATCH 3/3] Optimize allowed_keys assignment for structural pairs hoist structural_pairs construction above the loop, use frozenset for thread safety Signed-off-by: Adam Poulemanos <89049923+bashandbone@users.noreply.github.com> --- src/codeweaver/engine/chunker/delimiter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index 01d4bdb1..0d8f4d90 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -461,13 +461,13 @@ def _match_keyword_delimiters( ":": "\n", # Python uses : followed by indented block (simplified to newline) "=>": "", # Arrow functions often have expression bodies } - + allowed_keys = frozenset(structural_pairs.keys()) # Optimization: Combine all keyword start strings into a single compiled regex pattern. # This allows us to make a single pass over the content rather than iterating over # `re.finditer` for each keyword delimiter individually, significantly reducing overhead. start_strings = list(dict.fromkeys(d.start for d in keyword_delimiters)) combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b") - + # Group delimiters by matched start string so duplicate keyword starts # preserve the original "process each delimiter independently" behavior. delimiter_map: dict[str, list[Delimiter]] = {} @@ -487,7 +487,7 @@ def _match_keyword_delimiters( struct_start, struct_char = self._find_next_structural_with_char( content, start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), + allowed=allowed_keys, ) if struct_start is None: