From b8ee8d4a3851c77558706596707053aba28dbab8 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 27 Mar 2026 12:40:04 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=E2=9A=A1=20Bolt:=20[performance=20improvem?=
 =?UTF-8?q?ent]=20Optimize=20keyword=20delimiter=20matching?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit modifies `_match_keyword_delimiters` in `src/codeweaver/engine/chunker/delimiter.py` to significantly improve chunking performance.

Instead of calling `re.finditer` for every individual keyword delimiter, the optimization combines all start strings into a single compiled regex pattern. This reduces regex execution overhead and limits the algorithm to making a single pass over the content. Additionally, an early return checks for empty lists to prevent compiling a dangerous empty regex.

Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com>
---
 src/codeweaver/engine/chunker/delimiter.py | 82 ++++++++++++----------
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py
index 07b00994..f9cd403d 100644
--- a/src/codeweaver/engine/chunker/delimiter.py
+++ b/src/codeweaver/engine/chunker/delimiter.py
@@ -451,6 +451,9 @@ def _match_keyword_delimiters(
         # Filter out delimiters with empty start strings - they match everywhere!
         keyword_delimiters = [d for d in keyword_delimiters if d.start]
 
+        if not keyword_delimiters:
+            return matches
+
         # Define structural delimiters that can complete keywords
         # Map opening structural chars to their closing counterparts
         structural_pairs = {
@@ -459,49 +462,56 @@ def _match_keyword_delimiters(
             "=>": "",  # Arrow functions often have expression bodies
         }
 
-        for delimiter in keyword_delimiters:
-            # Find all keyword occurrences using word boundary matching
-            pattern = rf"\b{re.escape(delimiter.start)}\b"
+        # Optimization: Combine all keyword start strings into a single compiled regex pattern.
+        # This allows us to make a single pass over the content rather than iterating over
+        # `re.finditer` for each keyword delimiter individually, significantly reducing overhead.
+        start_strings = [d.start for d in keyword_delimiters]
+        combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b")
 
-            for match in re.finditer(pattern, content):
-                keyword_pos = match.start()
+        # Create a mapping to quickly look up the delimiter by its matched start string
+        delimiter_map = {d.start: d for d in keyword_delimiters}
 
-                # Skip if keyword is inside a string or comment
-                if self._is_inside_string_or_comment(content, keyword_pos):
-                    continue
+        for match in combined_pattern.finditer(content):
+            matched_text = match.group(0)
+            delimiter = delimiter_map[matched_text]
+            keyword_pos = match.start()
 
-                # Find the next structural opening after the keyword
-                struct_start, struct_char = self._find_next_structural_with_char(
-                    content,
-                    start=keyword_pos + len(delimiter.start),
-                    allowed=set(structural_pairs.keys()),
-                )
+            # Skip if keyword is inside a string or comment
+            if self._is_inside_string_or_comment(content, keyword_pos):
+                continue
 
-                if struct_start is None:
-                    continue
+            # Find the next structural opening after the keyword
+            struct_start, struct_char = self._find_next_structural_with_char(
+                content,
+                start=keyword_pos + len(delimiter.start),
+                allowed=set(structural_pairs.keys()),
+            )
 
-                # Find the matching closing delimiter for the structural character
-                struct_end = self._find_matching_close(
-                    content,
-                    struct_start,
-                    struct_char or "",
-                    structural_pairs.get(cast(str, struct_char), ""),
-                )
+            if struct_start is None:
+                continue
 
-                if struct_end is not None:
-                    # Calculate nesting level by counting parent structures
-                    nesting_level = self._calculate_nesting_level(content, keyword_pos)
+            # Find the matching closing delimiter for the structural character
+            struct_end = self._find_matching_close(
+                content,
+                struct_start,
+                struct_char or "",
+                structural_pairs.get(cast(str, struct_char), ""),
+            )
 
-                    # Create a complete match from keyword to closing structure
-                    # This represents the entire construct (e.g., function...})
-                    matches.append(
-                        DelimiterMatch(
-                            delimiter=delimiter,
-                            start_pos=keyword_pos,
-                            end_pos=struct_end,
-                            nesting_level=nesting_level,
-                        )
+            if struct_end is not None:
+                # Calculate nesting level by counting parent structures
+                nesting_level = self._calculate_nesting_level(content, keyword_pos)
+
+                # Create a complete match from keyword to closing structure
+                # This represents the entire construct (e.g., function...})
+                matches.append(
+                    DelimiterMatch(
+                        delimiter=delimiter,
+                        start_pos=keyword_pos,
+                        end_pos=struct_end,
+                        nesting_level=nesting_level,
                     )
+                )
 
         return matches
 
@@ -1280,7 +1290,7 @@ def _load_custom_delimiters(
         patterns when merged with the full delimiter list.
 
         Args:
-            normalized_language: Snake-case normalised language identifier.
+            normalized_language: Snake-case normalized language identifier.
             language: Original language string (used for logging only).
 
         Returns:

From 196c3ad557f0d46a16f2d0b7c20dde9dead01adf Mon Sep 17 00:00:00 2001
From: Edward Yi <41576951+aiedwardyi@users.noreply.github.com>
Date: Sat, 28 Mar 2026 17:16:00 +0900
Subject: [PATCH 2/3] fix: preserve duplicate keyword delimiters

---
 src/codeweaver/engine/chunker/delimiter.py    | 65 ++++++++++---------
 .../chunker/test_delimiter_edge_cases.py      | 37 +++++++++++
 2 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py
index f9cd403d..01d4bdb1 100644
--- a/src/codeweaver/engine/chunker/delimiter.py
+++ b/src/codeweaver/engine/chunker/delimiter.py
@@ -465,53 +465,56 @@ def _match_keyword_delimiters(
         # Optimization: Combine all keyword start strings into a single compiled regex pattern.
         # This allows us to make a single pass over the content rather than iterating over
         # `re.finditer` for each keyword delimiter individually, significantly reducing overhead.
-        start_strings = [d.start for d in keyword_delimiters]
+        start_strings = list(dict.fromkeys(d.start for d in keyword_delimiters))
         combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b")
 
-        # Create a mapping to quickly look up the delimiter by its matched start string
-        delimiter_map = {d.start: d for d in keyword_delimiters}
+        # Group delimiters by matched start string so duplicate keyword starts
+        # preserve the original "process each delimiter independently" behavior.
+        delimiter_map: dict[str, list[Delimiter]] = {}
+        for delimiter in keyword_delimiters:
+            delimiter_map.setdefault(delimiter.start, []).append(delimiter)
 
         for match in combined_pattern.finditer(content):
             matched_text = match.group(0)
-            delimiter = delimiter_map[matched_text]
             keyword_pos = match.start()
 
             # Skip if keyword is inside a string or comment
             if self._is_inside_string_or_comment(content, keyword_pos):
                 continue
 
-            # Find the next structural opening after the keyword
-            struct_start, struct_char = self._find_next_structural_with_char(
-                content,
-                start=keyword_pos + len(delimiter.start),
-                allowed=set(structural_pairs.keys()),
-            )
+            for delimiter in delimiter_map[matched_text]:
+                # Find the next structural opening after the keyword
+                struct_start, struct_char = self._find_next_structural_with_char(
+                    content,
+                    start=keyword_pos + len(delimiter.start),
+                    allowed=set(structural_pairs.keys()),
+                )
 
-            if struct_start is None:
-                continue
+                if struct_start is None:
+                    continue
 
-            # Find the matching closing delimiter for the structural character
-            struct_end = self._find_matching_close(
-                content,
-                struct_start,
-                struct_char or "",
-                structural_pairs.get(cast(str, struct_char), ""),
-            )
+                # Find the matching closing delimiter for the structural character
+                struct_end = self._find_matching_close(
+                    content,
+                    struct_start,
+                    struct_char or "",
+                    structural_pairs.get(cast(str, struct_char), ""),
+                )
 
-            if struct_end is not None:
-                # Calculate nesting level by counting parent structures
-                nesting_level = self._calculate_nesting_level(content, keyword_pos)
+                if struct_end is not None:
+                    # Calculate nesting level by counting parent structures
+                    nesting_level = self._calculate_nesting_level(content, keyword_pos)
 
-                # Create a complete match from keyword to closing structure
-                # This represents the entire construct (e.g., function...})
-                matches.append(
-                    DelimiterMatch(
-                        delimiter=delimiter,
-                        start_pos=keyword_pos,
-                        end_pos=struct_end,
-                        nesting_level=nesting_level,
+                    # Create a complete match from keyword to closing structure
+                    # This represents the entire construct (e.g., function...})
+                    matches.append(
+                        DelimiterMatch(
+                            delimiter=delimiter,
+                            start_pos=keyword_pos,
+                            end_pos=struct_end,
+                            nesting_level=nesting_level,
+                        )
                     )
-                )
 
         return matches
 
diff --git a/tests/unit/engine/chunker/test_delimiter_edge_cases.py b/tests/unit/engine/chunker/test_delimiter_edge_cases.py
index ca800938..12b26439 100644
--- a/tests/unit/engine/chunker/test_delimiter_edge_cases.py
+++ b/tests/unit/engine/chunker/test_delimiter_edge_cases.py
@@ -16,7 +16,9 @@
 
 import pytest
 
+from codeweaver.core.types import DelimiterKind
 from codeweaver.engine import ChunkGovernor, DelimiterChunker
+from codeweaver.engine.chunker.delimiter_model import Delimiter
 
 
 pytestmark = [pytest.mark.unit]
@@ -208,6 +210,41 @@ def test_take_whole_lines_expansion(
 class TestUnusualPatterns:
     """Test delimiter chunker with unusual patterns."""
 
+    def test_duplicate_keyword_starts_preserve_all_delimiters(
+        self, delimiter_chunker: DelimiterChunker
+    ) -> None:
+        """Verify duplicate keyword starts keep all matching delimiters.
+
+        Keyword delimiters with the same start string used to be processed
+        independently. The optimized single-regex path must preserve that
+        behavior instead of collapsing duplicates down to the last delimiter.
+        """
+        content = """type Person struct {
+    name string
+}"""
+        keyword_delimiters = [
+            Delimiter(
+                start="type",
+                end="",
+                kind=DelimiterKind.STRUCT,
+                priority=DelimiterKind.STRUCT.default_priority,
+            ),
+            Delimiter(
+                start="type",
+                end="",
+                kind=DelimiterKind.TYPE_ALIAS,
+                priority=DelimiterKind.TYPE_ALIAS.default_priority,
+            ),
+        ]
+
+        matches = delimiter_chunker._match_keyword_delimiters(content, keyword_delimiters)
+
+        assert len(matches) == 2, "Duplicate keyword starts should preserve all delimiters"
+        assert {match.delimiter.kind for match in matches} == {
+            DelimiterKind.STRUCT,
+            DelimiterKind.TYPE_ALIAS,
+        }
+
     def test_nested_delimiter_structures(
         self, delimiter_chunker: DelimiterChunker, tmp_path: Path
     ) -> None:

From bcce8bd842f5696dd42067a4361b0e18af44643b Mon Sep 17 00:00:00 2001
From: Adam Poulemanos <89049923+bashandbone@users.noreply.github.com>
Date: Sat, 28 Mar 2026 16:47:17 -0400
Subject: [PATCH 3/3] Optimize allowed_keys assignment for structural pairs

hoist structural_pairs construction above the loop, use frozenset for thread safety

Signed-off-by: Adam Poulemanos <89049923+bashandbone@users.noreply.github.com>
---
 src/codeweaver/engine/chunker/delimiter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py
index 01d4bdb1..0d8f4d90 100644
--- a/src/codeweaver/engine/chunker/delimiter.py
+++ b/src/codeweaver/engine/chunker/delimiter.py
@@ -461,13 +461,13 @@ def _match_keyword_delimiters(
             ":": "\n",  # Python uses : followed by indented block (simplified to newline)
             "=>": "",  # Arrow functions often have expression bodies
         }
-
+        allowed_keys = frozenset(structural_pairs.keys())
         # Optimization: Combine all keyword start strings into a single compiled regex pattern.
         # This allows us to make a single pass over the content rather than iterating over
         # `re.finditer` for each keyword delimiter individually, significantly reducing overhead.
         start_strings = list(dict.fromkeys(d.start for d in keyword_delimiters))
         combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b")
-
+        
         # Group delimiters by matched start string so duplicate keyword starts
         # preserve the original "process each delimiter independently" behavior.
         delimiter_map: dict[str, list[Delimiter]] = {}
@@ -487,7 +487,7 @@ def _match_keyword_delimiters(
                 struct_start, struct_char = self._find_next_structural_with_char(
                     content,
                     start=keyword_pos + len(delimiter.start),
-                    allowed=set(structural_pairs.keys()),
+                    allowed=allowed_keys,
                 )
 
                 if struct_start is None: