From cebf1c8686a164066582c38bb6f26698e5f9171f Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 28 Mar 2026 12:37:25 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Combine=20keyword=20del?= =?UTF-8?q?imiters=20into=20single=20regex=20pattern?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --- src/codeweaver/engine/chunker/delimiter.py | 78 ++++++++++++---------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index 07b00994..46a08f60 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -451,6 +451,9 @@ def _match_keyword_delimiters( # Filter out delimiters with empty start strings - they match everywhere! keyword_delimiters = [d for d in keyword_delimiters if d.start] + if not keyword_delimiters: + return matches + # Define structural delimiters that can complete keywords # Map opening structural chars to their closing counterparts structural_pairs = { @@ -459,49 +462,52 @@ def _match_keyword_delimiters( "=>": "", # Arrow functions often have expression bodies } - for delimiter in keyword_delimiters: - # Find all keyword occurrences using word boundary matching - pattern = rf"\b{re.escape(delimiter.start)}\b" + # Optimization: combining multiple keyword delimiters into a single compiled regex pattern + # using alternation (?:...) avoids looping over keywords with individual re.finditer calls. + delimiter_map = {d.start: d for d in keyword_delimiters} + combined_pattern = rf"\b(?:{'|'.join(re.escape(d.start) for d in keyword_delimiters)})\b" - for match in re.finditer(pattern, content): - keyword_pos = match.start() + for match in re.finditer(combined_pattern, content): + keyword_pos = match.start() + matched_text = match.group(0) + delimiter = delimiter_map[matched_text] - # Skip if keyword is inside a string or comment - if self._is_inside_string_or_comment(content, keyword_pos): - continue + # Skip if keyword is inside a string or comment + if self._is_inside_string_or_comment(content, keyword_pos): + continue - # Find the next structural opening after the keyword - struct_start, struct_char = self._find_next_structural_with_char( - content, - start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), - ) + # Find the next structural opening after the keyword + struct_start, struct_char = self._find_next_structural_with_char( + content, + start=keyword_pos + len(delimiter.start), + allowed=set(structural_pairs.keys()), + ) - if struct_start is None: - continue + if struct_start is None: + continue - # Find the matching closing delimiter for the structural character - struct_end = self._find_matching_close( - content, - struct_start, - struct_char or "", - structural_pairs.get(cast(str, struct_char), ""), - ) + # Find the matching closing delimiter for the structural character + struct_end = self._find_matching_close( + content, + struct_start, + struct_char or "", + structural_pairs.get(cast(str, struct_char), ""), + ) - if struct_end is not None: - # Calculate nesting level by counting parent structures - nesting_level = self._calculate_nesting_level(content, keyword_pos) + if struct_end is not None: + # Calculate nesting level by counting parent structures + nesting_level = self._calculate_nesting_level(content, keyword_pos) - # Create a complete match from keyword to closing structure - # This represents the entire construct (e.g., function...}) - matches.append( - DelimiterMatch( - delimiter=delimiter, - start_pos=keyword_pos, - end_pos=struct_end, - nesting_level=nesting_level, - ) + # Create a complete match from keyword to closing structure + # This represents the entire construct (e.g., function...}) + matches.append( + DelimiterMatch( + delimiter=delimiter, + start_pos=keyword_pos, + end_pos=struct_end, + nesting_level=nesting_level, ) + ) return matches @@ -1280,7 +1286,7 @@ def _load_custom_delimiters( patterns when merged with the full delimiter list. Args: - normalized_language: Snake-case normalised language identifier. + normalized_language: Snake-case normalized language identifier. language: Original language string (used for logging only). Returns: From 33c7ffa623e3b2fb16764df057a8949c7129e564 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 28 Mar 2026 19:53:09 +0000 Subject: [PATCH 2/3] Fix dictionary collision mapping bug Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --- src/codeweaver/engine/chunker/delimiter.py | 73 ++++++++++++---------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index 46a08f60..e318fd5c 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -464,50 +464,57 @@ def _match_keyword_delimiters( # Optimization: combining multiple keyword delimiters into a single compiled regex pattern # using alternation (?:...) avoids looping over keywords with individual re.finditer calls. - delimiter_map = {d.start: d for d in keyword_delimiters} - combined_pattern = rf"\b(?:{'|'.join(re.escape(d.start) for d in keyword_delimiters)})\b" + + # Build map of start strings to lists of delimiters to handle collisions + # e.g. 'type' or 'extension' being used for multiple structures + from collections import defaultdict + delimiter_map = defaultdict(list) + for d in keyword_delimiters: + delimiter_map[d.start].append(d) + + combined_pattern = rf"\b(?:{'|'.join(re.escape(start) for start in delimiter_map)})\b" for match in re.finditer(combined_pattern, content): keyword_pos = match.start() matched_text = match.group(0) - delimiter = delimiter_map[matched_text] - # Skip if keyword is inside a string or comment - if self._is_inside_string_or_comment(content, keyword_pos): - continue + for delimiter in delimiter_map[matched_text]: + # Skip if keyword is inside a string or comment + if self._is_inside_string_or_comment(content, keyword_pos): + continue - # Find the next structural opening after the keyword - struct_start, struct_char = self._find_next_structural_with_char( - content, - start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), - ) + # Find the next structural opening after the keyword + struct_start, struct_char = self._find_next_structural_with_char( + content, + start=keyword_pos + len(delimiter.start), + allowed=set(structural_pairs.keys()), + ) - if struct_start is None: - continue + if struct_start is None: + continue - # Find the matching closing delimiter for the structural character - struct_end = self._find_matching_close( - content, - struct_start, - struct_char or "", - structural_pairs.get(cast(str, struct_char), ""), - ) + # Find the matching closing delimiter for the structural character + struct_end = self._find_matching_close( + content, + struct_start, + struct_char or "", + structural_pairs.get(cast(str, struct_char), ""), + ) - if struct_end is not None: - # Calculate nesting level by counting parent structures - nesting_level = self._calculate_nesting_level(content, keyword_pos) + if struct_end is not None: + # Calculate nesting level by counting parent structures + nesting_level = self._calculate_nesting_level(content, keyword_pos) - # Create a complete match from keyword to closing structure - # This represents the entire construct (e.g., function...}) - matches.append( - DelimiterMatch( - delimiter=delimiter, - start_pos=keyword_pos, - end_pos=struct_end, - nesting_level=nesting_level, + # Create a complete match from keyword to closing structure + # This represents the entire construct (e.g., function...}) + matches.append( + DelimiterMatch( + delimiter=delimiter, + start_pos=keyword_pos, + end_pos=struct_end, + nesting_level=nesting_level, + ) ) - ) return matches From 1a98b3914e84058ee065909b82c62d1e6772c9e9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Mar 2026 20:00:39 +0000 Subject: [PATCH 3/3] refactor: move defaultdict import to module level and optimize per-position lookups - Move 'from collections import defaultdict' from inline function body to module-level imports - Add type annotation: defaultdict[str, list[Delimiter]] - Check _is_inside_string_or_comment once per match position (not once per delimiter) - Call _find_next_structural_with_char and _find_matching_close once per position - Use matches.extend() with a generator instead of an append loop --- src/codeweaver/engine/chunker/delimiter.py | 72 +++++++++++----------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index e318fd5c..a13390a2 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -14,6 +14,7 @@ import logging import re +from collections import defaultdict from datetime import UTC, datetime from pathlib import Path from typing import TYPE_CHECKING, Any, NamedTuple, cast @@ -464,11 +465,9 @@ def _match_keyword_delimiters( # Optimization: combining multiple keyword delimiters into a single compiled regex pattern # using alternation (?:...) avoids looping over keywords with individual re.finditer calls. - - # Build map of start strings to lists of delimiters to handle collisions - # e.g. 'type' or 'extension' being used for multiple structures - from collections import defaultdict - delimiter_map = defaultdict(list) + # Use defaultdict(list) to support multiple delimiters sharing the same start string + # (e.g., "type" matches both STRUCT and TYPE_ALIAS patterns). + delimiter_map: defaultdict[str, list[Delimiter]] = defaultdict(list) for d in keyword_delimiters: delimiter_map[d.start].append(d) @@ -478,43 +477,44 @@ def _match_keyword_delimiters( keyword_pos = match.start() matched_text = match.group(0) - for delimiter in delimiter_map[matched_text]: - # Skip if keyword is inside a string or comment - if self._is_inside_string_or_comment(content, keyword_pos): - continue + # Skip if keyword is inside a string or comment (same position for all delimiters) + if self._is_inside_string_or_comment(content, keyword_pos): + continue - # Find the next structural opening after the keyword - struct_start, struct_char = self._find_next_structural_with_char( - content, - start=keyword_pos + len(delimiter.start), - allowed=set(structural_pairs.keys()), - ) + # Find the next structural opening after the keyword. + # All delimiters sharing this start string have the same length, so we compute once. + struct_start, struct_char = self._find_next_structural_with_char( + content, + start=keyword_pos + len(matched_text), + allowed=set(structural_pairs.keys()), + ) - if struct_start is None: - continue + if struct_start is None: + continue - # Find the matching closing delimiter for the structural character - struct_end = self._find_matching_close( - content, - struct_start, - struct_char or "", - structural_pairs.get(cast(str, struct_char), ""), - ) + # Find the matching closing delimiter for the structural character + struct_end = self._find_matching_close( + content, + struct_start, + struct_char or "", + structural_pairs.get(cast(str, struct_char), ""), + ) - if struct_end is not None: - # Calculate nesting level by counting parent structures - nesting_level = self._calculate_nesting_level(content, keyword_pos) + if struct_end is not None: + # Calculate nesting level by counting parent structures + nesting_level = self._calculate_nesting_level(content, keyword_pos) - # Create a complete match from keyword to closing structure - # This represents the entire construct (e.g., function...}) - matches.append( - DelimiterMatch( - delimiter=delimiter, - start_pos=keyword_pos, - end_pos=struct_end, - nesting_level=nesting_level, - ) + # Create a complete match for every delimiter that shares this start string + # (e.g., both STRUCT and TYPE_ALIAS for "type") + matches.extend( + DelimiterMatch( + delimiter=delimiter, + start_pos=keyword_pos, + end_pos=struct_end, + nesting_level=nesting_level, ) + for delimiter in delimiter_map[matched_text] + ) return matches