Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 49 additions & 36 deletions src/codeweaver/engine/chunker/delimiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import logging
import re

from collections import defaultdict
from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, NamedTuple, cast
Expand Down Expand Up @@ -451,6 +452,9 @@ def _match_keyword_delimiters(
# Filter out delimiters with empty start strings - they match everywhere!
keyword_delimiters = [d for d in keyword_delimiters if d.start]

if not keyword_delimiters:
return matches

# Define structural delimiters that can complete keywords
# Map opening structural chars to their closing counterparts
structural_pairs = {
Expand All @@ -459,49 +463,58 @@ def _match_keyword_delimiters(
"=>": "", # Arrow functions often have expression bodies
}

for delimiter in keyword_delimiters:
# Find all keyword occurrences using word boundary matching
pattern = rf"\b{re.escape(delimiter.start)}\b"
# Optimization: combining multiple keyword delimiters into a single compiled regex pattern
# using alternation (?:...) avoids looping over keywords with individual re.finditer calls.
# Use defaultdict(list) to support multiple delimiters sharing the same start string
# (e.g., "type" matches both STRUCT and TYPE_ALIAS patterns).
delimiter_map: defaultdict[str, list[Delimiter]] = defaultdict(list)
for d in keyword_delimiters:
delimiter_map[d.start].append(d)

for match in re.finditer(pattern, content):
keyword_pos = match.start()
combined_pattern = rf"\b(?:{'|'.join(re.escape(start) for start in delimiter_map)})\b"

# Skip if keyword is inside a string or comment
if self._is_inside_string_or_comment(content, keyword_pos):
continue
for match in re.finditer(combined_pattern, content):
Comment on lines +466 to +476
Copy link

Copilot AI Mar 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This optimization changes how keyword delimiters are selected when multiple delimiter kinds share the same keyword (e.g., type). There doesn’t appear to be a unit test exercising overlapping keyword starts for delimiter-based chunking; adding a regression test (e.g., a snippet containing type Foo struct { ... } and/or type Foo = ...) would help prevent future regressions when refactoring this matcher.

Copilot uses AI. Check for mistakes.
keyword_pos = match.start()
matched_text = match.group(0)

# Find the next structural opening after the keyword
struct_start, struct_char = self._find_next_structural_with_char(
content,
start=keyword_pos + len(delimiter.start),
allowed=set(structural_pairs.keys()),
)
# Skip if keyword is inside a string or comment (same position for all delimiters)
if self._is_inside_string_or_comment(content, keyword_pos):
continue

if struct_start is None:
continue
# Find the next structural opening after the keyword.
# All delimiters sharing this start string have the same length, so we compute once.
struct_start, struct_char = self._find_next_structural_with_char(
content,
start=keyword_pos + len(matched_text),
allowed=set(structural_pairs.keys()),
)

# Find the matching closing delimiter for the structural character
struct_end = self._find_matching_close(
content,
struct_start,
struct_char or "",
structural_pairs.get(cast(str, struct_char), ""),
)
if struct_start is None:
continue

if struct_end is not None:
# Calculate nesting level by counting parent structures
nesting_level = self._calculate_nesting_level(content, keyword_pos)
# Find the matching closing delimiter for the structural character
struct_end = self._find_matching_close(
content,
struct_start,
struct_char or "",
structural_pairs.get(cast(str, struct_char), ""),
)

# Create a complete match from keyword to closing structure
# This represents the entire construct (e.g., function...})
matches.append(
DelimiterMatch(
delimiter=delimiter,
start_pos=keyword_pos,
end_pos=struct_end,
nesting_level=nesting_level,
)
if struct_end is not None:
# Calculate nesting level by counting parent structures
nesting_level = self._calculate_nesting_level(content, keyword_pos)

# Create a complete match for every delimiter that shares this start string
# (e.g., both STRUCT and TYPE_ALIAS for "type")
matches.extend(
DelimiterMatch(
delimiter=delimiter,
start_pos=keyword_pos,
end_pos=struct_end,
nesting_level=nesting_level,
)
for delimiter in delimiter_map[matched_text]
)

return matches

Expand Down Expand Up @@ -1280,7 +1293,7 @@ def _load_custom_delimiters(
patterns when merged with the full delimiter list.

Args:
normalized_language: Snake-case normalised language identifier.
normalized_language: Snake-case normalized language identifier.
language: Original language string (used for logging only).

Returns:
Expand Down
Loading