Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions src/codeweaver/engine/chunker/delimiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,30 +451,43 @@ def _match_keyword_delimiters(
# Filter out delimiters with empty start strings - they match everywhere!
keyword_delimiters = [d for d in keyword_delimiters if d.start]

if not keyword_delimiters:
return matches

# Define structural delimiters that can complete keywords
# Map opening structural chars to their closing counterparts
structural_pairs = {
"{": "}",
":": "\n", # Python uses : followed by indented block (simplified to newline)
"=>": "", # Arrow functions often have expression bodies
}

allowed_keys = frozenset(structural_pairs.keys())
# Optimization: Combine all keyword start strings into a single compiled regex pattern.
# This allows us to make a single pass over the content rather than iterating over
# `re.finditer` for each keyword delimiter individually, significantly reducing overhead.
start_strings = list(dict.fromkeys(d.start for d in keyword_delimiters))
combined_pattern = re.compile(rf"\b(?:{'|'.join(map(re.escape, start_strings))})\b")

# Group delimiters by matched start string so duplicate keyword starts
# preserve the original "process each delimiter independently" behavior.
delimiter_map: dict[str, list[Delimiter]] = {}
for delimiter in keyword_delimiters:
# Find all keyword occurrences using word boundary matching
pattern = rf"\b{re.escape(delimiter.start)}\b"
delimiter_map.setdefault(delimiter.start, []).append(delimiter)

for match in re.finditer(pattern, content):
keyword_pos = match.start()
for match in combined_pattern.finditer(content):
matched_text = match.group(0)
keyword_pos = match.start()

# Skip if keyword is inside a string or comment
if self._is_inside_string_or_comment(content, keyword_pos):
continue
# Skip if keyword is inside a string or comment
if self._is_inside_string_or_comment(content, keyword_pos):
continue

for delimiter in delimiter_map[matched_text]:
# Find the next structural opening after the keyword
struct_start, struct_char = self._find_next_structural_with_char(
content,
start=keyword_pos + len(delimiter.start),
allowed=set(structural_pairs.keys()),
allowed=allowed_keys,
)

if struct_start is None:
Expand Down Expand Up @@ -1280,7 +1293,7 @@ def _load_custom_delimiters(
patterns when merged with the full delimiter list.

Args:
normalized_language: Snake-case normalised language identifier.
normalized_language: Snake-case normalized language identifier.
language: Original language string (used for logging only).

Returns:
Expand Down
37 changes: 37 additions & 0 deletions tests/unit/engine/chunker/test_delimiter_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

import pytest

from codeweaver.core.types import DelimiterKind
from codeweaver.engine import ChunkGovernor, DelimiterChunker
from codeweaver.engine.chunker.delimiter_model import Delimiter


pytestmark = [pytest.mark.unit]
Expand Down Expand Up @@ -208,6 +210,41 @@ def test_take_whole_lines_expansion(
class TestUnusualPatterns:
"""Test delimiter chunker with unusual patterns."""

def test_duplicate_keyword_starts_preserve_all_delimiters(
self, delimiter_chunker: DelimiterChunker
) -> None:
"""Verify duplicate keyword starts keep all matching delimiters.

Keyword delimiters with the same start string used to be processed
independently. The optimized single-regex path must preserve that
behavior instead of collapsing duplicates down to the last delimiter.
"""
content = """type Person struct {
name string
}"""
keyword_delimiters = [
Delimiter(
start="type",
end="",
kind=DelimiterKind.STRUCT,
priority=DelimiterKind.STRUCT.default_priority,
),
Delimiter(
start="type",
end="",
kind=DelimiterKind.TYPE_ALIAS,
priority=DelimiterKind.TYPE_ALIAS.default_priority,
),
]

matches = delimiter_chunker._match_keyword_delimiters(content, keyword_delimiters)

assert len(matches) == 2, "Duplicate keyword starts should preserve all delimiters"
assert {match.delimiter.kind for match in matches} == {
DelimiterKind.STRUCT,
DelimiterKind.TYPE_ALIAS,
}

def test_nested_delimiter_structures(
self, delimiter_chunker: DelimiterChunker, tmp_path: Path
) -> None:
Expand Down