Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions src/codeweaver/engine/chunker/delimiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,24 @@

from __future__ import annotations

import logging
import re

from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, NamedTuple, cast

from codeweaver.core import CodeChunk, Metadata, Span, get_blake_hash, uuid7
import textcase

from codeweaver.core import (
CodeChunk,
ConfigLanguage,
Metadata,
SemanticSearchLanguage,
Span,
get_blake_hash,
uuid7,
)
from codeweaver.engine.chunker.base import AdaptiveChunkBehavior, BaseChunker, ChunkGovernor
from codeweaver.engine.chunker.delimiter_model import Boundary, Delimiter, DelimiterMatch
from codeweaver.engine.chunker.exceptions import (
Expand All @@ -32,6 +43,8 @@
from codeweaver.core import DiscoveredFile


logger = logging.getLogger(__name__)

PERFORMANCE_THRESHOLD_MS = 1000.0 # 1 second

# Token estimation: ~4 chars per token for code (conservative)
Expand Down Expand Up @@ -1256,7 +1269,7 @@ def _load_custom_delimiters(
self,
normalized_language: str,
language: str,
) -> list[object]:
) -> list[Delimiter]:
"""Load custom delimiter patterns from settings that match the given language.

Custom delimiters are returned first so they override built-in family
Expand All @@ -1269,28 +1282,19 @@ def _load_custom_delimiters(
Returns:
List of ``Delimiter`` objects converted from matching custom patterns.
"""
import logging

import textcase

from codeweaver.core import ConfigLanguage, SemanticSearchLanguage
from codeweaver.engine.chunker.delimiter_model import Delimiter

if (
self._governor.settings is None
or not hasattr(self._governor.settings, "custom_delimiters")
or not self._governor.settings.custom_delimiters
):
return []

logger = logging.getLogger(__name__)

def _normalize(lang: object) -> str:
if isinstance(lang, SemanticSearchLanguage | ConfigLanguage):
return textcase.snake(lang.variable)
return textcase.snake(str(lang))
Comment on lines 1292 to 1295
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Consider centralizing language normalization logic used across delimiter loading and selector matching.

This helper duplicates the normalization logic in _match_custom_ext_pair (handling SemanticSearchLanguage | ConfigLanguage via lang.variable, otherwise str(lang)). Please consider extracting a shared utility (e.g., in a small internal helper module) so the chunker and selector stay in sync if one of them changes.


delimiters: list[object] = []
delimiters: list[Delimiter] = []
for custom_delim in self._governor.settings.custom_delimiters:
lang_match = (
custom_delim.language is not None
Expand Down Expand Up @@ -1327,9 +1331,7 @@ def _load_delimiters_for_language(self, language: str) -> list[Delimiter]:
List of Delimiter objects for the language, with custom delimiters
prepended so they take priority over built-in family patterns.
"""
import textcase

from codeweaver.engine.chunker.delimiter_model import Delimiter, DelimiterKind
from codeweaver.engine.chunker.delimiter_model import DelimiterKind
from codeweaver.engine.chunker.delimiters.families import (
LanguageFamily,
get_family_patterns,
Expand All @@ -1339,7 +1341,7 @@ def _load_delimiters_for_language(self, language: str) -> list[Delimiter]:

# Custom entries are prepended so they override the built-in family
# patterns for known languages and are the sole source for new languages.
delimiters: list[Delimiter] = self._load_custom_delimiters( # type: ignore[assignment]
delimiters: list[Delimiter] = self._load_custom_delimiters(
normalized_language, language
)

Expand Down
14 changes: 7 additions & 7 deletions src/codeweaver/engine/chunker/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

from typing import TYPE_CHECKING, Any

import textcase

from codeweaver.core import ConfigLanguage, SemanticSearchLanguage
from codeweaver.engine.chunker.base import BaseChunker
from codeweaver.engine.chunker.delimiter import DelimiterChunker
Expand Down Expand Up @@ -216,29 +218,25 @@ def _detect_language_from_custom_ext(
if self.governor.settings is None:
return None

import textcase

custom_delimiters = getattr(self.governor.settings, "custom_delimiters", None)
if not custom_delimiters:
return None

for custom_delim in custom_delimiters:
if lang := self._match_custom_ext_pair(custom_delim, file_ext, textcase):
if lang := self._match_custom_ext_pair(custom_delim, file_ext):
return lang
return None

@staticmethod
def _match_custom_ext_pair(
custom_delim: object,
file_ext: str,
textcase: object,
) -> SemanticSearchLanguage | ConfigLanguage | str | None:
"""Return the language for a matching extension pair in *custom_delim*.

Args:
custom_delim: A ``CustomDelimiter`` instance from settings.
file_ext: File extension including the leading dot.
textcase: The textcase module (passed in to avoid repeated imports).

Returns:
Matching language, or ``None``.
Expand All @@ -256,12 +254,14 @@ def _match_custom_ext_pair(
continue
delim_lang = getattr(custom_delim, "language", None)
if delim_lang is not None:
return textcase.snake(str(delim_lang)) # type: ignore[attr-defined]
if isinstance(delim_lang, SemanticSearchLanguage | ConfigLanguage):
return delim_lang
return textcase.snake(str(delim_lang))
pair_lang = getattr(pair, "language", None)
if pair_lang is not None:
if isinstance(pair_lang, SemanticSearchLanguage | ConfigLanguage):
return pair_lang
return textcase.snake(str(pair_lang)) # type: ignore[attr-defined]
return textcase.snake(str(pair_lang))
return None

def _detect_language(
Expand Down
Loading