diff --git a/src/codeweaver/engine/chunker/delimiter.py b/src/codeweaver/engine/chunker/delimiter.py index 97a69ad3..2969bd6c 100644 --- a/src/codeweaver/engine/chunker/delimiter.py +++ b/src/codeweaver/engine/chunker/delimiter.py @@ -11,13 +11,24 @@ from __future__ import annotations +import logging import re from datetime import UTC, datetime from pathlib import Path from typing import TYPE_CHECKING, Any, NamedTuple, cast -from codeweaver.core import CodeChunk, Metadata, Span, get_blake_hash, uuid7 +import textcase + +from codeweaver.core import ( + CodeChunk, + ConfigLanguage, + Metadata, + SemanticSearchLanguage, + Span, + get_blake_hash, + uuid7, +) from codeweaver.engine.chunker.base import AdaptiveChunkBehavior, BaseChunker, ChunkGovernor from codeweaver.engine.chunker.delimiter_model import Boundary, Delimiter, DelimiterMatch from codeweaver.engine.chunker.exceptions import ( @@ -32,6 +43,8 @@ from codeweaver.core import DiscoveredFile +logger = logging.getLogger(__name__) + PERFORMANCE_THRESHOLD_MS = 1000.0 # 1 second # Token estimation: ~4 chars per token for code (conservative) @@ -1256,7 +1269,7 @@ def _load_custom_delimiters( self, normalized_language: str, language: str, - ) -> list[object]: + ) -> list[Delimiter]: """Load custom delimiter patterns from settings that match the given language. Custom delimiters are returned first so they override built-in family @@ -1269,13 +1282,6 @@ def _load_custom_delimiters( Returns: List of ``Delimiter`` objects converted from matching custom patterns. """ - import logging - - import textcase - - from codeweaver.core import ConfigLanguage, SemanticSearchLanguage - from codeweaver.engine.chunker.delimiter_model import Delimiter - if ( self._governor.settings is None or not hasattr(self._governor.settings, "custom_delimiters") @@ -1283,14 +1289,12 @@ def _load_custom_delimiters( ): return [] - logger = logging.getLogger(__name__) - def _normalize(lang: object) -> str: if isinstance(lang, SemanticSearchLanguage | ConfigLanguage): return textcase.snake(lang.variable) return textcase.snake(str(lang)) - delimiters: list[object] = [] + delimiters: list[Delimiter] = [] for custom_delim in self._governor.settings.custom_delimiters: lang_match = ( custom_delim.language is not None @@ -1327,9 +1331,7 @@ def _load_delimiters_for_language(self, language: str) -> list[Delimiter]: List of Delimiter objects for the language, with custom delimiters prepended so they take priority over built-in family patterns. """ - import textcase - - from codeweaver.engine.chunker.delimiter_model import Delimiter, DelimiterKind + from codeweaver.engine.chunker.delimiter_model import DelimiterKind from codeweaver.engine.chunker.delimiters.families import ( LanguageFamily, get_family_patterns, @@ -1339,7 +1341,7 @@ def _load_delimiters_for_language(self, language: str) -> list[Delimiter]: # Custom entries are prepended so they override the built-in family # patterns for known languages and are the sole source for new languages. - delimiters: list[Delimiter] = self._load_custom_delimiters( # type: ignore[assignment] + delimiters: list[Delimiter] = self._load_custom_delimiters( normalized_language, language ) diff --git a/src/codeweaver/engine/chunker/selector.py b/src/codeweaver/engine/chunker/selector.py index 661a958a..57b26707 100644 --- a/src/codeweaver/engine/chunker/selector.py +++ b/src/codeweaver/engine/chunker/selector.py @@ -21,6 +21,8 @@ from typing import TYPE_CHECKING, Any +import textcase + from codeweaver.core import ConfigLanguage, SemanticSearchLanguage from codeweaver.engine.chunker.base import BaseChunker from codeweaver.engine.chunker.delimiter import DelimiterChunker @@ -216,14 +218,12 @@ def _detect_language_from_custom_ext( if self.governor.settings is None: return None - import textcase - custom_delimiters = getattr(self.governor.settings, "custom_delimiters", None) if not custom_delimiters: return None for custom_delim in custom_delimiters: - if lang := self._match_custom_ext_pair(custom_delim, file_ext, textcase): + if lang := self._match_custom_ext_pair(custom_delim, file_ext): return lang return None @@ -231,14 +231,12 @@ def _detect_language_from_custom_ext( def _match_custom_ext_pair( custom_delim: object, file_ext: str, - textcase: object, ) -> SemanticSearchLanguage | ConfigLanguage | str | None: """Return the language for a matching extension pair in *custom_delim*. Args: custom_delim: A ``CustomDelimiter`` instance from settings. file_ext: File extension including the leading dot. - textcase: The textcase module (passed in to avoid repeated imports). Returns: Matching language, or ``None``. @@ -256,12 +254,14 @@ def _match_custom_ext_pair( continue delim_lang = getattr(custom_delim, "language", None) if delim_lang is not None: - return textcase.snake(str(delim_lang)) # type: ignore[attr-defined] + if isinstance(delim_lang, SemanticSearchLanguage | ConfigLanguage): + return delim_lang + return textcase.snake(str(delim_lang)) pair_lang = getattr(pair, "language", None) if pair_lang is not None: if isinstance(pair_lang, SemanticSearchLanguage | ConfigLanguage): return pair_lang - return textcase.snake(str(pair_lang)) # type: ignore[attr-defined] + return textcase.snake(str(pair_lang)) return None def _detect_language(