diff --git a/openviking/models/embedder/base.py b/openviking/models/embedder/base.py index bd178df8..b8270664 100644 --- a/openviking/models/embedder/base.py +++ b/openviking/models/embedder/base.py @@ -118,18 +118,19 @@ def _estimate_tokens(self, text: str) -> int: # CJK text (3 bytes per char in UTF-8) is not underestimated. return max(len(text) // 3, len(text.encode("utf-8")) // 4) - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str, override_max_tokens: Optional[int] = None) -> List[str]: """Split text into chunks, each within max_tokens. Splitting priority: paragraphs (\\n\\n) > sentences (。.!?\\n) > fixed length. Args: text: Input text + override_max_tokens: If set, use this instead of self.max_tokens Returns: List of text chunks """ - max_tok = self.max_tokens + max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens if self._estimate_tokens(text) <= max_tok: return [text] @@ -188,7 +189,12 @@ def _fixed_length_split(self, text: str, max_tok: int) -> List[str]: start = end return chunks - def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: + def _chunk_and_embed( + self, + text: str, + is_query: bool = False, + override_max_tokens: Optional[int] = None, + ) -> EmbedResult: """Chunk text if it exceeds max_tokens, embed each chunk, and merge results. For text within limits, delegates to _embed_single directly. @@ -198,15 +204,17 @@ def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: Args: text: Input text is_query: Flag to indicate if this is a query embedding + override_max_tokens: If set, use this instead of self.max_tokens Returns: EmbedResult with merged embedding """ + max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens estimated = self._estimate_tokens(text) - if estimated <= self.max_tokens: + if estimated <= max_tok: return self._embed_single(text, is_query=is_query) - chunks = self._chunk_text(text) + chunks = self._chunk_text(text, override_max_tokens=override_max_tokens) logger.debug( "Chunking text: original ~%d tokens -> %d chunks", estimated, diff --git a/openviking/models/embedder/openai_embedders.py b/openviking/models/embedder/openai_embedders.py index 2924b98d..7b57d6ce 100644 --- a/openviking/models/embedder/openai_embedders.py +++ b/openviking/models/embedder/openai_embedders.py @@ -277,6 +277,9 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: def embed(self, text: str, is_query: bool = False) -> EmbedResult: """Embed single text, with automatic chunking for oversized input. + If the initial embedding fails due to input length exceeding the model's + limit, retries with progressively smaller chunk sizes. + Args: text: Input text is_query: Flag to indicate if this is a query embedding @@ -285,14 +288,35 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: EmbedResult: Result containing only dense_vector Raises: - RuntimeError: When API call fails + RuntimeError: When API call fails after all retry attempts """ if not text: return self._embed_single(text, is_query=is_query) if self._estimate_tokens(text) > self.max_tokens: return self._chunk_and_embed(text, is_query=is_query) - return self._embed_single(text, is_query=is_query) + + try: + return self._embed_single(text, is_query=is_query) + except RuntimeError as e: + error_msg = str(e).lower() + if ( + ("input" in error_msg and "too large" in error_msg) + or ("token" in error_msg and ("too long" in error_msg or "too many" in error_msg)) + or "context length" in error_msg + ): + # Token estimation was wrong (common with non-OpenAI models). + # Retry with chunking at half the current max_tokens. + reduced = max(self.max_tokens // 2, 128) + logger.warning( + "Embedding failed due to input length. " + "Retrying with chunk size %d tokens. " + "Set embedding.dense.max_tokens in ov.conf to match " + "your model's actual limit.", + reduced, + ) + return self._chunk_and_embed(text, is_query=is_query, override_max_tokens=reduced) + raise def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: """Batch embedding with automatic chunking for oversized inputs.