From b0b3261822e539749231d0bb034b1f0b2f7fd236 Mon Sep 17 00:00:00 2001 From: deepakdevp Date: Wed, 18 Mar 2026 17:14:22 +0900 Subject: [PATCH 1/2] fix(embedder): auto-retry with smaller chunks when input exceeds model limit When the embedding API rejects input as "too large" (common with non-OpenAI models where token estimation is inaccurate), retry with chunking at half the current max_tokens instead of crashing. Also logs a warning guiding users to set embedding.dense.max_tokens in ov.conf to match their model's actual limit. Fixes #731. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../models/embedder/openai_embedders.py | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/openviking/models/embedder/openai_embedders.py b/openviking/models/embedder/openai_embedders.py index 2924b98d..d7fd846f 100644 --- a/openviking/models/embedder/openai_embedders.py +++ b/openviking/models/embedder/openai_embedders.py @@ -277,6 +277,9 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: def embed(self, text: str, is_query: bool = False) -> EmbedResult: """Embed single text, with automatic chunking for oversized input. + If the initial embedding fails due to input length exceeding the model's + limit, retries with progressively smaller chunk sizes. + Args: text: Input text is_query: Flag to indicate if this is a query embedding @@ -285,14 +288,39 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: EmbedResult: Result containing only dense_vector Raises: - RuntimeError: When API call fails + RuntimeError: When API call fails after all retry attempts """ if not text: return self._embed_single(text, is_query=is_query) if self._estimate_tokens(text) > self.max_tokens: return self._chunk_and_embed(text, is_query=is_query) - return self._embed_single(text, is_query=is_query) + + try: + return self._embed_single(text, is_query=is_query) + except RuntimeError as e: + error_msg = str(e).lower() + if ( + "too large" in error_msg + or "too long" in error_msg + or "maximum context length" in error_msg + ): + # Token estimation was wrong (common with non-OpenAI models). + # Retry with chunking at half the current max_tokens. + reduced = max(self.max_tokens // 2, 128) + logger.warning( + f"Embedding failed due to input length. " + f"Retrying with chunk size {reduced} tokens. " + f"Set embedding.dense.max_tokens in ov.conf to match " + f"your model's actual limit." + ) + saved = self._max_tokens + self._max_tokens = reduced + try: + return self._chunk_and_embed(text, is_query=is_query) + finally: + self._max_tokens = saved + raise def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: """Batch embedding with automatic chunking for oversized inputs. From 5b92e834dc96c3ed5dafae74f3b2110389b23b38 Mon Sep 17 00:00:00 2001 From: deepakdevp Date: Wed, 18 Mar 2026 17:34:36 +0900 Subject: [PATCH 2/2] fix(embedder): address review - thread-safe retry + precise error matching - Add override_max_tokens parameter to _chunk_and_embed() and _chunk_text() instead of mutating self._max_tokens (thread-safe) - Tighten error message matching to require specific patterns ("input" + "too large", "token" + "too long/many", "context length") Co-Authored-By: Claude Opus 4.6 (1M context) --- openviking/models/embedder/base.py | 18 ++++++++++----- .../models/embedder/openai_embedders.py | 22 ++++++++----------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/openviking/models/embedder/base.py b/openviking/models/embedder/base.py index bd178df8..b8270664 100644 --- a/openviking/models/embedder/base.py +++ b/openviking/models/embedder/base.py @@ -118,18 +118,19 @@ def _estimate_tokens(self, text: str) -> int: # CJK text (3 bytes per char in UTF-8) is not underestimated. return max(len(text) // 3, len(text.encode("utf-8")) // 4) - def _chunk_text(self, text: str) -> List[str]: + def _chunk_text(self, text: str, override_max_tokens: Optional[int] = None) -> List[str]: """Split text into chunks, each within max_tokens. Splitting priority: paragraphs (\\n\\n) > sentences (。.!?\\n) > fixed length. Args: text: Input text + override_max_tokens: If set, use this instead of self.max_tokens Returns: List of text chunks """ - max_tok = self.max_tokens + max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens if self._estimate_tokens(text) <= max_tok: return [text] @@ -188,7 +189,12 @@ def _fixed_length_split(self, text: str, max_tok: int) -> List[str]: start = end return chunks - def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: + def _chunk_and_embed( + self, + text: str, + is_query: bool = False, + override_max_tokens: Optional[int] = None, + ) -> EmbedResult: """Chunk text if it exceeds max_tokens, embed each chunk, and merge results. For text within limits, delegates to _embed_single directly. @@ -198,15 +204,17 @@ def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: Args: text: Input text is_query: Flag to indicate if this is a query embedding + override_max_tokens: If set, use this instead of self.max_tokens Returns: EmbedResult with merged embedding """ + max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens estimated = self._estimate_tokens(text) - if estimated <= self.max_tokens: + if estimated <= max_tok: return self._embed_single(text, is_query=is_query) - chunks = self._chunk_text(text) + chunks = self._chunk_text(text, override_max_tokens=override_max_tokens) logger.debug( "Chunking text: original ~%d tokens -> %d chunks", estimated, diff --git a/openviking/models/embedder/openai_embedders.py b/openviking/models/embedder/openai_embedders.py index d7fd846f..7b57d6ce 100644 --- a/openviking/models/embedder/openai_embedders.py +++ b/openviking/models/embedder/openai_embedders.py @@ -301,25 +301,21 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: except RuntimeError as e: error_msg = str(e).lower() if ( - "too large" in error_msg - or "too long" in error_msg - or "maximum context length" in error_msg + ("input" in error_msg and "too large" in error_msg) + or ("token" in error_msg and ("too long" in error_msg or "too many" in error_msg)) + or "context length" in error_msg ): # Token estimation was wrong (common with non-OpenAI models). # Retry with chunking at half the current max_tokens. reduced = max(self.max_tokens // 2, 128) logger.warning( - f"Embedding failed due to input length. " - f"Retrying with chunk size {reduced} tokens. " - f"Set embedding.dense.max_tokens in ov.conf to match " - f"your model's actual limit." + "Embedding failed due to input length. " + "Retrying with chunk size %d tokens. " + "Set embedding.dense.max_tokens in ov.conf to match " + "your model's actual limit.", + reduced, ) - saved = self._max_tokens - self._max_tokens = reduced - try: - return self._chunk_and_embed(text, is_query=is_query) - finally: - self._max_tokens = saved + return self._chunk_and_embed(text, is_query=is_query, override_max_tokens=reduced) raise def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: