volcengine · deepakdevp · Mar 18, 2026 · Mar 18, 2026
diff --git a/openviking/models/embedder/base.py b/openviking/models/embedder/base.py
@@ -118,18 +118,19 @@ def _estimate_tokens(self, text: str) -> int:
             # CJK text (3 bytes per char in UTF-8) is not underestimated.
             return max(len(text) // 3, len(text.encode("utf-8")) // 4)
 
-    def _chunk_text(self, text: str) -> List[str]:
+    def _chunk_text(self, text: str, override_max_tokens: Optional[int] = None) -> List[str]:
         """Split text into chunks, each within max_tokens.
 
         Splitting priority: paragraphs (\\n\\n) > sentences (。.!?\\n) > fixed length.
 
         Args:
             text: Input text
+            override_max_tokens: If set, use this instead of self.max_tokens
 
         Returns:
             List of text chunks
         """
-        max_tok = self.max_tokens
+        max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens
         if self._estimate_tokens(text) <= max_tok:
             return [text]
 
@@ -188,7 +189,12 @@ def _fixed_length_split(self, text: str, max_tok: int) -> List[str]:
             start = end
         return chunks
 
-    def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult:
+    def _chunk_and_embed(
+        self,
+        text: str,
+        is_query: bool = False,
+        override_max_tokens: Optional[int] = None,
+    ) -> EmbedResult:
         """Chunk text if it exceeds max_tokens, embed each chunk, and merge results.
 
         For text within limits, delegates to _embed_single directly.
@@ -198,15 +204,17 @@ def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult:
         Args:
             text: Input text
             is_query: Flag to indicate if this is a query embedding
+            override_max_tokens: If set, use this instead of self.max_tokens
 
         Returns:
             EmbedResult with merged embedding
         """
+        max_tok = override_max_tokens if override_max_tokens is not None else self.max_tokens
         estimated = self._estimate_tokens(text)
-        if estimated <= self.max_tokens:
+        if estimated <= max_tok:
             return self._embed_single(text, is_query=is_query)
 
-        chunks = self._chunk_text(text)
+        chunks = self._chunk_text(text, override_max_tokens=override_max_tokens)
         logger.debug(
             "Chunking text: original ~%d tokens -> %d chunks",
             estimated,

diff --git a/openviking/models/embedder/openai_embedders.py b/openviking/models/embedder/openai_embedders.py
@@ -277,6 +277,9 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult:
     def embed(self, text: str, is_query: bool = False) -> EmbedResult:
         """Embed single text, with automatic chunking for oversized input.
 
+        If the initial embedding fails due to input length exceeding the model's
+        limit, retries with progressively smaller chunk sizes.
+
         Args:
             text: Input text
             is_query: Flag to indicate if this is a query embedding
@@ -285,14 +288,35 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult:
             EmbedResult: Result containing only dense_vector
 
         Raises:
-            RuntimeError: When API call fails
+            RuntimeError: When API call fails after all retry attempts
         """
         if not text:
             return self._embed_single(text, is_query=is_query)
 
         if self._estimate_tokens(text) > self.max_tokens:
             return self._chunk_and_embed(text, is_query=is_query)
-        return self._embed_single(text, is_query=is_query)
+
+        try:
+            return self._embed_single(text, is_query=is_query)
+        except RuntimeError as e:
+            error_msg = str(e).lower()
+            if (
+                ("input" in error_msg and "too large" in error_msg)
+                or ("token" in error_msg and ("too long" in error_msg or "too many" in error_msg))
+                or "context length" in error_msg
+            ):
+                # Token estimation was wrong (common with non-OpenAI models).
+                # Retry with chunking at half the current max_tokens.
+                reduced = max(self.max_tokens // 2, 128)
+                logger.warning(
+                    "Embedding failed due to input length. "
+                    "Retrying with chunk size %d tokens. "
+                    "Set embedding.dense.max_tokens in ov.conf to match "
+                    "your model's actual limit.",
+                    reduced,
+                )
+                return self._chunk_and_embed(text, is_query=is_query, override_max_tokens=reduced)
+            raise
 
     def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]:
         """Batch embedding with automatic chunking for oversized inputs.