From 7a83898239f7d73ff815cb91cba4003a968b3d2a Mon Sep 17 00:00:00 2001 From: ZaynJarvis Date: Wed, 18 Mar 2026 02:07:58 +0800 Subject: [PATCH 01/15] feat: add native Google/Gemini embedding support with Parts API - Replace OpenAI-compatible implementation with native Gemini API - Support task-specific embeddings (RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, etc.) - Add Matryoshka dimension reduction support - Include chunking for oversized texts - Add configuration examples and documentation - Support both simple and key=value parameter formats - Use Parts API for future multimodal capability --- docs/en/guides/01-configuration.md | 29 +- docs/zh/guides/01-configuration.md | 2 +- examples/mcp-query/server.pid | 1 + openviking/models/embedder/__init__.py | 6 +- .../models/embedder/google_embedders.py | 324 ++++++++++++++++++ 5 files changed, 359 insertions(+), 3 deletions(-) create mode 100644 examples/mcp-query/server.pid create mode 100644 openviking/models/embedder/google_embedders.py diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index 7c2779b6..e217c32f 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -115,7 +115,7 @@ Embedding model configuration for vector search, supporting dense, sparse, and h | Parameter | Type | Description | |-----------|------|-------------| | `max_concurrent` | int | Maximum concurrent embedding requests (`embedding.max_concurrent`, default: `10`) | -| `provider` | str | `"volcengine"`, `"openai"`, `"vikingdb"`, `"jina"`, or `"voyage"` | +| `provider` | str | `"volcengine"`, `"openai"`, `"vikingdb"`, `"jina"`, `"voyage"`, or `"google"` | | `api_key` | str | API key | | `model` | str | Model name | | `dimension` | int | Vector dimension. For Voyage, this maps to `output_dimension` | @@ -128,6 +128,9 @@ Embedding model configuration for vector search, supporting dense, sparse, and h |-------|-----------|------------|-------| | `doubao-embedding-vision-250615` | 1024 | multimodal | Recommended | | `doubao-embedding-250615` | 1024 | text | Text only | +| `gemini-embedding-2-preview` | 3072 | text | Google Gemini Embedding 2 with MRL | +| `text-embedding-004` | 768 | text | Google text embedding model | +| `text-embedding-005` | 768 | text | Latest Google text embedding model | With `input: "multimodal"`, OpenViking can embed text, images (PNG, JPG, etc.), and mixed content. @@ -137,6 +140,7 @@ With `input: "multimodal"`, OpenViking can embed text, images (PNG, JPG, etc.), - `vikingdb`: VikingDB Embedding API - `jina`: Jina AI Embedding API - `voyage`: Voyage AI Embedding API +- `google`: Google/Gemini AI Embedding API **vikingdb provider example:** @@ -192,6 +196,29 @@ Get your API key at https://jina.ai } ``` +**google provider example:** + +```json +{ + "embedding": { + "dense": { + "provider": "google", + "api_key": "your-google-api-key", + "model": "gemini-embedding-2-preview", + "dimension": 1024, + "query_param": "RETRIEVAL_QUERY", + "document_param": "RETRIEVAL_DOCUMENT" + } + } +} +``` + +For Google/Gemini embeddings: +- `query_param` and `document_param` support task-specific embeddings +- Valid task types: `RETRIEVAL_QUERY`, `RETRIEVAL_DOCUMENT`, `SEMANTIC_SIMILARITY`, `CLASSIFICATION`, `CLUSTERING` +- Enhanced format: `"task_type=RETRIEVAL_QUERY,output_dimensionality=1024"` +- Get your API key at https://aistudio.google.com/app/apikey + Supported Voyage text embedding models include: - `voyage-4-lite` - `voyage-4` diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index 889c737a..35b77b91 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -121,7 +121,7 @@ OpenViking 使用 JSON 配置文件(`ov.conf`)进行设置。配置文件支 | 参数 | 类型 | 说明 | |------|------|------| | `max_concurrent` | int | 最大并发 Embedding 请求数(`embedding.max_concurrent`,默认:`10`) | -| `provider` | str | `"volcengine"`、`"openai"`、`"vikingdb"` 或 `"jina"` | +| `provider` | str | `"volcengine"`、`"openai"`、`"vikingdb"`、`"jina"`、`"voyage"` 或 `"google"` | | `api_key` | str | API Key | | `model` | str | 模型名称 | | `dimension` | int | 向量维度 | diff --git a/examples/mcp-query/server.pid b/examples/mcp-query/server.pid new file mode 100644 index 00000000..5513e2f2 --- /dev/null +++ b/examples/mcp-query/server.pid @@ -0,0 +1 @@ +95167 diff --git a/openviking/models/embedder/__init__.py b/openviking/models/embedder/__init__.py index b418b809..d17e4a31 100644 --- a/openviking/models/embedder/__init__.py +++ b/openviking/models/embedder/__init__.py @@ -13,6 +13,7 @@ - Volcengine: Dense, Sparse, Hybrid - Jina AI: Dense only - Voyage AI: Dense only +- Google/Gemini: Dense only """ from openviking.models.embedder.base import ( @@ -23,9 +24,9 @@ HybridEmbedderBase, SparseEmbedderBase, ) +from openviking.models.embedder.google_embedders import GoogleDenseEmbedder from openviking.models.embedder.jina_embedders import JinaDenseEmbedder from openviking.models.embedder.openai_embedders import OpenAIDenseEmbedder -from openviking.models.embedder.voyage_embedders import VoyageDenseEmbedder from openviking.models.embedder.vikingdb_embedders import ( VikingDBDenseEmbedder, VikingDBHybridEmbedder, @@ -36,6 +37,7 @@ VolcengineHybridEmbedder, VolcengineSparseEmbedder, ) +from openviking.models.embedder.voyage_embedders import VoyageDenseEmbedder __all__ = [ # Base classes @@ -45,6 +47,8 @@ "SparseEmbedderBase", "HybridEmbedderBase", "CompositeHybridEmbedder", + # Google/Gemini implementations + "GoogleDenseEmbedder", # Jina AI implementations "JinaDenseEmbedder", # OpenAI implementations diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py new file mode 100644 index 00000000..fdbf0e86 --- /dev/null +++ b/openviking/models/embedder/google_embedders.py @@ -0,0 +1,324 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Google/Gemini AI Embedder Implementation""" + +import logging +from typing import Any, Dict, List, Optional + +import requests + +from openviking.models.embedder.base import ( + DenseEmbedderBase, + EmbedResult, +) + +logger = logging.getLogger(__name__) + +# Default dimensions for Google/Gemini embedding models +GOOGLE_MODEL_DIMENSIONS = { + "gemini-embedding-2-preview": 3072, # Gemini Embedding 2 with MRL support + "text-embedding-004": 768, # Updated Google text embedding model + "text-embedding-005": 768, # Latest Google text embedding model +} + + +class GoogleDenseEmbedder(DenseEmbedderBase): + """Google/Gemini AI Dense Embedder Implementation + + Uses native Google Gemini embedding API with Parts format. + Supports task-specific embeddings and Matryoshka dimension reduction. + Supports both simple task_type values and key=value format for multiple parameters. + + Example: + >>> # Simple usage with query/document task types + >>> embedder = GoogleDenseEmbedder( + ... model_name="gemini-embedding-2-preview", + ... api_key="your-gemini-api-key", + ... dimension=1024, + ... query_param="RETRIEVAL_QUERY", + ... document_param="RETRIEVAL_DOCUMENT" + ... ) + >>> query_result = embedder.embed("Search query", is_query=True) + >>> doc_result = embedder.embed("Document content", is_query=False) + + >>> # Enhanced usage with key=value format + >>> advanced_embedder = GoogleDenseEmbedder( + ... model_name="gemini-embedding-2-preview", + ... api_key="your-gemini-api-key", + ... dimension=1024, + ... query_param="task_type=RETRIEVAL_QUERY,output_dimensionality=1024", + ... document_param="task_type=RETRIEVAL_DOCUMENT,output_dimensionality=1024" + ... ) + """ + + def __init__( + self, + model_name: str = "gemini-embedding-2-preview", + api_key: Optional[str] = None, + api_base: Optional[str] = None, + dimension: Optional[int] = None, + query_param: Optional[str] = None, + document_param: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, + max_tokens: Optional[int] = None, + extra_headers: Optional[Dict[str, str]] = None, + ): + """Initialize Google/Gemini AI Dense Embedder + + Args: + model_name: Google/Gemini model name, defaults to gemini-embedding-2-preview + api_key: API key, required + api_base: API base URL, defaults to https://generativelanguage.googleapis.com/v1/ + dimension: Dimension for Matryoshka reduction, optional + query_param: Parameter for query-side embeddings. Supports simple task_type + values (e.g., "RETRIEVAL_QUERY") or key=value format + (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024"). + Valid task_type values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING + document_param: Parameter for document-side embeddings. Supports simple task_type + values or key=value format. + config: Additional configuration dict + max_tokens: Maximum token count per embedding request, None to use default (8000) + extra_headers: Extra HTTP headers to include in API requests + + Raises: + ValueError: If api_key is not provided + """ + super().__init__(model_name, config) + self.api_key = api_key + self.api_base = api_base or "https://generativelanguage.googleapis.com/v1/" + self.dimension = dimension + self.query_param = query_param + self.document_param = document_param + self.max_tokens = max_tokens or 8000 + self.extra_headers = extra_headers or {} + + if not self.api_key: + raise ValueError("api_key is required") + + # Determine dimension + max_dim = GOOGLE_MODEL_DIMENSIONS.get(model_name, 3072) + if dimension is not None and dimension > max_dim: + raise ValueError( + f"Requested dimension {dimension} exceeds maximum {max_dim} for model '{model_name}'. " + f"Google/Gemini models support Matryoshka dimension reduction up to {max_dim}." + ) + self._dimension = dimension if dimension is not None else max_dim + + def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: + """Parse parameter string to dictionary for key=value format + + Args: + param: Parameter string (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024") + + Returns: + Dictionary of parsed parameters + """ + if not param: + return {} + + result = {} + # Split by comma for multiple parameters + parts = [p.strip() for p in param.split(",")] + + for part in parts: + if "=" in part: + key, value = part.split("=", 1) + key = key.strip() + value = value.strip() + + # Convert numeric values + if key == "output_dimensionality" and value.isdigit(): + result[key] = int(value) + else: + result[key] = value + + return result + + def _build_request_params(self, is_query: bool = False) -> Dict[str, Any]: + """Build request parameters for Google-specific settings + + Args: + is_query: Flag to indicate if this is for query embeddings + + Returns: + Dict containing Google-specific parameters + """ + params = {} + + # Determine which parameter to use based on is_query flag + active_param = None + if is_query and self.query_param is not None: + active_param = self.query_param + elif not is_query and self.document_param is not None: + active_param = self.document_param + + if active_param: + if "=" in active_param: + # Parse key=value format (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024") + parsed = self._parse_param_string(active_param) + params.update(parsed) + else: + # Simple format (e.g., "RETRIEVAL_QUERY" -> {"task_type": "RETRIEVAL_QUERY"}) + params["task_type"] = active_param + + # Add dimension if specified + if self.dimension: + params["output_dimensionality"] = self.dimension + + return params + + def _update_telemetry_token_usage(self, response_data: Dict[str, Any]) -> None: + """Update telemetry with token usage from API response""" + # Google API doesn't return token usage in the same format as OpenAI + # We'll estimate based on text length for now + pass + + def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: + """Perform raw embedding without chunking logic. + + Args: + text: Input text + is_query: Flag to indicate if this is a query embedding + + Returns: + EmbedResult: Result containing only dense_vector + + Raises: + RuntimeError: When API call fails + """ + try: + # Build the URL for the embedding endpoint + url = f"{self.api_base}models/{self.model_name}:embedContent" + + # Build request headers + headers = {"Content-Type": "application/json", **self.extra_headers} + + # Add API key to headers + if "?" in url: + url += f"&key={self.api_key}" + else: + url += f"?key={self.api_key}" + + # Build request body using Parts API + request_body = { + "model": f"models/{self.model_name}", + "content": {"parts": [{"text": text}]}, + } + + # Add task-specific parameters + request_params = self._build_request_params(is_query=is_query) + if request_params: + request_body.update(request_params) + + # Make the API request + response = requests.post(url, json=request_body, headers=headers, timeout=30) + response.raise_for_status() + + response_data = response.json() + + # Extract the embedding vector + if "embedding" in response_data and "values" in response_data["embedding"]: + vector = response_data["embedding"]["values"] + else: + raise RuntimeError(f"Unexpected response format: {response_data}") + + self._update_telemetry_token_usage(response_data) + + return EmbedResult(dense_vector=vector) + + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Google/Gemini API request error: {str(e)}") from e + except Exception as e: + raise RuntimeError(f"Embedding failed: {str(e)}") from e + + def embed(self, text: str, is_query: bool = False) -> EmbedResult: + """Embed single text, with automatic chunking for oversized input. + + Args: + text: Input text + is_query: Flag to indicate if this is a query embedding + + Returns: + EmbedResult: Result containing only dense_vector + + Raises: + RuntimeError: When API call fails + """ + if not text: + return self._embed_single(text, is_query=is_query) + + if self._estimate_tokens(text) > self.max_tokens: + return self._chunk_and_embed(text, is_query=is_query) + + return self._embed_single(text, is_query=is_query) + + def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: + """Chunk oversized text and average the embeddings. + + Args: + text: Oversized input text + is_query: Flag to indicate if this is a query embedding + + Returns: + EmbedResult: Result containing only dense_vector (averaged from chunks) + """ + chunks = self._chunk_text(text, self.max_tokens) + chunk_vectors: List[List[float]] = [] + + for chunk in chunks: + result = self._embed_single(chunk, is_query=is_query) + chunk_vectors.append(result.dense_vector) + + # Average the chunk vectors + if not chunk_vectors: + return EmbedResult(dense_vector=[0.0] * self._dimension) + + avg_vector = [ + sum(v[i] for v in chunk_vectors) / len(chunk_vectors) + for i in range(len(chunk_vectors[0])) + ] + + return EmbedResult(dense_vector=avg_vector) + + def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]: + """Batch embedding with automatic chunking for oversized inputs. + + Individual texts are processed sequentially since Google's native API + doesn't support batch requests in the same way as OpenAI-compatible. + Oversized texts are individually chunked and embedded. + + Args: + texts: List of texts + is_query: Flag to indicate if these are query embeddings + + Returns: + List[EmbedResult]: List of embedding results + + Raises: + RuntimeError: When API call fails + """ + if not texts: + return [] + + results: List[EmbedResult] = [] + + # Process each text individually + for text in texts: + if not text or self._estimate_tokens(text) <= self.max_tokens: + result = self._embed_single(text if text else " ", is_query=is_query) + else: + # Handle oversized text with chunking + result = self._chunk_and_embed(text, is_query=is_query) + + results.append(result) + + return results + + def get_dimension(self) -> int: + """Get embedding dimension + + Returns: + int: Vector dimension + """ + return self._dimension From f38d4ec668099dbfcfba5afa8ceb9c16c0206132 Mon Sep 17 00:00:00 2001 From: ZaynJarvis Date: Wed, 18 Mar 2026 02:28:28 +0800 Subject: [PATCH 02/15] fix: correct Google API implementation - Remove stray server.pid file - Fix base URL to https://generativelanguage.googleapis.com/v1beta - Use x-goog-api-key header instead of URL parameter - Remove model field from request body (already in URL) - Follow official Google API format exactly --- examples/mcp-query/server.pid | 1 - .../models/embedder/google_embedders.py | 23 ++++++++----------- 2 files changed, 9 insertions(+), 15 deletions(-) delete mode 100644 examples/mcp-query/server.pid diff --git a/examples/mcp-query/server.pid b/examples/mcp-query/server.pid deleted file mode 100644 index 5513e2f2..00000000 --- a/examples/mcp-query/server.pid +++ /dev/null @@ -1 +0,0 @@ -95167 diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index fdbf0e86..280b36fb 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -68,7 +68,7 @@ def __init__( Args: model_name: Google/Gemini model name, defaults to gemini-embedding-2-preview api_key: API key, required - api_base: API base URL, defaults to https://generativelanguage.googleapis.com/v1/ + api_base: API base URL, defaults to https://generativelanguage.googleapis.com/v1beta dimension: Dimension for Matryoshka reduction, optional query_param: Parameter for query-side embeddings. Supports simple task_type values (e.g., "RETRIEVAL_QUERY") or key=value format @@ -86,7 +86,7 @@ def __init__( """ super().__init__(model_name, config) self.api_key = api_key - self.api_base = api_base or "https://generativelanguage.googleapis.com/v1/" + self.api_base = api_base or "https://generativelanguage.googleapis.com/v1beta" self.dimension = dimension self.query_param = query_param self.document_param = document_param @@ -189,22 +189,17 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: """ try: # Build the URL for the embedding endpoint - url = f"{self.api_base}models/{self.model_name}:embedContent" + url = f"{self.api_base}/models/{self.model_name}:embedContent" # Build request headers - headers = {"Content-Type": "application/json", **self.extra_headers} - - # Add API key to headers - if "?" in url: - url += f"&key={self.api_key}" - else: - url += f"?key={self.api_key}" + headers = { + "Content-Type": "application/json", + "x-goog-api-key": self.api_key, + **self.extra_headers, + } # Build request body using Parts API - request_body = { - "model": f"models/{self.model_name}", - "content": {"parts": [{"text": text}]}, - } + request_body = {"content": {"parts": [{"text": text}]}} # Add task-specific parameters request_params = self._build_request_params(is_query=is_query) From dee7fa77be700d12aab5a3159639e101cb1c4e84 Mon Sep 17 00:00:00 2001 From: ZaynJarvis Date: Wed, 18 Mar 2026 02:29:59 +0800 Subject: [PATCH 03/15] refactor: simplify to support only Gemini Embedding 2 - Remove support for text-embedding-004 and text-embedding-005 - Focus implementation on gemini-embedding-2-preview only - Add model validation to ensure only supported model is used - Update documentation to reflect single model support - Clarify that this is specifically for Gemini Embedding 2 --- docs/en/guides/01-configuration.md | 2 -- .../models/embedder/google_embedders.py | 29 ++++++++++--------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index e217c32f..33f910eb 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -129,8 +129,6 @@ Embedding model configuration for vector search, supporting dense, sparse, and h | `doubao-embedding-vision-250615` | 1024 | multimodal | Recommended | | `doubao-embedding-250615` | 1024 | text | Text only | | `gemini-embedding-2-preview` | 3072 | text | Google Gemini Embedding 2 with MRL | -| `text-embedding-004` | 768 | text | Google text embedding model | -| `text-embedding-005` | 768 | text | Latest Google text embedding model | With `input: "multimodal"`, OpenViking can embed text, images (PNG, JPG, etc.), and mixed content. diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 280b36fb..3e4b7b14 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -17,22 +17,19 @@ # Default dimensions for Google/Gemini embedding models GOOGLE_MODEL_DIMENSIONS = { "gemini-embedding-2-preview": 3072, # Gemini Embedding 2 with MRL support - "text-embedding-004": 768, # Updated Google text embedding model - "text-embedding-005": 768, # Latest Google text embedding model } class GoogleDenseEmbedder(DenseEmbedderBase): - """Google/Gemini AI Dense Embedder Implementation + """Google Gemini Embedding 2 Dense Embedder Implementation Uses native Google Gemini embedding API with Parts format. + Supports Gemini Embedding 2 (gemini-embedding-2-preview) only. Supports task-specific embeddings and Matryoshka dimension reduction. - Supports both simple task_type values and key=value format for multiple parameters. Example: >>> # Simple usage with query/document task types >>> embedder = GoogleDenseEmbedder( - ... model_name="gemini-embedding-2-preview", ... api_key="your-gemini-api-key", ... dimension=1024, ... query_param="RETRIEVAL_QUERY", @@ -43,7 +40,6 @@ class GoogleDenseEmbedder(DenseEmbedderBase): >>> # Enhanced usage with key=value format >>> advanced_embedder = GoogleDenseEmbedder( - ... model_name="gemini-embedding-2-preview", ... api_key="your-gemini-api-key", ... dimension=1024, ... query_param="task_type=RETRIEVAL_QUERY,output_dimensionality=1024", @@ -63,13 +59,13 @@ def __init__( max_tokens: Optional[int] = None, extra_headers: Optional[Dict[str, str]] = None, ): - """Initialize Google/Gemini AI Dense Embedder + """Initialize Google Gemini Embedding 2 Dense Embedder Args: - model_name: Google/Gemini model name, defaults to gemini-embedding-2-preview - api_key: API key, required + model_name: Must be "gemini-embedding-2-preview" (default and only supported model) + api_key: Google API key, required api_base: API base URL, defaults to https://generativelanguage.googleapis.com/v1beta - dimension: Dimension for Matryoshka reduction, optional + dimension: Dimension for Matryoshka reduction, optional (max 3072) query_param: Parameter for query-side embeddings. Supports simple task_type values (e.g., "RETRIEVAL_QUERY") or key=value format (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024"). @@ -82,7 +78,7 @@ def __init__( extra_headers: Extra HTTP headers to include in API requests Raises: - ValueError: If api_key is not provided + ValueError: If api_key is not provided or unsupported model is specified """ super().__init__(model_name, config) self.api_key = api_key @@ -96,12 +92,17 @@ def __init__( if not self.api_key: raise ValueError("api_key is required") - # Determine dimension - max_dim = GOOGLE_MODEL_DIMENSIONS.get(model_name, 3072) + # Determine dimension - only support gemini-embedding-2-preview + if model_name not in GOOGLE_MODEL_DIMENSIONS: + raise ValueError( + f"Unsupported model '{model_name}'. Only 'gemini-embedding-2-preview' is supported." + ) + + max_dim = GOOGLE_MODEL_DIMENSIONS[model_name] if dimension is not None and dimension > max_dim: raise ValueError( f"Requested dimension {dimension} exceeds maximum {max_dim} for model '{model_name}'. " - f"Google/Gemini models support Matryoshka dimension reduction up to {max_dim}." + f"Gemini Embedding 2 supports Matryoshka dimension reduction up to {max_dim}." ) self._dimension = dimension if dimension is not None else max_dim From ddd4b363e8071a11849e1e2719154e2d6dc2dd3b Mon Sep 17 00:00:00 2001 From: ZaynJarvis Date: Wed, 18 Mar 2026 09:32:02 +0800 Subject: [PATCH 04/15] docs: add comprehensive Google/Gemini Embedding 2 test guide - Covers basic functionality, advanced features, error handling - Includes 11 test scenarios with expected outcomes - Provides configuration examples and debug commands - Ready for real-world testing with provided API key --- GOOGLE_EMBEDDING_TEST_GUIDE.md | 289 +++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 GOOGLE_EMBEDDING_TEST_GUIDE.md diff --git a/GOOGLE_EMBEDDING_TEST_GUIDE.md b/GOOGLE_EMBEDDING_TEST_GUIDE.md new file mode 100644 index 00000000..e0526253 --- /dev/null +++ b/GOOGLE_EMBEDDING_TEST_GUIDE.md @@ -0,0 +1,289 @@ +# Google/Gemini Embedding 2 Test Guide + +This guide provides step-by-step instructions to test the new Google/Gemini Embedding 2 implementation. + +## Prerequisites + +- Google API Key: `AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio` +- Branch: `feat/google-embedding-native-api` +- PR: https://github.com/volcengine/OpenViking/pull/718 + +## 1. Environment Setup + +```bash +# Navigate to OpenViking directory +cd ~/code/openviking + +# Ensure you're on the correct branch +git checkout feat/google-embedding-native-api +git pull fork feat/google-embedding-native-api + +# Verify the Google embedder implementation exists +ls -la openviking/models/embedder/google_embedders.py +``` + +## 2. Build and Install + +```bash +# Install OpenViking from source (development mode) +pip install -e . + +# Verify installation +ov --version +``` + +## 3. Configuration + +Create test configuration file `~/.openviking/test-google.conf`: + +```json +{ + "storage": { + "workspace": "./test-google-data", + "vectordb": { + "name": "test_google_context", + "backend": "local" + }, + "agfs": { + "port": 1834, + "log_level": "info", + "backend": "local" + } + }, + "embedding": { + "dense": { + "provider": "google", + "api_key": "AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio", + "model": "gemini-embedding-2-preview", + "dimension": 1024, + "query_param": "RETRIEVAL_QUERY", + "document_param": "RETRIEVAL_DOCUMENT" + } + }, + "vlm": { + "provider": "openai", + "api_key": "dummy-key", + "model": "gpt-4" + } +} +``` + +## 4. Basic Functionality Tests + +### Test 1: Configuration Validation + +```bash +# Test configuration loading +ov --config ~/.openviking/test-google.conf info + +# Expected: Should load without errors and show Google provider +``` + +### Test 2: Add Memory (Single Document) + +```bash +# Create test content +echo "The Google Gemini Embedding 2 model supports Matryoshka dimension reduction and task-specific embeddings for improved retrieval performance." > test-content.txt + +# Add to memory +ov --config ~/.openviking/test-google.conf add-memory test-content.txt + +# Expected: Should embed and store without errors +# Check for API calls in logs +``` + +### Test 3: Search Basic + +```bash +# Search for related content +ov --config ~/.openviking/test-google.conf search "Gemini embedding model" + +# Expected: Should find the added content with good relevance score +``` + +### Test 4: Add Multiple Documents + +```bash +# Create multiple test files +echo "Machine learning models require high-quality embeddings for semantic understanding." > ml-content.txt +echo "Vector databases store and retrieve embeddings efficiently for similarity search." > vector-content.txt +echo "Natural language processing uses embeddings to represent text in high-dimensional space." > nlp-content.txt + +# Add all to memory +ov --config ~/.openviking/test-google.conf add-memory ml-content.txt vector-content.txt nlp-content.txt + +# Expected: Should process all files successfully +``` + +### Test 5: Search with Different Queries + +```bash +# Test various search queries +ov --config ~/.openviking/test-google.conf search "machine learning" +ov --config ~/.openviking/test-google.conf search "vector search" +ov --config ~/.openviking/test-google.conf search "text representation" + +# Expected: Should return relevant results with proper ranking +``` + +## 5. Advanced Feature Tests + +### Test 6: Different Dimensions + +Create `~/.openviking/test-google-512.conf` with `"dimension": 512`: + +```bash +# Test with reduced dimensions (Matryoshka) +ov --config ~/.openviking/test-google-512.conf add-memory test-content.txt +ov --config ~/.openviking/test-google-512.conf search "Gemini" + +# Expected: Should work with smaller dimension vectors +``` + +### Test 7: Task-Specific Parameters + +Create `~/.openviking/test-google-enhanced.conf` with enhanced params: + +```json +{ + "embedding": { + "dense": { + "provider": "google", + "api_key": "AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio", + "model": "gemini-embedding-2-preview", + "dimension": 1024, + "query_param": "task_type=RETRIEVAL_QUERY,output_dimensionality=1024", + "document_param": "task_type=RETRIEVAL_DOCUMENT,output_dimensionality=1024" + } + } +} +``` + +```bash +# Test enhanced parameter format +ov --config ~/.openviking/test-google-enhanced.conf add-memory test-content.txt +ov --config ~/.openviking/test-google-enhanced.conf search "Gemini" + +# Expected: Should use enhanced parameter format successfully +``` + +### Test 8: Large Text Chunking + +```bash +# Create large text file (>8000 tokens) +python3 -c " +text = 'This is a test of chunking functionality for very long documents. ' * 200 +with open('large-content.txt', 'w') as f: + f.write(text) +" + +# Test chunking +ov --config ~/.openviking/test-google.conf add-memory large-content.txt + +# Expected: Should handle chunking automatically without errors +``` + +## 6. Error Handling Tests + +### Test 9: Invalid API Key + +Create `~/.openviking/test-google-badkey.conf` with invalid API key: + +```bash +# Test with bad API key +ov --config ~/.openviking/test-google-badkey.conf add-memory test-content.txt + +# Expected: Should fail gracefully with clear error message +``` + +### Test 10: Invalid Model + +Create config with `"model": "invalid-model"`: + +```bash +# Test with unsupported model +ov --config ~/.openviking/test-google-badmodel.conf add-memory test-content.txt + +# Expected: Should fail with model validation error +``` + +## 7. Performance Tests + +### Test 11: Batch Processing + +```bash +# Create multiple files +for i in {1..10}; do + echo "Test document number $i with unique content about topic $i." > batch-test-$i.txt +done + +# Time the batch operation +time ov --config ~/.openviking/test-google.conf add-memory batch-test-*.txt + +# Expected: Should process efficiently without rate limiting issues +``` + +## 8. Verification Checklist + +- [ ] Configuration loads without errors +- [ ] Single document embedding works +- [ ] Search returns relevant results +- [ ] Multiple documents can be added +- [ ] Different search queries work properly +- [ ] Matryoshka dimension reduction (512 dims) works +- [ ] Enhanced parameter format works +- [ ] Large text chunking works automatically +- [ ] Invalid API key fails gracefully +- [ ] Invalid model fails with validation error +- [ ] Batch processing works efficiently +- [ ] No memory leaks or hanging processes +- [ ] API calls use correct Google endpoint format +- [ ] Vector dimensions match configuration + +## 9. Debug Commands + +If issues arise: + +```bash +# Check logs with debug level +ov --config ~/.openviking/test-google.conf --log-level debug add-memory test-content.txt + +# Check vector database +ls -la test-google-data/ + +# Check API calls (if logging enabled) +# Look for requests to generativelanguage.googleapis.com/v1beta +``` + +## 10. Expected API Format + +The implementation should make calls like: + +```bash +# Verify this format is used (check logs or network traffic) +curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-2-preview:embedContent" \ + -H "Content-Type: application/json" \ + -H "x-goog-api-key: AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio" \ + -d '{ "content": { "parts": [ {"text": "test"} ] } }' +``` + +## Cleanup + +```bash +# Remove test data +rm -rf test-google-data/ +rm test-content.txt ml-content.txt vector-content.txt nlp-content.txt large-content.txt batch-test-*.txt +rm ~/.openviking/test-google*.conf +``` + +## Report Template + +**Test Results:** +- [ ] All basic tests passed +- [ ] Advanced features work correctly +- [ ] Error handling is appropriate +- [ ] Performance is acceptable +- [ ] Issues found: _[list any issues]_ +- [ ] Fixes needed: _[list any required fixes]_ + +**Notes:** _[Add any additional observations]_ \ No newline at end of file From 2a896ffcda199acf72bcd0a609f928f95407e23c Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Wed, 18 Mar 2026 11:44:57 +0800 Subject: [PATCH 05/15] feat: complete local test and fix some issues --- .../models/embedder/google_embedders.py | 27 ++++++++++------ .../utils/config/embedding_config.py | 31 ++++++++++++++++--- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 3e4b7b14..482b01d9 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -74,7 +74,7 @@ def __init__( document_param: Parameter for document-side embeddings. Supports simple task_type values or key=value format. config: Additional configuration dict - max_tokens: Maximum token count per embedding request, None to use default (8000) + max_tokens: Maximum token count per embedding request, None to use default (8192) extra_headers: Extra HTTP headers to include in API requests Raises: @@ -86,7 +86,7 @@ def __init__( self.dimension = dimension self.query_param = query_param self.document_param = document_param - self.max_tokens = max_tokens or 8000 + self._max_tokens = max_tokens or 8192 self.extra_headers = extra_headers or {} if not self.api_key: @@ -122,15 +122,21 @@ def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: # Split by comma for multiple parameters parts = [p.strip() for p in param.split(",")] + # Map snake_case keys to camelCase as required by Google API + key_map = {"task_type": "taskType"} + for part in parts: if "=" in part: key, value = part.split("=", 1) key = key.strip() value = value.strip() + key = key_map.get(key, key) - # Convert numeric values + # Convert numeric values and uppercase task type if key == "output_dimensionality" and value.isdigit(): result[key] = int(value) + elif key == "taskType": + result[key] = value.upper() else: result[key] = value @@ -160,8 +166,8 @@ def _build_request_params(self, is_query: bool = False) -> Dict[str, Any]: parsed = self._parse_param_string(active_param) params.update(parsed) else: - # Simple format (e.g., "RETRIEVAL_QUERY" -> {"task_type": "RETRIEVAL_QUERY"}) - params["task_type"] = active_param + # Simple format (e.g., "retrieval_query" -> {"taskType": "RETRIEVAL_QUERY"}) + params["taskType"] = active_param.upper() # Add dimension if specified if self.dimension: @@ -241,8 +247,8 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: Raises: RuntimeError: When API call fails """ - if not text: - return self._embed_single(text, is_query=is_query) + if not text or not text.strip(): + return EmbedResult() if self._estimate_tokens(text) > self.max_tokens: return self._chunk_and_embed(text, is_query=is_query) @@ -301,8 +307,11 @@ def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedRes # Process each text individually for text in texts: - if not text or self._estimate_tokens(text) <= self.max_tokens: - result = self._embed_single(text if text else " ", is_query=is_query) + if not text or not text.strip(): + results.append(EmbedResult()) + continue + if self._estimate_tokens(text) <= self.max_tokens: + result = self._embed_single(text, is_query=is_query) else: # Handle oversized text with chunking result = self._chunk_and_embed(text, is_query=is_query) diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 3f504d40..40effe9a 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -37,7 +37,7 @@ class EmbeddingModelConfig(BaseModel): provider: Optional[str] = Field( default="volcengine", description=( - "Provider type: 'openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage'. " + "Provider type: 'openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage', 'google'. " "For OpenRouter or other OpenAI-compatible providers, use 'openai' with " "api_base and extra_headers." ), @@ -93,10 +93,10 @@ def validate_config(self): if not self.provider: raise ValueError("Embedding provider is required") - if self.provider not in ["openai", "volcengine", "vikingdb", "jina", "ollama", "voyage"]: + if self.provider not in ["openai", "volcengine", "vikingdb", "jina", "ollama", "voyage", "google"]: raise ValueError( f"Invalid embedding provider: '{self.provider}'. Must be one of: " - "'openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage'" + "'openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage', 'google'" ) # Provider-specific validation @@ -135,6 +135,10 @@ def validate_config(self): if not self.api_key: raise ValueError("Voyage provider requires 'api_key' to be set") + elif self.provider == "google": + if not self.api_key: + raise ValueError("Google provider requires 'api_key' to be set") + return self def get_effective_dimension(self) -> int: @@ -150,6 +154,11 @@ def get_effective_dimension(self) -> int: return get_voyage_model_default_dimension(self.model) + if provider == "google": + from openviking.models.embedder.google_embedders import GOOGLE_MODEL_DIMENSIONS + + return GOOGLE_MODEL_DIMENSIONS.get(self.model, 3072) + return 2048 @@ -193,7 +202,7 @@ def _create_embedder( """Factory method to create embedder instance based on provider and type. Args: - provider: Provider type ('openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage') + provider: Provider type ('openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage', 'google') embedder_type: Embedder type ('dense', 'sparse', 'hybrid') config: EmbeddingModelConfig instance @@ -204,6 +213,7 @@ def _create_embedder( ValueError: If provider/type combination is not supported """ from openviking.models.embedder import ( + GoogleDenseEmbedder, JinaDenseEmbedder, OpenAIDenseEmbedder, VikingDBDenseEmbedder, @@ -328,6 +338,19 @@ def _create_embedder( "dimension": cfg.dimension, }, ), + ("google", "dense"): ( + GoogleDenseEmbedder, + lambda cfg: { + "model_name": cfg.model, + "api_key": cfg.api_key, + "api_base": cfg.api_base, + "dimension": cfg.dimension, + **({"query_param": cfg.query_param} if cfg.query_param else {}), + **({"document_param": cfg.document_param} if cfg.document_param else {}), + "max_tokens": cfg.max_tokens, + **({"extra_headers": cfg.extra_headers} if cfg.extra_headers else {}), + }, + ), } key = (provider, embedder_type) From 77643a768449dbe0d63818ac90a467913a6e11cd Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 11:53:11 +0800 Subject: [PATCH 06/15] fix: lint issues --- openviking_cli/utils/config/embedding_config.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 40effe9a..0286e756 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -93,7 +93,15 @@ def validate_config(self): if not self.provider: raise ValueError("Embedding provider is required") - if self.provider not in ["openai", "volcengine", "vikingdb", "jina", "ollama", "voyage", "google"]: + if self.provider not in [ + "openai", + "volcengine", + "vikingdb", + "jina", + "ollama", + "voyage", + "google", + ]: raise ValueError( f"Invalid embedding provider: '{self.provider}'. Must be one of: " "'openai', 'volcengine', 'vikingdb', 'jina', 'ollama', 'voyage', 'google'" From 66f09dabf0749bfcd14ea335fc4c7ff45c78004a Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 11:55:15 +0800 Subject: [PATCH 07/15] chore: remove redundant md --- GOOGLE_EMBEDDING_TEST_GUIDE.md | 289 --------------------------------- 1 file changed, 289 deletions(-) delete mode 100644 GOOGLE_EMBEDDING_TEST_GUIDE.md diff --git a/GOOGLE_EMBEDDING_TEST_GUIDE.md b/GOOGLE_EMBEDDING_TEST_GUIDE.md deleted file mode 100644 index e0526253..00000000 --- a/GOOGLE_EMBEDDING_TEST_GUIDE.md +++ /dev/null @@ -1,289 +0,0 @@ -# Google/Gemini Embedding 2 Test Guide - -This guide provides step-by-step instructions to test the new Google/Gemini Embedding 2 implementation. - -## Prerequisites - -- Google API Key: `AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio` -- Branch: `feat/google-embedding-native-api` -- PR: https://github.com/volcengine/OpenViking/pull/718 - -## 1. Environment Setup - -```bash -# Navigate to OpenViking directory -cd ~/code/openviking - -# Ensure you're on the correct branch -git checkout feat/google-embedding-native-api -git pull fork feat/google-embedding-native-api - -# Verify the Google embedder implementation exists -ls -la openviking/models/embedder/google_embedders.py -``` - -## 2. Build and Install - -```bash -# Install OpenViking from source (development mode) -pip install -e . - -# Verify installation -ov --version -``` - -## 3. Configuration - -Create test configuration file `~/.openviking/test-google.conf`: - -```json -{ - "storage": { - "workspace": "./test-google-data", - "vectordb": { - "name": "test_google_context", - "backend": "local" - }, - "agfs": { - "port": 1834, - "log_level": "info", - "backend": "local" - } - }, - "embedding": { - "dense": { - "provider": "google", - "api_key": "AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio", - "model": "gemini-embedding-2-preview", - "dimension": 1024, - "query_param": "RETRIEVAL_QUERY", - "document_param": "RETRIEVAL_DOCUMENT" - } - }, - "vlm": { - "provider": "openai", - "api_key": "dummy-key", - "model": "gpt-4" - } -} -``` - -## 4. Basic Functionality Tests - -### Test 1: Configuration Validation - -```bash -# Test configuration loading -ov --config ~/.openviking/test-google.conf info - -# Expected: Should load without errors and show Google provider -``` - -### Test 2: Add Memory (Single Document) - -```bash -# Create test content -echo "The Google Gemini Embedding 2 model supports Matryoshka dimension reduction and task-specific embeddings for improved retrieval performance." > test-content.txt - -# Add to memory -ov --config ~/.openviking/test-google.conf add-memory test-content.txt - -# Expected: Should embed and store without errors -# Check for API calls in logs -``` - -### Test 3: Search Basic - -```bash -# Search for related content -ov --config ~/.openviking/test-google.conf search "Gemini embedding model" - -# Expected: Should find the added content with good relevance score -``` - -### Test 4: Add Multiple Documents - -```bash -# Create multiple test files -echo "Machine learning models require high-quality embeddings for semantic understanding." > ml-content.txt -echo "Vector databases store and retrieve embeddings efficiently for similarity search." > vector-content.txt -echo "Natural language processing uses embeddings to represent text in high-dimensional space." > nlp-content.txt - -# Add all to memory -ov --config ~/.openviking/test-google.conf add-memory ml-content.txt vector-content.txt nlp-content.txt - -# Expected: Should process all files successfully -``` - -### Test 5: Search with Different Queries - -```bash -# Test various search queries -ov --config ~/.openviking/test-google.conf search "machine learning" -ov --config ~/.openviking/test-google.conf search "vector search" -ov --config ~/.openviking/test-google.conf search "text representation" - -# Expected: Should return relevant results with proper ranking -``` - -## 5. Advanced Feature Tests - -### Test 6: Different Dimensions - -Create `~/.openviking/test-google-512.conf` with `"dimension": 512`: - -```bash -# Test with reduced dimensions (Matryoshka) -ov --config ~/.openviking/test-google-512.conf add-memory test-content.txt -ov --config ~/.openviking/test-google-512.conf search "Gemini" - -# Expected: Should work with smaller dimension vectors -``` - -### Test 7: Task-Specific Parameters - -Create `~/.openviking/test-google-enhanced.conf` with enhanced params: - -```json -{ - "embedding": { - "dense": { - "provider": "google", - "api_key": "AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio", - "model": "gemini-embedding-2-preview", - "dimension": 1024, - "query_param": "task_type=RETRIEVAL_QUERY,output_dimensionality=1024", - "document_param": "task_type=RETRIEVAL_DOCUMENT,output_dimensionality=1024" - } - } -} -``` - -```bash -# Test enhanced parameter format -ov --config ~/.openviking/test-google-enhanced.conf add-memory test-content.txt -ov --config ~/.openviking/test-google-enhanced.conf search "Gemini" - -# Expected: Should use enhanced parameter format successfully -``` - -### Test 8: Large Text Chunking - -```bash -# Create large text file (>8000 tokens) -python3 -c " -text = 'This is a test of chunking functionality for very long documents. ' * 200 -with open('large-content.txt', 'w') as f: - f.write(text) -" - -# Test chunking -ov --config ~/.openviking/test-google.conf add-memory large-content.txt - -# Expected: Should handle chunking automatically without errors -``` - -## 6. Error Handling Tests - -### Test 9: Invalid API Key - -Create `~/.openviking/test-google-badkey.conf` with invalid API key: - -```bash -# Test with bad API key -ov --config ~/.openviking/test-google-badkey.conf add-memory test-content.txt - -# Expected: Should fail gracefully with clear error message -``` - -### Test 10: Invalid Model - -Create config with `"model": "invalid-model"`: - -```bash -# Test with unsupported model -ov --config ~/.openviking/test-google-badmodel.conf add-memory test-content.txt - -# Expected: Should fail with model validation error -``` - -## 7. Performance Tests - -### Test 11: Batch Processing - -```bash -# Create multiple files -for i in {1..10}; do - echo "Test document number $i with unique content about topic $i." > batch-test-$i.txt -done - -# Time the batch operation -time ov --config ~/.openviking/test-google.conf add-memory batch-test-*.txt - -# Expected: Should process efficiently without rate limiting issues -``` - -## 8. Verification Checklist - -- [ ] Configuration loads without errors -- [ ] Single document embedding works -- [ ] Search returns relevant results -- [ ] Multiple documents can be added -- [ ] Different search queries work properly -- [ ] Matryoshka dimension reduction (512 dims) works -- [ ] Enhanced parameter format works -- [ ] Large text chunking works automatically -- [ ] Invalid API key fails gracefully -- [ ] Invalid model fails with validation error -- [ ] Batch processing works efficiently -- [ ] No memory leaks or hanging processes -- [ ] API calls use correct Google endpoint format -- [ ] Vector dimensions match configuration - -## 9. Debug Commands - -If issues arise: - -```bash -# Check logs with debug level -ov --config ~/.openviking/test-google.conf --log-level debug add-memory test-content.txt - -# Check vector database -ls -la test-google-data/ - -# Check API calls (if logging enabled) -# Look for requests to generativelanguage.googleapis.com/v1beta -``` - -## 10. Expected API Format - -The implementation should make calls like: - -```bash -# Verify this format is used (check logs or network traffic) -curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-2-preview:embedContent" \ - -H "Content-Type: application/json" \ - -H "x-goog-api-key: AIzaSyDjMf_tdi8d3Gmbe0LNvJzlA-6ui9dCaio" \ - -d '{ "content": { "parts": [ {"text": "test"} ] } }' -``` - -## Cleanup - -```bash -# Remove test data -rm -rf test-google-data/ -rm test-content.txt ml-content.txt vector-content.txt nlp-content.txt large-content.txt batch-test-*.txt -rm ~/.openviking/test-google*.conf -``` - -## Report Template - -**Test Results:** -- [ ] All basic tests passed -- [ ] Advanced features work correctly -- [ ] Error handling is appropriate -- [ ] Performance is acceptable -- [ ] Issues found: _[list any issues]_ -- [ ] Fixes needed: _[list any required fixes]_ - -**Notes:** _[Add any additional observations]_ \ No newline at end of file From 605bf2b982806bc4a4de99dd4f6f07e07eba8391 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 12:31:33 +0800 Subject: [PATCH 08/15] fix: address code review issues in GoogleDenseEmbedder - Fix _chunk_text called with extra arg (real bug: base method only accepts text) - Fix inconsistent API key naming: output_dimensionality -> outputDimensionality - Add exponential_backoff_retry for transient network failures - Add Provider column to docs model table for clarity --- docs/en/guides/01-configuration.md | 10 ++++---- .../models/embedder/google_embedders.py | 24 +++++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index 33f910eb..a9d4c3ff 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -124,11 +124,11 @@ Embedding model configuration for vector search, supporting dense, sparse, and h **Available Models** -| Model | Dimension | Input Type | Notes | -|-------|-----------|------------|-------| -| `doubao-embedding-vision-250615` | 1024 | multimodal | Recommended | -| `doubao-embedding-250615` | 1024 | text | Text only | -| `gemini-embedding-2-preview` | 3072 | text | Google Gemini Embedding 2 with MRL | +| Provider | Model | Dimension | Input Type | Notes | +|----------|-------|-----------|------------|-------| +| `volcengine` | `doubao-embedding-vision-250615` | 1024 | multimodal | Recommended | +| `volcengine` | `doubao-embedding-250615` | 1024 | text | Text only | +| `google` | `gemini-embedding-2-preview` | 3072 | text | Google Gemini Embedding 2 with MRL | With `input: "multimodal"`, OpenViking can embed text, images (PNG, JPG, etc.), and mixed content. diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 482b01d9..66fb6dcd 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -10,6 +10,7 @@ from openviking.models.embedder.base import ( DenseEmbedderBase, EmbedResult, + exponential_backoff_retry, ) logger = logging.getLogger(__name__) @@ -123,7 +124,7 @@ def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: parts = [p.strip() for p in param.split(",")] # Map snake_case keys to camelCase as required by Google API - key_map = {"task_type": "taskType"} + key_map = {"task_type": "taskType", "output_dimensionality": "outputDimensionality"} for part in parts: if "=" in part: @@ -133,7 +134,7 @@ def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: key = key_map.get(key, key) # Convert numeric values and uppercase task type - if key == "output_dimensionality" and value.isdigit(): + if key == "outputDimensionality" and value.isdigit(): result[key] = int(value) elif key == "taskType": result[key] = value.upper() @@ -171,7 +172,7 @@ def _build_request_params(self, is_query: bool = False) -> Dict[str, Any]: # Add dimension if specified if self.dimension: - params["output_dimensionality"] = self.dimension + params["outputDimensionality"] = self.dimension return params @@ -213,9 +214,18 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: if request_params: request_body.update(request_params) - # Make the API request - response = requests.post(url, json=request_body, headers=headers, timeout=30) - response.raise_for_status() + # Make the API request with retry on transient failures + def _do_request(): + resp = requests.post(url, json=request_body, headers=headers, timeout=30) + resp.raise_for_status() + return resp + + response = exponential_backoff_retry( + _do_request, + is_retryable=lambda e: isinstance(e, requests.exceptions.ConnectionError) + or isinstance(e, requests.exceptions.Timeout), + logger=logger, + ) response_data = response.json() @@ -265,7 +275,7 @@ def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: Returns: EmbedResult: Result containing only dense_vector (averaged from chunks) """ - chunks = self._chunk_text(text, self.max_tokens) + chunks = self._chunk_text(text) chunk_vectors: List[List[float]] = [] for chunk in chunks: From 561eb5225dc6a24281926e9f668b93f852662083 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 13:17:47 +0800 Subject: [PATCH 09/15] revert: restore output_dimensionality per Google API spec --- openviking/models/embedder/google_embedders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 66fb6dcd..21afb753 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -124,7 +124,7 @@ def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: parts = [p.strip() for p in param.split(",")] # Map snake_case keys to camelCase as required by Google API - key_map = {"task_type": "taskType", "output_dimensionality": "outputDimensionality"} + key_map = {"task_type": "taskType"} for part in parts: if "=" in part: @@ -134,7 +134,7 @@ def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: key = key_map.get(key, key) # Convert numeric values and uppercase task type - if key == "outputDimensionality" and value.isdigit(): + if key == "output_dimensionality" and value.isdigit(): result[key] = int(value) elif key == "taskType": result[key] = value.upper() @@ -172,7 +172,7 @@ def _build_request_params(self, is_query: bool = False) -> Dict[str, Any]: # Add dimension if specified if self.dimension: - params["outputDimensionality"] = self.dimension + params["output_dimensionality"] = self.dimension return params From bb626b6baca02e799a77e01b9e08331c243745e7 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 13:25:05 +0800 Subject: [PATCH 10/15] fix: restore missing methods and field after main revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add max_tokens property, _estimate_tokens, _chunk_text (+ helpers) to GoogleDenseEmbedder — these were removed from base class in main - Restore max_tokens field on EmbeddingModelConfig for google factory --- .../models/embedder/google_embedders.py | 72 +++++++++++++++++++ .../utils/config/embedding_config.py | 4 ++ 2 files changed, 76 insertions(+) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 21afb753..3f7e3414 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -3,6 +3,7 @@ """Google/Gemini AI Embedder Implementation""" import logging +import re from typing import Any, Dict, List, Optional import requests @@ -107,6 +108,77 @@ def __init__( ) self._dimension = dimension if dimension is not None else max_dim + @property + def max_tokens(self) -> int: + """Maximum token count per embedding request.""" + return self._max_tokens + + def _estimate_tokens(self, text: str) -> int: + """Estimate token count. Falls back to character-based heuristic if tiktoken unavailable.""" + try: + import tiktoken + + enc = tiktoken.encoding_for_model(self.model_name) + return len(enc.encode(text)) + except Exception: + return max(len(text) // 3, len(text.encode("utf-8")) // 4) + + def _chunk_text(self, text: str) -> List[str]: + """Split text into chunks each within max_tokens. + + Splitting priority: paragraphs (\\n\\n) > sentences (。.!?\\n) > fixed length. + """ + max_tok = self.max_tokens + if self._estimate_tokens(text) <= max_tok: + return [text] + + paragraphs = text.split("\n\n") + if len(paragraphs) > 1: + chunks = self._merge_segments(paragraphs, max_tok, "\n\n") + if all(self._estimate_tokens(c) <= max_tok for c in chunks): + return chunks + + sentences = re.split(r"(?<=[。.!?\n])", text) + sentences = [s for s in sentences if s] + if len(sentences) > 1: + chunks = self._merge_segments(sentences, max_tok, "") + if all(self._estimate_tokens(c) <= max_tok for c in chunks): + return chunks + + return self._fixed_length_split(text, max_tok) + + def _merge_segments(self, segments: List[str], max_tok: int, separator: str) -> List[str]: + chunks: List[str] = [] + current = "" + for seg in segments: + candidate = (current + separator + seg) if current else seg + if self._estimate_tokens(candidate) <= max_tok: + current = candidate + else: + if current: + chunks.append(current) + current = seg + if current: + chunks.append(current) + return chunks + + def _fixed_length_split(self, text: str, max_tok: int) -> List[str]: + total_tokens = self._estimate_tokens(text) + chars_per_token = len(text) / max(total_tokens, 1) + chunk_size = max(int(max_tok * chars_per_token * 0.9), 100) + + chunks: List[str] = [] + start = 0 + while start < len(text): + end = start + chunk_size + if end < len(text): + boundary = text.rfind(" ", start, end) + if boundary > start: + end = boundary + chunks.append(text[start:end]) + start = end + return chunks + def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: """Parse parameter string to dictionary for key=value format diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 79d5e22a..212f0625 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -51,6 +51,10 @@ class EmbeddingModelConfig(BaseModel): sk: Optional[str] = Field(default=None, description="Access Key Secretfor VikingDB API") region: Optional[str] = Field(default=None, description="Region for VikingDB API") host: Optional[str] = Field(default=None, description="Host for VikingDB API") + max_tokens: Optional[int] = Field( + default=None, + description="Maximum token count per embedding request. If None, uses model default (e.g., 8192 for Google).", + ) extra_headers: Optional[dict[str, str]] = Field( default=None, description=( From 46c09eaef583e5d11dcdfe2f222cd470121b1f51 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 13:34:20 +0800 Subject: [PATCH 11/15] test: add unit tests for GoogleDenseEmbedder --- pyproject.toml | 1 + tests/unit/test_google_embedder.py | 394 +++++++++++++++++++++++++++++ uv.lock | 6 +- 3 files changed, 400 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_google_embedder.py diff --git a/pyproject.toml b/pyproject.toml index c26aa7cf..60e55714 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,4 +246,5 @@ line-ending = "auto" [dependency-groups] dev = [ "pytest>=9.0.2", + "pytest-asyncio>=1.3.0", ] diff --git a/tests/unit/test_google_embedder.py b/tests/unit/test_google_embedder.py new file mode 100644 index 00000000..1e626582 --- /dev/null +++ b/tests/unit/test_google_embedder.py @@ -0,0 +1,394 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for Google/Gemini Embedder""" + +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from openviking.models.embedder import GoogleDenseEmbedder +from openviking.models.embedder.google_embedders import GOOGLE_MODEL_DIMENSIONS + + +def _make_response(values: list) -> MagicMock: + """Build a mock successful requests.Response with the given embedding values.""" + mock_resp = MagicMock() + mock_resp.json.return_value = {"embedding": {"values": values}} + return mock_resp + + +def _make_error_response(status_code: int = 400) -> MagicMock: + """Build a mock requests.Response that raises HTTPError on raise_for_status.""" + mock_resp = MagicMock() + mock_resp.raise_for_status.side_effect = requests.exceptions.HTTPError( + response=MagicMock(status_code=status_code) + ) + return mock_resp + + +class TestGoogleDenseEmbedderInit: + def test_requires_api_key(self): + with pytest.raises(ValueError, match="api_key is required"): + GoogleDenseEmbedder(model_name="gemini-embedding-2-preview") + + def test_rejects_unsupported_model(self): + with pytest.raises(ValueError, match="Unsupported model"): + GoogleDenseEmbedder(model_name="unknown-model", api_key="key") + + def test_rejects_dimension_exceeding_max(self): + with pytest.raises(ValueError, match="exceeds maximum"): + GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="key", + dimension=9999, + ) + + def test_default_dimension(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="key" + ) + assert embedder.get_dimension() == GOOGLE_MODEL_DIMENSIONS["gemini-embedding-2-preview"] + + def test_custom_dimension(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="key", dimension=1024 + ) + assert embedder.get_dimension() == 1024 + + def test_default_api_base(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="key" + ) + assert embedder.api_base == "https://generativelanguage.googleapis.com/v1beta" + + def test_custom_api_base(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="key", + api_base="https://custom.endpoint/v1", + ) + assert embedder.api_base == "https://custom.endpoint/v1" + + def test_default_max_tokens(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="key" + ) + assert embedder.max_tokens == 8192 + + def test_custom_max_tokens(self): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="key", max_tokens=4096 + ) + assert embedder.max_tokens == 4096 + + def test_google_model_dimensions_constant(self): + assert "gemini-embedding-2-preview" in GOOGLE_MODEL_DIMENSIONS + assert GOOGLE_MODEL_DIMENSIONS["gemini-embedding-2-preview"] == 3072 + + +class TestGoogleDenseEmbedderEmbed: + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_returns_vector(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + result = embedder.embed("Hello world") + + assert result.dense_vector is not None + assert len(result.dense_vector) == 3072 + mock_post.assert_called_once() + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_sends_correct_url(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + embedder.embed("Hello world") + + url = mock_post.call_args[0][0] + assert "gemini-embedding-2-preview:embedContent" in url + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_sends_api_key_header(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="my-api-key" + ) + embedder.embed("Hello world") + + headers = mock_post.call_args[1]["headers"] + assert headers["x-goog-api-key"] == "my-api-key" + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_sends_text_in_parts(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + embedder.embed("Hello world") + + body = mock_post.call_args[1]["json"] + assert body["content"]["parts"][0]["text"] == "Hello world" + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_empty_text_returns_empty(self, mock_post): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + result = embedder.embed("") + assert result.dense_vector is None + mock_post.assert_not_called() + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_whitespace_text_returns_empty(self, mock_post): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + result = embedder.embed(" ") + assert result.dense_vector is None + mock_post.assert_not_called() + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_api_error_raises_runtime_error(self, mock_post): + mock_post.return_value = _make_error_response(400) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + with pytest.raises(RuntimeError): + embedder.embed("Hello world") + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_unexpected_response_format_raises(self, mock_post): + mock_resp = MagicMock() + mock_resp.json.return_value = {"unexpected": "format"} + mock_post.return_value = mock_resp + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + with pytest.raises(RuntimeError, match="Unexpected response format"): + embedder.embed("Hello world") + + +class TestGoogleDenseEmbedderTaskType: + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_with_simple_query_param(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + query_param="RETRIEVAL_QUERY", + ) + embedder.embed("Hello world", is_query=True) + + body = mock_post.call_args[1]["json"] + assert body["taskType"] == "RETRIEVAL_QUERY" + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_with_simple_document_param(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + document_param="RETRIEVAL_DOCUMENT", + ) + embedder.embed("Hello world", is_query=False) + + body = mock_post.call_args[1]["json"] + assert body["taskType"] == "RETRIEVAL_DOCUMENT" + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_query_param_not_sent_for_document(self, mock_post): + """query_param should not be applied when is_query=False.""" + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + query_param="RETRIEVAL_QUERY", + ) + embedder.embed("Hello world", is_query=False) + + body = mock_post.call_args[1]["json"] + assert "taskType" not in body + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_with_keyvalue_query_param(self, mock_post): + mock_post.return_value = _make_response([0.1] * 1024) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + query_param="task_type=RETRIEVAL_QUERY,output_dimensionality=1024", + ) + embedder.embed("Hello world", is_query=True) + + body = mock_post.call_args[1]["json"] + assert body["taskType"] == "RETRIEVAL_QUERY" + assert body["output_dimensionality"] == 1024 + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_dimension_added_to_request(self, mock_post): + mock_post.return_value = _make_response([0.1] * 1024) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + dimension=1024, + ) + embedder.embed("Hello world") + + body = mock_post.call_args[1]["json"] + assert body["output_dimensionality"] == 1024 + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_no_dimension_no_output_dimensionality(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + embedder.embed("Hello world") + + body = mock_post.call_args[1]["json"] + assert "output_dimensionality" not in body + + +class TestGoogleDenseEmbedderExtraHeaders: + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_extra_headers_sent(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + extra_headers={"X-Custom": "value"}, + ) + embedder.embed("Hello world") + + headers = mock_post.call_args[1]["headers"] + assert headers["X-Custom"] == "value" + + +class TestGoogleDenseEmbedderBatch: + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_batch_returns_results(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + results = embedder.embed_batch(["Hello", "World", "Test"]) + + assert len(results) == 3 + assert mock_post.call_count == 3 + for result in results: + assert result.dense_vector is not None + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_batch_empty_list(self, mock_post): + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + results = embedder.embed_batch([]) + + assert results == [] + mock_post.assert_not_called() + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_batch_skips_empty_texts(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + results = embedder.embed_batch(["Hello", "", "World"]) + + assert len(results) == 3 + assert results[1].dense_vector is None + assert mock_post.call_count == 2 + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_batch_with_query_param(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + query_param="RETRIEVAL_QUERY", + ) + embedder.embed_batch(["Hello", "World"], is_query=True) + + for call in mock_post.call_args_list: + body = call[1]["json"] + assert body["taskType"] == "RETRIEVAL_QUERY" + + +class TestGoogleDenseEmbedderChunking: + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_oversized_text_is_chunked(self, mock_post): + """Text exceeding max_tokens should be split and embeddings averaged.""" + mock_post.return_value = _make_response([0.5] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", + api_key="test-key", + max_tokens=5, + ) + # "word " * 100 will far exceed 5 tokens + result = embedder.embed("word " * 100) + + assert result.dense_vector is not None + assert mock_post.call_count > 1 + + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_small_text_not_chunked(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + embedder.embed("Hello world") + + assert mock_post.call_count == 1 + + +class TestGoogleDenseEmbedderParseParam: + def _embedder(self): + return GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + + def test_parse_empty_param(self): + embedder = self._embedder() + assert embedder._parse_param_string(None) == {} + assert embedder._parse_param_string("") == {} + + def test_parse_task_type(self): + embedder = self._embedder() + result = embedder._parse_param_string("task_type=RETRIEVAL_QUERY") + assert result["taskType"] == "RETRIEVAL_QUERY" + + def test_parse_output_dimensionality_as_int(self): + embedder = self._embedder() + result = embedder._parse_param_string("output_dimensionality=1024") + assert result["output_dimensionality"] == 1024 + assert isinstance(result["output_dimensionality"], int) + + def test_parse_multiple_params(self): + embedder = self._embedder() + result = embedder._parse_param_string( + "task_type=RETRIEVAL_QUERY,output_dimensionality=512" + ) + assert result["taskType"] == "RETRIEVAL_QUERY" + assert result["output_dimensionality"] == 512 diff --git a/uv.lock b/uv.lock index 59e6638e..2e02b1eb 100644 --- a/uv.lock +++ b/uv.lock @@ -3406,6 +3406,7 @@ test = [ [package.dev-dependencies] dev = [ { name = "pytest" }, + { name = "pytest-asyncio" }, ] [package.metadata] @@ -3505,7 +3506,10 @@ requires-dist = [ provides-extras = ["test", "dev", "doc", "eval", "build", "bot", "bot-langfuse", "bot-telegram", "bot-feishu", "bot-dingtalk", "bot-slack", "bot-qq", "bot-sandbox", "bot-fuse", "bot-opencode", "bot-full"] [package.metadata.requires-dev] -dev = [{ name = "pytest", specifier = ">=9.0.2" }] +dev = [ + { name = "pytest", specifier = ">=9.0.2" }, + { name = "pytest-asyncio", specifier = ">=1.3.0" }, +] [[package]] name = "orjson" From f23ab1cdec4ac46ed888c87d88f025d883ca4481 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 13:41:52 +0800 Subject: [PATCH 12/15] docs: document task_type behavior for gemini-embedding-2-preview - Both snake_case (task_type) and camelCase (taskType) are accepted by the API - All task type values produce identical embeddings in this model version - Parameter is forwarded for forward compatibility with future model versions --- .../models/embedder/google_embedders.py | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 3f7e3414..be44d00a 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -27,10 +27,35 @@ class GoogleDenseEmbedder(DenseEmbedderBase): Uses native Google Gemini embedding API with Parts format. Supports Gemini Embedding 2 (gemini-embedding-2-preview) only. - Supports task-specific embeddings and Matryoshka dimension reduction. + Supports Matryoshka dimension reduction via output_dimensionality. + + ## Task Type Behavior (gemini-embedding-2-preview) + + Tested 2026-03-19 against the live API (dim=128): + + The API accepts both snake_case (task_type) and camelCase (taskType) — they + are equivalent and both succeed without error. + + However, gemini-embedding-2-preview currently produces **identical vectors + regardless of task_type**. The parameter is silently ignored by this model + version. All task types below were verified to return bit-for-bit identical + embeddings: + + | Task type | Description | + |----------------------|-----------------------------------------------------| + | RETRIEVAL_QUERY | Optimized for search queries | + | RETRIEVAL_DOCUMENT | Optimized for indexed documents | + | SEMANTIC_SIMILARITY | Optimized for similarity assessment | + | CLASSIFICATION | Optimized for text classification | + | CLUSTERING | Optimized for clustering by similarity | + | CODE_RETRIEVAL_QUERY | Optimized for NL queries over code blocks | + | QUESTION_ANSWERING | Optimized for Q&A queries | + | FACT_VERIFICATION | Optimized for fact-checking statements | + + query_param / document_param are accepted and stored, and will be forwarded + to the API in case future model versions begin honouring task_type. Example: - >>> # Simple usage with query/document task types >>> embedder = GoogleDenseEmbedder( ... api_key="your-gemini-api-key", ... dimension=1024, @@ -39,14 +64,6 @@ class GoogleDenseEmbedder(DenseEmbedderBase): ... ) >>> query_result = embedder.embed("Search query", is_query=True) >>> doc_result = embedder.embed("Document content", is_query=False) - - >>> # Enhanced usage with key=value format - >>> advanced_embedder = GoogleDenseEmbedder( - ... api_key="your-gemini-api-key", - ... dimension=1024, - ... query_param="task_type=RETRIEVAL_QUERY,output_dimensionality=1024", - ... document_param="task_type=RETRIEVAL_DOCUMENT,output_dimensionality=1024" - ... ) """ def __init__( From ce8f411e4d3a0f403a309d5d2d1cb8573b9ee0c1 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 13:53:14 +0800 Subject: [PATCH 13/15] refactor: remove taskType logic from GoogleDenseEmbedder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gemini-embedding-2-preview silently ignores taskType — verified 2026-03-19 at full 3072 dims, all task types return bit-for-bit identical vectors. Remove query_param, document_param, _parse_param_string, _build_request_params. Add note in docstring. Update factory and tests accordingly. --- .../models/embedder/google_embedders.py | 172 +++--------------- .../utils/config/embedding_config.py | 2 - tests/unit/test_google_embedder.py | 121 +++--------- 3 files changed, 48 insertions(+), 247 deletions(-) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index be44d00a..4dfb6331 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -29,41 +29,24 @@ class GoogleDenseEmbedder(DenseEmbedderBase): Supports Gemini Embedding 2 (gemini-embedding-2-preview) only. Supports Matryoshka dimension reduction via output_dimensionality. - ## Task Type Behavior (gemini-embedding-2-preview) + ## Note: taskType not supported by gemini-embedding-2-preview - Tested 2026-03-19 against the live API (dim=128): - - The API accepts both snake_case (task_type) and camelCase (taskType) — they - are equivalent and both succeed without error. - - However, gemini-embedding-2-preview currently produces **identical vectors - regardless of task_type**. The parameter is silently ignored by this model - version. All task types below were verified to return bit-for-bit identical - embeddings: - - | Task type | Description | - |----------------------|-----------------------------------------------------| - | RETRIEVAL_QUERY | Optimized for search queries | - | RETRIEVAL_DOCUMENT | Optimized for indexed documents | - | SEMANTIC_SIMILARITY | Optimized for similarity assessment | - | CLASSIFICATION | Optimized for text classification | - | CLUSTERING | Optimized for clustering by similarity | - | CODE_RETRIEVAL_QUERY | Optimized for NL queries over code blocks | - | QUESTION_ANSWERING | Optimized for Q&A queries | - | FACT_VERIFICATION | Optimized for fact-checking statements | - - query_param / document_param are accepted and stored, and will be forwarded - to the API in case future model versions begin honouring task_type. + Tested 2026-03-19 against the live API at full 3072 dimensions: + the taskType parameter is accepted without error but produces bit-for-bit + identical vectors regardless of which task type is specified. All eight + documented task types (RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, + SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING, CODE_RETRIEVAL_QUERY, + QUESTION_ANSWERING, FACT_VERIFICATION) return the same embedding as the + default (no taskType). The parameter is therefore not sent. Example: >>> embedder = GoogleDenseEmbedder( ... api_key="your-gemini-api-key", ... dimension=1024, - ... query_param="RETRIEVAL_QUERY", - ... document_param="RETRIEVAL_DOCUMENT" ... ) - >>> query_result = embedder.embed("Search query", is_query=True) - >>> doc_result = embedder.embed("Document content", is_query=False) + >>> result = embedder.embed("Hello world") + >>> print(len(result.dense_vector)) + 1024 """ def __init__( @@ -72,8 +55,6 @@ def __init__( api_key: Optional[str] = None, api_base: Optional[str] = None, dimension: Optional[int] = None, - query_param: Optional[str] = None, - document_param: Optional[str] = None, config: Optional[Dict[str, Any]] = None, max_tokens: Optional[int] = None, extra_headers: Optional[Dict[str, str]] = None, @@ -85,13 +66,6 @@ def __init__( api_key: Google API key, required api_base: API base URL, defaults to https://generativelanguage.googleapis.com/v1beta dimension: Dimension for Matryoshka reduction, optional (max 3072) - query_param: Parameter for query-side embeddings. Supports simple task_type - values (e.g., "RETRIEVAL_QUERY") or key=value format - (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024"). - Valid task_type values: RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT, - SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING - document_param: Parameter for document-side embeddings. Supports simple task_type - values or key=value format. config: Additional configuration dict max_tokens: Maximum token count per embedding request, None to use default (8192) extra_headers: Extra HTTP headers to include in API requests @@ -103,15 +77,12 @@ def __init__( self.api_key = api_key self.api_base = api_base or "https://generativelanguage.googleapis.com/v1beta" self.dimension = dimension - self.query_param = query_param - self.document_param = document_param self._max_tokens = max_tokens or 8192 self.extra_headers = extra_headers or {} if not self.api_key: raise ValueError("api_key is required") - # Determine dimension - only support gemini-embedding-2-preview if model_name not in GOOGLE_MODEL_DIMENSIONS: raise ValueError( f"Unsupported model '{model_name}'. Only 'gemini-embedding-2-preview' is supported." @@ -196,87 +167,15 @@ def _fixed_length_split(self, text: str, max_tok: int) -> List[str]: start = end return chunks - def _parse_param_string(self, param: Optional[str]) -> Dict[str, Any]: - """Parse parameter string to dictionary for key=value format - - Args: - param: Parameter string (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024") - - Returns: - Dictionary of parsed parameters - """ - if not param: - return {} - - result = {} - # Split by comma for multiple parameters - parts = [p.strip() for p in param.split(",")] - - # Map snake_case keys to camelCase as required by Google API - key_map = {"task_type": "taskType"} - - for part in parts: - if "=" in part: - key, value = part.split("=", 1) - key = key.strip() - value = value.strip() - key = key_map.get(key, key) - - # Convert numeric values and uppercase task type - if key == "output_dimensionality" and value.isdigit(): - result[key] = int(value) - elif key == "taskType": - result[key] = value.upper() - else: - result[key] = value - - return result - - def _build_request_params(self, is_query: bool = False) -> Dict[str, Any]: - """Build request parameters for Google-specific settings - - Args: - is_query: Flag to indicate if this is for query embeddings - - Returns: - Dict containing Google-specific parameters - """ - params = {} - - # Determine which parameter to use based on is_query flag - active_param = None - if is_query and self.query_param is not None: - active_param = self.query_param - elif not is_query and self.document_param is not None: - active_param = self.document_param - - if active_param: - if "=" in active_param: - # Parse key=value format (e.g., "task_type=RETRIEVAL_QUERY,output_dimensionality=1024") - parsed = self._parse_param_string(active_param) - params.update(parsed) - else: - # Simple format (e.g., "retrieval_query" -> {"taskType": "RETRIEVAL_QUERY"}) - params["taskType"] = active_param.upper() - - # Add dimension if specified - if self.dimension: - params["output_dimensionality"] = self.dimension - - return params - def _update_telemetry_token_usage(self, response_data: Dict[str, Any]) -> None: """Update telemetry with token usage from API response""" - # Google API doesn't return token usage in the same format as OpenAI - # We'll estimate based on text length for now pass - def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: + def _embed_single(self, text: str) -> EmbedResult: """Perform raw embedding without chunking logic. Args: text: Input text - is_query: Flag to indicate if this is a query embedding Returns: EmbedResult: Result containing only dense_vector @@ -285,25 +184,18 @@ def _embed_single(self, text: str, is_query: bool = False) -> EmbedResult: RuntimeError: When API call fails """ try: - # Build the URL for the embedding endpoint url = f"{self.api_base}/models/{self.model_name}:embedContent" - # Build request headers headers = { "Content-Type": "application/json", "x-goog-api-key": self.api_key, **self.extra_headers, } - # Build request body using Parts API - request_body = {"content": {"parts": [{"text": text}]}} + request_body: Dict[str, Any] = {"content": {"parts": [{"text": text}]}} + if self.dimension: + request_body["output_dimensionality"] = self.dimension - # Add task-specific parameters - request_params = self._build_request_params(is_query=is_query) - if request_params: - request_body.update(request_params) - - # Make the API request with retry on transient failures def _do_request(): resp = requests.post(url, json=request_body, headers=headers, timeout=30) resp.raise_for_status() @@ -318,7 +210,6 @@ def _do_request(): response_data = response.json() - # Extract the embedding vector if "embedding" in response_data and "values" in response_data["embedding"]: vector = response_data["embedding"]["values"] else: @@ -338,7 +229,7 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: Args: text: Input text - is_query: Flag to indicate if this is a query embedding + is_query: Ignored. gemini-embedding-2-preview does not support taskType. Returns: EmbedResult: Result containing only dense_vector @@ -350,28 +241,19 @@ def embed(self, text: str, is_query: bool = False) -> EmbedResult: return EmbedResult() if self._estimate_tokens(text) > self.max_tokens: - return self._chunk_and_embed(text, is_query=is_query) - - return self._embed_single(text, is_query=is_query) + return self._chunk_and_embed(text) - def _chunk_and_embed(self, text: str, is_query: bool = False) -> EmbedResult: - """Chunk oversized text and average the embeddings. - - Args: - text: Oversized input text - is_query: Flag to indicate if this is a query embedding + return self._embed_single(text) - Returns: - EmbedResult: Result containing only dense_vector (averaged from chunks) - """ + def _chunk_and_embed(self, text: str) -> EmbedResult: + """Chunk oversized text and average the embeddings.""" chunks = self._chunk_text(text) chunk_vectors: List[List[float]] = [] for chunk in chunks: - result = self._embed_single(chunk, is_query=is_query) + result = self._embed_single(chunk) chunk_vectors.append(result.dense_vector) - # Average the chunk vectors if not chunk_vectors: return EmbedResult(dense_vector=[0.0] * self._dimension) @@ -386,12 +268,11 @@ def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedRes """Batch embedding with automatic chunking for oversized inputs. Individual texts are processed sequentially since Google's native API - doesn't support batch requests in the same way as OpenAI-compatible. - Oversized texts are individually chunked and embedded. + does not support batch requests. Args: texts: List of texts - is_query: Flag to indicate if these are query embeddings + is_query: Ignored. gemini-embedding-2-preview does not support taskType. Returns: List[EmbedResult]: List of embedding results @@ -404,17 +285,14 @@ def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedRes results: List[EmbedResult] = [] - # Process each text individually for text in texts: if not text or not text.strip(): results.append(EmbedResult()) continue if self._estimate_tokens(text) <= self.max_tokens: - result = self._embed_single(text, is_query=is_query) + result = self._embed_single(text) else: - # Handle oversized text with chunking - result = self._chunk_and_embed(text, is_query=is_query) - + result = self._chunk_and_embed(text) results.append(result) return results diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 212f0625..72d61297 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -351,8 +351,6 @@ def _create_embedder( "api_key": cfg.api_key, "api_base": cfg.api_base, "dimension": cfg.dimension, - **({"query_param": cfg.query_param} if cfg.query_param else {}), - **({"document_param": cfg.document_param} if cfg.document_param else {}), "max_tokens": cfg.max_tokens, **({"extra_headers": cfg.extra_headers} if cfg.extra_headers else {}), }, diff --git a/tests/unit/test_google_embedder.py b/tests/unit/test_google_embedder.py index 1e626582..1df0de04 100644 --- a/tests/unit/test_google_embedder.py +++ b/tests/unit/test_google_embedder.py @@ -137,6 +137,22 @@ def test_embed_sends_text_in_parts(self, mock_post): body = mock_post.call_args[1]["json"] assert body["content"]["parts"][0]["text"] == "Hello world" + @patch("openviking.models.embedder.google_embedders.requests.post") + def test_embed_does_not_send_task_type(self, mock_post): + """taskType must not be sent — gemini-embedding-2-preview ignores it.""" + mock_post.return_value = _make_response([0.1] * 3072) + + embedder = GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key" + ) + embedder.embed("Hello world", is_query=True) + embedder.embed("Hello world", is_query=False) + + for call in mock_post.call_args_list: + body = call[1]["json"] + assert "taskType" not in body + assert "task_type" not in body + @patch("openviking.models.embedder.google_embedders.requests.post") def test_embed_empty_text_returns_empty(self, mock_post): embedder = GoogleDenseEmbedder( @@ -178,67 +194,9 @@ def test_embed_unexpected_response_format_raises(self, mock_post): embedder.embed("Hello world") -class TestGoogleDenseEmbedderTaskType: - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_with_simple_query_param(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - query_param="RETRIEVAL_QUERY", - ) - embedder.embed("Hello world", is_query=True) - - body = mock_post.call_args[1]["json"] - assert body["taskType"] == "RETRIEVAL_QUERY" - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_with_simple_document_param(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - document_param="RETRIEVAL_DOCUMENT", - ) - embedder.embed("Hello world", is_query=False) - - body = mock_post.call_args[1]["json"] - assert body["taskType"] == "RETRIEVAL_DOCUMENT" - +class TestGoogleDenseEmbedderDimension: @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_query_param_not_sent_for_document(self, mock_post): - """query_param should not be applied when is_query=False.""" - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - query_param="RETRIEVAL_QUERY", - ) - embedder.embed("Hello world", is_query=False) - - body = mock_post.call_args[1]["json"] - assert "taskType" not in body - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_with_keyvalue_query_param(self, mock_post): - mock_post.return_value = _make_response([0.1] * 1024) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - query_param="task_type=RETRIEVAL_QUERY,output_dimensionality=1024", - ) - embedder.embed("Hello world", is_query=True) - - body = mock_post.call_args[1]["json"] - assert body["taskType"] == "RETRIEVAL_QUERY" - assert body["output_dimensionality"] == 1024 - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_dimension_added_to_request(self, mock_post): + def test_dimension_sent_as_output_dimensionality(self, mock_post): mock_post.return_value = _make_response([0.1] * 1024) embedder = GoogleDenseEmbedder( @@ -252,7 +210,7 @@ def test_embed_dimension_added_to_request(self, mock_post): assert body["output_dimensionality"] == 1024 @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_no_dimension_no_output_dimensionality(self, mock_post): + def test_no_dimension_omits_output_dimensionality(self, mock_post): mock_post.return_value = _make_response([0.1] * 3072) embedder = GoogleDenseEmbedder( @@ -319,19 +277,18 @@ def test_embed_batch_skips_empty_texts(self, mock_post): assert mock_post.call_count == 2 @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_batch_with_query_param(self, mock_post): + def test_embed_batch_does_not_send_task_type(self, mock_post): mock_post.return_value = _make_response([0.1] * 3072) embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - query_param="RETRIEVAL_QUERY", + model_name="gemini-embedding-2-preview", api_key="test-key" ) embedder.embed_batch(["Hello", "World"], is_query=True) for call in mock_post.call_args_list: body = call[1]["json"] - assert body["taskType"] == "RETRIEVAL_QUERY" + assert "taskType" not in body + assert "task_type" not in body class TestGoogleDenseEmbedderChunking: @@ -345,7 +302,6 @@ def test_oversized_text_is_chunked(self, mock_post): api_key="test-key", max_tokens=5, ) - # "word " * 100 will far exceed 5 tokens result = embedder.embed("word " * 100) assert result.dense_vector is not None @@ -361,34 +317,3 @@ def test_small_text_not_chunked(self, mock_post): embedder.embed("Hello world") assert mock_post.call_count == 1 - - -class TestGoogleDenseEmbedderParseParam: - def _embedder(self): - return GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - - def test_parse_empty_param(self): - embedder = self._embedder() - assert embedder._parse_param_string(None) == {} - assert embedder._parse_param_string("") == {} - - def test_parse_task_type(self): - embedder = self._embedder() - result = embedder._parse_param_string("task_type=RETRIEVAL_QUERY") - assert result["taskType"] == "RETRIEVAL_QUERY" - - def test_parse_output_dimensionality_as_int(self): - embedder = self._embedder() - result = embedder._parse_param_string("output_dimensionality=1024") - assert result["output_dimensionality"] == 1024 - assert isinstance(result["output_dimensionality"], int) - - def test_parse_multiple_params(self): - embedder = self._embedder() - result = embedder._parse_param_string( - "task_type=RETRIEVAL_QUERY,output_dimensionality=512" - ) - assert result["taskType"] == "RETRIEVAL_QUERY" - assert result["output_dimensionality"] == 512 From c6a5ef0bce255d4703307ee979805118eb74d156 Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 14:30:45 +0800 Subject: [PATCH 14/15] docs: note taskType instruction-prefix mechanism in gemini embedders --- openviking/models/embedder/google_embedders.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/openviking/models/embedder/google_embedders.py b/openviking/models/embedder/google_embedders.py index 4dfb6331..04789913 100644 --- a/openviking/models/embedder/google_embedders.py +++ b/openviking/models/embedder/google_embedders.py @@ -39,6 +39,13 @@ class GoogleDenseEmbedder(DenseEmbedderBase): QUESTION_ANSWERING, FACT_VERIFICATION) return the same embedding as the default (no taskType). The parameter is therefore not sent. + By contrast, gemini-embedding-001 does produce distinct vectors per task + type. This is because taskType in Gemini embedding models is implemented as + an instruction prefix injected into the embedding input — effectively + "task: {task_type}, content: {text}" — rather than a separate model head or + fine-tuned adapter. gemini-embedding-2-preview appears to have dropped this + instruction-following behaviour. + Example: >>> embedder = GoogleDenseEmbedder( ... api_key="your-gemini-api-key", From e3b52d76331c73ffb4681a100d35193952fc11db Mon Sep 17 00:00:00 2001 From: "zhiheng.liu" Date: Thu, 19 Mar 2026 15:30:47 +0800 Subject: [PATCH 15/15] test: consolidate google embedder tests 28 -> 17 --- tests/unit/test_google_embedder.py | 284 ++++++----------------------- 1 file changed, 54 insertions(+), 230 deletions(-) diff --git a/tests/unit/test_google_embedder.py b/tests/unit/test_google_embedder.py index 1df0de04..50bd2229 100644 --- a/tests/unit/test_google_embedder.py +++ b/tests/unit/test_google_embedder.py @@ -8,23 +8,18 @@ import requests from openviking.models.embedder import GoogleDenseEmbedder -from openviking.models.embedder.google_embedders import GOOGLE_MODEL_DIMENSIONS def _make_response(values: list) -> MagicMock: - """Build a mock successful requests.Response with the given embedding values.""" mock_resp = MagicMock() mock_resp.json.return_value = {"embedding": {"values": values}} return mock_resp -def _make_error_response(status_code: int = 400) -> MagicMock: - """Build a mock requests.Response that raises HTTPError on raise_for_status.""" - mock_resp = MagicMock() - mock_resp.raise_for_status.side_effect = requests.exceptions.HTTPError( - response=MagicMock(status_code=status_code) +def _embedder(**kwargs) -> GoogleDenseEmbedder: + return GoogleDenseEmbedder( + model_name="gemini-embedding-2-preview", api_key="test-key", **kwargs ) - return mock_resp class TestGoogleDenseEmbedderInit: @@ -38,282 +33,111 @@ def test_rejects_unsupported_model(self): def test_rejects_dimension_exceeding_max(self): with pytest.raises(ValueError, match="exceeds maximum"): - GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="key", - dimension=9999, - ) - - def test_default_dimension(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="key" - ) - assert embedder.get_dimension() == GOOGLE_MODEL_DIMENSIONS["gemini-embedding-2-preview"] - - def test_custom_dimension(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="key", dimension=1024 - ) - assert embedder.get_dimension() == 1024 - - def test_default_api_base(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="key" - ) - assert embedder.api_base == "https://generativelanguage.googleapis.com/v1beta" + _embedder(dimension=9999) - def test_custom_api_base(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="key", - api_base="https://custom.endpoint/v1", - ) - assert embedder.api_base == "https://custom.endpoint/v1" + def test_defaults(self): + e = _embedder() + assert e.get_dimension() == 3072 + assert e.max_tokens == 8192 + assert e.api_base == "https://generativelanguage.googleapis.com/v1beta" - def test_default_max_tokens(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="key" - ) - assert embedder.max_tokens == 8192 - - def test_custom_max_tokens(self): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="key", max_tokens=4096 - ) - assert embedder.max_tokens == 4096 - - def test_google_model_dimensions_constant(self): - assert "gemini-embedding-2-preview" in GOOGLE_MODEL_DIMENSIONS - assert GOOGLE_MODEL_DIMENSIONS["gemini-embedding-2-preview"] == 3072 + def test_custom_values(self): + e = _embedder(dimension=1024, max_tokens=4096, api_base="https://custom/v1") + assert e.get_dimension() == 1024 + assert e.max_tokens == 4096 + assert e.api_base == "https://custom/v1" class TestGoogleDenseEmbedderEmbed: @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_returns_vector(self, mock_post): + def test_embed_request_structure(self, mock_post): + """Single embed call — verify URL, auth header, body, and return value.""" mock_post.return_value = _make_response([0.1] * 3072) - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - result = embedder.embed("Hello world") + result = _embedder().embed("Hello world") assert result.dense_vector is not None assert len(result.dense_vector) == 3072 mock_post.assert_called_once() - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_sends_correct_url(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed("Hello world") - url = mock_post.call_args[0][0] assert "gemini-embedding-2-preview:embedContent" in url - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_sends_api_key_header(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="my-api-key" - ) - embedder.embed("Hello world") - headers = mock_post.call_args[1]["headers"] - assert headers["x-goog-api-key"] == "my-api-key" + assert headers["x-goog-api-key"] == "test-key" + body = mock_post.call_args[1]["json"] + assert body["content"]["parts"][0]["text"] == "Hello world" + assert "taskType" not in body + assert "task_type" not in body @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_sends_text_in_parts(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed("Hello world") - + def test_dimension_sent_as_output_dimensionality(self, mock_post): + mock_post.return_value = _make_response([0.1] * 1024) + _embedder(dimension=1024).embed("Hello world") body = mock_post.call_args[1]["json"] - assert body["content"]["parts"][0]["text"] == "Hello world" + assert body["output_dimensionality"] == 1024 @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_does_not_send_task_type(self, mock_post): - """taskType must not be sent — gemini-embedding-2-preview ignores it.""" + def test_no_dimension_omits_output_dimensionality(self, mock_post): mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed("Hello world", is_query=True) - embedder.embed("Hello world", is_query=False) - - for call in mock_post.call_args_list: - body = call[1]["json"] - assert "taskType" not in body - assert "task_type" not in body + _embedder().embed("Hello world") + assert "output_dimensionality" not in mock_post.call_args[1]["json"] @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_empty_text_returns_empty(self, mock_post): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - result = embedder.embed("") - assert result.dense_vector is None - mock_post.assert_not_called() + def test_extra_headers_forwarded(self, mock_post): + mock_post.return_value = _make_response([0.1] * 3072) + _embedder(extra_headers={"X-Custom": "value"}).embed("Hello world") + assert mock_post.call_args[1]["headers"]["X-Custom"] == "value" + @pytest.mark.parametrize("text", ["", " "]) @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_whitespace_text_returns_empty(self, mock_post): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - result = embedder.embed(" ") + def test_blank_text_returns_empty_without_request(self, mock_post, text): + result = _embedder().embed(text) assert result.dense_vector is None mock_post.assert_not_called() @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_api_error_raises_runtime_error(self, mock_post): - mock_post.return_value = _make_error_response(400) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) + def test_api_error_raises_runtime_error(self, mock_post): + mock_resp = MagicMock() + mock_resp.raise_for_status.side_effect = requests.exceptions.HTTPError() + mock_post.return_value = mock_resp with pytest.raises(RuntimeError): - embedder.embed("Hello world") + _embedder().embed("Hello world") @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_unexpected_response_format_raises(self, mock_post): + def test_unexpected_response_raises(self, mock_post): mock_resp = MagicMock() mock_resp.json.return_value = {"unexpected": "format"} mock_post.return_value = mock_resp - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) with pytest.raises(RuntimeError, match="Unexpected response format"): - embedder.embed("Hello world") - - -class TestGoogleDenseEmbedderDimension: - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_dimension_sent_as_output_dimensionality(self, mock_post): - mock_post.return_value = _make_response([0.1] * 1024) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - dimension=1024, - ) - embedder.embed("Hello world") - - body = mock_post.call_args[1]["json"] - assert body["output_dimensionality"] == 1024 - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_no_dimension_omits_output_dimensionality(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed("Hello world") - - body = mock_post.call_args[1]["json"] - assert "output_dimensionality" not in body - - -class TestGoogleDenseEmbedderExtraHeaders: - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_extra_headers_sent(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - extra_headers={"X-Custom": "value"}, - ) - embedder.embed("Hello world") - - headers = mock_post.call_args[1]["headers"] - assert headers["X-Custom"] == "value" + _embedder().embed("Hello world") class TestGoogleDenseEmbedderBatch: @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_batch_returns_results(self, mock_post): + def test_batch_results_and_empty_skipped(self, mock_post): mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - results = embedder.embed_batch(["Hello", "World", "Test"]) - - assert len(results) == 3 - assert mock_post.call_count == 3 - for result in results: - assert result.dense_vector is not None - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_batch_empty_list(self, mock_post): - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - results = embedder.embed_batch([]) - - assert results == [] - mock_post.assert_not_called() - - @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_batch_skips_empty_texts(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - results = embedder.embed_batch(["Hello", "", "World"]) - + results = _embedder().embed_batch(["Hello", "", "World"]) assert len(results) == 3 + assert results[0].dense_vector is not None assert results[1].dense_vector is None + assert results[2].dense_vector is not None assert mock_post.call_count == 2 @patch("openviking.models.embedder.google_embedders.requests.post") - def test_embed_batch_does_not_send_task_type(self, mock_post): - mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed_batch(["Hello", "World"], is_query=True) - - for call in mock_post.call_args_list: - body = call[1]["json"] - assert "taskType" not in body - assert "task_type" not in body + def test_batch_empty_list(self, mock_post): + assert _embedder().embed_batch([]) == [] + mock_post.assert_not_called() class TestGoogleDenseEmbedderChunking: @patch("openviking.models.embedder.google_embedders.requests.post") - def test_oversized_text_is_chunked(self, mock_post): - """Text exceeding max_tokens should be split and embeddings averaged.""" + def test_oversized_text_chunked_and_averaged(self, mock_post): mock_post.return_value = _make_response([0.5] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", - api_key="test-key", - max_tokens=5, - ) - result = embedder.embed("word " * 100) - + result = _embedder(max_tokens=5).embed("word " * 100) assert result.dense_vector is not None assert mock_post.call_count > 1 @patch("openviking.models.embedder.google_embedders.requests.post") - def test_small_text_not_chunked(self, mock_post): + def test_normal_text_single_request(self, mock_post): mock_post.return_value = _make_response([0.1] * 3072) - - embedder = GoogleDenseEmbedder( - model_name="gemini-embedding-2-preview", api_key="test-key" - ) - embedder.embed("Hello world") - + _embedder().embed("Hello world") assert mock_post.call_count == 1