From 7f34e7bf38ba423d4c4d34acee0ee0cd5de86263 Mon Sep 17 00:00:00 2001 From: Sergey Yedrikov Date: Thu, 16 Apr 2026 22:41:42 -0400 Subject: [PATCH 1/3] LCORE-1853: Add relevance cutoff score to BYOK RAG --- docs/byok_guide.md | 32 +++-- src/app/endpoints/rags.py | 2 +- src/client.py | 2 +- src/configuration.py | 5 +- src/constants.py | 3 + src/llama_stack_configuration.py | 25 +++- src/models/config.py | 61 +++++++++- src/utils/responses.py | 6 +- src/utils/vector_search.py | 51 +++++--- .../endpoints/test_query_byok_integration.py | 100 ++++++++++++++-- .../test_streaming_query_byok_integration.py | 47 ++++++-- tests/unit/app/endpoints/test_rags.py | 4 +- tests/unit/models/config/test_byok_rag.py | 90 +++++++++++++- .../models/config/test_dump_configuration.py | 46 +++++--- tests/unit/telemetry/conftest.py | 5 +- tests/unit/test_llama_stack_configuration.py | 41 +++++++ tests/unit/utils/test_responses.py | 38 ++++-- tests/unit/utils/test_vector_search.py | 110 ++++++++++++++++-- 18 files changed, 569 insertions(+), 99 deletions(-) diff --git a/docs/byok_guide.md b/docs/byok_guide.md index 5213a8d15..b98a84139 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -79,10 +79,11 @@ Both modes rely on: Inline RAG additionally supports: - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. +- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score from each BYOK vector store during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). The default matches `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for BYOK inline retrieval. > [!NOTE] > OKP and BYOK scores are not directly comparable (different scoring systems), so -> `score_multiplier` does not apply to OKP results. To control the amount of retrieved +> `score_multiplier` and `relevance_cutoff_score` do not apply to OKP results. To control the amount of retrieved > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py` > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10). @@ -280,19 +281,26 @@ registered_resources: > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration > at startup. > +> Preferred shape: an object with `entries` (list of stores) and optional `relevance_cutoff_score`: +> > ```yaml > byok_rag: -> - rag_id: my-docs # Unique identifier for this knowledge source -> rag_type: inline::faiss -> embedding_model: sentence-transformers/all-mpnet-base-v2 -> embedding_dimension: 768 -> vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) -> db_path: /path/to/vector_db/faiss_store.db -> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources +> relevance_cutoff_score: 0.3 # Optional; min raw score per BYOK store before score_multiplier (BYOK only) +> entries: +> - rag_id: my-docs # Unique identifier for this knowledge source +> rag_type: inline::faiss +> embedding_model: sentence-transformers/all-mpnet-base-v2 +> embedding_dimension: 768 +> vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) +> db_path: /path/to/vector_db/faiss_store.db +> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources > ``` > +> Legacy: a bare list is still accepted and is treated as `entries` (same fields as each list item above). +> > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. +> `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff. ### Step 5: Configure RAG Strategy @@ -319,10 +327,10 @@ okp: Both modes can be enabled simultaneously. Choose based on your latency and control preferences: -| Mode | When context is fetched | Tool call needed | score_multiplier | -|------|------------------------|------------------|-----------------| -| Inline RAG | With every query | No | Yes (BYOK only) | -| Tool RAG | On LLM demand | Yes | No | +| Mode | When context is fetched | Tool call needed | score_multiplier | relevance_cutoff_score | +|------|------------------------|------------------|------------------|------------------------| +| Inline RAG | With every query | No | Yes (BYOK only) | Yes (BYOK only) | +| Tool RAG | On LLM demand | Yes | No | No | > [!TIP] > A ready-to-use example combining BYOK and OKP is available at diff --git a/src/app/endpoints/rags.py b/src/app/endpoints/rags.py index fdc988358..0e4b6c745 100644 --- a/src/app/endpoints/rags.py +++ b/src/app/endpoints/rags.py @@ -163,7 +163,7 @@ async def get_rag_endpoint_handler( # Resolve user-facing rag_id to llama-stack vector_db_id vector_db_id = _resolve_rag_id_to_vector_db_id( - rag_id, configuration.configuration.byok_rag + rag_id, configuration.configuration.byok_rag.entries ) try: diff --git a/src/client.py b/src/client.py index 0c77c2d49..1bc87ea0b 100644 --- a/src/client.py +++ b/src/client.py @@ -85,7 +85,7 @@ def _enrich_library_config(self, input_config_path: str) -> str: config = configuration.configuration # Enrichment: BYOK RAG - enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag]) + enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag.entries]) # Enrichment: Solr - enabled when "okp" appears in either inline or tool list enrich_solr(ls_config, config.rag.model_dump(), config.okp.model_dump()) diff --git a/src/configuration.py b/src/configuration.py index 4eeca0460..297523f9f 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -479,7 +479,8 @@ def rag_id_mapping(self) -> dict[str, str]: if self._configuration is None: raise LogicError("logic error: configuration is not loaded") byok_mapping = { - brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag + brag.vector_db_id: brag.rag_id + for brag in self._configuration.byok_rag.entries } rag = self._configuration.rag @@ -505,7 +506,7 @@ def score_multiplier_mapping(self) -> dict[str, float]: raise LogicError("logic error: configuration is not loaded") return { brag.vector_db_id: brag.score_multiplier - for brag in self._configuration.byok_rag + for brag in self._configuration.byok_rag.entries } @property diff --git a/src/constants.py b/src/constants.py index 69fea3cf9..425840687 100644 --- a/src/constants.py +++ b/src/constants.py @@ -186,10 +186,13 @@ # Inline RAG constants BYOK_RAG_MAX_CHUNKS = 10 # retrieved from BYOK RAG +# Default minimum raw similarity for BYOK vector stores only (``byok_rag.relevance_cutoff_score``) +DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3 OKP_RAG_MAX_CHUNKS = 5 # retrieved from OKP RAG # Solr OKP constants SOLR_VECTOR_SEARCH_DEFAULT_K = 5 +# Default score_threshold in vector_io.query params for the OKP/Solr vector store SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.3 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid" diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index 3ac4a8ec4..7fb12c1da 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -41,6 +41,29 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None: return super().increase_indent(flow, False) +def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]: + """Return BYOK store definitions from raw Lightspeed YAML (list or section dict). + + Always returns a ``list`` suitable for :func:`enrich_byok_rag` (each item is + expected to be a mapping with ``.get``). + + For the section form ``byok_rag: { entries: ..., relevance_cutoff_score: ... }``, + only ``entries`` is used: a ``list`` is returned as-is, a single ``dict`` is + wrapped as a one-element list, and ``None``, strings, or other types yield + ``[]``. + """ + if isinstance(raw_byok_rag, list): + return raw_byok_rag + if isinstance(raw_byok_rag, dict): + entries = raw_byok_rag.get("entries") + if isinstance(entries, list): + return entries + if isinstance(entries, dict): + return [entries] + return [] + return [] + + # ============================================================================= # Enrichment: Azure Entra ID # ============================================================================= @@ -619,7 +642,7 @@ def generate_configuration( setup_azure_entra_id_token(config.get("azure_entra_id"), env_file) # Enrichment: BYOK RAG - enrich_byok_rag(ls_config, config.get("byok_rag", [])) + enrich_byok_rag(ls_config, _raw_byok_rag_store_list(config.get("byok_rag", []))) # Enrichment: Solr - enabled when "okp" appears in either inline or tool list enrich_solr(ls_config, config.get("rag", {}), config.get("okp", {})) diff --git a/src/models/config.py b/src/models/config.py index c3e8f6726..6ed0d0bd6 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -7,7 +7,7 @@ from functools import cached_property from pathlib import Path from re import Pattern -from typing import Any, Literal, Optional, Self +from typing import Annotated, Any, Literal, Optional, Self import jsonpath_ng import yaml @@ -15,6 +15,7 @@ from pydantic import ( AnyHttpUrl, BaseModel, + BeforeValidator, ConfigDict, Field, FilePath, @@ -1622,6 +1623,55 @@ class ByokRag(ConfigurationBase): ) +def _normalize_byok_rag_input(value: Any) -> Any: + """Allow legacy ``byok_rag: [ ... ]`` YAML alongside the section object form. + + Explicit YAML null (``byok_rag: null``) is normalized to an empty mapping so + :class:`ByokRagSection` field defaults apply instead of a type error. + """ + if value is None: + return {} + if isinstance(value, list): + return {"entries": value} + return value + + +class ByokRagSection(ConfigurationBase): + """BYOK RAG configuration: registered BYOK stores and optional raw-score cutoff. + + Settings here apply only to bring-your-own-knowledge vector stores listed in + ``entries``. They do not affect OKP (Solr) inline RAG, which uses separate + query parameters and defaults. + """ + + entries: list[ByokRag] = Field( + default_factory=list, + title="BYOK RAG stores", + description="Registered bring-your-own-knowledge vector stores.", + ) + + relevance_cutoff_score: float = Field( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + ge=0, + title="BYOK inline RAG relevance cutoff", + description="Minimum raw similarity score from each **BYOK** vector store " + "before per-store score_multiplier weighting. Chunks below this threshold " + "are dropped immediately after retrieval from those stores only. Does not " + "apply to OKP/Solr. Set to 0.0 to disable filtering for BYOK.", + ) + + +def _default_byok_rag_section() -> ByokRagSection: + """Return default BYOK RAG section; delegates to :class:`ByokRagSection` field defaults.""" + return ByokRagSection.model_validate({}) + + +ByokRagSectionValidated = Annotated[ + ByokRagSection, + BeforeValidator(_normalize_byok_rag_input), +] + + class QuotaLimiterConfiguration(ConfigurationBase): """Configuration for one quota limiter. @@ -1908,11 +1958,14 @@ class Configuration(ConfigurationBase): description="Conversation history configuration.", ) - byok_rag: list[ByokRag] = Field( - default_factory=list, + byok_rag: ByokRagSectionValidated = Field( + default_factory=_default_byok_rag_section, title="BYOK RAG configuration", description="BYOK RAG configuration. This configuration can be used to " - "reconfigure Llama Stack through its run.yaml configuration file", + "reconfigure Llama Stack through its run.yaml configuration file. " + "You may use the legacy form ``byok_rag: [ ... ]`` (a list of stores) or " + "an object with ``entries`` and optional ``relevance_cutoff_score`` " + "(BYOK inline RAG only; not used for OKP/Solr).", ) a2a_state: A2AStateConfiguration = Field( diff --git a/src/utils/responses.py b/src/utils/responses.py index 3fa10b8e2..ba52cceac 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -236,7 +236,7 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position # If rag.inline is configured, but not rag.tool, tool RAG is disabled. # 3. All registered vector DBs: fallback when neither rag.tool nor rag.inline are configured. # IDs fetched from llama-stack are already internal and need no translation. - byok_rags = configuration.configuration.byok_rag + byok_rags = configuration.configuration.byok_rag.entries is_tool_rag_enabled = len(configuration.configuration.rag.tool) > 0 is_inline_rag_enabled = len(configuration.configuration.rag.inline) > 0 @@ -1708,7 +1708,7 @@ async def _resolve_client_tools( # Per-request override of vector stores (user-facing rag_ids) vector_store_ids = extract_vector_store_ids_from_tools(tools) or None # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool - byok_rags = configuration.configuration.byok_rag + byok_rags = configuration.configuration.byok_rag.entries prepared_tools = translate_tools_vector_store_ids(tools, byok_rags) prepared_tools = apply_mcp_headers_to_explicit_tools( prepared_tools, token, mcp_headers, request_headers @@ -1803,7 +1803,7 @@ async def resolve_tool_choice( ) else: # Pass tools explicitly configured for this request - byok_rags = configuration.configuration.byok_rag + byok_rags = configuration.configuration.byok_rag.entries prepared_tools = translate_tools_vector_store_ids(tools, byok_rags) prepared_tools = apply_mcp_headers_to_explicit_tools( prepared_tools, token, mcp_headers, request_headers diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index 54ec16a49..bea5a5604 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -73,14 +73,21 @@ def _build_query_params(solr: Optional[dict[str, Any]] = None) -> dict[str, Any] def _extract_byok_rag_chunks( - search_response: Any, vector_store_id: str, weight: float + search_response: Any, + vector_store_id: str, + weight: float, + byok_raw_cutoff_score: float, ) -> list[dict[str, Any]]: - """Extract and weight result chunks from vector search for BYOK RAG. + """Extract and weight result chunks from vector search for BYOK RAG only. + + Chunks whose raw retrieval score is below ``byok_raw_cutoff_score`` are + dropped before applying the per-store weight. This is not used for OKP/Solr. Args: search_response: Response from vector_io.query - vector_store_id: ID of the vector store that produced these results - weight: Score multiplier to apply to this store's results + vector_store_id: ID of the BYOK vector store that produced these results + weight: Score multiplier to apply to this store's remaining results + byok_raw_cutoff_score: Minimum raw similarity score (``byok_rag.relevance_cutoff_score``) Returns: List of result dictionaries with weighted scores @@ -89,6 +96,14 @@ def _extract_byok_rag_chunks( for chunk, score in zip( search_response.chunks, search_response.scores, strict=True ): + if score < byok_raw_cutoff_score: + logger.debug( + " [%s] BYOK: dropping chunk: raw score=%.4f < cutoff=%.4f", + vector_store_id, + score, + byok_raw_cutoff_score, + ) + continue weighted_score = score * weight doc_id = ( chunk.metadata.get("document_id", chunk.chunk_id) @@ -164,14 +179,16 @@ async def _query_store_for_byok_rag( vector_store_id: str, query: str, weight: float, + byok_raw_cutoff_score: float, ) -> list[dict[str, Any]]: - """Query a single vector store for BYOK RAG. + """Query a single **BYOK** vector store for inline RAG (not OKP/Solr). Args: client: AsyncLlamaStackClient for vector_io queries - vector_store_id: ID of the vector store to query + vector_store_id: ID of the BYOK vector store to query query: Search query string weight: Score multiplier to apply + byok_raw_cutoff_score: Minimum raw score before weighting (BYOK config only) Returns: List of weighted result dictionaries, or empty list on error @@ -185,7 +202,9 @@ async def _query_store_for_byok_rag( "mode": "vector", }, ) - return _extract_byok_rag_chunks(search_response, vector_store_id, weight) + return _extract_byok_rag_chunks( + search_response, vector_store_id, weight, byok_raw_cutoff_score + ) except Exception as e: # pylint: disable=broad-exception-caught logger.warning("Failed to search '%s': %s", vector_store_id, e) return [] @@ -333,12 +352,15 @@ async def _fetch_byok_rag( query: str, vector_store_ids: Optional[list[str]] = None, # User-facing ) -> tuple[list[RAGChunk], list[ReferencedDocument]]: - """Fetch chunks and documents from BYOK RAG sources. + """Fetch chunks and documents from BYOK RAG sources only. + + Applies ``configuration.byok_rag.relevance_cutoff_score`` to raw scores from + BYOK vector stores. OKP/Solr inline RAG is implemented in ``_fetch_solr_rag`` + and does not use that setting. Args: client: The AsyncLlamaStackClient to use for the request query: The search query - configuration: Application configuration vector_store_ids: Optional list of vector store IDs to query. If provided, only these stores will be queried. If None, all stores (excluding Solr) will be queried. @@ -366,7 +388,7 @@ async def _fetch_byok_rag( # Translate user-facing rag_ids to llama-stack ids vector_store_ids_to_query: list[str] = resolve_vector_store_ids( - rag_ids_to_query, configuration.configuration.byok_rag + rag_ids_to_query, configuration.configuration.byok_rag.entries ) # Request-level override: filter out Solr store, use the rest @@ -386,7 +408,7 @@ async def _fetch_byok_rag( score_multiplier_mapping = configuration.score_multiplier_mapping rag_id_mapping = configuration.rag_id_mapping - # Query all vector stores in parallel + # BYOK stores only: byok_rag.relevance_cutoff_score (OKP/Solr does not use it). results_per_store = await asyncio.gather( *[ _query_store_for_byok_rag( @@ -394,12 +416,13 @@ async def _fetch_byok_rag( vector_store_id, query, score_multiplier_mapping.get(vector_store_id, 1.0), + configuration.configuration.byok_rag.relevance_cutoff_score, ) for vector_store_id in vector_store_ids_to_query ] ) - # Flatten, sort by weighted score, and take top results + # Flatten, sort by weighted score, take top results all_results: list[dict[str, Any]] = [] for store_results in results_per_store: all_results.extend(store_results) @@ -520,7 +543,9 @@ async def build_rag_context( ) -> RAGContext: """Build RAG context by fetching and merging chunks from all enabled sources. - Enabled sources can be BYOK and/or Solr OKP. + Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoff comes only + from ``byok_rag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g. + Solr ``score_threshold``), not the BYOK cutoff. Args: client: The AsyncLlamaStackClient to use for the request diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py index 5e71c76c1..5a43bca17 100644 --- a/tests/integration/endpoints/test_query_byok_integration.py +++ b/tests/integration/endpoints/test_query_byok_integration.py @@ -265,7 +265,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon } # Patch the loaded configuration's byok_rag and rag.inline - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["test-knowledge"] return test_config @@ -294,7 +297,10 @@ def byok_tool_config_fixture( "score_multiplier": 1.0, } - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = [] test_config.configuration.rag.tool = ["test-knowledge"] @@ -425,7 +431,10 @@ async def test_query_byok_inline_rag_with_request_vector_store_ids( entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -486,7 +495,10 @@ async def test_query_byok_request_vector_store_ids_filters_configured_stores( entry_b.score_multiplier = 1.0 # Both sources are in config - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -684,7 +696,10 @@ async def test_query_byok_combined_inline_and_tool_rag( # pylint: disable=too-m byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["test-knowledge"] test_config.configuration.rag.tool = ["test-knowledge"] @@ -800,7 +815,10 @@ async def test_query_byok_inline_rag_only_configured_rag_id_is_queried( entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -871,7 +889,10 @@ async def test_query_byok_score_multiplier_shifts_chunk_priority( # pylint: dis entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 5.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -926,6 +947,61 @@ async def _side_effect(**kwargs: Any) -> Any: assert first_chunk.score > second_chunk.score +@pytest.mark.asyncio +async def test_query_byok_cutoff_applied_before_multiplier( + test_config: AppConfig, + mocker: MockerFixture, + test_request: Request, + test_auth: AuthTuple, +) -> None: + """Chunks below relevance_cutoff_score are dropped before score_multiplier. + + With cutoff 0.5 and per-store multiplier 10.0, a raw score of 0.4 would + become 4.0 only after surviving the cutoff step; because 0.4 < 0.5, the + chunk must be removed and must not appear in the response even with a + large multiplier. + """ + entry = mocker.MagicMock() + entry.rag_id = "cutoff-test" + entry.vector_db_id = "vs-cutoff-test" + entry.score_multiplier = 10.0 + + _br = mocker.MagicMock() + _br.entries = [entry] + _br.relevance_cutoff_score = 0.5 + test_config.configuration.byok_rag = _br + test_config.configuration.rag.inline = ["cutoff-test"] + + mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") + mock_client = _build_base_mock_client(mocker) + + low_score_resp = _make_vector_io_response( + mocker, + [ + ("Content below raw cutoff", "doc-below", 0.4), + ], + ) + mock_client.vector_io.query = mocker.AsyncMock(return_value=low_score_resp) + + mock_vs_resp = mocker.MagicMock() + mock_vs_resp.data = [] + mock_client.vector_stores.list.return_value = mock_vs_resp + + mock_holder_class.return_value.get_client.return_value = mock_client + + query_request = QueryRequest(query="test query") + + response = await query_endpoint_handler( + request=test_request, + query_request=query_request, + auth=test_auth, + mcp_headers={}, + ) + + assert response.rag_chunks is not None + assert len(response.rag_chunks) == 0 + + # ============================================================================== # BYOK_RAG_MAX_CHUNKS Capping Tests # ============================================================================== @@ -953,7 +1029,10 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable= entry.vector_db_id = "vs-big-source" entry.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry] + _br = mocker.MagicMock() + _br.entries = [entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["big-source"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -1025,7 +1104,10 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources( # pylint: di entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") diff --git a/tests/integration/endpoints/test_streaming_query_byok_integration.py b/tests/integration/endpoints/test_streaming_query_byok_integration.py index fa18edceb..8f963be11 100644 --- a/tests/integration/endpoints/test_streaming_query_byok_integration.py +++ b/tests/integration/endpoints/test_streaming_query_byok_integration.py @@ -246,7 +246,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon "score_multiplier": 1.0, } - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["test-knowledge"] return test_config @@ -271,7 +274,10 @@ def byok_tool_config_fixture( "score_multiplier": 1.0, } - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = [] test_config.configuration.rag.tool = ["test-knowledge"] @@ -345,7 +351,10 @@ async def test_streaming_query_byok_inline_rag_with_request_vector_store_ids( entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch( @@ -409,7 +418,10 @@ async def test_streaming_query_byok_request_vector_store_ids_filters_configured_ entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( @@ -689,7 +701,10 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag( byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 - test_config.configuration.byok_rag = [byok_entry] + _br = mocker.MagicMock() + _br.entries = [byok_entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["test-knowledge"] test_config.configuration.rag.tool = ["test-knowledge"] @@ -744,7 +759,7 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag( @pytest.mark.asyncio -async def test_streaming_query_byok_only_configured_rag_id_is_queried( +async def test_streaming_query_byok_only_configured_rag_id_is_queried( # pylint: disable=too-many-locals test_config: AppConfig, mocker: MockerFixture, test_request: Request, @@ -772,7 +787,10 @@ async def test_streaming_query_byok_only_configured_rag_id_is_queried( entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch( @@ -851,7 +869,10 @@ async def test_streaming_query_byok_score_multiplier_shifts_priority( # pylint: entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 5.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( @@ -933,7 +954,10 @@ async def test_streaming_query_byok_max_chunks_caps_context( # pylint: disable= entry.vector_db_id = "vs-big-source" entry.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry] + _br = mocker.MagicMock() + _br.entries = [entry] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["big-source"] mock_holder_class = mocker.patch( @@ -1009,7 +1033,10 @@ async def test_streaming_query_byok_max_chunks_caps_across_multiple_sources( # entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 - test_config.configuration.byok_rag = [entry_a, entry_b] + _br = mocker.MagicMock() + _br.entries = [entry_a, entry_b] + _br.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = _br test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( diff --git a/tests/unit/app/endpoints/test_rags.py b/tests/unit/app/endpoints/test_rags.py index 563c223fe..f6247aefa 100644 --- a/tests/unit/app/endpoints/test_rags.py +++ b/tests/unit/app/endpoints/test_rags.py @@ -379,7 +379,7 @@ def __init__(self) -> None: def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None: """Test that _resolve_rag_id_to_vector_db_id maps rag_id to vector_db_id.""" byok_config = _make_byok_config(str(tmp_path)) - byok_rags = byok_config.configuration.byok_rag + byok_rags = byok_config.configuration.byok_rag.entries assert _resolve_rag_id_to_vector_db_id("ocp-4.18-docs", byok_rags) == "vs_abc123" assert _resolve_rag_id_to_vector_db_id("company-kb", byok_rags) == "vs_def456" @@ -387,5 +387,5 @@ def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None: def test_resolve_rag_id_to_vector_db_id_passthrough(tmp_path: Path) -> None: """Test that unmapped IDs are passed through unchanged.""" byok_config = _make_byok_config(str(tmp_path)) - byok_rags = byok_config.configuration.byok_rag + byok_rags = byok_config.configuration.byok_rag.entries assert _resolve_rag_id_to_vector_db_id("vs_unknown", byok_rags) == "vs_unknown" diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py index 0a22b3a12..4083dd6b7 100644 --- a/tests/unit/models/config/test_byok_rag.py +++ b/tests/unit/models/config/test_byok_rag.py @@ -6,12 +6,13 @@ from pydantic import ValidationError from constants import ( + DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, DEFAULT_EMBEDDING_DIMENSION, DEFAULT_EMBEDDING_MODEL, DEFAULT_RAG_TYPE, DEFAULT_SCORE_MULTIPLIER, ) -from models.config import ByokRag +from models.config import ByokRag, ByokRagSection, Configuration def test_byok_rag_configuration_default_values() -> None: @@ -189,3 +190,90 @@ def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None: db_path="tests/configuration/rag.txt", score_multiplier=0.0, ) + + +def test_byok_rag_section_explicit_null_yields_defaults() -> None: + """``byok_rag: null`` in YAML normalizes to defaults (empty entries, default cutoff).""" + cfg = Configuration.model_validate( + { + "name": "t", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://x:1", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": None, + } + ) + assert cfg.byok_rag.entries == [] + assert ( + cfg.byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + + +def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None: + """``byok_rag`` accepts a section object or a legacy list of stores.""" + db_path = "tests/configuration/rag.txt" + from_list = Configuration.model_validate( + { + "name": "t", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://x:1", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": [ + { + "rag_id": "r1", + "vector_db_id": "vs1", + "db_path": db_path, + }, + ], + } + ) + assert from_list.byok_rag.entries[0].rag_id == "r1" + assert ( + from_list.byok_rag.relevance_cutoff_score + == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + + from_obj = Configuration.model_validate( + { + "name": "t", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://x:1", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": { + "entries": [ + { + "rag_id": "r2", + "vector_db_id": "vs2", + "db_path": db_path, + }, + ], + "relevance_cutoff_score": 0.42, + }, + } + ) + assert from_obj.byok_rag.entries[0].rag_id == "r2" + assert from_obj.byok_rag.relevance_cutoff_score == 0.42 + + +def test_byok_rag_section_rejects_negative_cutoff() -> None: + """relevance_cutoff_score must be non-negative.""" + with pytest.raises(ValidationError): + _ = ByokRagSection( + entries=[], + relevance_cutoff_score=-0.1, + ) diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 06a3ef08c..49184cf31 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -8,6 +8,7 @@ from pydantic import SecretStr +import constants from models.config import ( ByokRag, Configuration, @@ -189,7 +190,10 @@ def test_dump_configuration(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": [], + "byok_rag": { + "entries": [], + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, "quota_handlers": { "sqlite": None, "postgres": None, @@ -531,7 +535,10 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": [], + "byok_rag": { + "entries": [], + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, "quota_handlers": { "sqlite": None, "postgres": None, @@ -765,7 +772,10 @@ def test_dump_configuration_with_quota_limiters_different_values( "sqlite": None, "type": None, }, - "byok_rag": [], + "byok_rag": { + "entries": [], + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, "quota_handlers": { "sqlite": None, "postgres": None, @@ -979,17 +989,20 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": [ - { - "db_path": "tests/configuration/rag.txt", - "embedding_dimension": 768, - "embedding_model": "sentence-transformers/all-mpnet-base-v2", - "rag_id": "rag_id", - "rag_type": "inline::faiss", - "vector_db_id": "vector_db_id", - "score_multiplier": 1.0, - }, - ], + "byok_rag": { + "entries": [ + { + "db_path": "tests/configuration/rag.txt", + "embedding_dimension": 768, + "embedding_model": "sentence-transformers/all-mpnet-base-v2", + "rag_id": "rag_id", + "rag_type": "inline::faiss", + "vector_db_id": "vector_db_id", + "score_multiplier": 1.0, + }, + ], + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, "quota_handlers": { "sqlite": None, "postgres": None, @@ -1183,7 +1196,10 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": [], + "byok_rag": { + "entries": [], + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + }, "quota_handlers": { "sqlite": None, "postgres": None, diff --git a/tests/unit/telemetry/conftest.py b/tests/unit/telemetry/conftest.py index 6b2db6a82..e7ef44255 100644 --- a/tests/unit/telemetry/conftest.py +++ b/tests/unit/telemetry/conftest.py @@ -12,6 +12,7 @@ Action, AuthenticationConfiguration, AuthorizationConfiguration, + ByokRagSection, Configuration, CORSConfiguration, Customization, @@ -287,7 +288,7 @@ def build_fully_populated_config() -> Configuration: ), ], conversation_cache=None, - byok_rag=[], + byok_rag=ByokRagSection(entries=[]), a2a_state=None, quota_handlers=None, azure_entra_id=None, @@ -363,7 +364,7 @@ def build_minimal_config() -> Configuration: ), mcp_servers=[], conversation_cache=None, - byok_rag=[], + byok_rag=ByokRagSection(entries=[]), a2a_state=None, quota_handlers=None, azure_entra_id=None, diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py index d10f60580..6d14a5a8d 100644 --- a/tests/unit/test_llama_stack_configuration.py +++ b/tests/unit/test_llama_stack_configuration.py @@ -7,6 +7,7 @@ import yaml from llama_stack_configuration import ( + _raw_byok_rag_store_list, construct_models_section, construct_storage_backends_section, construct_vector_io_providers_section, @@ -24,6 +25,46 @@ UserDataCollection, ) +# ============================================================================= +# Test _raw_byok_rag_store_list +# ============================================================================= + + +def test_raw_byok_rag_store_list_legacy_list() -> None: + """Bare list form returns the same sequence.""" + raw = [{"rag_id": "a"}] + assert _raw_byok_rag_store_list(raw) is raw + + +def test_raw_byok_rag_store_list_section_with_entries_list() -> None: + """Section dict with entries list returns that list.""" + entries = [{"rag_id": "x"}] + assert ( + _raw_byok_rag_store_list({"relevance_cutoff_score": 0.3, "entries": entries}) + is entries + ) + + +def test_raw_byok_rag_store_list_section_with_single_entry_dict() -> None: + """Section dict may use a single mapping for entries (wrap as one-element list).""" + one = {"rag_id": "solo", "vector_db_id": "vs1", "db_path": "/d.db"} + out = _raw_byok_rag_store_list({"entries": one}) + assert out == [one] + + +def test_raw_byok_rag_store_list_section_entries_invalid_or_missing() -> None: + """None, strings, or missing entries yield an empty list.""" + assert _raw_byok_rag_store_list({"entries": None}) == [] + assert _raw_byok_rag_store_list({"entries": "not-a-list"}) == [] + assert _raw_byok_rag_store_list({}) == [] + + +def test_raw_byok_rag_store_list_non_sequence_top_level() -> None: + """Non-list, non-dict input yields [].""" + assert _raw_byok_rag_store_list(None) == [] + assert _raw_byok_rag_store_list("byok") == [] + + # ============================================================================= # Test construct_vector_stores_section # ============================================================================= diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index 485ecbf26..9ae98a225 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -90,6 +90,14 @@ ) +def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any: + """Mock ``configuration.byok_rag`` with ``entries`` and default cutoff.""" + section = mocker.Mock() + section.entries = entries + section.relevance_cutoff_score = 0.0 + return section + + class MockOutputItem: # pylint: disable=too-few-public-methods """Mock Responses API output item.""" @@ -1643,7 +1651,9 @@ async def test_translates_byok_ids_in_prepare_tools( mock_byok_rag.rag_id = "ocp_docs" mock_byok_rag.vector_db_id = "vs-001" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [mock_byok_rag] + mock_config.configuration.byok_rag = _byok_rag_section_mock( + mocker, [mock_byok_rag] + ) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1664,7 +1674,7 @@ async def test_passes_through_unknown_ids_in_prepare_tools( # Configure empty BYOK RAG mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1694,7 +1704,9 @@ async def test_does_not_translate_when_ids_fetched_from_llama_stack( mock_byok_rag.rag_id = "vs-internal" mock_byok_rag.vector_db_id = "vs-translated" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [mock_byok_rag] + mock_config.configuration.byok_rag = _byok_rag_section_mock( + mocker, [mock_byok_rag] + ) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1717,7 +1729,7 @@ async def test_uses_rag_tool_config_when_no_per_request_ids( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.configuration.rag.tool = ["rag-tool-id-1", "rag-tool-id-2"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1742,7 +1754,9 @@ async def test_rag_tool_config_ids_are_translated( mock_byok_rag.rag_id = "ocp_docs" mock_byok_rag.vector_db_id = "vs-001" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [mock_byok_rag] + mock_config.configuration.byok_rag = _byok_rag_section_mock( + mocker, [mock_byok_rag] + ) mock_config.configuration.rag.tool = ["ocp_docs"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1760,7 +1774,7 @@ async def test_inline_rag_disables_tool_rag(self, mocker: MockerFixture) -> None mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [ "inline-store-id" @@ -1782,7 +1796,7 @@ async def test_per_request_ids_override_rag_tool_config( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.configuration.rag.tool = ["config-id-1"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1807,7 +1821,7 @@ async def test_all_registered_dbs_used_when_neither_tool_nor_inline_configured( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3355,7 +3369,7 @@ async def test_client_tools_without_merge_header( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3380,7 +3394,7 @@ async def test_client_tools_with_merge_header(self, mocker: MockerFixture) -> No return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3417,7 +3431,7 @@ async def test_merge_header_conflict_raises_409( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3476,7 +3490,7 @@ async def test_merge_header_no_server_tools_returns_client_only( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = [] + mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) mocker.patch( diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py index 8bd5669e9..7e2752dfd 100644 --- a/tests/unit/utils/test_vector_search.py +++ b/tests/unit/utils/test_vector_search.py @@ -1,5 +1,7 @@ """Unit tests for vector search utilities.""" +from typing import Any + import pytest from pydantic import AnyUrl from pytest_mock import MockerFixture @@ -23,6 +25,14 @@ ) +def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any: + """Build a mock ``byok_rag`` section with ``entries`` and cutoff.""" + section = mocker.Mock() + section.entries = entries + section.relevance_cutoff_score = 0.0 + return section + + class TestIsSolrEnabled: """Tests for _is_solr_enabled function.""" @@ -97,7 +107,10 @@ def test_extract_chunks_with_metadata(self, mocker: MockerFixture) -> None: search_response.scores = [0.9, 0.8] result = _extract_byok_rag_chunks( - search_response, vector_store_id="test_store", weight=1.5 + search_response, + vector_store_id="test_store", + weight=1.5, + byok_raw_cutoff_score=0.0, ) assert len(result) == 2 @@ -119,13 +132,42 @@ def test_extract_chunks_without_metadata(self, mocker: MockerFixture) -> None: search_response.scores = [0.75] result = _extract_byok_rag_chunks( - search_response, vector_store_id="test_store", weight=1.0 + search_response, + vector_store_id="test_store", + weight=1.0, + byok_raw_cutoff_score=0.0, ) assert len(result) == 1 assert result[0]["doc_id"] == "chunk_id" assert result[0]["metadata"] == {} + def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> None: + """Raw scores below the BYOK cutoff are not included.""" + chunk_hi = mocker.Mock() + chunk_hi.content = "Hi" + chunk_hi.chunk_id = "a" + chunk_hi.metadata = {} + chunk_lo = mocker.Mock() + chunk_lo.content = "Lo" + chunk_lo.chunk_id = "b" + chunk_lo.metadata = {} + + search_response = mocker.Mock() + search_response.chunks = [chunk_hi, chunk_lo] + search_response.scores = [0.9, 0.2] + + result = _extract_byok_rag_chunks( + search_response, + vector_store_id="vs", + weight=0.5, + byok_raw_cutoff_score=0.5, + ) + assert len(result) == 1 + assert result[0]["content"] == "Hi" + assert result[0]["score"] == 0.9 + assert result[0]["weighted_score"] == 0.45 + class TestFormatRagContext: """Tests for _format_rag_context function.""" @@ -382,7 +424,7 @@ async def test_byok_no_inline_ids(self, mocker: MockerFixture) -> None: """Test when no inline BYOK sources are configured.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = [] + config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -401,7 +443,9 @@ async def test_byok_enabled_success(self, mocker: MockerFixture) -> None: byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" config_mock.configuration.rag.inline = ["rag_1"] - config_mock.configuration.byok_rag = [byok_rag_mock] + config_mock.configuration.byok_rag = _byok_rag_section_mock( + mocker, [byok_rag_mock] + ) config_mock.score_multiplier_mapping = {"vs_1": 1.5} config_mock.rag_id_mapping = {"vs_1": "rag_1"} mocker.patch("utils.vector_search.configuration", config_mock) @@ -439,7 +483,9 @@ async def test_user_facing_ids_translated_to_internal_ids( byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "my-kb" byok_rag_mock.vector_db_id = "vs-internal-001" - config_mock.configuration.byok_rag = [byok_rag_mock] + config_mock.configuration.byok_rag = _byok_rag_section_mock( + mocker, [byok_rag_mock] + ) config_mock.configuration.rag.inline = ["my-kb"] config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0} config_mock.rag_id_mapping = {"vs-internal-001": "my-kb"} @@ -479,7 +525,9 @@ async def test_multiple_user_facing_ids_each_translated( byok_rag_2 = mocker.Mock() byok_rag_2.rag_id = "kb-part2" byok_rag_2.vector_db_id = "vs-bbb-222" - config_mock.configuration.byok_rag = [byok_rag_1, byok_rag_2] + config_mock.configuration.byok_rag = _byok_rag_section_mock( + mocker, [byok_rag_1, byok_rag_2] + ) config_mock.configuration.rag.inline = ["kb-part1", "kb-part2"] config_mock.score_multiplier_mapping = {"vs-aaa-111": 1.0, "vs-bbb-222": 1.0} config_mock.rag_id_mapping = { @@ -522,7 +570,7 @@ async def test_no_inline_rag_configured_skips_byok( """Test that BYOK inline RAG is skipped when rag.inline is empty.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = [] + config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -542,7 +590,7 @@ async def test_request_id_not_in_inline_config_skips_byok( """Test that a request vector_store_id not registered in rag.inline is filtered out.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = ["registered-id"] - config_mock.configuration.byok_rag = [] + config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -555,6 +603,45 @@ async def test_request_id_not_in_inline_config_skips_byok( assert referenced_docs == [] client_mock.vector_io.query.assert_not_called() + @pytest.mark.asyncio + async def test_byok_relevance_cutoff_drops_low_raw_scores( + self, mocker: MockerFixture + ) -> None: + """Chunks with raw retrieval score below BYOK relevance_cutoff_score are excluded.""" + config_mock = mocker.Mock(spec=AppConfig) + byok_rag_mock = mocker.Mock() + byok_rag_mock.rag_id = "rag_1" + byok_rag_mock.vector_db_id = "vs_1" + section = mocker.Mock() + section.entries = [byok_rag_mock] + section.relevance_cutoff_score = 0.5 + config_mock.configuration.rag.inline = ["rag_1"] + config_mock.configuration.byok_rag = section + config_mock.score_multiplier_mapping = {"vs_1": 1.0} + config_mock.rag_id_mapping = {"vs_1": "rag_1"} + mocker.patch("utils.vector_search.configuration", config_mock) + + chunk_hi = mocker.Mock() + chunk_hi.content = "Keep" + chunk_hi.chunk_id = "c1" + chunk_hi.metadata = {} + + chunk_lo = mocker.Mock() + chunk_lo.content = "Drop" + chunk_lo.chunk_id = "c2" + chunk_lo.metadata = {} + + search_response = mocker.Mock() + search_response.chunks = [chunk_hi, chunk_lo] + search_response.scores = [0.9, 0.3] + + client_mock = mocker.AsyncMock() + client_mock.vector_io.query.return_value = search_response + + rag_chunks, _referenced_docs = await _fetch_byok_rag(client_mock, "q") + assert len(rag_chunks) == 1 + assert rag_chunks[0].content == "Keep" + class TestFetchSolrRag: """Tests for _fetch_solr_rag async function.""" @@ -613,7 +700,7 @@ async def test_both_sources_disabled(self, mocker: MockerFixture) -> None: """Test when both BYOK inline and Solr inline are not configured.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = [] + config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) config_mock.inline_solr_enabled = False mocker.patch("utils.vector_search.configuration", config_mock) @@ -633,7 +720,9 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None: byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" config_mock.configuration.rag.inline = ["rag_1"] - config_mock.configuration.byok_rag = [byok_rag_mock] + config_mock.configuration.byok_rag = _byok_rag_section_mock( + mocker, [byok_rag_mock] + ) config_mock.inline_solr_enabled = False config_mock.score_multiplier_mapping = {"vs_1": 1.0} config_mock.rag_id_mapping = {"vs_1": "rag_1"} @@ -657,4 +746,3 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None: assert len(context.rag_chunks) > 0 assert "BYOK content" in context.context_text - assert "file_search found" in context.context_text From e371ef42ae6714bf1a7c9caf1c92e20b28a1c8b4 Mon Sep 17 00:00:00 2001 From: Sergey Yedrikov Date: Fri, 17 Apr 2026 17:21:58 -0400 Subject: [PATCH 2/3] Per BYOK database cutoff score --- docs/byok_guide.md | 23 ++-- docs/openapi.json | 19 ++-- examples/lightspeed-stack-byok-okp-rag.yaml | 2 + src/app/endpoints/a2a.py | 32 +++++- src/app/endpoints/rags.py | 2 +- src/client.py | 2 +- src/configuration.py | 27 ++++- src/constants.py | 2 +- src/llama_stack_configuration.py | 27 +++-- src/models/config.py | 95 +++++++++-------- src/utils/responses.py | 6 +- src/utils/vector_search.py | 29 ++--- .../endpoints/test_query_byok_integration.py | 70 +++++------- .../test_streaming_query_byok_integration.py | 61 +++++------ tests/unit/app/endpoints/test_rags.py | 4 +- tests/unit/models/config/test_byok_rag.py | 100 +++++++++++------- .../models/config/test_dump_configuration.py | 46 +++----- tests/unit/telemetry/conftest.py | 5 +- tests/unit/test_configuration.py | 39 +++++++ tests/unit/test_llama_stack_configuration.py | 31 +----- tests/unit/utils/test_responses.py | 44 ++++---- tests/unit/utils/test_vector_search.py | 43 ++++---- 22 files changed, 388 insertions(+), 321 deletions(-) diff --git a/docs/byok_guide.md b/docs/byok_guide.md index b98a84139..62a3e346e 100644 --- a/docs/byok_guide.md +++ b/docs/byok_guide.md @@ -79,7 +79,7 @@ Both modes rely on: Inline RAG additionally supports: - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content. -- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score from each BYOK vector store during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). The default matches `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for BYOK inline retrieval. +- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score, configured **per BYOK store** (each `byok_rag` entry) during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). If an entry omits `relevance_cutoff_score`, it defaults to `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for that store. > [!NOTE] > OKP and BYOK scores are not directly comparable (different scoring systems), so @@ -281,23 +281,20 @@ registered_resources: > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration > at startup. > -> Preferred shape: an object with `entries` (list of stores) and optional `relevance_cutoff_score`: +> Preferred shape: a YAML list of stores. Set optional `relevance_cutoff_score` on **each** store, or rely on the default from `src/constants.py`: > > ```yaml > byok_rag: -> relevance_cutoff_score: 0.3 # Optional; min raw score per BYOK store before score_multiplier (BYOK only) -> entries: -> - rag_id: my-docs # Unique identifier for this knowledge source -> rag_type: inline::faiss -> embedding_model: sentence-transformers/all-mpnet-base-v2 -> embedding_dimension: 768 -> vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) -> db_path: /path/to/vector_db/faiss_store.db -> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources +> - rag_id: my-docs # Unique identifier for this knowledge source +> rag_type: inline::faiss +> embedding_model: sentence-transformers/all-mpnet-base-v2 +> embedding_dimension: 768 +> vector_db_id: your-index-id # Llama Stack vector store ID (from index generation) +> db_path: /path/to/vector_db/faiss_store.db +> relevance_cutoff_score: 0.3 # Optional per store; min raw score before score_multiplier (BYOK inline only) +> score_multiplier: 1.0 # Optional: weight results when mixing multiple sources > ``` > -> Legacy: a bare list is still accepted and is treated as `entries` (same fields as each list item above). -> > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it. > `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff. diff --git a/docs/openapi.json b/docs/openapi.json index 8679b5a03..ff0c3d03f 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -6931,8 +6931,8 @@ "tags": [ "a2a" ], - "summary": "Handle A2A Jsonrpc", - "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", + "summary": "Handle A2A Jsonrpc Get", + "description": "Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST).", "operationId": "handle_a2a_jsonrpc_a2a_get", "responses": { "200": { @@ -6949,9 +6949,9 @@ "tags": [ "a2a" ], - "summary": "Handle A2A Jsonrpc", - "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response", - "operationId": "handle_a2a_jsonrpc_a2a_get", + "summary": "Handle A2A Jsonrpc Post", + "description": "Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET).", + "operationId": "handle_a2a_jsonrpc_a2a_post", "responses": { "200": { "description": "Successful Response", @@ -8074,6 +8074,13 @@ "title": "DB path", "description": "Path to RAG database." }, + "relevance_cutoff_score": { + "type": "number", + "minimum": 0.0, + "title": "BYOK inline RAG relevance cutoff", + "description": "Minimum raw similarity score from this **BYOK** vector store before score_multiplier weighting. Chunks below this threshold are dropped immediately after retrieval from this store only. Does not apply to OKP/Solr. Set to 0.0 to disable filtering for this store.", + "default": 0.3 + }, "score_multiplier": { "type": "number", "exclusiveMinimum": 0.0, @@ -8251,7 +8258,7 @@ }, "type": "array", "title": "BYOK RAG configuration", - "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file" + "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file. Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set ``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr)." }, "a2a_state": { "$ref": "#/components/schemas/A2AStateConfiguration", diff --git a/examples/lightspeed-stack-byok-okp-rag.yaml b/examples/lightspeed-stack-byok-okp-rag.yaml index 4a6b433b8..a160eaafd 100644 --- a/examples/lightspeed-stack-byok-okp-rag.yaml +++ b/examples/lightspeed-stack-byok-okp-rag.yaml @@ -40,12 +40,14 @@ byok_rag: embedding_dimension: 1024 vector_db_id: vs_123 # Llama-stack vector_store_id db_path: /tmp/ocp.faiss + relevance_cutoff_score: 0.3 # Optional per store; min raw score before score_multiplier (inline BYOK only) score_multiplier: 1.0 # Weight for this vector store's results (Inline RAG only) - rag_id: knowledge-base # referenced in rag.inline / rag.tool rag_type: inline::faiss embedding_dimension: 384 vector_db_id: vs_456 # Llama-stack vector_store_id db_path: /tmp/kb.faiss + relevance_cutoff_score: 0.3 score_multiplier: 1.2 # Weight for this vector store's results (Inline RAG only) # RAG configuration diff --git a/src/app/endpoints/a2a.py b/src/app/endpoints/a2a.py index 0c9fe2a75..b2b8b80f7 100644 --- a/src/app/endpoints/a2a.py +++ b/src/app/endpoints/a2a.py @@ -692,12 +692,40 @@ async def _create_a2a_app( return a2a_app.build() -@router.api_route("/a2a", methods=["GET", "POST"], response_model=None) +@router.get( + "/a2a", + response_model=None, + operation_id="handle_a2a_jsonrpc_a2a_get", +) +@authorize(Action.A2A_JSONRPC) +async def handle_a2a_jsonrpc_get( + request: Request, + auth: Annotated[AuthTuple, Depends(auth_dependency)], + mcp_headers: McpHeaders = Depends(mcp_headers_dependency), +) -> Response | StreamingResponse: + """Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST).""" + return await _handle_a2a_jsonrpc(request, auth, mcp_headers) + + +@router.post( + "/a2a", + response_model=None, + operation_id="handle_a2a_jsonrpc_a2a_post", +) @authorize(Action.A2A_JSONRPC) -async def handle_a2a_jsonrpc( # pylint: disable=too-many-locals,too-many-statements +async def handle_a2a_jsonrpc_post( request: Request, auth: Annotated[AuthTuple, Depends(auth_dependency)], mcp_headers: McpHeaders = Depends(mcp_headers_dependency), +) -> Response | StreamingResponse: + """Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET).""" + return await _handle_a2a_jsonrpc(request, auth, mcp_headers) + + +async def _handle_a2a_jsonrpc( # pylint: disable=too-many-locals,too-many-statements + request: Request, + auth: AuthTuple, + mcp_headers: McpHeaders, ) -> Response | StreamingResponse: """ Handle A2A JSON-RPC requests following the A2A protocol specification. diff --git a/src/app/endpoints/rags.py b/src/app/endpoints/rags.py index 0e4b6c745..fdc988358 100644 --- a/src/app/endpoints/rags.py +++ b/src/app/endpoints/rags.py @@ -163,7 +163,7 @@ async def get_rag_endpoint_handler( # Resolve user-facing rag_id to llama-stack vector_db_id vector_db_id = _resolve_rag_id_to_vector_db_id( - rag_id, configuration.configuration.byok_rag.entries + rag_id, configuration.configuration.byok_rag ) try: diff --git a/src/client.py b/src/client.py index 1bc87ea0b..0c77c2d49 100644 --- a/src/client.py +++ b/src/client.py @@ -85,7 +85,7 @@ def _enrich_library_config(self, input_config_path: str) -> str: config = configuration.configuration # Enrichment: BYOK RAG - enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag.entries]) + enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag]) # Enrichment: Solr - enabled when "okp" appears in either inline or tool list enrich_solr(ls_config, config.rag.model_dump(), config.okp.model_dump()) diff --git a/src/configuration.py b/src/configuration.py index 297523f9f..3689d30b9 100644 --- a/src/configuration.py +++ b/src/configuration.py @@ -479,8 +479,7 @@ def rag_id_mapping(self) -> dict[str, str]: if self._configuration is None: raise LogicError("logic error: configuration is not loaded") byok_mapping = { - brag.vector_db_id: brag.rag_id - for brag in self._configuration.byok_rag.entries + brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag } rag = self._configuration.rag @@ -506,9 +505,31 @@ def score_multiplier_mapping(self) -> dict[str, float]: raise LogicError("logic error: configuration is not loaded") return { brag.vector_db_id: brag.score_multiplier - for brag in self._configuration.byok_rag.entries + for brag in self._configuration.byok_rag } + @property + def relevance_cutoff_mapping(self) -> dict[str, float]: + """Return mapping from vector_db_id to relevance_cutoff_score from BYOK RAG config. + + If ``byok_rag`` lists the same ``vector_db_id`` more than once, the first + occurrence wins. + + Returns: + dict[str, float]: Mapping where keys are llama-stack ``vector_db_id`` values + and values are per-store raw-score cutoffs before score_multiplier weighting. + + Raises: + LogicError: If the configuration has not been loaded. + """ + if self._configuration is None: + raise LogicError("logic error: configuration is not loaded") + mapping: dict[str, float] = {} + for brag in self._configuration.byok_rag: + if brag.vector_db_id not in mapping: + mapping[brag.vector_db_id] = brag.relevance_cutoff_score + return mapping + @property def inline_solr_enabled(self) -> bool: """Return whether OKP is included in the inline RAG list. diff --git a/src/constants.py b/src/constants.py index 425840687..f09305e10 100644 --- a/src/constants.py +++ b/src/constants.py @@ -186,7 +186,7 @@ # Inline RAG constants BYOK_RAG_MAX_CHUNKS = 10 # retrieved from BYOK RAG -# Default minimum raw similarity for BYOK vector stores only (``byok_rag.relevance_cutoff_score``) +# Default minimum raw similarity per BYOK store (``ByokRag.relevance_cutoff_score``) DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3 OKP_RAG_MAX_CHUNKS = 5 # retrieved from OKP RAG diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index 7fb12c1da..106db656e 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -42,25 +42,24 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None: def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]: - """Return BYOK store definitions from raw Lightspeed YAML (list or section dict). + """Return BYOK store definitions from raw Lightspeed YAML. - Always returns a ``list`` suitable for :func:`enrich_byok_rag` (each item is - expected to be a mapping with ``.get``). + Only a YAML list is accepted as BYOK stores. Each list item is expected to be + a mapping with ``.get`` for :func:`enrich_byok_rag`. - For the section form ``byok_rag: { entries: ..., relevance_cutoff_score: ... }``, - only ``entries`` is used: a ``list`` is returned as-is, a single ``dict`` is - wrapped as a one-element list, and ``None``, strings, or other types yield - ``[]``. + Parameters: + ---------- + raw_byok_rag (Any): + Raw parsed YAML value (may be a list, ``None``, or other types). + + Returns: + ------- + list[Any]: + The input list returned as-is when ``raw_byok_rag`` is a list; + otherwise an empty list (``[]`` for ``None`` or any non-list type). """ if isinstance(raw_byok_rag, list): return raw_byok_rag - if isinstance(raw_byok_rag, dict): - entries = raw_byok_rag.get("entries") - if isinstance(entries, list): - return entries - if isinstance(entries, dict): - return [entries] - return [] return [] diff --git a/src/models/config.py b/src/models/config.py index 6ed0d0bd6..882d4b448 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -2,12 +2,13 @@ # pylint: disable=too-many-lines +import math import re from enum import Enum from functools import cached_property from pathlib import Path from re import Pattern -from typing import Annotated, Any, Literal, Optional, Self +from typing import Annotated, Any, Literal, Optional import jsonpath_ng import yaml @@ -27,6 +28,7 @@ model_validator, ) from pydantic.dataclasses import dataclass +from typing_extensions import Self # noqa: UP035 import constants from log import get_logger @@ -1613,6 +1615,39 @@ class ByokRag(ConfigurationBase): description="Path to RAG database.", ) + relevance_cutoff_score: float = Field( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + ge=0, + title="BYOK inline RAG relevance cutoff", + description="Minimum raw similarity score from this **BYOK** vector store " + "before score_multiplier weighting. Chunks below this threshold are dropped " + "immediately after retrieval from this store only. Does not apply to OKP/Solr. " + "Set to 0.0 to disable filtering for this store.", + ) + + @field_validator("relevance_cutoff_score") + @classmethod + def validate_relevance_cutoff_score(cls, value: float) -> float: + """Reject non-finite values (e.g. ``.inf`` or ``.nan`` from YAML). + + Parameters: + ---------- + value: Cutoff after coercion to ``float``. + + Returns: + ------- + The same finite value. + + Raises: + ------ + ValueError: If ``value`` is not finite. + """ + if not math.isfinite(value): + raise ValueError( + "relevance_cutoff_score must be a finite number (not inf, -inf, or nan)" + ) + return value + score_multiplier: float = Field( constants.DEFAULT_SCORE_MULTIPLIER, gt=0, @@ -1624,50 +1659,25 @@ class ByokRag(ConfigurationBase): def _normalize_byok_rag_input(value: Any) -> Any: - """Allow legacy ``byok_rag: [ ... ]`` YAML alongside the section object form. + """Normalize ``byok_rag`` YAML to a list of :class:`ByokRag` definitions. + + ``null`` becomes ``[]``. The value must otherwise be a YAML list of store + mappings (not a mapping at the top level). - Explicit YAML null (``byok_rag: null``) is normalized to an empty mapping so - :class:`ByokRagSection` field defaults apply instead of a type error. + Raises: + ValueError: If ``value`` is a dict or other non-list type (other than ``None``). """ if value is None: - return {} + return [] if isinstance(value, list): - return {"entries": value} - return value - - -class ByokRagSection(ConfigurationBase): - """BYOK RAG configuration: registered BYOK stores and optional raw-score cutoff. - - Settings here apply only to bring-your-own-knowledge vector stores listed in - ``entries``. They do not affect OKP (Solr) inline RAG, which uses separate - query parameters and defaults. - """ - - entries: list[ByokRag] = Field( - default_factory=list, - title="BYOK RAG stores", - description="Registered bring-your-own-knowledge vector stores.", - ) - - relevance_cutoff_score: float = Field( - constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - ge=0, - title="BYOK inline RAG relevance cutoff", - description="Minimum raw similarity score from each **BYOK** vector store " - "before per-store score_multiplier weighting. Chunks below this threshold " - "are dropped immediately after retrieval from those stores only. Does not " - "apply to OKP/Solr. Set to 0.0 to disable filtering for BYOK.", + return value + raise ValueError( + "byok_rag must be a YAML list of BYOK store definitions, not a mapping." ) -def _default_byok_rag_section() -> ByokRagSection: - """Return default BYOK RAG section; delegates to :class:`ByokRagSection` field defaults.""" - return ByokRagSection.model_validate({}) - - -ByokRagSectionValidated = Annotated[ - ByokRagSection, +ByokRagListValidated = Annotated[ + list[ByokRag], BeforeValidator(_normalize_byok_rag_input), ] @@ -1958,14 +1968,13 @@ class Configuration(ConfigurationBase): description="Conversation history configuration.", ) - byok_rag: ByokRagSectionValidated = Field( - default_factory=_default_byok_rag_section, + byok_rag: ByokRagListValidated = Field( + default_factory=list, title="BYOK RAG configuration", description="BYOK RAG configuration. This configuration can be used to " "reconfigure Llama Stack through its run.yaml configuration file. " - "You may use the legacy form ``byok_rag: [ ... ]`` (a list of stores) or " - "an object with ``entries`` and optional ``relevance_cutoff_score`` " - "(BYOK inline RAG only; not used for OKP/Solr).", + "Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set " + "``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr).", ) a2a_state: A2AStateConfiguration = Field( diff --git a/src/utils/responses.py b/src/utils/responses.py index ba52cceac..3fa10b8e2 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -236,7 +236,7 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position # If rag.inline is configured, but not rag.tool, tool RAG is disabled. # 3. All registered vector DBs: fallback when neither rag.tool nor rag.inline are configured. # IDs fetched from llama-stack are already internal and need no translation. - byok_rags = configuration.configuration.byok_rag.entries + byok_rags = configuration.configuration.byok_rag is_tool_rag_enabled = len(configuration.configuration.rag.tool) > 0 is_inline_rag_enabled = len(configuration.configuration.rag.inline) > 0 @@ -1708,7 +1708,7 @@ async def _resolve_client_tools( # Per-request override of vector stores (user-facing rag_ids) vector_store_ids = extract_vector_store_ids_from_tools(tools) or None # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool - byok_rags = configuration.configuration.byok_rag.entries + byok_rags = configuration.configuration.byok_rag prepared_tools = translate_tools_vector_store_ids(tools, byok_rags) prepared_tools = apply_mcp_headers_to_explicit_tools( prepared_tools, token, mcp_headers, request_headers @@ -1803,7 +1803,7 @@ async def resolve_tool_choice( ) else: # Pass tools explicitly configured for this request - byok_rags = configuration.configuration.byok_rag.entries + byok_rags = configuration.configuration.byok_rag prepared_tools = translate_tools_vector_store_ids(tools, byok_rags) prepared_tools = apply_mcp_headers_to_explicit_tools( prepared_tools, token, mcp_headers, request_headers diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index bea5a5604..4d3007e2a 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -87,7 +87,8 @@ def _extract_byok_rag_chunks( search_response: Response from vector_io.query vector_store_id: ID of the BYOK vector store that produced these results weight: Score multiplier to apply to this store's remaining results - byok_raw_cutoff_score: Minimum raw similarity score (``byok_rag.relevance_cutoff_score``) + byok_raw_cutoff_score: Minimum raw similarity score for this store + (``ByokRag.relevance_cutoff_score``). Returns: List of result dictionaries with weighted scores @@ -347,16 +348,16 @@ def _process_solr_chunks_for_documents( return doc_ids_from_chunks -async def _fetch_byok_rag( +async def _fetch_byok_rag( # pylint: disable=too-many-locals client: AsyncLlamaStackClient, query: str, vector_store_ids: Optional[list[str]] = None, # User-facing ) -> tuple[list[RAGChunk], list[ReferencedDocument]]: """Fetch chunks and documents from BYOK RAG sources only. - Applies ``configuration.byok_rag.relevance_cutoff_score`` to raw scores from - BYOK vector stores. OKP/Solr inline RAG is implemented in ``_fetch_solr_rag`` - and does not use that setting. + Applies each entry's ``relevance_cutoff_score`` (via ``relevance_cutoff_mapping``) + to raw scores from BYOK vector stores. OKP/Solr inline RAG is implemented in + ``_fetch_solr_rag`` and does not use that setting. Args: client: The AsyncLlamaStackClient to use for the request @@ -388,7 +389,7 @@ async def _fetch_byok_rag( # Translate user-facing rag_ids to llama-stack ids vector_store_ids_to_query: list[str] = resolve_vector_store_ids( - rag_ids_to_query, configuration.configuration.byok_rag.entries + rag_ids_to_query, configuration.configuration.byok_rag ) # Request-level override: filter out Solr store, use the rest @@ -404,11 +405,12 @@ async def _fetch_byok_rag( return rag_chunks, referenced_documents try: - # Get score multiplier and rag_id mappings + # Get score multiplier, relevance cutoff, and rag_id mappings score_multiplier_mapping = configuration.score_multiplier_mapping + relevance_cutoff_mapping = configuration.relevance_cutoff_mapping rag_id_mapping = configuration.rag_id_mapping - # BYOK stores only: byok_rag.relevance_cutoff_score (OKP/Solr does not use it). + # BYOK stores only: per-entry relevance_cutoff_score (OKP/Solr does not use it). results_per_store = await asyncio.gather( *[ _query_store_for_byok_rag( @@ -416,7 +418,10 @@ async def _fetch_byok_rag( vector_store_id, query, score_multiplier_mapping.get(vector_store_id, 1.0), - configuration.configuration.byok_rag.relevance_cutoff_score, + relevance_cutoff_mapping.get( + vector_store_id, + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + ), ) for vector_store_id in vector_store_ids_to_query ] @@ -543,9 +548,9 @@ async def build_rag_context( ) -> RAGContext: """Build RAG context by fetching and merging chunks from all enabled sources. - Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoff comes only - from ``byok_rag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g. - Solr ``score_threshold``), not the BYOK cutoff. + Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoffs come from + each ``ByokRag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g. + Solr ``score_threshold``), not the BYOK cutoffs. Args: client: The AsyncLlamaStackClient to use for the request diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py index 5a43bca17..93ea0271d 100644 --- a/tests/integration/endpoints/test_query_byok_integration.py +++ b/tests/integration/endpoints/test_query_byok_integration.py @@ -254,6 +254,7 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 + byok_entry.relevance_cutoff_score = 0.0 byok_entry.model_dump.return_value = { "rag_id": "test-knowledge", "rag_type": "inline::faiss", @@ -262,13 +263,11 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon "vector_db_id": "vs-byok-knowledge", "db_path": "/tmp/test-db", "score_multiplier": 1.0, + "relevance_cutoff_score": 0.0, } # Patch the loaded configuration's byok_rag and rag.inline - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = ["test-knowledge"] return test_config @@ -287,6 +286,7 @@ def byok_tool_config_fixture( byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 + byok_entry.relevance_cutoff_score = 0.0 byok_entry.model_dump.return_value = { "rag_id": "test-knowledge", "rag_type": "inline::faiss", @@ -295,12 +295,10 @@ def byok_tool_config_fixture( "vector_db_id": "vs-byok-knowledge", "db_path": "/tmp/test-db", "score_multiplier": 1.0, + "relevance_cutoff_score": 0.0, } - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = [] test_config.configuration.rag.tool = ["test-knowledge"] @@ -425,16 +423,15 @@ async def test_query_byok_inline_rag_with_request_vector_store_ids( entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -493,12 +490,11 @@ async def test_query_byok_request_vector_store_ids_filters_configured_stores( entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 + entry_b.relevance_cutoff_score = 0.0 # Both sources are in config - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -696,10 +692,8 @@ async def test_query_byok_combined_inline_and_tool_rag( # pylint: disable=too-m byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + byok_entry.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = ["test-knowledge"] test_config.configuration.rag.tool = ["test-knowledge"] @@ -809,16 +803,15 @@ async def test_query_byok_inline_rag_only_configured_rag_id_is_queried( entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -883,16 +876,15 @@ async def test_query_byok_score_multiplier_shifts_chunk_priority( # pylint: dis entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 5.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -965,11 +957,9 @@ async def test_query_byok_cutoff_applied_before_multiplier( entry.rag_id = "cutoff-test" entry.vector_db_id = "vs-cutoff-test" entry.score_multiplier = 10.0 + entry.relevance_cutoff_score = 0.5 - _br = mocker.MagicMock() - _br.entries = [entry] - _br.relevance_cutoff_score = 0.5 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry] test_config.configuration.rag.inline = ["cutoff-test"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -998,8 +988,7 @@ async def test_query_byok_cutoff_applied_before_multiplier( mcp_headers={}, ) - assert response.rag_chunks is not None - assert len(response.rag_chunks) == 0 + assert not response.rag_chunks # ============================================================================== @@ -1028,11 +1017,9 @@ async def test_query_byok_max_chunks_caps_retrieved_results( # pylint: disable= entry.rag_id = "big-source" entry.vector_db_id = "vs-big-source" entry.score_multiplier = 1.0 + entry.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry] test_config.configuration.rag.inline = ["big-source"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") @@ -1103,11 +1090,10 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources( # pylint: di entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder") diff --git a/tests/integration/endpoints/test_streaming_query_byok_integration.py b/tests/integration/endpoints/test_streaming_query_byok_integration.py index 8f963be11..dd53f2da7 100644 --- a/tests/integration/endpoints/test_streaming_query_byok_integration.py +++ b/tests/integration/endpoints/test_streaming_query_byok_integration.py @@ -236,6 +236,7 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 + byok_entry.relevance_cutoff_score = 0.0 byok_entry.model_dump.return_value = { "rag_id": "test-knowledge", "rag_type": "inline::faiss", @@ -244,12 +245,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon "vector_db_id": "vs-byok-knowledge", "db_path": "/tmp/test-db", "score_multiplier": 1.0, + "relevance_cutoff_score": 0.0, } - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = ["test-knowledge"] return test_config @@ -264,6 +263,7 @@ def byok_tool_config_fixture( byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 + byok_entry.relevance_cutoff_score = 0.0 byok_entry.model_dump.return_value = { "rag_id": "test-knowledge", "rag_type": "inline::faiss", @@ -272,12 +272,10 @@ def byok_tool_config_fixture( "vector_db_id": "vs-byok-knowledge", "db_path": "/tmp/test-db", "score_multiplier": 1.0, + "relevance_cutoff_score": 0.0, } - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = [] test_config.configuration.rag.tool = ["test-knowledge"] @@ -345,16 +343,15 @@ async def test_streaming_query_byok_inline_rag_with_request_vector_store_ids( entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch( @@ -412,16 +409,15 @@ async def test_streaming_query_byok_request_vector_store_ids_filters_configured_ entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( @@ -701,10 +697,8 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag( byok_entry.rag_id = "test-knowledge" byok_entry.vector_db_id = "vs-byok-knowledge" byok_entry.score_multiplier = 1.0 - _br = mocker.MagicMock() - _br.entries = [byok_entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + byok_entry.relevance_cutoff_score = 0.0 + test_config.configuration.byok_rag = [byok_entry] test_config.configuration.rag.inline = ["test-knowledge"] test_config.configuration.rag.tool = ["test-knowledge"] @@ -781,16 +775,15 @@ async def test_streaming_query_byok_only_configured_rag_id_is_queried( # pylint entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a"] mock_holder_class = mocker.patch( @@ -863,16 +856,15 @@ async def test_streaming_query_byok_score_multiplier_shifts_priority( # pylint: entry_a.rag_id = "source-a" entry_a.vector_db_id = "vs-source-a" entry_a.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 entry_b = mocker.MagicMock() entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 5.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( @@ -953,11 +945,9 @@ async def test_streaming_query_byok_max_chunks_caps_context( # pylint: disable= entry.rag_id = "big-source" entry.vector_db_id = "vs-big-source" entry.score_multiplier = 1.0 + entry.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry] test_config.configuration.rag.inline = ["big-source"] mock_holder_class = mocker.patch( @@ -1032,11 +1022,10 @@ async def test_streaming_query_byok_max_chunks_caps_across_multiple_sources( # entry_b.rag_id = "source-b" entry_b.vector_db_id = "vs-source-b" entry_b.score_multiplier = 1.0 + entry_a.relevance_cutoff_score = 0.0 + entry_b.relevance_cutoff_score = 0.0 - _br = mocker.MagicMock() - _br.entries = [entry_a, entry_b] - _br.relevance_cutoff_score = 0.0 - test_config.configuration.byok_rag = _br + test_config.configuration.byok_rag = [entry_a, entry_b] test_config.configuration.rag.inline = ["source-a", "source-b"] mock_holder_class = mocker.patch( diff --git a/tests/unit/app/endpoints/test_rags.py b/tests/unit/app/endpoints/test_rags.py index f6247aefa..563c223fe 100644 --- a/tests/unit/app/endpoints/test_rags.py +++ b/tests/unit/app/endpoints/test_rags.py @@ -379,7 +379,7 @@ def __init__(self) -> None: def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None: """Test that _resolve_rag_id_to_vector_db_id maps rag_id to vector_db_id.""" byok_config = _make_byok_config(str(tmp_path)) - byok_rags = byok_config.configuration.byok_rag.entries + byok_rags = byok_config.configuration.byok_rag assert _resolve_rag_id_to_vector_db_id("ocp-4.18-docs", byok_rags) == "vs_abc123" assert _resolve_rag_id_to_vector_db_id("company-kb", byok_rags) == "vs_def456" @@ -387,5 +387,5 @@ def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None: def test_resolve_rag_id_to_vector_db_id_passthrough(tmp_path: Path) -> None: """Test that unmapped IDs are passed through unchanged.""" byok_config = _make_byok_config(str(tmp_path)) - byok_rags = byok_config.configuration.byok_rag.entries + byok_rags = byok_config.configuration.byok_rag assert _resolve_rag_id_to_vector_db_id("vs_unknown", byok_rags) == "vs_unknown" diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py index 4083dd6b7..d0fd88420 100644 --- a/tests/unit/models/config/test_byok_rag.py +++ b/tests/unit/models/config/test_byok_rag.py @@ -12,7 +12,7 @@ DEFAULT_RAG_TYPE, DEFAULT_SCORE_MULTIPLIER, ) -from models.config import ByokRag, ByokRagSection, Configuration +from models.config import ByokRag, Configuration def test_byok_rag_configuration_default_values() -> None: @@ -37,6 +37,7 @@ def test_byok_rag_configuration_default_values() -> None: assert byok_rag.embedding_dimension == DEFAULT_EMBEDDING_DIMENSION assert byok_rag.vector_db_id == "vector_db_id" assert byok_rag.db_path == "tests/configuration/rag.txt" + assert byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE assert byok_rag.score_multiplier == DEFAULT_SCORE_MULTIPLIER @@ -192,8 +193,8 @@ def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None: ) -def test_byok_rag_section_explicit_null_yields_defaults() -> None: - """``byok_rag: null`` in YAML normalizes to defaults (empty entries, default cutoff).""" +def test_byok_rag_explicit_null_yields_empty_list() -> None: + """``byok_rag: null`` in YAML normalizes to an empty list.""" cfg = Configuration.model_validate( { "name": "t", @@ -208,16 +209,13 @@ def test_byok_rag_section_explicit_null_yields_defaults() -> None: "byok_rag": None, } ) - assert cfg.byok_rag.entries == [] - assert ( - cfg.byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE - ) + assert cfg.byok_rag == [] -def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None: - """``byok_rag`` accepts a section object or a legacy list of stores.""" +def test_byok_rag_list_in_configuration() -> None: + """``byok_rag`` is a YAML list of stores.""" db_path = "tests/configuration/rag.txt" - from_list = Configuration.model_validate( + cfg = Configuration.model_validate( { "name": "t", "service": {"host": "localhost", "port": 8080}, @@ -237,43 +235,63 @@ def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None: ], } ) - assert from_list.byok_rag.entries[0].rag_id == "r1" + assert cfg.byok_rag[0].rag_id == "r1" assert ( - from_list.byok_rag.relevance_cutoff_score + cfg.byok_rag[0].relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE ) - from_obj = Configuration.model_validate( - { - "name": "t", - "service": {"host": "localhost", "port": 8080}, - "llama_stack": { - "api_key": "k", - "url": "http://x:1", - "use_as_library_client": False, - }, - "user_data_collection": {}, - "authentication": {"module": "noop"}, - "byok_rag": { - "entries": [ - { - "rag_id": "r2", - "vector_db_id": "vs2", - "db_path": db_path, - }, - ], - "relevance_cutoff_score": 0.42, - }, - } - ) - assert from_obj.byok_rag.entries[0].rag_id == "r2" - assert from_obj.byok_rag.relevance_cutoff_score == 0.42 +def test_byok_rag_rejects_top_level_mapping() -> None: + """A mapping (including ``entries``) at top level is not valid.""" + db_path = "tests/configuration/rag.txt" + with pytest.raises(ValidationError): + Configuration.model_validate( + { + "name": "t", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://x:1", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": { + "entries": [ + { + "rag_id": "r2", + "vector_db_id": "vs2", + "db_path": db_path, + }, + ], + "relevance_cutoff_score": 0.42, + }, + } + ) -def test_byok_rag_section_rejects_negative_cutoff() -> None: - """relevance_cutoff_score must be non-negative.""" + +def test_byok_rag_rejects_negative_cutoff_on_entry() -> None: + """relevance_cutoff_score on each entry must be non-negative.""" with pytest.raises(ValidationError): - _ = ByokRagSection( - entries=[], + _ = ByokRag( + rag_id="rag_id", + vector_db_id="vector_db_id", + db_path="tests/configuration/rag.txt", relevance_cutoff_score=-0.1, ) + + +@pytest.mark.parametrize( + "non_finite", + [float("inf"), float("-inf"), float("nan")], +) +def test_byok_rag_rejects_non_finite_relevance_cutoff(non_finite: float) -> None: + """relevance_cutoff_score must be finite (reject inf and nan).""" + with pytest.raises(ValidationError): + _ = ByokRag( + rag_id="rag_id", + vector_db_id="vector_db_id", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=non_finite, + ) diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 49184cf31..09a5cf79a 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -190,10 +190,7 @@ def test_dump_configuration(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": { - "entries": [], - "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - }, + "byok_rag": [], "quota_handlers": { "sqlite": None, "postgres": None, @@ -535,10 +532,7 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": { - "entries": [], - "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - }, + "byok_rag": [], "quota_handlers": { "sqlite": None, "postgres": None, @@ -772,10 +766,7 @@ def test_dump_configuration_with_quota_limiters_different_values( "sqlite": None, "type": None, }, - "byok_rag": { - "entries": [], - "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - }, + "byok_rag": [], "quota_handlers": { "sqlite": None, "postgres": None, @@ -989,20 +980,18 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": { - "entries": [ - { - "db_path": "tests/configuration/rag.txt", - "embedding_dimension": 768, - "embedding_model": "sentence-transformers/all-mpnet-base-v2", - "rag_id": "rag_id", - "rag_type": "inline::faiss", - "vector_db_id": "vector_db_id", - "score_multiplier": 1.0, - }, - ], - "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - }, + "byok_rag": [ + { + "db_path": "tests/configuration/rag.txt", + "embedding_dimension": 768, + "embedding_model": "sentence-transformers/all-mpnet-base-v2", + "rag_id": "rag_id", + "rag_type": "inline::faiss", + "vector_db_id": "vector_db_id", + "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, + "score_multiplier": 1.0, + }, + ], "quota_handlers": { "sqlite": None, "postgres": None, @@ -1196,10 +1185,7 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: "sqlite": None, "type": None, }, - "byok_rag": { - "entries": [], - "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - }, + "byok_rag": [], "quota_handlers": { "sqlite": None, "postgres": None, diff --git a/tests/unit/telemetry/conftest.py b/tests/unit/telemetry/conftest.py index e7ef44255..6b2db6a82 100644 --- a/tests/unit/telemetry/conftest.py +++ b/tests/unit/telemetry/conftest.py @@ -12,7 +12,6 @@ Action, AuthenticationConfiguration, AuthorizationConfiguration, - ByokRagSection, Configuration, CORSConfiguration, Customization, @@ -288,7 +287,7 @@ def build_fully_populated_config() -> Configuration: ), ], conversation_cache=None, - byok_rag=ByokRagSection(entries=[]), + byok_rag=[], a2a_state=None, quota_handlers=None, azure_entra_id=None, @@ -364,7 +363,7 @@ def build_minimal_config() -> Configuration: ), mcp_servers=[], conversation_cache=None, - byok_rag=ByokRagSection(entries=[]), + byok_rag=[], a2a_state=None, quota_handlers=None, azure_entra_id=None, diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 798fe89f4..f0212fe46 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -1186,6 +1186,45 @@ def test_score_multiplier_mapping_with_custom_values(tmp_path: Path) -> None: assert cfg.score_multiplier_mapping == {"vs-001": 1.5, "vs-002": 0.75} +def test_relevance_cutoff_mapping_first_wins_duplicate_vector_db_id( + tmp_path: Path, +) -> None: + """First BYOK entry wins when two entries share the same vector_db_id.""" + db_file1 = tmp_path / "test1.db" + db_file1.touch() + db_file2 = tmp_path / "test2.db" + db_file2.touch() + cfg = AppConfig() + cfg.init_from_dict( + { + "name": "test", + "service": {"host": "localhost", "port": 8080}, + "llama_stack": { + "api_key": "k", + "url": "http://test.com:1234", + "use_as_library_client": False, + }, + "user_data_collection": {}, + "authentication": {"module": "noop"}, + "byok_rag": [ + { + "rag_id": "kb1", + "vector_db_id": "vs-dup", + "db_path": str(db_file1), + "relevance_cutoff_score": 0.2, + }, + { + "rag_id": "kb2", + "vector_db_id": "vs-dup", + "db_path": str(db_file2), + "relevance_cutoff_score": 0.9, + }, + ], + } + ) + assert cfg.relevance_cutoff_mapping == {"vs-dup": 0.2} + + def test_score_multiplier_mapping_not_loaded() -> None: """Test that score_multiplier_mapping raises when config not loaded.""" cfg = AppConfig() diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py index 6d14a5a8d..cffa8f0da 100644 --- a/tests/unit/test_llama_stack_configuration.py +++ b/tests/unit/test_llama_stack_configuration.py @@ -30,37 +30,16 @@ # ============================================================================= -def test_raw_byok_rag_store_list_legacy_list() -> None: - """Bare list form returns the same sequence.""" +def test_raw_byok_rag_store_list_returns_list_as_is() -> None: + """YAML list form returns the same sequence.""" raw = [{"rag_id": "a"}] assert _raw_byok_rag_store_list(raw) is raw -def test_raw_byok_rag_store_list_section_with_entries_list() -> None: - """Section dict with entries list returns that list.""" - entries = [{"rag_id": "x"}] - assert ( - _raw_byok_rag_store_list({"relevance_cutoff_score": 0.3, "entries": entries}) - is entries - ) - - -def test_raw_byok_rag_store_list_section_with_single_entry_dict() -> None: - """Section dict may use a single mapping for entries (wrap as one-element list).""" - one = {"rag_id": "solo", "vector_db_id": "vs1", "db_path": "/d.db"} - out = _raw_byok_rag_store_list({"entries": one}) - assert out == [one] - - -def test_raw_byok_rag_store_list_section_entries_invalid_or_missing() -> None: - """None, strings, or missing entries yield an empty list.""" - assert _raw_byok_rag_store_list({"entries": None}) == [] - assert _raw_byok_rag_store_list({"entries": "not-a-list"}) == [] +def test_raw_byok_rag_store_list_non_list_yields_empty() -> None: + """Only a list is used; mappings and other types yield [].""" + assert _raw_byok_rag_store_list({"entries": [{"rag_id": "x"}]}) == [] assert _raw_byok_rag_store_list({}) == [] - - -def test_raw_byok_rag_store_list_non_sequence_top_level() -> None: - """Non-list, non-dict input yields [].""" assert _raw_byok_rag_store_list(None) == [] assert _raw_byok_rag_store_list("byok") == [] diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index 9ae98a225..add19be23 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -90,12 +90,14 @@ ) -def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any: - """Mock ``configuration.byok_rag`` with ``entries`` and default cutoff.""" - section = mocker.Mock() - section.entries = entries - section.relevance_cutoff_score = 0.0 - return section +def _byok_rag_list_mock(entries: list[Any]) -> list[Any]: + """Mock ``configuration.byok_rag`` as a list of entries.""" + for entry in entries: + if not isinstance(getattr(entry, "relevance_cutoff_score", None), (int, float)): + entry.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + return entries class MockOutputItem: # pylint: disable=too-few-public-methods @@ -1651,9 +1653,7 @@ async def test_translates_byok_ids_in_prepare_tools( mock_byok_rag.rag_id = "ocp_docs" mock_byok_rag.vector_db_id = "vs-001" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock( - mocker, [mock_byok_rag] - ) + mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag]) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1674,7 +1674,7 @@ async def test_passes_through_unknown_ids_in_prepare_tools( # Configure empty BYOK RAG mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1704,9 +1704,7 @@ async def test_does_not_translate_when_ids_fetched_from_llama_stack( mock_byok_rag.rag_id = "vs-internal" mock_byok_rag.vector_db_id = "vs-translated" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock( - mocker, [mock_byok_rag] - ) + mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag]) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1729,7 +1727,7 @@ async def test_uses_rag_tool_config_when_no_per_request_ids( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.configuration.rag.tool = ["rag-tool-id-1", "rag-tool-id-2"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1754,9 +1752,7 @@ async def test_rag_tool_config_ids_are_translated( mock_byok_rag.rag_id = "ocp_docs" mock_byok_rag.vector_db_id = "vs-001" mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock( - mocker, [mock_byok_rag] - ) + mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag]) mock_config.configuration.rag.tool = ["ocp_docs"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1774,7 +1770,7 @@ async def test_inline_rag_disables_tool_rag(self, mocker: MockerFixture) -> None mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [ "inline-store-id" @@ -1796,7 +1792,7 @@ async def test_per_request_ids_override_rag_tool_config( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.configuration.rag.tool = ["config-id-1"] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -1821,7 +1817,7 @@ async def test_all_registered_dbs_used_when_neither_tool_nor_inline_configured( mocker.patch("utils.responses.get_mcp_tools", return_value=None) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.configuration.rag.tool = [] mock_config.configuration.rag.inline = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3369,7 +3365,7 @@ async def test_client_tools_without_merge_header( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3394,7 +3390,7 @@ async def test_client_tools_with_merge_header(self, mocker: MockerFixture) -> No return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3431,7 +3427,7 @@ async def test_merge_header_conflict_raises_409( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) @@ -3490,7 +3486,7 @@ async def test_merge_header_no_server_tools_returns_client_only( return_value=mock_holder, ) mock_config = mocker.Mock() - mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + mock_config.configuration.byok_rag = _byok_rag_list_mock([]) mock_config.mcp_servers = [] mocker.patch("utils.responses.configuration", mock_config) mocker.patch( diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py index 7e2752dfd..47dddbdd4 100644 --- a/tests/unit/utils/test_vector_search.py +++ b/tests/unit/utils/test_vector_search.py @@ -25,12 +25,14 @@ ) -def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any: - """Build a mock ``byok_rag`` section with ``entries`` and cutoff.""" - section = mocker.Mock() - section.entries = entries - section.relevance_cutoff_score = 0.0 - return section +def _byok_rag_list_mock(_mocker: MockerFixture, entries: list[Any]) -> list[Any]: + """Normalize mocked BYOK entries for ``configuration.byok_rag``.""" + for entry in entries: + if not isinstance(getattr(entry, "relevance_cutoff_score", None), (int, float)): + entry.relevance_cutoff_score = ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + return entries class TestIsSolrEnabled: @@ -424,7 +426,7 @@ async def test_byok_no_inline_ids(self, mocker: MockerFixture) -> None: """Test when no inline BYOK sources are configured.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -443,10 +445,11 @@ async def test_byok_enabled_success(self, mocker: MockerFixture) -> None: byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" config_mock.configuration.rag.inline = ["rag_1"] - config_mock.configuration.byok_rag = _byok_rag_section_mock( + config_mock.configuration.byok_rag = _byok_rag_list_mock( mocker, [byok_rag_mock] ) config_mock.score_multiplier_mapping = {"vs_1": 1.5} + config_mock.relevance_cutoff_mapping = {"vs_1": 0.0} config_mock.rag_id_mapping = {"vs_1": "rag_1"} mocker.patch("utils.vector_search.configuration", config_mock) @@ -483,11 +486,12 @@ async def test_user_facing_ids_translated_to_internal_ids( byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "my-kb" byok_rag_mock.vector_db_id = "vs-internal-001" - config_mock.configuration.byok_rag = _byok_rag_section_mock( + config_mock.configuration.byok_rag = _byok_rag_list_mock( mocker, [byok_rag_mock] ) config_mock.configuration.rag.inline = ["my-kb"] config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0} + config_mock.relevance_cutoff_mapping = {"vs-internal-001": 0.0} config_mock.rag_id_mapping = {"vs-internal-001": "my-kb"} mocker.patch("utils.vector_search.configuration", config_mock) @@ -525,11 +529,12 @@ async def test_multiple_user_facing_ids_each_translated( byok_rag_2 = mocker.Mock() byok_rag_2.rag_id = "kb-part2" byok_rag_2.vector_db_id = "vs-bbb-222" - config_mock.configuration.byok_rag = _byok_rag_section_mock( + config_mock.configuration.byok_rag = _byok_rag_list_mock( mocker, [byok_rag_1, byok_rag_2] ) config_mock.configuration.rag.inline = ["kb-part1", "kb-part2"] config_mock.score_multiplier_mapping = {"vs-aaa-111": 1.0, "vs-bbb-222": 1.0} + config_mock.relevance_cutoff_mapping = {"vs-aaa-111": 0.0, "vs-bbb-222": 0.0} config_mock.rag_id_mapping = { "vs-aaa-111": "kb-part1", "vs-bbb-222": "kb-part2", @@ -570,7 +575,7 @@ async def test_no_inline_rag_configured_skips_byok( """Test that BYOK inline RAG is skipped when rag.inline is empty.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -590,7 +595,7 @@ async def test_request_id_not_in_inline_config_skips_byok( """Test that a request vector_store_id not registered in rag.inline is filtered out.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = ["registered-id"] - config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, []) mocker.patch("utils.vector_search.configuration", config_mock) client_mock = mocker.AsyncMock() @@ -612,12 +617,13 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores( byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" - section = mocker.Mock() - section.entries = [byok_rag_mock] - section.relevance_cutoff_score = 0.5 + byok_rag_mock.relevance_cutoff_score = 0.5 config_mock.configuration.rag.inline = ["rag_1"] - config_mock.configuration.byok_rag = section + config_mock.configuration.byok_rag = _byok_rag_list_mock( + mocker, [byok_rag_mock] + ) config_mock.score_multiplier_mapping = {"vs_1": 1.0} + config_mock.relevance_cutoff_mapping = {"vs_1": 0.5} config_mock.rag_id_mapping = {"vs_1": "rag_1"} mocker.patch("utils.vector_search.configuration", config_mock) @@ -700,7 +706,7 @@ async def test_both_sources_disabled(self, mocker: MockerFixture) -> None: """Test when both BYOK inline and Solr inline are not configured.""" config_mock = mocker.Mock(spec=AppConfig) config_mock.configuration.rag.inline = [] - config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, []) + config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, []) config_mock.inline_solr_enabled = False mocker.patch("utils.vector_search.configuration", config_mock) @@ -720,11 +726,12 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None: byok_rag_mock.rag_id = "rag_1" byok_rag_mock.vector_db_id = "vs_1" config_mock.configuration.rag.inline = ["rag_1"] - config_mock.configuration.byok_rag = _byok_rag_section_mock( + config_mock.configuration.byok_rag = _byok_rag_list_mock( mocker, [byok_rag_mock] ) config_mock.inline_solr_enabled = False config_mock.score_multiplier_mapping = {"vs_1": 1.0} + config_mock.relevance_cutoff_mapping = {"vs_1": 0.0} config_mock.rag_id_mapping = {"vs_1": "rag_1"} mocker.patch("utils.vector_search.configuration", config_mock) From 579c17e3712d151b8a21b011873cebda65de46c1 Mon Sep 17 00:00:00 2001 From: Sergey Yedrikov Date: Fri, 17 Apr 2026 21:20:24 -0400 Subject: [PATCH 3/3] Cutoff score passed as score_threshold --- src/utils/responses.py | 92 +++++++++-- src/utils/vector_search.py | 41 ++--- .../endpoints/test_query_byok_integration.py | 13 +- tests/unit/utils/test_responses.py | 156 ++++++++++++++++-- tests/unit/utils/test_vector_search.py | 33 ++-- 5 files changed, 264 insertions(+), 71 deletions(-) diff --git a/src/utils/responses.py b/src/utils/responses.py index 3fa10b8e2..bc0d8570e 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -7,7 +7,7 @@ from typing import Any, Optional, cast from fastapi import HTTPException -from llama_stack_api import OpenAIResponseObject +from llama_stack_api import OpenAIResponseObject, SearchRankingOptions from llama_stack_api.openai_responses import ( OpenAIResponseContentPartRefusal as ContentPartRefusal, ) @@ -251,7 +251,7 @@ async def prepare_tools( # pylint: disable=too-many-arguments,too-many-position effective_ids = await get_vector_store_ids(client, None) # Add RAG tools if vector stores are available - rag_tools = get_rag_tools(effective_ids) + rag_tools = get_rag_tools(effective_ids, byok_rags) if rag_tools: toolgroups.extend(rag_tools) @@ -639,11 +639,52 @@ def resolve_vector_store_ids( return [rag_id_to_vector_db_id.get(vs_id, vs_id) for vs_id in vector_store_ids] +def file_search_score_threshold_for_store( + vector_store_id: str, byok_rags: list[ByokRag] +) -> float: + """Return the configured relevance cutoff for one resolved vector store (tool RAG). + + Parameters: + ---------- + vector_store_id: Llama-stack vector store ID (e.g. BYOK ``vector_db_id``). + byok_rags: BYOK configuration entries. + + Returns: + ------- + Solr/OKP default for the Solr store ID; ``relevance_cutoff_score`` for + configured BYOK stores; otherwise ``DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE``. + """ + vector_db_to_cutoff = { + br.vector_db_id: br.relevance_cutoff_score for br in byok_rags + } + if vector_store_id == constants.SOLR_DEFAULT_VECTOR_STORE_ID: + return constants.SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD + if vector_store_id in vector_db_to_cutoff: + return vector_db_to_cutoff[vector_store_id] + return constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + + +def file_search_ranking_options_for_store( + vector_store_id: str, byok_rags: list[ByokRag] +) -> SearchRankingOptions: + """Build ``SearchRankingOptions`` with ``score_threshold`` for a single store.""" + return SearchRankingOptions( + score_threshold=file_search_score_threshold_for_store( + vector_store_id, byok_rags + ) + ) + + def translate_tools_vector_store_ids( tools: list[InputTool], byok_rags: list[ByokRag] ) -> list[InputTool]: """Translate user-facing vector_store_ids to llama-stack IDs in each file_search tool. + Each resolved store becomes its own ``file_search`` tool with that store's + ``ranking_options.score_threshold``. If the client omits ``ranking_options``, + the store cutoff is used; if present, the effective threshold is + ``max(client score_threshold, store_cutoff)``. + Parameters: ---------- tools: List of request tools (may contain file_search with user-facing IDs). @@ -651,27 +692,56 @@ def translate_tools_vector_store_ids( Returns: ------- - New list of tools with file_search vector_store_ids translated; other tools + New list of tools with file_search entries expanded per store; other tools unchanged. """ result: list[InputTool] = [] for tool in tools: - if tool.type == "file_search": - resolved_ids = resolve_vector_store_ids(tool.vector_store_ids, byok_rags) - result.append(tool.model_copy(update={"vector_store_ids": resolved_ids})) - else: + if tool.type != "file_search": result.append(tool) + continue + + file_tool = cast(InputToolFileSearch, tool) + resolved_ids = resolve_vector_store_ids(file_tool.vector_store_ids, byok_rags) + + for vs_id in resolved_ids: + store_cutoff = file_search_score_threshold_for_store(vs_id, byok_rags) + if file_tool.ranking_options is None: + ranking = file_search_ranking_options_for_store(vs_id, byok_rags) + else: + client_ro = file_tool.ranking_options + client_st = client_ro.score_threshold + client_val = 0.0 if client_st is None else float(client_st) + ranking = client_ro.model_copy( + update={"score_threshold": max(client_val, store_cutoff)} + ) + result.append( + InputToolFileSearch( + type="file_search", + vector_store_ids=[vs_id], + max_num_results=file_tool.max_num_results, + filters=file_tool.filters, + ranking_options=ranking, + ) + ) + return result -def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSearch]]: +def get_rag_tools( + vector_store_ids: list[str], byok_rags: list[ByokRag] +) -> Optional[list[InputToolFileSearch]]: """Convert vector store IDs to tools format for Responses API. + Emits one ``file_search`` tool per store so each has its own + ``ranking_options.score_threshold``. + Args: vector_store_ids: List of vector store identifiers + byok_rags: BYOK RAG entries used to set ``ranking_options.score_threshold`` Returns: - List containing file_search tool configuration, or empty list if no stores available + List containing file_search tool configurations, or empty list if no stores available """ if vector_store_ids == []: return [] @@ -679,9 +749,11 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea return [ InputToolFileSearch( type="file_search", - vector_store_ids=vector_store_ids, + vector_store_ids=[vs_id], max_num_results=constants.TOOL_RAG_MAX_CHUNKS, + ranking_options=file_search_ranking_options_for_store(vs_id, byok_rags), ) + for vs_id in vector_store_ids ] diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py index 4d3007e2a..eec1c5ea0 100644 --- a/src/utils/vector_search.py +++ b/src/utils/vector_search.py @@ -76,19 +76,17 @@ def _extract_byok_rag_chunks( search_response: Any, vector_store_id: str, weight: float, - byok_raw_cutoff_score: float, ) -> list[dict[str, Any]]: """Extract and weight result chunks from vector search for BYOK RAG only. - Chunks whose raw retrieval score is below ``byok_raw_cutoff_score`` are - dropped before applying the per-store weight. This is not used for OKP/Solr. + Raw-score filtering uses ``score_threshold`` in :meth:`vector_io.query` + params (see :func:`_query_store_for_byok_rag`); this only maps chunks to + weighted result dicts. Not used for OKP/Solr. Args: search_response: Response from vector_io.query vector_store_id: ID of the BYOK vector store that produced these results weight: Score multiplier to apply to this store's remaining results - byok_raw_cutoff_score: Minimum raw similarity score for this store - (``ByokRag.relevance_cutoff_score``). Returns: List of result dictionaries with weighted scores @@ -97,14 +95,6 @@ def _extract_byok_rag_chunks( for chunk, score in zip( search_response.chunks, search_response.scores, strict=True ): - if score < byok_raw_cutoff_score: - logger.debug( - " [%s] BYOK: dropping chunk: raw score=%.4f < cutoff=%.4f", - vector_store_id, - score, - byok_raw_cutoff_score, - ) - continue weighted_score = score * weight doc_id = ( chunk.metadata.get("document_id", chunk.chunk_id) @@ -189,7 +179,8 @@ async def _query_store_for_byok_rag( vector_store_id: ID of the BYOK vector store to query query: Search query string weight: Score multiplier to apply - byok_raw_cutoff_score: Minimum raw score before weighting (BYOK config only) + byok_raw_cutoff_score: Passed as ``score_threshold`` in ``vector_io.query`` + params (``ByokRag.relevance_cutoff_score`` / mapping default). Returns: List of weighted result dictionaries, or empty list on error @@ -201,11 +192,10 @@ async def _query_store_for_byok_rag( params={ "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, "mode": "vector", + "score_threshold": byok_raw_cutoff_score, }, ) - return _extract_byok_rag_chunks( - search_response, vector_store_id, weight, byok_raw_cutoff_score - ) + return _extract_byok_rag_chunks(search_response, vector_store_id, weight) except Exception as e: # pylint: disable=broad-exception-caught logger.warning("Failed to search '%s': %s", vector_store_id, e) return [] @@ -356,8 +346,9 @@ async def _fetch_byok_rag( # pylint: disable=too-many-locals """Fetch chunks and documents from BYOK RAG sources only. Applies each entry's ``relevance_cutoff_score`` (via ``relevance_cutoff_mapping``) - to raw scores from BYOK vector stores. OKP/Solr inline RAG is implemented in - ``_fetch_solr_rag`` and does not use that setting. + as ``score_threshold`` on ``vector_io.query`` for each BYOK store. OKP/Solr + inline RAG is implemented in ``_fetch_solr_rag`` and does not use that + setting. Args: client: The AsyncLlamaStackClient to use for the request @@ -418,10 +409,7 @@ async def _fetch_byok_rag( # pylint: disable=too-many-locals vector_store_id, query, score_multiplier_mapping.get(vector_store_id, 1.0), - relevance_cutoff_mapping.get( - vector_store_id, - constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE, - ), + relevance_cutoff_mapping[vector_store_id], ) for vector_store_id in vector_store_ids_to_query ] @@ -548,9 +536,10 @@ async def build_rag_context( ) -> RAGContext: """Build RAG context by fetching and merging chunks from all enabled sources. - Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoffs come from - each ``ByokRag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g. - Solr ``score_threshold``), not the BYOK cutoffs. + Enabled sources can be BYOK and/or Solr OKP. BYOK uses ``score_threshold`` in + ``vector_io.query`` from each ``ByokRag.relevance_cutoff_score``; OKP uses + ``_build_query_params`` (e.g. Solr ``score_threshold``), not the BYOK + per-store config keys. Args: client: The AsyncLlamaStackClient to use for the request diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py index 93ea0271d..5abf78e55 100644 --- a/tests/integration/endpoints/test_query_byok_integration.py +++ b/tests/integration/endpoints/test_query_byok_integration.py @@ -971,7 +971,18 @@ async def test_query_byok_cutoff_applied_before_multiplier( ("Content below raw cutoff", "doc-below", 0.4), ], ) - mock_client.vector_io.query = mocker.AsyncMock(return_value=low_score_resp) + + async def _vector_io_query_side_effect(*_args: Any, **kwargs: Any) -> Any: + params = kwargs.get("params") or {} + raw_st = params.get("score_threshold", 0.0) + score_threshold = float(raw_st) if raw_st is not None else 0.0 + if score_threshold <= 0.4: + return low_score_resp + return _make_vector_io_response(mocker, []) + + mock_client.vector_io.query = mocker.AsyncMock( + side_effect=_vector_io_query_side_effect + ) mock_vs_resp = mocker.MagicMock() mock_vs_resp.data = [] diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index add19be23..e860c6cf7 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -8,6 +8,7 @@ import pytest from fastapi import HTTPException +from llama_stack_api import SearchRankingOptions from llama_stack_api.openai_responses import ( AllowedToolsFilter, OpenAIResponseInputToolChoiceAllowedTools, @@ -87,6 +88,7 @@ resolve_client_tool_choice, resolve_tool_choice, resolve_vector_store_ids, + translate_tools_vector_store_ids, ) @@ -380,16 +382,49 @@ class TestGetRAGTools: def test_get_rag_tools_empty_list(self) -> None: """Test get_rag_tools returns empty list for empty vector store IDs.""" - assert not get_rag_tools([]) + assert not get_rag_tools([], []) def test_get_rag_tools_with_vector_stores(self) -> None: - """Test get_rag_tools returns correct tool format for vector stores.""" - tools = get_rag_tools(["db1", "db2"]) + """Test get_rag_tools returns one file_search tool per vector store.""" + tools = get_rag_tools(["db1", "db2"], []) assert isinstance(tools, list) - assert len(tools) == 1 - assert tools[0].type == "file_search" - assert tools[0].vector_store_ids == ["db1", "db2"] + assert len(tools) == 2 + assert all(t.type == "file_search" for t in tools) + assert tools[0].vector_store_ids == ["db1"] + assert tools[1].vector_store_ids == ["db2"] assert tools[0].max_num_results == 10 + assert tools[0].ranking_options is not None + assert tools[0].ranking_options.score_threshold == ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + assert tools[1].ranking_options is not None + assert tools[1].ranking_options.score_threshold == ( + constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE + ) + + def test_get_rag_tools_per_store_cutoff(self) -> None: + """Each tool uses its store's relevance_cutoff_score as score_threshold.""" + byok_rags = [ + ByokRag( + rag_id="a", + vector_db_id="vs-low", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.2, + ), + ByokRag( + rag_id="b", + vector_db_id="vs-high", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.55, + ), + ] + tools = get_rag_tools(["vs-low", "vs-high"], byok_rags) + assert tools is not None + assert len(tools) == 2 + assert tools[0].ranking_options is not None + assert tools[0].ranking_options.score_threshold == 0.2 + assert tools[1].ranking_options is not None + assert tools[1].ranking_options.score_threshold == 0.55 class TestGetMCPTools: @@ -1487,9 +1522,10 @@ async def test_prepare_tools_with_vector_store_ids( result = await prepare_tools(mock_client, ["vs1", "vs2"], False, "token") assert result is not None - assert len(result) == 1 - assert result[0].type == "file_search" - assert result[0].vector_store_ids == ["vs1", "vs2"] + assert len(result) == 2 + assert all(t.type == "file_search" for t in result) + assert result[0].vector_store_ids == ["vs1"] + assert result[1].vector_store_ids == ["vs2"] @pytest.mark.asyncio async def test_prepare_tools_fetch_vector_stores( @@ -1510,8 +1546,9 @@ async def test_prepare_tools_fetch_vector_stores( result = await prepare_tools(mock_client, None, False, "token") assert result is not None - assert len(result) == 1 - assert result[0].vector_store_ids == ["vs1", "vs2"] + assert len(result) == 2 + assert result[0].vector_store_ids == ["vs1"] + assert result[1].vector_store_ids == ["vs2"] @pytest.mark.asyncio async def test_prepare_tools_connection_error(self, mocker: MockerFixture) -> None: @@ -1637,6 +1674,92 @@ def test_preserves_order(self) -> None: assert result == ["vs-a", "vs-b"] +class TestTranslateToolsVectorStoreIds: + """Tests for translate_tools_vector_store_ids.""" + + def test_sets_ranking_when_omitted(self) -> None: + """file_search tools get ranking_options from BYOK cutoffs when unset.""" + byok_rags = [ + ByokRag( + rag_id="x", + vector_db_id="vs1", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.4, + ) + ] + tools_in: list[InputTool] = [ + InputToolFileSearch(vector_store_ids=["x"]), + ] + out = translate_tools_vector_store_ids(tools_in, byok_rags) + fs = cast(InputToolFileSearch, out[0]) + assert fs.vector_store_ids == ["vs1"] + assert fs.ranking_options is not None + assert fs.ranking_options.score_threshold == 0.4 + + def test_preserves_client_ranking_options(self) -> None: + """Client score_threshold above store cutoff is kept (floor does not lower it).""" + byok_rags = [ + ByokRag( + rag_id="x", + vector_db_id="vs1", + db_path="tests/configuration/rag.txt", + ) + ] + custom = SearchRankingOptions(score_threshold=0.88) + tools_in: list[InputTool] = [ + InputToolFileSearch(vector_store_ids=["x"], ranking_options=custom), + ] + out = translate_tools_vector_store_ids(tools_in, byok_rags) + fs = cast(InputToolFileSearch, out[0]) + assert fs.ranking_options is not None + assert fs.ranking_options.score_threshold == 0.88 + + def test_client_ranking_floor_raises_below_store_cutoff(self) -> None: + """Effective score_threshold is max(client, store_cutoff).""" + byok_rags = [ + ByokRag( + rag_id="x", + vector_db_id="vs1", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.5, + ) + ] + custom = SearchRankingOptions(score_threshold=0.1) + tools_in: list[InputTool] = [ + InputToolFileSearch(vector_store_ids=["x"], ranking_options=custom), + ] + out = translate_tools_vector_store_ids(tools_in, byok_rags) + fs = cast(InputToolFileSearch, out[0]) + assert fs.ranking_options is not None + assert fs.ranking_options.score_threshold == 0.5 + + def test_splits_multiple_stores_into_separate_tools(self) -> None: + """One client file_search with multiple stores becomes one tool per store.""" + byok_rags = [ + ByokRag( + rag_id="a", + vector_db_id="vs-a", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.2, + ), + ByokRag( + rag_id="b", + vector_db_id="vs-b", + db_path="tests/configuration/rag.txt", + relevance_cutoff_score=0.6, + ), + ] + tools_in: list[InputTool] = [ + InputToolFileSearch(vector_store_ids=["a", "b"]), + ] + out = translate_tools_vector_store_ids(tools_in, byok_rags) + assert len(out) == 2 + assert cast(InputToolFileSearch, out[0]).vector_store_ids == ["vs-a"] + assert cast(InputToolFileSearch, out[0]).ranking_options.score_threshold == 0.2 + assert cast(InputToolFileSearch, out[1]).vector_store_ids == ["vs-b"] + assert cast(InputToolFileSearch, out[1]).ranking_options.score_threshold == 0.6 + + class TestPrepareToolsTranslatesVectorStoreIds: """Tests that prepare_tools translates BYOK IDs before building RAG tools.""" @@ -1735,9 +1858,10 @@ async def test_uses_rag_tool_config_when_no_per_request_ids( result = await prepare_tools(mock_client, None, False, "token") assert result is not None - assert len(result) == 1 - assert result[0].type == "file_search" - assert result[0].vector_store_ids == ["rag-tool-id-1", "rag-tool-id-2"] + assert len(result) == 2 + assert all(t.type == "file_search" for t in result) + assert result[0].vector_store_ids == ["rag-tool-id-1"] + assert result[1].vector_store_ids == ["rag-tool-id-2"] mock_client.vector_stores.list.assert_not_called() @pytest.mark.asyncio @@ -3173,11 +3297,11 @@ class TestGetRAGToolsWithConfig: def test_returns_empty_when_no_vector_store_ids(self) -> None: """Test get_rag_tools returns empty list when no vector store IDs are provided.""" # pylint: disable-next=use-implicit-booleaness-not-comparison - assert get_rag_tools([]) == [] + assert get_rag_tools([], []) == [] def test_returns_tools_when_stores_provided(self) -> None: """Test get_rag_tools returns tools when vector store IDs are provided.""" - tools = get_rag_tools(["vs1"]) + tools = get_rag_tools(["vs1"], []) assert tools is not None assert tools[0].type == constants.DEFAULT_RAG_TOOL assert tools[0].vector_store_ids == ["vs1"] diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py index 47dddbdd4..5dee8495c 100644 --- a/tests/unit/utils/test_vector_search.py +++ b/tests/unit/utils/test_vector_search.py @@ -112,7 +112,6 @@ def test_extract_chunks_with_metadata(self, mocker: MockerFixture) -> None: search_response, vector_store_id="test_store", weight=1.5, - byok_raw_cutoff_score=0.0, ) assert len(result) == 2 @@ -137,15 +136,14 @@ def test_extract_chunks_without_metadata(self, mocker: MockerFixture) -> None: search_response, vector_store_id="test_store", weight=1.0, - byok_raw_cutoff_score=0.0, ) assert len(result) == 1 assert result[0]["doc_id"] == "chunk_id" assert result[0]["metadata"] == {} - def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> None: - """Raw scores below the BYOK cutoff are not included.""" + def test_extract_maps_all_chunks_from_response(self, mocker: MockerFixture) -> None: + """Extraction maps every chunk returned by vector_io (server applies score_threshold).""" chunk_hi = mocker.Mock() chunk_hi.content = "Hi" chunk_hi.chunk_id = "a" @@ -163,12 +161,10 @@ def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> N search_response, vector_store_id="vs", weight=0.5, - byok_raw_cutoff_score=0.5, ) - assert len(result) == 1 - assert result[0]["content"] == "Hi" - assert result[0]["score"] == 0.9 + assert len(result) == 2 assert result[0]["weighted_score"] == 0.45 + assert result[1]["weighted_score"] == 0.1 class TestFormatRagContext: @@ -514,7 +510,11 @@ async def test_user_facing_ids_translated_to_internal_ids( client_mock.vector_io.query.assert_called_once_with( vector_store_id="vs-internal-001", query="test query", - params={"max_chunks": constants.BYOK_RAG_MAX_CHUNKS, "mode": "vector"}, + params={ + "max_chunks": constants.BYOK_RAG_MAX_CHUNKS, + "mode": "vector", + "score_threshold": 0.0, + }, ) @pytest.mark.asyncio @@ -609,10 +609,10 @@ async def test_request_id_not_in_inline_config_skips_byok( client_mock.vector_io.query.assert_not_called() @pytest.mark.asyncio - async def test_byok_relevance_cutoff_drops_low_raw_scores( + async def test_byok_relevance_cutoff_passed_as_score_threshold( self, mocker: MockerFixture ) -> None: - """Chunks with raw retrieval score below BYOK relevance_cutoff_score are excluded.""" + """vector_io.query receives score_threshold from BYOK relevance_cutoff_mapping.""" config_mock = mocker.Mock(spec=AppConfig) byok_rag_mock = mocker.Mock() byok_rag_mock.rag_id = "rag_1" @@ -632,14 +632,9 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores( chunk_hi.chunk_id = "c1" chunk_hi.metadata = {} - chunk_lo = mocker.Mock() - chunk_lo.content = "Drop" - chunk_lo.chunk_id = "c2" - chunk_lo.metadata = {} - search_response = mocker.Mock() - search_response.chunks = [chunk_hi, chunk_lo] - search_response.scores = [0.9, 0.3] + search_response.chunks = [chunk_hi] + search_response.scores = [0.9] client_mock = mocker.AsyncMock() client_mock.vector_io.query.return_value = search_response @@ -647,6 +642,8 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores( rag_chunks, _referenced_docs = await _fetch_byok_rag(client_mock, "q") assert len(rag_chunks) == 1 assert rag_chunks[0].content == "Keep" + call_kwargs = client_mock.vector_io.query.call_args.kwargs + assert call_kwargs["params"]["score_threshold"] == 0.5 class TestFetchSolrRag: