From 7f34e7bf38ba423d4c4d34acee0ee0cd5de86263 Mon Sep 17 00:00:00 2001
From: Sergey Yedrikov <syedriko@redhat.com>
Date: Thu, 16 Apr 2026 22:41:42 -0400
Subject: [PATCH 1/3] LCORE-1853: Add relevance cutoff score to BYOK RAG

---
 docs/byok_guide.md                            |  32 +++--
 src/app/endpoints/rags.py                     |   2 +-
 src/client.py                                 |   2 +-
 src/configuration.py                          |   5 +-
 src/constants.py                              |   3 +
 src/llama_stack_configuration.py              |  25 +++-
 src/models/config.py                          |  61 +++++++++-
 src/utils/responses.py                        |   6 +-
 src/utils/vector_search.py                    |  51 +++++---
 .../endpoints/test_query_byok_integration.py  | 100 ++++++++++++++--
 .../test_streaming_query_byok_integration.py  |  47 ++++++--
 tests/unit/app/endpoints/test_rags.py         |   4 +-
 tests/unit/models/config/test_byok_rag.py     |  90 +++++++++++++-
 .../models/config/test_dump_configuration.py  |  46 +++++---
 tests/unit/telemetry/conftest.py              |   5 +-
 tests/unit/test_llama_stack_configuration.py  |  41 +++++++
 tests/unit/utils/test_responses.py            |  38 ++++--
 tests/unit/utils/test_vector_search.py        | 110 ++++++++++++++++--
 18 files changed, 569 insertions(+), 99 deletions(-)

diff --git a/docs/byok_guide.md b/docs/byok_guide.md
index 5213a8d15..b98a84139 100644
--- a/docs/byok_guide.md
+++ b/docs/byok_guide.md
@@ -79,10 +79,11 @@ Both modes rely on:
 
 Inline RAG additionally supports:
 - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content.
+- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score from each BYOK vector store during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). The default matches `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for BYOK inline retrieval.
 
 > [!NOTE]
 > OKP and BYOK scores are not directly comparable (different scoring systems), so
-> `score_multiplier` does not apply to OKP results. To control the amount of retrieved
+> `score_multiplier` and `relevance_cutoff_score` do not apply to OKP results. To control the amount of retrieved
 > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py`
 > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10).
 
@@ -280,19 +281,26 @@ registered_resources:
 > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration
 > at startup.
 >
+> Preferred shape: an object with `entries` (list of stores) and optional `relevance_cutoff_score`:
+>
 > ```yaml
 > byok_rag:
->   - rag_id: my-docs           # Unique identifier for this knowledge source
->     rag_type: inline::faiss
->     embedding_model: sentence-transformers/all-mpnet-base-v2
->     embedding_dimension: 768
->     vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
->     db_path: /path/to/vector_db/faiss_store.db
->     score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
+>   relevance_cutoff_score: 0.3   # Optional; min raw score per BYOK store before score_multiplier (BYOK only)
+>   entries:
+>     - rag_id: my-docs           # Unique identifier for this knowledge source
+>       rag_type: inline::faiss
+>       embedding_model: sentence-transformers/all-mpnet-base-v2
+>       embedding_dimension: 768
+>       vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
+>       db_path: /path/to/vector_db/faiss_store.db
+>       score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
 > ```
 >
+> Legacy: a bare list is still accepted and is treated as `entries` (same fields as each list item above).
+>
 > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of
 > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it.
+> `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff.
 
 ### Step 5: Configure RAG Strategy
 
@@ -319,10 +327,10 @@ okp:
 
 Both modes can be enabled simultaneously. Choose based on your latency and control preferences:
 
-| Mode | When context is fetched | Tool call needed | score_multiplier |
-|------|------------------------|------------------|-----------------|
-| Inline RAG | With every query | No | Yes (BYOK only) |
-| Tool RAG | On LLM demand | Yes | No |
+| Mode | When context is fetched | Tool call needed | score_multiplier | relevance_cutoff_score |
+|------|------------------------|------------------|------------------|------------------------|
+| Inline RAG | With every query | No | Yes (BYOK only) | Yes (BYOK only) |
+| Tool RAG | On LLM demand | Yes | No | No |
 
 > [!TIP]
 > A ready-to-use example combining BYOK and OKP is available at
diff --git a/src/app/endpoints/rags.py b/src/app/endpoints/rags.py
index fdc988358..0e4b6c745 100644
--- a/src/app/endpoints/rags.py
+++ b/src/app/endpoints/rags.py
@@ -163,7 +163,7 @@ async def get_rag_endpoint_handler(
 
     # Resolve user-facing rag_id to llama-stack vector_db_id
     vector_db_id = _resolve_rag_id_to_vector_db_id(
-        rag_id, configuration.configuration.byok_rag
+        rag_id, configuration.configuration.byok_rag.entries
     )
 
     try:
diff --git a/src/client.py b/src/client.py
index 0c77c2d49..1bc87ea0b 100644
--- a/src/client.py
+++ b/src/client.py
@@ -85,7 +85,7 @@ def _enrich_library_config(self, input_config_path: str) -> str:
         config = configuration.configuration
 
         # Enrichment: BYOK RAG
-        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag])
+        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag.entries])
 
         # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
         enrich_solr(ls_config, config.rag.model_dump(), config.okp.model_dump())
diff --git a/src/configuration.py b/src/configuration.py
index 4eeca0460..297523f9f 100644
--- a/src/configuration.py
+++ b/src/configuration.py
@@ -479,7 +479,8 @@ def rag_id_mapping(self) -> dict[str, str]:
         if self._configuration is None:
             raise LogicError("logic error: configuration is not loaded")
         byok_mapping = {
-            brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag
+            brag.vector_db_id: brag.rag_id
+            for brag in self._configuration.byok_rag.entries
         }
 
         rag = self._configuration.rag
@@ -505,7 +506,7 @@ def score_multiplier_mapping(self) -> dict[str, float]:
             raise LogicError("logic error: configuration is not loaded")
         return {
             brag.vector_db_id: brag.score_multiplier
-            for brag in self._configuration.byok_rag
+            for brag in self._configuration.byok_rag.entries
         }
 
     @property
diff --git a/src/constants.py b/src/constants.py
index 69fea3cf9..425840687 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -186,10 +186,13 @@
 
 # Inline RAG constants
 BYOK_RAG_MAX_CHUNKS = 10  # retrieved from BYOK RAG
+# Default minimum raw similarity for BYOK vector stores only (``byok_rag.relevance_cutoff_score``)
+DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3
 OKP_RAG_MAX_CHUNKS = 5  # retrieved from OKP RAG
 
 # Solr OKP constants
 SOLR_VECTOR_SEARCH_DEFAULT_K = 5
+# Default score_threshold in vector_io.query params for the OKP/Solr vector store
 SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.3
 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid"
 
diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
index 3ac4a8ec4..7fb12c1da 100644
--- a/src/llama_stack_configuration.py
+++ b/src/llama_stack_configuration.py
@@ -41,6 +41,29 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None:
         return super().increase_indent(flow, False)
 
 
+def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]:
+    """Return BYOK store definitions from raw Lightspeed YAML (list or section dict).
+
+    Always returns a ``list`` suitable for :func:`enrich_byok_rag` (each item is
+    expected to be a mapping with ``.get``).
+
+    For the section form ``byok_rag: { entries: ..., relevance_cutoff_score: ... }``,
+    only ``entries`` is used: a ``list`` is returned as-is, a single ``dict`` is
+    wrapped as a one-element list, and ``None``, strings, or other types yield
+    ``[]``.
+    """
+    if isinstance(raw_byok_rag, list):
+        return raw_byok_rag
+    if isinstance(raw_byok_rag, dict):
+        entries = raw_byok_rag.get("entries")
+        if isinstance(entries, list):
+            return entries
+        if isinstance(entries, dict):
+            return [entries]
+        return []
+    return []
+
+
 # =============================================================================
 # Enrichment: Azure Entra ID
 # =============================================================================
@@ -619,7 +642,7 @@ def generate_configuration(
     setup_azure_entra_id_token(config.get("azure_entra_id"), env_file)
 
     # Enrichment: BYOK RAG
-    enrich_byok_rag(ls_config, config.get("byok_rag", []))
+    enrich_byok_rag(ls_config, _raw_byok_rag_store_list(config.get("byok_rag", [])))
 
     # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
     enrich_solr(ls_config, config.get("rag", {}), config.get("okp", {}))
diff --git a/src/models/config.py b/src/models/config.py
index c3e8f6726..6ed0d0bd6 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -7,7 +7,7 @@
 from functools import cached_property
 from pathlib import Path
 from re import Pattern
-from typing import Any, Literal, Optional, Self
+from typing import Annotated, Any, Literal, Optional, Self
 
 import jsonpath_ng
 import yaml
@@ -15,6 +15,7 @@
 from pydantic import (
     AnyHttpUrl,
     BaseModel,
+    BeforeValidator,
     ConfigDict,
     Field,
     FilePath,
@@ -1622,6 +1623,55 @@ class ByokRag(ConfigurationBase):
     )
 
 
+def _normalize_byok_rag_input(value: Any) -> Any:
+    """Allow legacy ``byok_rag: [ ... ]`` YAML alongside the section object form.
+
+    Explicit YAML null (``byok_rag: null``) is normalized to an empty mapping so
+    :class:`ByokRagSection` field defaults apply instead of a type error.
+    """
+    if value is None:
+        return {}
+    if isinstance(value, list):
+        return {"entries": value}
+    return value
+
+
+class ByokRagSection(ConfigurationBase):
+    """BYOK RAG configuration: registered BYOK stores and optional raw-score cutoff.
+
+    Settings here apply only to bring-your-own-knowledge vector stores listed in
+    ``entries``. They do not affect OKP (Solr) inline RAG, which uses separate
+    query parameters and defaults.
+    """
+
+    entries: list[ByokRag] = Field(
+        default_factory=list,
+        title="BYOK RAG stores",
+        description="Registered bring-your-own-knowledge vector stores.",
+    )
+
+    relevance_cutoff_score: float = Field(
+        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+        ge=0,
+        title="BYOK inline RAG relevance cutoff",
+        description="Minimum raw similarity score from each **BYOK** vector store "
+        "before per-store score_multiplier weighting. Chunks below this threshold "
+        "are dropped immediately after retrieval from those stores only. Does not "
+        "apply to OKP/Solr. Set to 0.0 to disable filtering for BYOK.",
+    )
+
+
+def _default_byok_rag_section() -> ByokRagSection:
+    """Return default BYOK RAG section; delegates to :class:`ByokRagSection` field defaults."""
+    return ByokRagSection.model_validate({})
+
+
+ByokRagSectionValidated = Annotated[
+    ByokRagSection,
+    BeforeValidator(_normalize_byok_rag_input),
+]
+
+
 class QuotaLimiterConfiguration(ConfigurationBase):
     """Configuration for one quota limiter.
 
@@ -1908,11 +1958,14 @@ class Configuration(ConfigurationBase):
         description="Conversation history configuration.",
     )
 
-    byok_rag: list[ByokRag] = Field(
-        default_factory=list,
+    byok_rag: ByokRagSectionValidated = Field(
+        default_factory=_default_byok_rag_section,
         title="BYOK RAG configuration",
         description="BYOK RAG configuration. This configuration can be used to "
-        "reconfigure Llama Stack through its run.yaml configuration file",
+        "reconfigure Llama Stack through its run.yaml configuration file. "
+        "You may use the legacy form ``byok_rag: [ ... ]`` (a list of stores) or "
+        "an object with ``entries`` and optional ``relevance_cutoff_score`` "
+        "(BYOK inline RAG only; not used for OKP/Solr).",
     )
 
     a2a_state: A2AStateConfiguration = Field(
diff --git a/src/utils/responses.py b/src/utils/responses.py
index 3fa10b8e2..ba52cceac 100644
--- a/src/utils/responses.py
+++ b/src/utils/responses.py
@@ -236,7 +236,7 @@ async def prepare_tools(  # pylint: disable=too-many-arguments,too-many-position
     #      If rag.inline is configured, but not rag.tool, tool RAG is disabled.
     #   3. All registered vector DBs: fallback when neither rag.tool nor rag.inline are configured.
     #      IDs fetched from llama-stack are already internal and need no translation.
-    byok_rags = configuration.configuration.byok_rag
+    byok_rags = configuration.configuration.byok_rag.entries
 
     is_tool_rag_enabled = len(configuration.configuration.rag.tool) > 0
     is_inline_rag_enabled = len(configuration.configuration.rag.inline) > 0
@@ -1708,7 +1708,7 @@ async def _resolve_client_tools(
     # Per-request override of vector stores (user-facing rag_ids)
     vector_store_ids = extract_vector_store_ids_from_tools(tools) or None
     # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool
-    byok_rags = configuration.configuration.byok_rag
+    byok_rags = configuration.configuration.byok_rag.entries
     prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
     prepared_tools = apply_mcp_headers_to_explicit_tools(
         prepared_tools, token, mcp_headers, request_headers
@@ -1803,7 +1803,7 @@ async def resolve_tool_choice(
         )
     else:
         # Pass tools explicitly configured for this request
-        byok_rags = configuration.configuration.byok_rag
+        byok_rags = configuration.configuration.byok_rag.entries
         prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
         prepared_tools = apply_mcp_headers_to_explicit_tools(
             prepared_tools, token, mcp_headers, request_headers
diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py
index 54ec16a49..bea5a5604 100644
--- a/src/utils/vector_search.py
+++ b/src/utils/vector_search.py
@@ -73,14 +73,21 @@ def _build_query_params(solr: Optional[dict[str, Any]] = None) -> dict[str, Any]
 
 
 def _extract_byok_rag_chunks(
-    search_response: Any, vector_store_id: str, weight: float
+    search_response: Any,
+    vector_store_id: str,
+    weight: float,
+    byok_raw_cutoff_score: float,
 ) -> list[dict[str, Any]]:
-    """Extract and weight result chunks from vector search for BYOK RAG.
+    """Extract and weight result chunks from vector search for BYOK RAG only.
+
+    Chunks whose raw retrieval score is below ``byok_raw_cutoff_score`` are
+    dropped before applying the per-store weight. This is not used for OKP/Solr.
 
     Args:
         search_response: Response from vector_io.query
-        vector_store_id: ID of the vector store that produced these results
-        weight: Score multiplier to apply to this store's results
+        vector_store_id: ID of the BYOK vector store that produced these results
+        weight: Score multiplier to apply to this store's remaining results
+        byok_raw_cutoff_score: Minimum raw similarity score (``byok_rag.relevance_cutoff_score``)
 
     Returns:
         List of result dictionaries with weighted scores
@@ -89,6 +96,14 @@ def _extract_byok_rag_chunks(
     for chunk, score in zip(
         search_response.chunks, search_response.scores, strict=True
     ):
+        if score < byok_raw_cutoff_score:
+            logger.debug(
+                "  [%s] BYOK: dropping chunk: raw score=%.4f < cutoff=%.4f",
+                vector_store_id,
+                score,
+                byok_raw_cutoff_score,
+            )
+            continue
         weighted_score = score * weight
         doc_id = (
             chunk.metadata.get("document_id", chunk.chunk_id)
@@ -164,14 +179,16 @@ async def _query_store_for_byok_rag(
     vector_store_id: str,
     query: str,
     weight: float,
+    byok_raw_cutoff_score: float,
 ) -> list[dict[str, Any]]:
-    """Query a single vector store for BYOK RAG.
+    """Query a single **BYOK** vector store for inline RAG (not OKP/Solr).
 
     Args:
         client: AsyncLlamaStackClient for vector_io queries
-        vector_store_id: ID of the vector store to query
+        vector_store_id: ID of the BYOK vector store to query
         query: Search query string
         weight: Score multiplier to apply
+        byok_raw_cutoff_score: Minimum raw score before weighting (BYOK config only)
 
     Returns:
         List of weighted result dictionaries, or empty list on error
@@ -185,7 +202,9 @@ async def _query_store_for_byok_rag(
                 "mode": "vector",
             },
         )
-        return _extract_byok_rag_chunks(search_response, vector_store_id, weight)
+        return _extract_byok_rag_chunks(
+            search_response, vector_store_id, weight, byok_raw_cutoff_score
+        )
     except Exception as e:  # pylint: disable=broad-exception-caught
         logger.warning("Failed to search '%s': %s", vector_store_id, e)
         return []
@@ -333,12 +352,15 @@ async def _fetch_byok_rag(
     query: str,
     vector_store_ids: Optional[list[str]] = None,  # User-facing
 ) -> tuple[list[RAGChunk], list[ReferencedDocument]]:
-    """Fetch chunks and documents from BYOK RAG sources.
+    """Fetch chunks and documents from BYOK RAG sources only.
+
+    Applies ``configuration.byok_rag.relevance_cutoff_score`` to raw scores from
+    BYOK vector stores. OKP/Solr inline RAG is implemented in ``_fetch_solr_rag``
+    and does not use that setting.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
         query: The search query
-        configuration: Application configuration
         vector_store_ids: Optional list of vector store IDs to query.
             If provided, only these stores will be queried. If None, all stores
             (excluding Solr) will be queried.
@@ -366,7 +388,7 @@ async def _fetch_byok_rag(
 
     # Translate user-facing rag_ids to llama-stack ids
     vector_store_ids_to_query: list[str] = resolve_vector_store_ids(
-        rag_ids_to_query, configuration.configuration.byok_rag
+        rag_ids_to_query, configuration.configuration.byok_rag.entries
     )
 
     # Request-level override: filter out Solr store, use the rest
@@ -386,7 +408,7 @@ async def _fetch_byok_rag(
         score_multiplier_mapping = configuration.score_multiplier_mapping
         rag_id_mapping = configuration.rag_id_mapping
 
-        # Query all vector stores in parallel
+        # BYOK stores only: byok_rag.relevance_cutoff_score (OKP/Solr does not use it).
         results_per_store = await asyncio.gather(
             *[
                 _query_store_for_byok_rag(
@@ -394,12 +416,13 @@ async def _fetch_byok_rag(
                     vector_store_id,
                     query,
                     score_multiplier_mapping.get(vector_store_id, 1.0),
+                    configuration.configuration.byok_rag.relevance_cutoff_score,
                 )
                 for vector_store_id in vector_store_ids_to_query
             ]
         )
 
-        # Flatten, sort by weighted score, and take top results
+        # Flatten, sort by weighted score, take top results
         all_results: list[dict[str, Any]] = []
         for store_results in results_per_store:
             all_results.extend(store_results)
@@ -520,7 +543,9 @@ async def build_rag_context(
 ) -> RAGContext:
     """Build RAG context by fetching and merging chunks from all enabled sources.
 
-    Enabled sources can be BYOK and/or Solr OKP.
+    Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoff comes only
+    from ``byok_rag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g.
+    Solr ``score_threshold``), not the BYOK cutoff.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py
index 5e71c76c1..5a43bca17 100644
--- a/tests/integration/endpoints/test_query_byok_integration.py
+++ b/tests/integration/endpoints/test_query_byok_integration.py
@@ -265,7 +265,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
     }
 
     # Patch the loaded configuration's byok_rag and rag.inline
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["test-knowledge"]
 
     return test_config
@@ -294,7 +297,10 @@ def byok_tool_config_fixture(
         "score_multiplier": 1.0,
     }
 
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = []
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -425,7 +431,10 @@ async def test_query_byok_inline_rag_with_request_vector_store_ids(
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -486,7 +495,10 @@ async def test_query_byok_request_vector_store_ids_filters_configured_stores(
     entry_b.score_multiplier = 1.0
 
     # Both sources are in config
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -684,7 +696,10 @@ async def test_query_byok_combined_inline_and_tool_rag(  # pylint: disable=too-m
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["test-knowledge"]
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -800,7 +815,10 @@ async def test_query_byok_inline_rag_only_configured_rag_id_is_queried(
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -871,7 +889,10 @@ async def test_query_byok_score_multiplier_shifts_chunk_priority(  # pylint: dis
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 5.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -926,6 +947,61 @@ async def _side_effect(**kwargs: Any) -> Any:
     assert first_chunk.score > second_chunk.score
 
 
+@pytest.mark.asyncio
+async def test_query_byok_cutoff_applied_before_multiplier(
+    test_config: AppConfig,
+    mocker: MockerFixture,
+    test_request: Request,
+    test_auth: AuthTuple,
+) -> None:
+    """Chunks below relevance_cutoff_score are dropped before score_multiplier.
+
+    With cutoff 0.5 and per-store multiplier 10.0, a raw score of 0.4 would
+    become 4.0 only after surviving the cutoff step; because 0.4 < 0.5, the
+    chunk must be removed and must not appear in the response even with a
+    large multiplier.
+    """
+    entry = mocker.MagicMock()
+    entry.rag_id = "cutoff-test"
+    entry.vector_db_id = "vs-cutoff-test"
+    entry.score_multiplier = 10.0
+
+    _br = mocker.MagicMock()
+    _br.entries = [entry]
+    _br.relevance_cutoff_score = 0.5
+    test_config.configuration.byok_rag = _br
+    test_config.configuration.rag.inline = ["cutoff-test"]
+
+    mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
+    mock_client = _build_base_mock_client(mocker)
+
+    low_score_resp = _make_vector_io_response(
+        mocker,
+        [
+            ("Content below raw cutoff", "doc-below", 0.4),
+        ],
+    )
+    mock_client.vector_io.query = mocker.AsyncMock(return_value=low_score_resp)
+
+    mock_vs_resp = mocker.MagicMock()
+    mock_vs_resp.data = []
+    mock_client.vector_stores.list.return_value = mock_vs_resp
+
+    mock_holder_class.return_value.get_client.return_value = mock_client
+
+    query_request = QueryRequest(query="test query")
+
+    response = await query_endpoint_handler(
+        request=test_request,
+        query_request=query_request,
+        auth=test_auth,
+        mcp_headers={},
+    )
+
+    assert response.rag_chunks is not None
+    assert len(response.rag_chunks) == 0
+
+
 # ==============================================================================
 # BYOK_RAG_MAX_CHUNKS Capping Tests
 # ==============================================================================
@@ -953,7 +1029,10 @@ async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=
     entry.vector_db_id = "vs-big-source"
     entry.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry]
+    _br = mocker.MagicMock()
+    _br.entries = [entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["big-source"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -1025,7 +1104,10 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources(  # pylint: di
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
diff --git a/tests/integration/endpoints/test_streaming_query_byok_integration.py b/tests/integration/endpoints/test_streaming_query_byok_integration.py
index fa18edceb..8f963be11 100644
--- a/tests/integration/endpoints/test_streaming_query_byok_integration.py
+++ b/tests/integration/endpoints/test_streaming_query_byok_integration.py
@@ -246,7 +246,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
         "score_multiplier": 1.0,
     }
 
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["test-knowledge"]
 
     return test_config
@@ -271,7 +274,10 @@ def byok_tool_config_fixture(
         "score_multiplier": 1.0,
     }
 
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = []
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -345,7 +351,10 @@ async def test_streaming_query_byok_inline_rag_with_request_vector_store_ids(
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch(
@@ -409,7 +418,10 @@ async def test_streaming_query_byok_request_vector_store_ids_filters_configured_
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
@@ -689,7 +701,10 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag(
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
-    test_config.configuration.byok_rag = [byok_entry]
+    _br = mocker.MagicMock()
+    _br.entries = [byok_entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["test-knowledge"]
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -744,7 +759,7 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag(
 
 
 @pytest.mark.asyncio
-async def test_streaming_query_byok_only_configured_rag_id_is_queried(
+async def test_streaming_query_byok_only_configured_rag_id_is_queried(  # pylint: disable=too-many-locals
     test_config: AppConfig,
     mocker: MockerFixture,
     test_request: Request,
@@ -772,7 +787,10 @@ async def test_streaming_query_byok_only_configured_rag_id_is_queried(
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch(
@@ -851,7 +869,10 @@ async def test_streaming_query_byok_score_multiplier_shifts_priority(  # pylint:
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 5.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
@@ -933,7 +954,10 @@ async def test_streaming_query_byok_max_chunks_caps_context(  # pylint: disable=
     entry.vector_db_id = "vs-big-source"
     entry.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry]
+    _br = mocker.MagicMock()
+    _br.entries = [entry]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["big-source"]
 
     mock_holder_class = mocker.patch(
@@ -1009,7 +1033,10 @@ async def test_streaming_query_byok_max_chunks_caps_across_multiple_sources(  #
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
 
-    test_config.configuration.byok_rag = [entry_a, entry_b]
+    _br = mocker.MagicMock()
+    _br.entries = [entry_a, entry_b]
+    _br.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = _br
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
diff --git a/tests/unit/app/endpoints/test_rags.py b/tests/unit/app/endpoints/test_rags.py
index 563c223fe..f6247aefa 100644
--- a/tests/unit/app/endpoints/test_rags.py
+++ b/tests/unit/app/endpoints/test_rags.py
@@ -379,7 +379,7 @@ def __init__(self) -> None:
 def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None:
     """Test that _resolve_rag_id_to_vector_db_id maps rag_id to vector_db_id."""
     byok_config = _make_byok_config(str(tmp_path))
-    byok_rags = byok_config.configuration.byok_rag
+    byok_rags = byok_config.configuration.byok_rag.entries
     assert _resolve_rag_id_to_vector_db_id("ocp-4.18-docs", byok_rags) == "vs_abc123"
     assert _resolve_rag_id_to_vector_db_id("company-kb", byok_rags) == "vs_def456"
 
@@ -387,5 +387,5 @@ def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None:
 def test_resolve_rag_id_to_vector_db_id_passthrough(tmp_path: Path) -> None:
     """Test that unmapped IDs are passed through unchanged."""
     byok_config = _make_byok_config(str(tmp_path))
-    byok_rags = byok_config.configuration.byok_rag
+    byok_rags = byok_config.configuration.byok_rag.entries
     assert _resolve_rag_id_to_vector_db_id("vs_unknown", byok_rags) == "vs_unknown"
diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py
index 0a22b3a12..4083dd6b7 100644
--- a/tests/unit/models/config/test_byok_rag.py
+++ b/tests/unit/models/config/test_byok_rag.py
@@ -6,12 +6,13 @@
 from pydantic import ValidationError
 
 from constants import (
+    DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
     DEFAULT_EMBEDDING_DIMENSION,
     DEFAULT_EMBEDDING_MODEL,
     DEFAULT_RAG_TYPE,
     DEFAULT_SCORE_MULTIPLIER,
 )
-from models.config import ByokRag
+from models.config import ByokRag, ByokRagSection, Configuration
 
 
 def test_byok_rag_configuration_default_values() -> None:
@@ -189,3 +190,90 @@ def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None:
             db_path="tests/configuration/rag.txt",
             score_multiplier=0.0,
         )
+
+
+def test_byok_rag_section_explicit_null_yields_defaults() -> None:
+    """``byok_rag: null`` in YAML normalizes to defaults (empty entries, default cutoff)."""
+    cfg = Configuration.model_validate(
+        {
+            "name": "t",
+            "service": {"host": "localhost", "port": 8080},
+            "llama_stack": {
+                "api_key": "k",
+                "url": "http://x:1",
+                "use_as_library_client": False,
+            },
+            "user_data_collection": {},
+            "authentication": {"module": "noop"},
+            "byok_rag": None,
+        }
+    )
+    assert cfg.byok_rag.entries == []
+    assert (
+        cfg.byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+    )
+
+
+def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None:
+    """``byok_rag`` accepts a section object or a legacy list of stores."""
+    db_path = "tests/configuration/rag.txt"
+    from_list = Configuration.model_validate(
+        {
+            "name": "t",
+            "service": {"host": "localhost", "port": 8080},
+            "llama_stack": {
+                "api_key": "k",
+                "url": "http://x:1",
+                "use_as_library_client": False,
+            },
+            "user_data_collection": {},
+            "authentication": {"module": "noop"},
+            "byok_rag": [
+                {
+                    "rag_id": "r1",
+                    "vector_db_id": "vs1",
+                    "db_path": db_path,
+                },
+            ],
+        }
+    )
+    assert from_list.byok_rag.entries[0].rag_id == "r1"
+    assert (
+        from_list.byok_rag.relevance_cutoff_score
+        == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+    )
+
+    from_obj = Configuration.model_validate(
+        {
+            "name": "t",
+            "service": {"host": "localhost", "port": 8080},
+            "llama_stack": {
+                "api_key": "k",
+                "url": "http://x:1",
+                "use_as_library_client": False,
+            },
+            "user_data_collection": {},
+            "authentication": {"module": "noop"},
+            "byok_rag": {
+                "entries": [
+                    {
+                        "rag_id": "r2",
+                        "vector_db_id": "vs2",
+                        "db_path": db_path,
+                    },
+                ],
+                "relevance_cutoff_score": 0.42,
+            },
+        }
+    )
+    assert from_obj.byok_rag.entries[0].rag_id == "r2"
+    assert from_obj.byok_rag.relevance_cutoff_score == 0.42
+
+
+def test_byok_rag_section_rejects_negative_cutoff() -> None:
+    """relevance_cutoff_score must be non-negative."""
+    with pytest.raises(ValidationError):
+        _ = ByokRagSection(
+            entries=[],
+            relevance_cutoff_score=-0.1,
+        )
diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py
index 06a3ef08c..49184cf31 100644
--- a/tests/unit/models/config/test_dump_configuration.py
+++ b/tests/unit/models/config/test_dump_configuration.py
@@ -8,6 +8,7 @@
 
 from pydantic import SecretStr
 
+import constants
 from models.config import (
     ByokRag,
     Configuration,
@@ -189,7 +190,10 @@ def test_dump_configuration(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": [],
+            "byok_rag": {
+                "entries": [],
+                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+            },
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -531,7 +535,10 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": [],
+            "byok_rag": {
+                "entries": [],
+                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+            },
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -765,7 +772,10 @@ def test_dump_configuration_with_quota_limiters_different_values(
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": [],
+            "byok_rag": {
+                "entries": [],
+                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+            },
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -979,17 +989,20 @@ def test_dump_configuration_byok(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": [
-                {
-                    "db_path": "tests/configuration/rag.txt",
-                    "embedding_dimension": 768,
-                    "embedding_model": "sentence-transformers/all-mpnet-base-v2",
-                    "rag_id": "rag_id",
-                    "rag_type": "inline::faiss",
-                    "vector_db_id": "vector_db_id",
-                    "score_multiplier": 1.0,
-                },
-            ],
+            "byok_rag": {
+                "entries": [
+                    {
+                        "db_path": "tests/configuration/rag.txt",
+                        "embedding_dimension": 768,
+                        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
+                        "rag_id": "rag_id",
+                        "rag_type": "inline::faiss",
+                        "vector_db_id": "vector_db_id",
+                        "score_multiplier": 1.0,
+                    },
+                ],
+                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+            },
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -1183,7 +1196,10 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": [],
+            "byok_rag": {
+                "entries": [],
+                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+            },
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
diff --git a/tests/unit/telemetry/conftest.py b/tests/unit/telemetry/conftest.py
index 6b2db6a82..e7ef44255 100644
--- a/tests/unit/telemetry/conftest.py
+++ b/tests/unit/telemetry/conftest.py
@@ -12,6 +12,7 @@
     Action,
     AuthenticationConfiguration,
     AuthorizationConfiguration,
+    ByokRagSection,
     Configuration,
     CORSConfiguration,
     Customization,
@@ -287,7 +288,7 @@ def build_fully_populated_config() -> Configuration:
             ),
         ],
         conversation_cache=None,
-        byok_rag=[],
+        byok_rag=ByokRagSection(entries=[]),
         a2a_state=None,
         quota_handlers=None,
         azure_entra_id=None,
@@ -363,7 +364,7 @@ def build_minimal_config() -> Configuration:
         ),
         mcp_servers=[],
         conversation_cache=None,
-        byok_rag=[],
+        byok_rag=ByokRagSection(entries=[]),
         a2a_state=None,
         quota_handlers=None,
         azure_entra_id=None,
diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py
index d10f60580..6d14a5a8d 100644
--- a/tests/unit/test_llama_stack_configuration.py
+++ b/tests/unit/test_llama_stack_configuration.py
@@ -7,6 +7,7 @@
 import yaml
 
 from llama_stack_configuration import (
+    _raw_byok_rag_store_list,
     construct_models_section,
     construct_storage_backends_section,
     construct_vector_io_providers_section,
@@ -24,6 +25,46 @@
     UserDataCollection,
 )
 
+# =============================================================================
+# Test _raw_byok_rag_store_list
+# =============================================================================
+
+
+def test_raw_byok_rag_store_list_legacy_list() -> None:
+    """Bare list form returns the same sequence."""
+    raw = [{"rag_id": "a"}]
+    assert _raw_byok_rag_store_list(raw) is raw
+
+
+def test_raw_byok_rag_store_list_section_with_entries_list() -> None:
+    """Section dict with entries list returns that list."""
+    entries = [{"rag_id": "x"}]
+    assert (
+        _raw_byok_rag_store_list({"relevance_cutoff_score": 0.3, "entries": entries})
+        is entries
+    )
+
+
+def test_raw_byok_rag_store_list_section_with_single_entry_dict() -> None:
+    """Section dict may use a single mapping for entries (wrap as one-element list)."""
+    one = {"rag_id": "solo", "vector_db_id": "vs1", "db_path": "/d.db"}
+    out = _raw_byok_rag_store_list({"entries": one})
+    assert out == [one]
+
+
+def test_raw_byok_rag_store_list_section_entries_invalid_or_missing() -> None:
+    """None, strings, or missing entries yield an empty list."""
+    assert _raw_byok_rag_store_list({"entries": None}) == []
+    assert _raw_byok_rag_store_list({"entries": "not-a-list"}) == []
+    assert _raw_byok_rag_store_list({}) == []
+
+
+def test_raw_byok_rag_store_list_non_sequence_top_level() -> None:
+    """Non-list, non-dict input yields []."""
+    assert _raw_byok_rag_store_list(None) == []
+    assert _raw_byok_rag_store_list("byok") == []
+
+
 # =============================================================================
 # Test construct_vector_stores_section
 # =============================================================================
diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py
index 485ecbf26..9ae98a225 100644
--- a/tests/unit/utils/test_responses.py
+++ b/tests/unit/utils/test_responses.py
@@ -90,6 +90,14 @@
 )
 
 
+def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any:
+    """Mock ``configuration.byok_rag`` with ``entries`` and default cutoff."""
+    section = mocker.Mock()
+    section.entries = entries
+    section.relevance_cutoff_score = 0.0
+    return section
+
+
 class MockOutputItem:  # pylint: disable=too-few-public-methods
     """Mock Responses API output item."""
 
@@ -1643,7 +1651,9 @@ async def test_translates_byok_ids_in_prepare_tools(
         mock_byok_rag.rag_id = "ocp_docs"
         mock_byok_rag.vector_db_id = "vs-001"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = [mock_byok_rag]
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [mock_byok_rag]
+        )
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1664,7 +1674,7 @@ async def test_passes_through_unknown_ids_in_prepare_tools(
 
         # Configure empty BYOK RAG
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1694,7 +1704,9 @@ async def test_does_not_translate_when_ids_fetched_from_llama_stack(
         mock_byok_rag.rag_id = "vs-internal"
         mock_byok_rag.vector_db_id = "vs-translated"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = [mock_byok_rag]
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [mock_byok_rag]
+        )
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1717,7 +1729,7 @@ async def test_uses_rag_tool_config_when_no_per_request_ids(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.configuration.rag.tool = ["rag-tool-id-1", "rag-tool-id-2"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1742,7 +1754,9 @@ async def test_rag_tool_config_ids_are_translated(
         mock_byok_rag.rag_id = "ocp_docs"
         mock_byok_rag.vector_db_id = "vs-001"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = [mock_byok_rag]
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [mock_byok_rag]
+        )
         mock_config.configuration.rag.tool = ["ocp_docs"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1760,7 +1774,7 @@ async def test_inline_rag_disables_tool_rag(self, mocker: MockerFixture) -> None
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = [
             "inline-store-id"
@@ -1782,7 +1796,7 @@ async def test_per_request_ids_override_rag_tool_config(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.configuration.rag.tool = ["config-id-1"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1807,7 +1821,7 @@ async def test_all_registered_dbs_used_when_neither_tool_nor_inline_configured(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -3355,7 +3369,7 @@ async def test_client_tools_without_merge_header(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3380,7 +3394,7 @@ async def test_client_tools_with_merge_header(self, mocker: MockerFixture) -> No
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3417,7 +3431,7 @@ async def test_merge_header_conflict_raises_409(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3476,7 +3490,7 @@ async def test_merge_header_no_server_tools_returns_client_only(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = []
+        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch(
diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py
index 8bd5669e9..7e2752dfd 100644
--- a/tests/unit/utils/test_vector_search.py
+++ b/tests/unit/utils/test_vector_search.py
@@ -1,5 +1,7 @@
 """Unit tests for vector search utilities."""
 
+from typing import Any
+
 import pytest
 from pydantic import AnyUrl
 from pytest_mock import MockerFixture
@@ -23,6 +25,14 @@
 )
 
 
+def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any:
+    """Build a mock ``byok_rag`` section with ``entries`` and cutoff."""
+    section = mocker.Mock()
+    section.entries = entries
+    section.relevance_cutoff_score = 0.0
+    return section
+
+
 class TestIsSolrEnabled:
     """Tests for _is_solr_enabled function."""
 
@@ -97,7 +107,10 @@ def test_extract_chunks_with_metadata(self, mocker: MockerFixture) -> None:
         search_response.scores = [0.9, 0.8]
 
         result = _extract_byok_rag_chunks(
-            search_response, vector_store_id="test_store", weight=1.5
+            search_response,
+            vector_store_id="test_store",
+            weight=1.5,
+            byok_raw_cutoff_score=0.0,
         )
 
         assert len(result) == 2
@@ -119,13 +132,42 @@ def test_extract_chunks_without_metadata(self, mocker: MockerFixture) -> None:
         search_response.scores = [0.75]
 
         result = _extract_byok_rag_chunks(
-            search_response, vector_store_id="test_store", weight=1.0
+            search_response,
+            vector_store_id="test_store",
+            weight=1.0,
+            byok_raw_cutoff_score=0.0,
         )
 
         assert len(result) == 1
         assert result[0]["doc_id"] == "chunk_id"
         assert result[0]["metadata"] == {}
 
+    def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> None:
+        """Raw scores below the BYOK cutoff are not included."""
+        chunk_hi = mocker.Mock()
+        chunk_hi.content = "Hi"
+        chunk_hi.chunk_id = "a"
+        chunk_hi.metadata = {}
+        chunk_lo = mocker.Mock()
+        chunk_lo.content = "Lo"
+        chunk_lo.chunk_id = "b"
+        chunk_lo.metadata = {}
+
+        search_response = mocker.Mock()
+        search_response.chunks = [chunk_hi, chunk_lo]
+        search_response.scores = [0.9, 0.2]
+
+        result = _extract_byok_rag_chunks(
+            search_response,
+            vector_store_id="vs",
+            weight=0.5,
+            byok_raw_cutoff_score=0.5,
+        )
+        assert len(result) == 1
+        assert result[0]["content"] == "Hi"
+        assert result[0]["score"] == 0.9
+        assert result[0]["weighted_score"] == 0.45
+
 
 class TestFormatRagContext:
     """Tests for _format_rag_context function."""
@@ -382,7 +424,7 @@ async def test_byok_no_inline_ids(self, mocker: MockerFixture) -> None:
         """Test when no inline BYOK sources are configured."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = []
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -401,7 +443,9 @@ async def test_byok_enabled_success(self, mocker: MockerFixture) -> None:
         byok_rag_mock.rag_id = "rag_1"
         byok_rag_mock.vector_db_id = "vs_1"
         config_mock.configuration.rag.inline = ["rag_1"]
-        config_mock.configuration.byok_rag = [byok_rag_mock]
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [byok_rag_mock]
+        )
         config_mock.score_multiplier_mapping = {"vs_1": 1.5}
         config_mock.rag_id_mapping = {"vs_1": "rag_1"}
         mocker.patch("utils.vector_search.configuration", config_mock)
@@ -439,7 +483,9 @@ async def test_user_facing_ids_translated_to_internal_ids(
         byok_rag_mock = mocker.Mock()
         byok_rag_mock.rag_id = "my-kb"
         byok_rag_mock.vector_db_id = "vs-internal-001"
-        config_mock.configuration.byok_rag = [byok_rag_mock]
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [byok_rag_mock]
+        )
         config_mock.configuration.rag.inline = ["my-kb"]
         config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0}
         config_mock.rag_id_mapping = {"vs-internal-001": "my-kb"}
@@ -479,7 +525,9 @@ async def test_multiple_user_facing_ids_each_translated(
         byok_rag_2 = mocker.Mock()
         byok_rag_2.rag_id = "kb-part2"
         byok_rag_2.vector_db_id = "vs-bbb-222"
-        config_mock.configuration.byok_rag = [byok_rag_1, byok_rag_2]
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [byok_rag_1, byok_rag_2]
+        )
         config_mock.configuration.rag.inline = ["kb-part1", "kb-part2"]
         config_mock.score_multiplier_mapping = {"vs-aaa-111": 1.0, "vs-bbb-222": 1.0}
         config_mock.rag_id_mapping = {
@@ -522,7 +570,7 @@ async def test_no_inline_rag_configured_skips_byok(
         """Test that BYOK inline RAG is skipped when rag.inline is empty."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = []
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -542,7 +590,7 @@ async def test_request_id_not_in_inline_config_skips_byok(
         """Test that a request vector_store_id not registered in rag.inline is filtered out."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = ["registered-id"]
-        config_mock.configuration.byok_rag = []
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -555,6 +603,45 @@ async def test_request_id_not_in_inline_config_skips_byok(
         assert referenced_docs == []
         client_mock.vector_io.query.assert_not_called()
 
+    @pytest.mark.asyncio
+    async def test_byok_relevance_cutoff_drops_low_raw_scores(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Chunks with raw retrieval score below BYOK relevance_cutoff_score are excluded."""
+        config_mock = mocker.Mock(spec=AppConfig)
+        byok_rag_mock = mocker.Mock()
+        byok_rag_mock.rag_id = "rag_1"
+        byok_rag_mock.vector_db_id = "vs_1"
+        section = mocker.Mock()
+        section.entries = [byok_rag_mock]
+        section.relevance_cutoff_score = 0.5
+        config_mock.configuration.rag.inline = ["rag_1"]
+        config_mock.configuration.byok_rag = section
+        config_mock.score_multiplier_mapping = {"vs_1": 1.0}
+        config_mock.rag_id_mapping = {"vs_1": "rag_1"}
+        mocker.patch("utils.vector_search.configuration", config_mock)
+
+        chunk_hi = mocker.Mock()
+        chunk_hi.content = "Keep"
+        chunk_hi.chunk_id = "c1"
+        chunk_hi.metadata = {}
+
+        chunk_lo = mocker.Mock()
+        chunk_lo.content = "Drop"
+        chunk_lo.chunk_id = "c2"
+        chunk_lo.metadata = {}
+
+        search_response = mocker.Mock()
+        search_response.chunks = [chunk_hi, chunk_lo]
+        search_response.scores = [0.9, 0.3]
+
+        client_mock = mocker.AsyncMock()
+        client_mock.vector_io.query.return_value = search_response
+
+        rag_chunks, _referenced_docs = await _fetch_byok_rag(client_mock, "q")
+        assert len(rag_chunks) == 1
+        assert rag_chunks[0].content == "Keep"
+
 
 class TestFetchSolrRag:
     """Tests for _fetch_solr_rag async function."""
@@ -613,7 +700,7 @@ async def test_both_sources_disabled(self, mocker: MockerFixture) -> None:
         """Test when both BYOK inline and Solr inline are not configured."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = []
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
         config_mock.inline_solr_enabled = False
         mocker.patch("utils.vector_search.configuration", config_mock)
 
@@ -633,7 +720,9 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None:
         byok_rag_mock.rag_id = "rag_1"
         byok_rag_mock.vector_db_id = "vs_1"
         config_mock.configuration.rag.inline = ["rag_1"]
-        config_mock.configuration.byok_rag = [byok_rag_mock]
+        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+            mocker, [byok_rag_mock]
+        )
         config_mock.inline_solr_enabled = False
         config_mock.score_multiplier_mapping = {"vs_1": 1.0}
         config_mock.rag_id_mapping = {"vs_1": "rag_1"}
@@ -657,4 +746,3 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None:
 
         assert len(context.rag_chunks) > 0
         assert "BYOK content" in context.context_text
-        assert "file_search found" in context.context_text

From e371ef42ae6714bf1a7c9caf1c92e20b28a1c8b4 Mon Sep 17 00:00:00 2001
From: Sergey Yedrikov <syedriko@redhat.com>
Date: Fri, 17 Apr 2026 17:21:58 -0400
Subject: [PATCH 2/3] Per BYOK database cutoff score

---
 docs/byok_guide.md                            |  23 ++--
 docs/openapi.json                             |  19 ++--
 examples/lightspeed-stack-byok-okp-rag.yaml   |   2 +
 src/app/endpoints/a2a.py                      |  32 +++++-
 src/app/endpoints/rags.py                     |   2 +-
 src/client.py                                 |   2 +-
 src/configuration.py                          |  27 ++++-
 src/constants.py                              |   2 +-
 src/llama_stack_configuration.py              |  27 +++--
 src/models/config.py                          |  95 +++++++++--------
 src/utils/responses.py                        |   6 +-
 src/utils/vector_search.py                    |  29 ++---
 .../endpoints/test_query_byok_integration.py  |  70 +++++-------
 .../test_streaming_query_byok_integration.py  |  61 +++++------
 tests/unit/app/endpoints/test_rags.py         |   4 +-
 tests/unit/models/config/test_byok_rag.py     | 100 +++++++++++-------
 .../models/config/test_dump_configuration.py  |  46 +++-----
 tests/unit/telemetry/conftest.py              |   5 +-
 tests/unit/test_configuration.py              |  39 +++++++
 tests/unit/test_llama_stack_configuration.py  |  31 +-----
 tests/unit/utils/test_responses.py            |  44 ++++----
 tests/unit/utils/test_vector_search.py        |  43 ++++----
 22 files changed, 388 insertions(+), 321 deletions(-)

diff --git a/docs/byok_guide.md b/docs/byok_guide.md
index b98a84139..62a3e346e 100644
--- a/docs/byok_guide.md
+++ b/docs/byok_guide.md
@@ -79,7 +79,7 @@ Both modes rely on:
 
 Inline RAG additionally supports:
 - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content.
-- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score from each BYOK vector store during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). The default matches `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for BYOK inline retrieval.
+- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score, configured **per BYOK store** (each `byok_rag` entry) during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). If an entry omits `relevance_cutoff_score`, it defaults to `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for that store.
 
 > [!NOTE]
 > OKP and BYOK scores are not directly comparable (different scoring systems), so
@@ -281,23 +281,20 @@ registered_resources:
 > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration
 > at startup.
 >
-> Preferred shape: an object with `entries` (list of stores) and optional `relevance_cutoff_score`:
+> Preferred shape: a YAML list of stores. Set optional `relevance_cutoff_score` on **each** store, or rely on the default from `src/constants.py`:
 >
 > ```yaml
 > byok_rag:
->   relevance_cutoff_score: 0.3   # Optional; min raw score per BYOK store before score_multiplier (BYOK only)
->   entries:
->     - rag_id: my-docs           # Unique identifier for this knowledge source
->       rag_type: inline::faiss
->       embedding_model: sentence-transformers/all-mpnet-base-v2
->       embedding_dimension: 768
->       vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
->       db_path: /path/to/vector_db/faiss_store.db
->       score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
+>   - rag_id: my-docs           # Unique identifier for this knowledge source
+>     rag_type: inline::faiss
+>     embedding_model: sentence-transformers/all-mpnet-base-v2
+>     embedding_dimension: 768
+>     vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
+>     db_path: /path/to/vector_db/faiss_store.db
+>     relevance_cutoff_score: 0.3   # Optional per store; min raw score before score_multiplier (BYOK inline only)
+>     score_multiplier: 1.0         # Optional: weight results when mixing multiple sources
 > ```
 >
-> Legacy: a bare list is still accepted and is treated as `entries` (same fields as each list item above).
->
 > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of
 > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it.
 > `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff.
diff --git a/docs/openapi.json b/docs/openapi.json
index 8679b5a03..ff0c3d03f 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -6931,8 +6931,8 @@
                 "tags": [
                     "a2a"
                 ],
-                "summary": "Handle A2A Jsonrpc",
-                "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n    request: FastAPI request object\n    auth: Authentication tuple\n    mcp_headers: MCP headers for context propagation\n\nReturns:\n    JSON-RPC response or streaming response",
+                "summary": "Handle A2A Jsonrpc Get",
+                "description": "Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST).",
                 "operationId": "handle_a2a_jsonrpc_a2a_get",
                 "responses": {
                     "200": {
@@ -6949,9 +6949,9 @@
                 "tags": [
                     "a2a"
                 ],
-                "summary": "Handle A2A Jsonrpc",
-                "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n    request: FastAPI request object\n    auth: Authentication tuple\n    mcp_headers: MCP headers for context propagation\n\nReturns:\n    JSON-RPC response or streaming response",
-                "operationId": "handle_a2a_jsonrpc_a2a_get",
+                "summary": "Handle A2A Jsonrpc Post",
+                "description": "Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET).",
+                "operationId": "handle_a2a_jsonrpc_a2a_post",
                 "responses": {
                     "200": {
                         "description": "Successful Response",
@@ -8074,6 +8074,13 @@
                         "title": "DB path",
                         "description": "Path to RAG database."
                     },
+                    "relevance_cutoff_score": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "title": "BYOK inline RAG relevance cutoff",
+                        "description": "Minimum raw similarity score from this **BYOK** vector store before score_multiplier weighting. Chunks below this threshold are dropped immediately after retrieval from this store only. Does not apply to OKP/Solr. Set to 0.0 to disable filtering for this store.",
+                        "default": 0.3
+                    },
                     "score_multiplier": {
                         "type": "number",
                         "exclusiveMinimum": 0.0,
@@ -8251,7 +8258,7 @@
                         },
                         "type": "array",
                         "title": "BYOK RAG configuration",
-                        "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file"
+                        "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file. Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set ``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr)."
                     },
                     "a2a_state": {
                         "$ref": "#/components/schemas/A2AStateConfiguration",
diff --git a/examples/lightspeed-stack-byok-okp-rag.yaml b/examples/lightspeed-stack-byok-okp-rag.yaml
index 4a6b433b8..a160eaafd 100644
--- a/examples/lightspeed-stack-byok-okp-rag.yaml
+++ b/examples/lightspeed-stack-byok-okp-rag.yaml
@@ -40,12 +40,14 @@ byok_rag:
     embedding_dimension: 1024
     vector_db_id: vs_123       # Llama-stack vector_store_id
     db_path: /tmp/ocp.faiss
+    relevance_cutoff_score: 0.3   # Optional per store; min raw score before score_multiplier (inline BYOK only)
     score_multiplier: 1.0      # Weight for this vector store's results (Inline RAG only)
   - rag_id: knowledge-base     # referenced in rag.inline / rag.tool
     rag_type: inline::faiss
     embedding_dimension: 384
     vector_db_id: vs_456       # Llama-stack vector_store_id
     db_path: /tmp/kb.faiss
+    relevance_cutoff_score: 0.3
     score_multiplier: 1.2      # Weight for this vector store's results (Inline RAG only)
 
 # RAG configuration
diff --git a/src/app/endpoints/a2a.py b/src/app/endpoints/a2a.py
index 0c9fe2a75..b2b8b80f7 100644
--- a/src/app/endpoints/a2a.py
+++ b/src/app/endpoints/a2a.py
@@ -692,12 +692,40 @@ async def _create_a2a_app(
     return a2a_app.build()
 
 
-@router.api_route("/a2a", methods=["GET", "POST"], response_model=None)
+@router.get(
+    "/a2a",
+    response_model=None,
+    operation_id="handle_a2a_jsonrpc_a2a_get",
+)
+@authorize(Action.A2A_JSONRPC)
+async def handle_a2a_jsonrpc_get(
+    request: Request,
+    auth: Annotated[AuthTuple, Depends(auth_dependency)],
+    mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
+) -> Response | StreamingResponse:
+    """Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST)."""
+    return await _handle_a2a_jsonrpc(request, auth, mcp_headers)
+
+
+@router.post(
+    "/a2a",
+    response_model=None,
+    operation_id="handle_a2a_jsonrpc_a2a_post",
+)
 @authorize(Action.A2A_JSONRPC)
-async def handle_a2a_jsonrpc(  # pylint: disable=too-many-locals,too-many-statements
+async def handle_a2a_jsonrpc_post(
     request: Request,
     auth: Annotated[AuthTuple, Depends(auth_dependency)],
     mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
+) -> Response | StreamingResponse:
+    """Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET)."""
+    return await _handle_a2a_jsonrpc(request, auth, mcp_headers)
+
+
+async def _handle_a2a_jsonrpc(  # pylint: disable=too-many-locals,too-many-statements
+    request: Request,
+    auth: AuthTuple,
+    mcp_headers: McpHeaders,
 ) -> Response | StreamingResponse:
     """
     Handle A2A JSON-RPC requests following the A2A protocol specification.
diff --git a/src/app/endpoints/rags.py b/src/app/endpoints/rags.py
index 0e4b6c745..fdc988358 100644
--- a/src/app/endpoints/rags.py
+++ b/src/app/endpoints/rags.py
@@ -163,7 +163,7 @@ async def get_rag_endpoint_handler(
 
     # Resolve user-facing rag_id to llama-stack vector_db_id
     vector_db_id = _resolve_rag_id_to_vector_db_id(
-        rag_id, configuration.configuration.byok_rag.entries
+        rag_id, configuration.configuration.byok_rag
     )
 
     try:
diff --git a/src/client.py b/src/client.py
index 1bc87ea0b..0c77c2d49 100644
--- a/src/client.py
+++ b/src/client.py
@@ -85,7 +85,7 @@ def _enrich_library_config(self, input_config_path: str) -> str:
         config = configuration.configuration
 
         # Enrichment: BYOK RAG
-        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag.entries])
+        enrich_byok_rag(ls_config, [b.model_dump() for b in config.byok_rag])
 
         # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
         enrich_solr(ls_config, config.rag.model_dump(), config.okp.model_dump())
diff --git a/src/configuration.py b/src/configuration.py
index 297523f9f..3689d30b9 100644
--- a/src/configuration.py
+++ b/src/configuration.py
@@ -479,8 +479,7 @@ def rag_id_mapping(self) -> dict[str, str]:
         if self._configuration is None:
             raise LogicError("logic error: configuration is not loaded")
         byok_mapping = {
-            brag.vector_db_id: brag.rag_id
-            for brag in self._configuration.byok_rag.entries
+            brag.vector_db_id: brag.rag_id for brag in self._configuration.byok_rag
         }
 
         rag = self._configuration.rag
@@ -506,9 +505,31 @@ def score_multiplier_mapping(self) -> dict[str, float]:
             raise LogicError("logic error: configuration is not loaded")
         return {
             brag.vector_db_id: brag.score_multiplier
-            for brag in self._configuration.byok_rag.entries
+            for brag in self._configuration.byok_rag
         }
 
+    @property
+    def relevance_cutoff_mapping(self) -> dict[str, float]:
+        """Return mapping from vector_db_id to relevance_cutoff_score from BYOK RAG config.
+
+        If ``byok_rag`` lists the same ``vector_db_id`` more than once, the first
+        occurrence wins.
+
+        Returns:
+            dict[str, float]: Mapping where keys are llama-stack ``vector_db_id`` values
+            and values are per-store raw-score cutoffs before score_multiplier weighting.
+
+        Raises:
+            LogicError: If the configuration has not been loaded.
+        """
+        if self._configuration is None:
+            raise LogicError("logic error: configuration is not loaded")
+        mapping: dict[str, float] = {}
+        for brag in self._configuration.byok_rag:
+            if brag.vector_db_id not in mapping:
+                mapping[brag.vector_db_id] = brag.relevance_cutoff_score
+        return mapping
+
     @property
     def inline_solr_enabled(self) -> bool:
         """Return whether OKP is included in the inline RAG list.
diff --git a/src/constants.py b/src/constants.py
index 425840687..f09305e10 100644
--- a/src/constants.py
+++ b/src/constants.py
@@ -186,7 +186,7 @@
 
 # Inline RAG constants
 BYOK_RAG_MAX_CHUNKS = 10  # retrieved from BYOK RAG
-# Default minimum raw similarity for BYOK vector stores only (``byok_rag.relevance_cutoff_score``)
+# Default minimum raw similarity per BYOK store (``ByokRag.relevance_cutoff_score``)
 DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3
 OKP_RAG_MAX_CHUNKS = 5  # retrieved from OKP RAG
 
diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
index 7fb12c1da..106db656e 100644
--- a/src/llama_stack_configuration.py
+++ b/src/llama_stack_configuration.py
@@ -42,25 +42,24 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None:
 
 
 def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]:
-    """Return BYOK store definitions from raw Lightspeed YAML (list or section dict).
+    """Return BYOK store definitions from raw Lightspeed YAML.
 
-    Always returns a ``list`` suitable for :func:`enrich_byok_rag` (each item is
-    expected to be a mapping with ``.get``).
+    Only a YAML list is accepted as BYOK stores. Each list item is expected to be
+    a mapping with ``.get`` for :func:`enrich_byok_rag`.
 
-    For the section form ``byok_rag: { entries: ..., relevance_cutoff_score: ... }``,
-    only ``entries`` is used: a ``list`` is returned as-is, a single ``dict`` is
-    wrapped as a one-element list, and ``None``, strings, or other types yield
-    ``[]``.
+    Parameters:
+    ----------
+        raw_byok_rag (Any):
+            Raw parsed YAML value (may be a list, ``None``, or other types).
+
+    Returns:
+    -------
+        list[Any]:
+            The input list returned as-is when ``raw_byok_rag`` is a list;
+            otherwise an empty list (``[]`` for ``None`` or any non-list type).
     """
     if isinstance(raw_byok_rag, list):
         return raw_byok_rag
-    if isinstance(raw_byok_rag, dict):
-        entries = raw_byok_rag.get("entries")
-        if isinstance(entries, list):
-            return entries
-        if isinstance(entries, dict):
-            return [entries]
-        return []
     return []
 
 
diff --git a/src/models/config.py b/src/models/config.py
index 6ed0d0bd6..882d4b448 100644
--- a/src/models/config.py
+++ b/src/models/config.py
@@ -2,12 +2,13 @@
 
 # pylint: disable=too-many-lines
 
+import math
 import re
 from enum import Enum
 from functools import cached_property
 from pathlib import Path
 from re import Pattern
-from typing import Annotated, Any, Literal, Optional, Self
+from typing import Annotated, Any, Literal, Optional
 
 import jsonpath_ng
 import yaml
@@ -27,6 +28,7 @@
     model_validator,
 )
 from pydantic.dataclasses import dataclass
+from typing_extensions import Self  # noqa: UP035
 
 import constants
 from log import get_logger
@@ -1613,6 +1615,39 @@ class ByokRag(ConfigurationBase):
         description="Path to RAG database.",
     )
 
+    relevance_cutoff_score: float = Field(
+        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+        ge=0,
+        title="BYOK inline RAG relevance cutoff",
+        description="Minimum raw similarity score from this **BYOK** vector store "
+        "before score_multiplier weighting. Chunks below this threshold are dropped "
+        "immediately after retrieval from this store only. Does not apply to OKP/Solr. "
+        "Set to 0.0 to disable filtering for this store.",
+    )
+
+    @field_validator("relevance_cutoff_score")
+    @classmethod
+    def validate_relevance_cutoff_score(cls, value: float) -> float:
+        """Reject non-finite values (e.g. ``.inf`` or ``.nan`` from YAML).
+
+        Parameters:
+        ----------
+            value: Cutoff after coercion to ``float``.
+
+        Returns:
+        -------
+            The same finite value.
+
+        Raises:
+        ------
+            ValueError: If ``value`` is not finite.
+        """
+        if not math.isfinite(value):
+            raise ValueError(
+                "relevance_cutoff_score must be a finite number (not inf, -inf, or nan)"
+            )
+        return value
+
     score_multiplier: float = Field(
         constants.DEFAULT_SCORE_MULTIPLIER,
         gt=0,
@@ -1624,50 +1659,25 @@ class ByokRag(ConfigurationBase):
 
 
 def _normalize_byok_rag_input(value: Any) -> Any:
-    """Allow legacy ``byok_rag: [ ... ]`` YAML alongside the section object form.
+    """Normalize ``byok_rag`` YAML to a list of :class:`ByokRag` definitions.
+
+    ``null`` becomes ``[]``. The value must otherwise be a YAML list of store
+    mappings (not a mapping at the top level).
 
-    Explicit YAML null (``byok_rag: null``) is normalized to an empty mapping so
-    :class:`ByokRagSection` field defaults apply instead of a type error.
+    Raises:
+        ValueError: If ``value`` is a dict or other non-list type (other than ``None``).
     """
     if value is None:
-        return {}
+        return []
     if isinstance(value, list):
-        return {"entries": value}
-    return value
-
-
-class ByokRagSection(ConfigurationBase):
-    """BYOK RAG configuration: registered BYOK stores and optional raw-score cutoff.
-
-    Settings here apply only to bring-your-own-knowledge vector stores listed in
-    ``entries``. They do not affect OKP (Solr) inline RAG, which uses separate
-    query parameters and defaults.
-    """
-
-    entries: list[ByokRag] = Field(
-        default_factory=list,
-        title="BYOK RAG stores",
-        description="Registered bring-your-own-knowledge vector stores.",
-    )
-
-    relevance_cutoff_score: float = Field(
-        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-        ge=0,
-        title="BYOK inline RAG relevance cutoff",
-        description="Minimum raw similarity score from each **BYOK** vector store "
-        "before per-store score_multiplier weighting. Chunks below this threshold "
-        "are dropped immediately after retrieval from those stores only. Does not "
-        "apply to OKP/Solr. Set to 0.0 to disable filtering for BYOK.",
+        return value
+    raise ValueError(
+        "byok_rag must be a YAML list of BYOK store definitions, not a mapping."
     )
 
 
-def _default_byok_rag_section() -> ByokRagSection:
-    """Return default BYOK RAG section; delegates to :class:`ByokRagSection` field defaults."""
-    return ByokRagSection.model_validate({})
-
-
-ByokRagSectionValidated = Annotated[
-    ByokRagSection,
+ByokRagListValidated = Annotated[
+    list[ByokRag],
     BeforeValidator(_normalize_byok_rag_input),
 ]
 
@@ -1958,14 +1968,13 @@ class Configuration(ConfigurationBase):
         description="Conversation history configuration.",
     )
 
-    byok_rag: ByokRagSectionValidated = Field(
-        default_factory=_default_byok_rag_section,
+    byok_rag: ByokRagListValidated = Field(
+        default_factory=list,
         title="BYOK RAG configuration",
         description="BYOK RAG configuration. This configuration can be used to "
         "reconfigure Llama Stack through its run.yaml configuration file. "
-        "You may use the legacy form ``byok_rag: [ ... ]`` (a list of stores) or "
-        "an object with ``entries`` and optional ``relevance_cutoff_score`` "
-        "(BYOK inline RAG only; not used for OKP/Solr).",
+        "Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set "
+        "``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr).",
     )
 
     a2a_state: A2AStateConfiguration = Field(
diff --git a/src/utils/responses.py b/src/utils/responses.py
index ba52cceac..3fa10b8e2 100644
--- a/src/utils/responses.py
+++ b/src/utils/responses.py
@@ -236,7 +236,7 @@ async def prepare_tools(  # pylint: disable=too-many-arguments,too-many-position
     #      If rag.inline is configured, but not rag.tool, tool RAG is disabled.
     #   3. All registered vector DBs: fallback when neither rag.tool nor rag.inline are configured.
     #      IDs fetched from llama-stack are already internal and need no translation.
-    byok_rags = configuration.configuration.byok_rag.entries
+    byok_rags = configuration.configuration.byok_rag
 
     is_tool_rag_enabled = len(configuration.configuration.rag.tool) > 0
     is_inline_rag_enabled = len(configuration.configuration.rag.inline) > 0
@@ -1708,7 +1708,7 @@ async def _resolve_client_tools(
     # Per-request override of vector stores (user-facing rag_ids)
     vector_store_ids = extract_vector_store_ids_from_tools(tools) or None
     # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool
-    byok_rags = configuration.configuration.byok_rag.entries
+    byok_rags = configuration.configuration.byok_rag
     prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
     prepared_tools = apply_mcp_headers_to_explicit_tools(
         prepared_tools, token, mcp_headers, request_headers
@@ -1803,7 +1803,7 @@ async def resolve_tool_choice(
         )
     else:
         # Pass tools explicitly configured for this request
-        byok_rags = configuration.configuration.byok_rag.entries
+        byok_rags = configuration.configuration.byok_rag
         prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
         prepared_tools = apply_mcp_headers_to_explicit_tools(
             prepared_tools, token, mcp_headers, request_headers
diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py
index bea5a5604..4d3007e2a 100644
--- a/src/utils/vector_search.py
+++ b/src/utils/vector_search.py
@@ -87,7 +87,8 @@ def _extract_byok_rag_chunks(
         search_response: Response from vector_io.query
         vector_store_id: ID of the BYOK vector store that produced these results
         weight: Score multiplier to apply to this store's remaining results
-        byok_raw_cutoff_score: Minimum raw similarity score (``byok_rag.relevance_cutoff_score``)
+        byok_raw_cutoff_score: Minimum raw similarity score for this store
+            (``ByokRag.relevance_cutoff_score``).
 
     Returns:
         List of result dictionaries with weighted scores
@@ -347,16 +348,16 @@ def _process_solr_chunks_for_documents(
     return doc_ids_from_chunks
 
 
-async def _fetch_byok_rag(
+async def _fetch_byok_rag(  # pylint: disable=too-many-locals
     client: AsyncLlamaStackClient,
     query: str,
     vector_store_ids: Optional[list[str]] = None,  # User-facing
 ) -> tuple[list[RAGChunk], list[ReferencedDocument]]:
     """Fetch chunks and documents from BYOK RAG sources only.
 
-    Applies ``configuration.byok_rag.relevance_cutoff_score`` to raw scores from
-    BYOK vector stores. OKP/Solr inline RAG is implemented in ``_fetch_solr_rag``
-    and does not use that setting.
+    Applies each entry's ``relevance_cutoff_score`` (via ``relevance_cutoff_mapping``)
+    to raw scores from BYOK vector stores. OKP/Solr inline RAG is implemented in
+    ``_fetch_solr_rag`` and does not use that setting.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
@@ -388,7 +389,7 @@ async def _fetch_byok_rag(
 
     # Translate user-facing rag_ids to llama-stack ids
     vector_store_ids_to_query: list[str] = resolve_vector_store_ids(
-        rag_ids_to_query, configuration.configuration.byok_rag.entries
+        rag_ids_to_query, configuration.configuration.byok_rag
     )
 
     # Request-level override: filter out Solr store, use the rest
@@ -404,11 +405,12 @@ async def _fetch_byok_rag(
         return rag_chunks, referenced_documents
 
     try:
-        # Get score multiplier and rag_id mappings
+        # Get score multiplier, relevance cutoff, and rag_id mappings
         score_multiplier_mapping = configuration.score_multiplier_mapping
+        relevance_cutoff_mapping = configuration.relevance_cutoff_mapping
         rag_id_mapping = configuration.rag_id_mapping
 
-        # BYOK stores only: byok_rag.relevance_cutoff_score (OKP/Solr does not use it).
+        # BYOK stores only: per-entry relevance_cutoff_score (OKP/Solr does not use it).
         results_per_store = await asyncio.gather(
             *[
                 _query_store_for_byok_rag(
@@ -416,7 +418,10 @@ async def _fetch_byok_rag(
                     vector_store_id,
                     query,
                     score_multiplier_mapping.get(vector_store_id, 1.0),
-                    configuration.configuration.byok_rag.relevance_cutoff_score,
+                    relevance_cutoff_mapping.get(
+                        vector_store_id,
+                        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+                    ),
                 )
                 for vector_store_id in vector_store_ids_to_query
             ]
@@ -543,9 +548,9 @@ async def build_rag_context(
 ) -> RAGContext:
     """Build RAG context by fetching and merging chunks from all enabled sources.
 
-    Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoff comes only
-    from ``byok_rag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g.
-    Solr ``score_threshold``), not the BYOK cutoff.
+    Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoffs come from
+    each ``ByokRag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g.
+    Solr ``score_threshold``), not the BYOK cutoffs.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py
index 5a43bca17..93ea0271d 100644
--- a/tests/integration/endpoints/test_query_byok_integration.py
+++ b/tests/integration/endpoints/test_query_byok_integration.py
@@ -254,6 +254,7 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
+    byok_entry.relevance_cutoff_score = 0.0
     byok_entry.model_dump.return_value = {
         "rag_id": "test-knowledge",
         "rag_type": "inline::faiss",
@@ -262,13 +263,11 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
         "vector_db_id": "vs-byok-knowledge",
         "db_path": "/tmp/test-db",
         "score_multiplier": 1.0,
+        "relevance_cutoff_score": 0.0,
     }
 
     # Patch the loaded configuration's byok_rag and rag.inline
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = ["test-knowledge"]
 
     return test_config
@@ -287,6 +286,7 @@ def byok_tool_config_fixture(
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
+    byok_entry.relevance_cutoff_score = 0.0
     byok_entry.model_dump.return_value = {
         "rag_id": "test-knowledge",
         "rag_type": "inline::faiss",
@@ -295,12 +295,10 @@ def byok_tool_config_fixture(
         "vector_db_id": "vs-byok-knowledge",
         "db_path": "/tmp/test-db",
         "score_multiplier": 1.0,
+        "relevance_cutoff_score": 0.0,
     }
 
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = []
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -425,16 +423,15 @@ async def test_query_byok_inline_rag_with_request_vector_store_ids(
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -493,12 +490,11 @@ async def test_query_byok_request_vector_store_ids_filters_configured_stores(
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
+    entry_b.relevance_cutoff_score = 0.0
 
     # Both sources are in config
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -696,10 +692,8 @@ async def test_query_byok_combined_inline_and_tool_rag(  # pylint: disable=too-m
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    byok_entry.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = ["test-knowledge"]
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -809,16 +803,15 @@ async def test_query_byok_inline_rag_only_configured_rag_id_is_queried(
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -883,16 +876,15 @@ async def test_query_byok_score_multiplier_shifts_chunk_priority(  # pylint: dis
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 5.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -965,11 +957,9 @@ async def test_query_byok_cutoff_applied_before_multiplier(
     entry.rag_id = "cutoff-test"
     entry.vector_db_id = "vs-cutoff-test"
     entry.score_multiplier = 10.0
+    entry.relevance_cutoff_score = 0.5
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry]
-    _br.relevance_cutoff_score = 0.5
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry]
     test_config.configuration.rag.inline = ["cutoff-test"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -998,8 +988,7 @@ async def test_query_byok_cutoff_applied_before_multiplier(
         mcp_headers={},
     )
 
-    assert response.rag_chunks is not None
-    assert len(response.rag_chunks) == 0
+    assert not response.rag_chunks
 
 
 # ==============================================================================
@@ -1028,11 +1017,9 @@ async def test_query_byok_max_chunks_caps_retrieved_results(  # pylint: disable=
     entry.rag_id = "big-source"
     entry.vector_db_id = "vs-big-source"
     entry.score_multiplier = 1.0
+    entry.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry]
     test_config.configuration.rag.inline = ["big-source"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
@@ -1103,11 +1090,10 @@ async def test_query_byok_max_chunks_caps_across_multiple_sources(  # pylint: di
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch("app.endpoints.query.AsyncLlamaStackClientHolder")
diff --git a/tests/integration/endpoints/test_streaming_query_byok_integration.py b/tests/integration/endpoints/test_streaming_query_byok_integration.py
index 8f963be11..dd53f2da7 100644
--- a/tests/integration/endpoints/test_streaming_query_byok_integration.py
+++ b/tests/integration/endpoints/test_streaming_query_byok_integration.py
@@ -236,6 +236,7 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
+    byok_entry.relevance_cutoff_score = 0.0
     byok_entry.model_dump.return_value = {
         "rag_id": "test-knowledge",
         "rag_type": "inline::faiss",
@@ -244,12 +245,10 @@ def byok_config_fixture(test_config: AppConfig, mocker: MockerFixture) -> AppCon
         "vector_db_id": "vs-byok-knowledge",
         "db_path": "/tmp/test-db",
         "score_multiplier": 1.0,
+        "relevance_cutoff_score": 0.0,
     }
 
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = ["test-knowledge"]
 
     return test_config
@@ -264,6 +263,7 @@ def byok_tool_config_fixture(
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
+    byok_entry.relevance_cutoff_score = 0.0
     byok_entry.model_dump.return_value = {
         "rag_id": "test-knowledge",
         "rag_type": "inline::faiss",
@@ -272,12 +272,10 @@ def byok_tool_config_fixture(
         "vector_db_id": "vs-byok-knowledge",
         "db_path": "/tmp/test-db",
         "score_multiplier": 1.0,
+        "relevance_cutoff_score": 0.0,
     }
 
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = []
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -345,16 +343,15 @@ async def test_streaming_query_byok_inline_rag_with_request_vector_store_ids(
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch(
@@ -412,16 +409,15 @@ async def test_streaming_query_byok_request_vector_store_ids_filters_configured_
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
@@ -701,10 +697,8 @@ async def test_streaming_query_byok_combined_inline_and_tool_rag(
     byok_entry.rag_id = "test-knowledge"
     byok_entry.vector_db_id = "vs-byok-knowledge"
     byok_entry.score_multiplier = 1.0
-    _br = mocker.MagicMock()
-    _br.entries = [byok_entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    byok_entry.relevance_cutoff_score = 0.0
+    test_config.configuration.byok_rag = [byok_entry]
     test_config.configuration.rag.inline = ["test-knowledge"]
     test_config.configuration.rag.tool = ["test-knowledge"]
 
@@ -781,16 +775,15 @@ async def test_streaming_query_byok_only_configured_rag_id_is_queried(  # pylint
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a"]
 
     mock_holder_class = mocker.patch(
@@ -863,16 +856,15 @@ async def test_streaming_query_byok_score_multiplier_shifts_priority(  # pylint:
     entry_a.rag_id = "source-a"
     entry_a.vector_db_id = "vs-source-a"
     entry_a.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
 
     entry_b = mocker.MagicMock()
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 5.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
@@ -953,11 +945,9 @@ async def test_streaming_query_byok_max_chunks_caps_context(  # pylint: disable=
     entry.rag_id = "big-source"
     entry.vector_db_id = "vs-big-source"
     entry.score_multiplier = 1.0
+    entry.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry]
     test_config.configuration.rag.inline = ["big-source"]
 
     mock_holder_class = mocker.patch(
@@ -1032,11 +1022,10 @@ async def test_streaming_query_byok_max_chunks_caps_across_multiple_sources(  #
     entry_b.rag_id = "source-b"
     entry_b.vector_db_id = "vs-source-b"
     entry_b.score_multiplier = 1.0
+    entry_a.relevance_cutoff_score = 0.0
+    entry_b.relevance_cutoff_score = 0.0
 
-    _br = mocker.MagicMock()
-    _br.entries = [entry_a, entry_b]
-    _br.relevance_cutoff_score = 0.0
-    test_config.configuration.byok_rag = _br
+    test_config.configuration.byok_rag = [entry_a, entry_b]
     test_config.configuration.rag.inline = ["source-a", "source-b"]
 
     mock_holder_class = mocker.patch(
diff --git a/tests/unit/app/endpoints/test_rags.py b/tests/unit/app/endpoints/test_rags.py
index f6247aefa..563c223fe 100644
--- a/tests/unit/app/endpoints/test_rags.py
+++ b/tests/unit/app/endpoints/test_rags.py
@@ -379,7 +379,7 @@ def __init__(self) -> None:
 def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None:
     """Test that _resolve_rag_id_to_vector_db_id maps rag_id to vector_db_id."""
     byok_config = _make_byok_config(str(tmp_path))
-    byok_rags = byok_config.configuration.byok_rag.entries
+    byok_rags = byok_config.configuration.byok_rag
     assert _resolve_rag_id_to_vector_db_id("ocp-4.18-docs", byok_rags) == "vs_abc123"
     assert _resolve_rag_id_to_vector_db_id("company-kb", byok_rags) == "vs_def456"
 
@@ -387,5 +387,5 @@ def test_resolve_rag_id_to_vector_db_id_with_mapping(tmp_path: Path) -> None:
 def test_resolve_rag_id_to_vector_db_id_passthrough(tmp_path: Path) -> None:
     """Test that unmapped IDs are passed through unchanged."""
     byok_config = _make_byok_config(str(tmp_path))
-    byok_rags = byok_config.configuration.byok_rag.entries
+    byok_rags = byok_config.configuration.byok_rag
     assert _resolve_rag_id_to_vector_db_id("vs_unknown", byok_rags) == "vs_unknown"
diff --git a/tests/unit/models/config/test_byok_rag.py b/tests/unit/models/config/test_byok_rag.py
index 4083dd6b7..d0fd88420 100644
--- a/tests/unit/models/config/test_byok_rag.py
+++ b/tests/unit/models/config/test_byok_rag.py
@@ -12,7 +12,7 @@
     DEFAULT_RAG_TYPE,
     DEFAULT_SCORE_MULTIPLIER,
 )
-from models.config import ByokRag, ByokRagSection, Configuration
+from models.config import ByokRag, Configuration
 
 
 def test_byok_rag_configuration_default_values() -> None:
@@ -37,6 +37,7 @@ def test_byok_rag_configuration_default_values() -> None:
     assert byok_rag.embedding_dimension == DEFAULT_EMBEDDING_DIMENSION
     assert byok_rag.vector_db_id == "vector_db_id"
     assert byok_rag.db_path == "tests/configuration/rag.txt"
+    assert byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
     assert byok_rag.score_multiplier == DEFAULT_SCORE_MULTIPLIER
 
 
@@ -192,8 +193,8 @@ def test_byok_rag_configuration_score_multiplier_must_be_positive() -> None:
         )
 
 
-def test_byok_rag_section_explicit_null_yields_defaults() -> None:
-    """``byok_rag: null`` in YAML normalizes to defaults (empty entries, default cutoff)."""
+def test_byok_rag_explicit_null_yields_empty_list() -> None:
+    """``byok_rag: null`` in YAML normalizes to an empty list."""
     cfg = Configuration.model_validate(
         {
             "name": "t",
@@ -208,16 +209,13 @@ def test_byok_rag_section_explicit_null_yields_defaults() -> None:
             "byok_rag": None,
         }
     )
-    assert cfg.byok_rag.entries == []
-    assert (
-        cfg.byok_rag.relevance_cutoff_score == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
-    )
+    assert cfg.byok_rag == []
 
 
-def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None:
-    """``byok_rag`` accepts a section object or a legacy list of stores."""
+def test_byok_rag_list_in_configuration() -> None:
+    """``byok_rag`` is a YAML list of stores."""
     db_path = "tests/configuration/rag.txt"
-    from_list = Configuration.model_validate(
+    cfg = Configuration.model_validate(
         {
             "name": "t",
             "service": {"host": "localhost", "port": 8080},
@@ -237,43 +235,63 @@ def test_byok_rag_section_object_and_legacy_list_in_configuration() -> None:
             ],
         }
     )
-    assert from_list.byok_rag.entries[0].rag_id == "r1"
+    assert cfg.byok_rag[0].rag_id == "r1"
     assert (
-        from_list.byok_rag.relevance_cutoff_score
+        cfg.byok_rag[0].relevance_cutoff_score
         == DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
     )
 
-    from_obj = Configuration.model_validate(
-        {
-            "name": "t",
-            "service": {"host": "localhost", "port": 8080},
-            "llama_stack": {
-                "api_key": "k",
-                "url": "http://x:1",
-                "use_as_library_client": False,
-            },
-            "user_data_collection": {},
-            "authentication": {"module": "noop"},
-            "byok_rag": {
-                "entries": [
-                    {
-                        "rag_id": "r2",
-                        "vector_db_id": "vs2",
-                        "db_path": db_path,
-                    },
-                ],
-                "relevance_cutoff_score": 0.42,
-            },
-        }
-    )
-    assert from_obj.byok_rag.entries[0].rag_id == "r2"
-    assert from_obj.byok_rag.relevance_cutoff_score == 0.42
 
+def test_byok_rag_rejects_top_level_mapping() -> None:
+    """A mapping (including ``entries``) at top level is not valid."""
+    db_path = "tests/configuration/rag.txt"
+    with pytest.raises(ValidationError):
+        Configuration.model_validate(
+            {
+                "name": "t",
+                "service": {"host": "localhost", "port": 8080},
+                "llama_stack": {
+                    "api_key": "k",
+                    "url": "http://x:1",
+                    "use_as_library_client": False,
+                },
+                "user_data_collection": {},
+                "authentication": {"module": "noop"},
+                "byok_rag": {
+                    "entries": [
+                        {
+                            "rag_id": "r2",
+                            "vector_db_id": "vs2",
+                            "db_path": db_path,
+                        },
+                    ],
+                    "relevance_cutoff_score": 0.42,
+                },
+            }
+        )
 
-def test_byok_rag_section_rejects_negative_cutoff() -> None:
-    """relevance_cutoff_score must be non-negative."""
+
+def test_byok_rag_rejects_negative_cutoff_on_entry() -> None:
+    """relevance_cutoff_score on each entry must be non-negative."""
     with pytest.raises(ValidationError):
-        _ = ByokRagSection(
-            entries=[],
+        _ = ByokRag(
+            rag_id="rag_id",
+            vector_db_id="vector_db_id",
+            db_path="tests/configuration/rag.txt",
             relevance_cutoff_score=-0.1,
         )
+
+
+@pytest.mark.parametrize(
+    "non_finite",
+    [float("inf"), float("-inf"), float("nan")],
+)
+def test_byok_rag_rejects_non_finite_relevance_cutoff(non_finite: float) -> None:
+    """relevance_cutoff_score must be finite (reject inf and nan)."""
+    with pytest.raises(ValidationError):
+        _ = ByokRag(
+            rag_id="rag_id",
+            vector_db_id="vector_db_id",
+            db_path="tests/configuration/rag.txt",
+            relevance_cutoff_score=non_finite,
+        )
diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py
index 49184cf31..09a5cf79a 100644
--- a/tests/unit/models/config/test_dump_configuration.py
+++ b/tests/unit/models/config/test_dump_configuration.py
@@ -190,10 +190,7 @@ def test_dump_configuration(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": {
-                "entries": [],
-                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-            },
+            "byok_rag": [],
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -535,10 +532,7 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": {
-                "entries": [],
-                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-            },
+            "byok_rag": [],
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -772,10 +766,7 @@ def test_dump_configuration_with_quota_limiters_different_values(
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": {
-                "entries": [],
-                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-            },
+            "byok_rag": [],
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -989,20 +980,18 @@ def test_dump_configuration_byok(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": {
-                "entries": [
-                    {
-                        "db_path": "tests/configuration/rag.txt",
-                        "embedding_dimension": 768,
-                        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
-                        "rag_id": "rag_id",
-                        "rag_type": "inline::faiss",
-                        "vector_db_id": "vector_db_id",
-                        "score_multiplier": 1.0,
-                    },
-                ],
-                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-            },
+            "byok_rag": [
+                {
+                    "db_path": "tests/configuration/rag.txt",
+                    "embedding_dimension": 768,
+                    "embedding_model": "sentence-transformers/all-mpnet-base-v2",
+                    "rag_id": "rag_id",
+                    "rag_type": "inline::faiss",
+                    "vector_db_id": "vector_db_id",
+                    "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+                    "score_multiplier": 1.0,
+                },
+            ],
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
@@ -1196,10 +1185,7 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None:
                 "sqlite": None,
                 "type": None,
             },
-            "byok_rag": {
-                "entries": [],
-                "relevance_cutoff_score": constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-            },
+            "byok_rag": [],
             "quota_handlers": {
                 "sqlite": None,
                 "postgres": None,
diff --git a/tests/unit/telemetry/conftest.py b/tests/unit/telemetry/conftest.py
index e7ef44255..6b2db6a82 100644
--- a/tests/unit/telemetry/conftest.py
+++ b/tests/unit/telemetry/conftest.py
@@ -12,7 +12,6 @@
     Action,
     AuthenticationConfiguration,
     AuthorizationConfiguration,
-    ByokRagSection,
     Configuration,
     CORSConfiguration,
     Customization,
@@ -288,7 +287,7 @@ def build_fully_populated_config() -> Configuration:
             ),
         ],
         conversation_cache=None,
-        byok_rag=ByokRagSection(entries=[]),
+        byok_rag=[],
         a2a_state=None,
         quota_handlers=None,
         azure_entra_id=None,
@@ -364,7 +363,7 @@ def build_minimal_config() -> Configuration:
         ),
         mcp_servers=[],
         conversation_cache=None,
-        byok_rag=ByokRagSection(entries=[]),
+        byok_rag=[],
         a2a_state=None,
         quota_handlers=None,
         azure_entra_id=None,
diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
index 798fe89f4..f0212fe46 100644
--- a/tests/unit/test_configuration.py
+++ b/tests/unit/test_configuration.py
@@ -1186,6 +1186,45 @@ def test_score_multiplier_mapping_with_custom_values(tmp_path: Path) -> None:
     assert cfg.score_multiplier_mapping == {"vs-001": 1.5, "vs-002": 0.75}
 
 
+def test_relevance_cutoff_mapping_first_wins_duplicate_vector_db_id(
+    tmp_path: Path,
+) -> None:
+    """First BYOK entry wins when two entries share the same vector_db_id."""
+    db_file1 = tmp_path / "test1.db"
+    db_file1.touch()
+    db_file2 = tmp_path / "test2.db"
+    db_file2.touch()
+    cfg = AppConfig()
+    cfg.init_from_dict(
+        {
+            "name": "test",
+            "service": {"host": "localhost", "port": 8080},
+            "llama_stack": {
+                "api_key": "k",
+                "url": "http://test.com:1234",
+                "use_as_library_client": False,
+            },
+            "user_data_collection": {},
+            "authentication": {"module": "noop"},
+            "byok_rag": [
+                {
+                    "rag_id": "kb1",
+                    "vector_db_id": "vs-dup",
+                    "db_path": str(db_file1),
+                    "relevance_cutoff_score": 0.2,
+                },
+                {
+                    "rag_id": "kb2",
+                    "vector_db_id": "vs-dup",
+                    "db_path": str(db_file2),
+                    "relevance_cutoff_score": 0.9,
+                },
+            ],
+        }
+    )
+    assert cfg.relevance_cutoff_mapping == {"vs-dup": 0.2}
+
+
 def test_score_multiplier_mapping_not_loaded() -> None:
     """Test that score_multiplier_mapping raises when config not loaded."""
     cfg = AppConfig()
diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py
index 6d14a5a8d..cffa8f0da 100644
--- a/tests/unit/test_llama_stack_configuration.py
+++ b/tests/unit/test_llama_stack_configuration.py
@@ -30,37 +30,16 @@
 # =============================================================================
 
 
-def test_raw_byok_rag_store_list_legacy_list() -> None:
-    """Bare list form returns the same sequence."""
+def test_raw_byok_rag_store_list_returns_list_as_is() -> None:
+    """YAML list form returns the same sequence."""
     raw = [{"rag_id": "a"}]
     assert _raw_byok_rag_store_list(raw) is raw
 
 
-def test_raw_byok_rag_store_list_section_with_entries_list() -> None:
-    """Section dict with entries list returns that list."""
-    entries = [{"rag_id": "x"}]
-    assert (
-        _raw_byok_rag_store_list({"relevance_cutoff_score": 0.3, "entries": entries})
-        is entries
-    )
-
-
-def test_raw_byok_rag_store_list_section_with_single_entry_dict() -> None:
-    """Section dict may use a single mapping for entries (wrap as one-element list)."""
-    one = {"rag_id": "solo", "vector_db_id": "vs1", "db_path": "/d.db"}
-    out = _raw_byok_rag_store_list({"entries": one})
-    assert out == [one]
-
-
-def test_raw_byok_rag_store_list_section_entries_invalid_or_missing() -> None:
-    """None, strings, or missing entries yield an empty list."""
-    assert _raw_byok_rag_store_list({"entries": None}) == []
-    assert _raw_byok_rag_store_list({"entries": "not-a-list"}) == []
+def test_raw_byok_rag_store_list_non_list_yields_empty() -> None:
+    """Only a list is used; mappings and other types yield []."""
+    assert _raw_byok_rag_store_list({"entries": [{"rag_id": "x"}]}) == []
     assert _raw_byok_rag_store_list({}) == []
-
-
-def test_raw_byok_rag_store_list_non_sequence_top_level() -> None:
-    """Non-list, non-dict input yields []."""
     assert _raw_byok_rag_store_list(None) == []
     assert _raw_byok_rag_store_list("byok") == []
 
diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py
index 9ae98a225..add19be23 100644
--- a/tests/unit/utils/test_responses.py
+++ b/tests/unit/utils/test_responses.py
@@ -90,12 +90,14 @@
 )
 
 
-def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any:
-    """Mock ``configuration.byok_rag`` with ``entries`` and default cutoff."""
-    section = mocker.Mock()
-    section.entries = entries
-    section.relevance_cutoff_score = 0.0
-    return section
+def _byok_rag_list_mock(entries: list[Any]) -> list[Any]:
+    """Mock ``configuration.byok_rag`` as a list of entries."""
+    for entry in entries:
+        if not isinstance(getattr(entry, "relevance_cutoff_score", None), (int, float)):
+            entry.relevance_cutoff_score = (
+                constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+            )
+    return entries
 
 
 class MockOutputItem:  # pylint: disable=too-few-public-methods
@@ -1651,9 +1653,7 @@ async def test_translates_byok_ids_in_prepare_tools(
         mock_byok_rag.rag_id = "ocp_docs"
         mock_byok_rag.vector_db_id = "vs-001"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(
-            mocker, [mock_byok_rag]
-        )
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1674,7 +1674,7 @@ async def test_passes_through_unknown_ids_in_prepare_tools(
 
         # Configure empty BYOK RAG
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1704,9 +1704,7 @@ async def test_does_not_translate_when_ids_fetched_from_llama_stack(
         mock_byok_rag.rag_id = "vs-internal"
         mock_byok_rag.vector_db_id = "vs-translated"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(
-            mocker, [mock_byok_rag]
-        )
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1729,7 +1727,7 @@ async def test_uses_rag_tool_config_when_no_per_request_ids(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.configuration.rag.tool = ["rag-tool-id-1", "rag-tool-id-2"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1754,9 +1752,7 @@ async def test_rag_tool_config_ids_are_translated(
         mock_byok_rag.rag_id = "ocp_docs"
         mock_byok_rag.vector_db_id = "vs-001"
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(
-            mocker, [mock_byok_rag]
-        )
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([mock_byok_rag])
         mock_config.configuration.rag.tool = ["ocp_docs"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1774,7 +1770,7 @@ async def test_inline_rag_disables_tool_rag(self, mocker: MockerFixture) -> None
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = [
             "inline-store-id"
@@ -1796,7 +1792,7 @@ async def test_per_request_ids_override_rag_tool_config(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.configuration.rag.tool = ["config-id-1"]
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -1821,7 +1817,7 @@ async def test_all_registered_dbs_used_when_neither_tool_nor_inline_configured(
         mocker.patch("utils.responses.get_mcp_tools", return_value=None)
 
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.configuration.rag.tool = []
         mock_config.configuration.rag.inline = []
         mocker.patch("utils.responses.configuration", mock_config)
@@ -3369,7 +3365,7 @@ async def test_client_tools_without_merge_header(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3394,7 +3390,7 @@ async def test_client_tools_with_merge_header(self, mocker: MockerFixture) -> No
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3431,7 +3427,7 @@ async def test_merge_header_conflict_raises_409(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
 
@@ -3490,7 +3486,7 @@ async def test_merge_header_no_server_tools_returns_client_only(
             return_value=mock_holder,
         )
         mock_config = mocker.Mock()
-        mock_config.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        mock_config.configuration.byok_rag = _byok_rag_list_mock([])
         mock_config.mcp_servers = []
         mocker.patch("utils.responses.configuration", mock_config)
         mocker.patch(
diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py
index 7e2752dfd..47dddbdd4 100644
--- a/tests/unit/utils/test_vector_search.py
+++ b/tests/unit/utils/test_vector_search.py
@@ -25,12 +25,14 @@
 )
 
 
-def _byok_rag_section_mock(mocker: MockerFixture, entries: list[Any]) -> Any:
-    """Build a mock ``byok_rag`` section with ``entries`` and cutoff."""
-    section = mocker.Mock()
-    section.entries = entries
-    section.relevance_cutoff_score = 0.0
-    return section
+def _byok_rag_list_mock(_mocker: MockerFixture, entries: list[Any]) -> list[Any]:
+    """Normalize mocked BYOK entries for ``configuration.byok_rag``."""
+    for entry in entries:
+        if not isinstance(getattr(entry, "relevance_cutoff_score", None), (int, float)):
+            entry.relevance_cutoff_score = (
+                constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+            )
+    return entries
 
 
 class TestIsSolrEnabled:
@@ -424,7 +426,7 @@ async def test_byok_no_inline_ids(self, mocker: MockerFixture) -> None:
         """Test when no inline BYOK sources are configured."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -443,10 +445,11 @@ async def test_byok_enabled_success(self, mocker: MockerFixture) -> None:
         byok_rag_mock.rag_id = "rag_1"
         byok_rag_mock.vector_db_id = "vs_1"
         config_mock.configuration.rag.inline = ["rag_1"]
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(
             mocker, [byok_rag_mock]
         )
         config_mock.score_multiplier_mapping = {"vs_1": 1.5}
+        config_mock.relevance_cutoff_mapping = {"vs_1": 0.0}
         config_mock.rag_id_mapping = {"vs_1": "rag_1"}
         mocker.patch("utils.vector_search.configuration", config_mock)
 
@@ -483,11 +486,12 @@ async def test_user_facing_ids_translated_to_internal_ids(
         byok_rag_mock = mocker.Mock()
         byok_rag_mock.rag_id = "my-kb"
         byok_rag_mock.vector_db_id = "vs-internal-001"
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(
             mocker, [byok_rag_mock]
         )
         config_mock.configuration.rag.inline = ["my-kb"]
         config_mock.score_multiplier_mapping = {"vs-internal-001": 1.0}
+        config_mock.relevance_cutoff_mapping = {"vs-internal-001": 0.0}
         config_mock.rag_id_mapping = {"vs-internal-001": "my-kb"}
         mocker.patch("utils.vector_search.configuration", config_mock)
 
@@ -525,11 +529,12 @@ async def test_multiple_user_facing_ids_each_translated(
         byok_rag_2 = mocker.Mock()
         byok_rag_2.rag_id = "kb-part2"
         byok_rag_2.vector_db_id = "vs-bbb-222"
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(
             mocker, [byok_rag_1, byok_rag_2]
         )
         config_mock.configuration.rag.inline = ["kb-part1", "kb-part2"]
         config_mock.score_multiplier_mapping = {"vs-aaa-111": 1.0, "vs-bbb-222": 1.0}
+        config_mock.relevance_cutoff_mapping = {"vs-aaa-111": 0.0, "vs-bbb-222": 0.0}
         config_mock.rag_id_mapping = {
             "vs-aaa-111": "kb-part1",
             "vs-bbb-222": "kb-part2",
@@ -570,7 +575,7 @@ async def test_no_inline_rag_configured_skips_byok(
         """Test that BYOK inline RAG is skipped when rag.inline is empty."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -590,7 +595,7 @@ async def test_request_id_not_in_inline_config_skips_byok(
         """Test that a request vector_store_id not registered in rag.inline is filtered out."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = ["registered-id"]
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, [])
         mocker.patch("utils.vector_search.configuration", config_mock)
 
         client_mock = mocker.AsyncMock()
@@ -612,12 +617,13 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores(
         byok_rag_mock = mocker.Mock()
         byok_rag_mock.rag_id = "rag_1"
         byok_rag_mock.vector_db_id = "vs_1"
-        section = mocker.Mock()
-        section.entries = [byok_rag_mock]
-        section.relevance_cutoff_score = 0.5
+        byok_rag_mock.relevance_cutoff_score = 0.5
         config_mock.configuration.rag.inline = ["rag_1"]
-        config_mock.configuration.byok_rag = section
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(
+            mocker, [byok_rag_mock]
+        )
         config_mock.score_multiplier_mapping = {"vs_1": 1.0}
+        config_mock.relevance_cutoff_mapping = {"vs_1": 0.5}
         config_mock.rag_id_mapping = {"vs_1": "rag_1"}
         mocker.patch("utils.vector_search.configuration", config_mock)
 
@@ -700,7 +706,7 @@ async def test_both_sources_disabled(self, mocker: MockerFixture) -> None:
         """Test when both BYOK inline and Solr inline are not configured."""
         config_mock = mocker.Mock(spec=AppConfig)
         config_mock.configuration.rag.inline = []
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(mocker, [])
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(mocker, [])
         config_mock.inline_solr_enabled = False
         mocker.patch("utils.vector_search.configuration", config_mock)
 
@@ -720,11 +726,12 @@ async def test_byok_enabled_only(self, mocker: MockerFixture) -> None:
         byok_rag_mock.rag_id = "rag_1"
         byok_rag_mock.vector_db_id = "vs_1"
         config_mock.configuration.rag.inline = ["rag_1"]
-        config_mock.configuration.byok_rag = _byok_rag_section_mock(
+        config_mock.configuration.byok_rag = _byok_rag_list_mock(
             mocker, [byok_rag_mock]
         )
         config_mock.inline_solr_enabled = False
         config_mock.score_multiplier_mapping = {"vs_1": 1.0}
+        config_mock.relevance_cutoff_mapping = {"vs_1": 0.0}
         config_mock.rag_id_mapping = {"vs_1": "rag_1"}
         mocker.patch("utils.vector_search.configuration", config_mock)
 

From 579c17e3712d151b8a21b011873cebda65de46c1 Mon Sep 17 00:00:00 2001
From: Sergey Yedrikov <syedriko@redhat.com>
Date: Fri, 17 Apr 2026 21:20:24 -0400
Subject: [PATCH 3/3] Cutoff score passed as score_threshold

---
 src/utils/responses.py                        |  92 +++++++++--
 src/utils/vector_search.py                    |  41 ++---
 .../endpoints/test_query_byok_integration.py  |  13 +-
 tests/unit/utils/test_responses.py            | 156 ++++++++++++++++--
 tests/unit/utils/test_vector_search.py        |  33 ++--
 5 files changed, 264 insertions(+), 71 deletions(-)

diff --git a/src/utils/responses.py b/src/utils/responses.py
index 3fa10b8e2..bc0d8570e 100644
--- a/src/utils/responses.py
+++ b/src/utils/responses.py
@@ -7,7 +7,7 @@
 from typing import Any, Optional, cast
 
 from fastapi import HTTPException
-from llama_stack_api import OpenAIResponseObject
+from llama_stack_api import OpenAIResponseObject, SearchRankingOptions
 from llama_stack_api.openai_responses import (
     OpenAIResponseContentPartRefusal as ContentPartRefusal,
 )
@@ -251,7 +251,7 @@ async def prepare_tools(  # pylint: disable=too-many-arguments,too-many-position
         effective_ids = await get_vector_store_ids(client, None)
 
     # Add RAG tools if vector stores are available
-    rag_tools = get_rag_tools(effective_ids)
+    rag_tools = get_rag_tools(effective_ids, byok_rags)
     if rag_tools:
         toolgroups.extend(rag_tools)
 
@@ -639,11 +639,52 @@ def resolve_vector_store_ids(
     return [rag_id_to_vector_db_id.get(vs_id, vs_id) for vs_id in vector_store_ids]
 
 
+def file_search_score_threshold_for_store(
+    vector_store_id: str, byok_rags: list[ByokRag]
+) -> float:
+    """Return the configured relevance cutoff for one resolved vector store (tool RAG).
+
+    Parameters:
+    ----------
+        vector_store_id: Llama-stack vector store ID (e.g. BYOK ``vector_db_id``).
+        byok_rags: BYOK configuration entries.
+
+    Returns:
+    -------
+        Solr/OKP default for the Solr store ID; ``relevance_cutoff_score`` for
+        configured BYOK stores; otherwise ``DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE``.
+    """
+    vector_db_to_cutoff = {
+        br.vector_db_id: br.relevance_cutoff_score for br in byok_rags
+    }
+    if vector_store_id == constants.SOLR_DEFAULT_VECTOR_STORE_ID:
+        return constants.SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD
+    if vector_store_id in vector_db_to_cutoff:
+        return vector_db_to_cutoff[vector_store_id]
+    return constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+
+
+def file_search_ranking_options_for_store(
+    vector_store_id: str, byok_rags: list[ByokRag]
+) -> SearchRankingOptions:
+    """Build ``SearchRankingOptions`` with ``score_threshold`` for a single store."""
+    return SearchRankingOptions(
+        score_threshold=file_search_score_threshold_for_store(
+            vector_store_id, byok_rags
+        )
+    )
+
+
 def translate_tools_vector_store_ids(
     tools: list[InputTool], byok_rags: list[ByokRag]
 ) -> list[InputTool]:
     """Translate user-facing vector_store_ids to llama-stack IDs in each file_search tool.
 
+    Each resolved store becomes its own ``file_search`` tool with that store's
+    ``ranking_options.score_threshold``. If the client omits ``ranking_options``,
+    the store cutoff is used; if present, the effective threshold is
+    ``max(client score_threshold, store_cutoff)``.
+
     Parameters:
     ----------
         tools: List of request tools (may contain file_search with user-facing IDs).
@@ -651,27 +692,56 @@ def translate_tools_vector_store_ids(
 
     Returns:
     -------
-        New list of tools with file_search vector_store_ids translated; other tools
+        New list of tools with file_search entries expanded per store; other tools
         unchanged.
     """
     result: list[InputTool] = []
     for tool in tools:
-        if tool.type == "file_search":
-            resolved_ids = resolve_vector_store_ids(tool.vector_store_ids, byok_rags)
-            result.append(tool.model_copy(update={"vector_store_ids": resolved_ids}))
-        else:
+        if tool.type != "file_search":
             result.append(tool)
+            continue
+
+        file_tool = cast(InputToolFileSearch, tool)
+        resolved_ids = resolve_vector_store_ids(file_tool.vector_store_ids, byok_rags)
+
+        for vs_id in resolved_ids:
+            store_cutoff = file_search_score_threshold_for_store(vs_id, byok_rags)
+            if file_tool.ranking_options is None:
+                ranking = file_search_ranking_options_for_store(vs_id, byok_rags)
+            else:
+                client_ro = file_tool.ranking_options
+                client_st = client_ro.score_threshold
+                client_val = 0.0 if client_st is None else float(client_st)
+                ranking = client_ro.model_copy(
+                    update={"score_threshold": max(client_val, store_cutoff)}
+                )
+            result.append(
+                InputToolFileSearch(
+                    type="file_search",
+                    vector_store_ids=[vs_id],
+                    max_num_results=file_tool.max_num_results,
+                    filters=file_tool.filters,
+                    ranking_options=ranking,
+                )
+            )
+
     return result
 
 
-def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSearch]]:
+def get_rag_tools(
+    vector_store_ids: list[str], byok_rags: list[ByokRag]
+) -> Optional[list[InputToolFileSearch]]:
     """Convert vector store IDs to tools format for Responses API.
 
+    Emits one ``file_search`` tool per store so each has its own
+    ``ranking_options.score_threshold``.
+
     Args:
         vector_store_ids: List of vector store identifiers
+        byok_rags: BYOK RAG entries used to set ``ranking_options.score_threshold``
 
     Returns:
-        List containing file_search tool configuration, or empty list if no stores available
+        List containing file_search tool configurations, or empty list if no stores available
     """
     if vector_store_ids == []:
         return []
@@ -679,9 +749,11 @@ def get_rag_tools(vector_store_ids: list[str]) -> Optional[list[InputToolFileSea
     return [
         InputToolFileSearch(
             type="file_search",
-            vector_store_ids=vector_store_ids,
+            vector_store_ids=[vs_id],
             max_num_results=constants.TOOL_RAG_MAX_CHUNKS,
+            ranking_options=file_search_ranking_options_for_store(vs_id, byok_rags),
         )
+        for vs_id in vector_store_ids
     ]
 
 
diff --git a/src/utils/vector_search.py b/src/utils/vector_search.py
index 4d3007e2a..eec1c5ea0 100644
--- a/src/utils/vector_search.py
+++ b/src/utils/vector_search.py
@@ -76,19 +76,17 @@ def _extract_byok_rag_chunks(
     search_response: Any,
     vector_store_id: str,
     weight: float,
-    byok_raw_cutoff_score: float,
 ) -> list[dict[str, Any]]:
     """Extract and weight result chunks from vector search for BYOK RAG only.
 
-    Chunks whose raw retrieval score is below ``byok_raw_cutoff_score`` are
-    dropped before applying the per-store weight. This is not used for OKP/Solr.
+    Raw-score filtering uses ``score_threshold`` in :meth:`vector_io.query`
+    params (see :func:`_query_store_for_byok_rag`); this only maps chunks to
+    weighted result dicts. Not used for OKP/Solr.
 
     Args:
         search_response: Response from vector_io.query
         vector_store_id: ID of the BYOK vector store that produced these results
         weight: Score multiplier to apply to this store's remaining results
-        byok_raw_cutoff_score: Minimum raw similarity score for this store
-            (``ByokRag.relevance_cutoff_score``).
 
     Returns:
         List of result dictionaries with weighted scores
@@ -97,14 +95,6 @@ def _extract_byok_rag_chunks(
     for chunk, score in zip(
         search_response.chunks, search_response.scores, strict=True
     ):
-        if score < byok_raw_cutoff_score:
-            logger.debug(
-                "  [%s] BYOK: dropping chunk: raw score=%.4f < cutoff=%.4f",
-                vector_store_id,
-                score,
-                byok_raw_cutoff_score,
-            )
-            continue
         weighted_score = score * weight
         doc_id = (
             chunk.metadata.get("document_id", chunk.chunk_id)
@@ -189,7 +179,8 @@ async def _query_store_for_byok_rag(
         vector_store_id: ID of the BYOK vector store to query
         query: Search query string
         weight: Score multiplier to apply
-        byok_raw_cutoff_score: Minimum raw score before weighting (BYOK config only)
+        byok_raw_cutoff_score: Passed as ``score_threshold`` in ``vector_io.query``
+            params (``ByokRag.relevance_cutoff_score`` / mapping default).
 
     Returns:
         List of weighted result dictionaries, or empty list on error
@@ -201,11 +192,10 @@ async def _query_store_for_byok_rag(
             params={
                 "max_chunks": constants.BYOK_RAG_MAX_CHUNKS,
                 "mode": "vector",
+                "score_threshold": byok_raw_cutoff_score,
             },
         )
-        return _extract_byok_rag_chunks(
-            search_response, vector_store_id, weight, byok_raw_cutoff_score
-        )
+        return _extract_byok_rag_chunks(search_response, vector_store_id, weight)
     except Exception as e:  # pylint: disable=broad-exception-caught
         logger.warning("Failed to search '%s': %s", vector_store_id, e)
         return []
@@ -356,8 +346,9 @@ async def _fetch_byok_rag(  # pylint: disable=too-many-locals
     """Fetch chunks and documents from BYOK RAG sources only.
 
     Applies each entry's ``relevance_cutoff_score`` (via ``relevance_cutoff_mapping``)
-    to raw scores from BYOK vector stores. OKP/Solr inline RAG is implemented in
-    ``_fetch_solr_rag`` and does not use that setting.
+    as ``score_threshold`` on ``vector_io.query`` for each BYOK store. OKP/Solr
+    inline RAG is implemented in ``_fetch_solr_rag`` and does not use that
+    setting.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
@@ -418,10 +409,7 @@ async def _fetch_byok_rag(  # pylint: disable=too-many-locals
                     vector_store_id,
                     query,
                     score_multiplier_mapping.get(vector_store_id, 1.0),
-                    relevance_cutoff_mapping.get(
-                        vector_store_id,
-                        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
-                    ),
+                    relevance_cutoff_mapping[vector_store_id],
                 )
                 for vector_store_id in vector_store_ids_to_query
             ]
@@ -548,9 +536,10 @@ async def build_rag_context(
 ) -> RAGContext:
     """Build RAG context by fetching and merging chunks from all enabled sources.
 
-    Enabled sources can be BYOK and/or Solr OKP. BYOK raw-score cutoffs come from
-    each ``ByokRag.relevance_cutoff_score``; OKP uses ``_build_query_params`` (e.g.
-    Solr ``score_threshold``), not the BYOK cutoffs.
+    Enabled sources can be BYOK and/or Solr OKP. BYOK uses ``score_threshold`` in
+    ``vector_io.query`` from each ``ByokRag.relevance_cutoff_score``; OKP uses
+    ``_build_query_params`` (e.g. Solr ``score_threshold``), not the BYOK
+    per-store config keys.
 
     Args:
         client: The AsyncLlamaStackClient to use for the request
diff --git a/tests/integration/endpoints/test_query_byok_integration.py b/tests/integration/endpoints/test_query_byok_integration.py
index 93ea0271d..5abf78e55 100644
--- a/tests/integration/endpoints/test_query_byok_integration.py
+++ b/tests/integration/endpoints/test_query_byok_integration.py
@@ -971,7 +971,18 @@ async def test_query_byok_cutoff_applied_before_multiplier(
             ("Content below raw cutoff", "doc-below", 0.4),
         ],
     )
-    mock_client.vector_io.query = mocker.AsyncMock(return_value=low_score_resp)
+
+    async def _vector_io_query_side_effect(*_args: Any, **kwargs: Any) -> Any:
+        params = kwargs.get("params") or {}
+        raw_st = params.get("score_threshold", 0.0)
+        score_threshold = float(raw_st) if raw_st is not None else 0.0
+        if score_threshold <= 0.4:
+            return low_score_resp
+        return _make_vector_io_response(mocker, [])
+
+    mock_client.vector_io.query = mocker.AsyncMock(
+        side_effect=_vector_io_query_side_effect
+    )
 
     mock_vs_resp = mocker.MagicMock()
     mock_vs_resp.data = []
diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py
index add19be23..e860c6cf7 100644
--- a/tests/unit/utils/test_responses.py
+++ b/tests/unit/utils/test_responses.py
@@ -8,6 +8,7 @@
 
 import pytest
 from fastapi import HTTPException
+from llama_stack_api import SearchRankingOptions
 from llama_stack_api.openai_responses import (
     AllowedToolsFilter,
     OpenAIResponseInputToolChoiceAllowedTools,
@@ -87,6 +88,7 @@
     resolve_client_tool_choice,
     resolve_tool_choice,
     resolve_vector_store_ids,
+    translate_tools_vector_store_ids,
 )
 
 
@@ -380,16 +382,49 @@ class TestGetRAGTools:
 
     def test_get_rag_tools_empty_list(self) -> None:
         """Test get_rag_tools returns empty list for empty vector store IDs."""
-        assert not get_rag_tools([])
+        assert not get_rag_tools([], [])
 
     def test_get_rag_tools_with_vector_stores(self) -> None:
-        """Test get_rag_tools returns correct tool format for vector stores."""
-        tools = get_rag_tools(["db1", "db2"])
+        """Test get_rag_tools returns one file_search tool per vector store."""
+        tools = get_rag_tools(["db1", "db2"], [])
         assert isinstance(tools, list)
-        assert len(tools) == 1
-        assert tools[0].type == "file_search"
-        assert tools[0].vector_store_ids == ["db1", "db2"]
+        assert len(tools) == 2
+        assert all(t.type == "file_search" for t in tools)
+        assert tools[0].vector_store_ids == ["db1"]
+        assert tools[1].vector_store_ids == ["db2"]
         assert tools[0].max_num_results == 10
+        assert tools[0].ranking_options is not None
+        assert tools[0].ranking_options.score_threshold == (
+            constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+        )
+        assert tools[1].ranking_options is not None
+        assert tools[1].ranking_options.score_threshold == (
+            constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE
+        )
+
+    def test_get_rag_tools_per_store_cutoff(self) -> None:
+        """Each tool uses its store's relevance_cutoff_score as score_threshold."""
+        byok_rags = [
+            ByokRag(
+                rag_id="a",
+                vector_db_id="vs-low",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.2,
+            ),
+            ByokRag(
+                rag_id="b",
+                vector_db_id="vs-high",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.55,
+            ),
+        ]
+        tools = get_rag_tools(["vs-low", "vs-high"], byok_rags)
+        assert tools is not None
+        assert len(tools) == 2
+        assert tools[0].ranking_options is not None
+        assert tools[0].ranking_options.score_threshold == 0.2
+        assert tools[1].ranking_options is not None
+        assert tools[1].ranking_options.score_threshold == 0.55
 
 
 class TestGetMCPTools:
@@ -1487,9 +1522,10 @@ async def test_prepare_tools_with_vector_store_ids(
 
         result = await prepare_tools(mock_client, ["vs1", "vs2"], False, "token")
         assert result is not None
-        assert len(result) == 1
-        assert result[0].type == "file_search"
-        assert result[0].vector_store_ids == ["vs1", "vs2"]
+        assert len(result) == 2
+        assert all(t.type == "file_search" for t in result)
+        assert result[0].vector_store_ids == ["vs1"]
+        assert result[1].vector_store_ids == ["vs2"]
 
     @pytest.mark.asyncio
     async def test_prepare_tools_fetch_vector_stores(
@@ -1510,8 +1546,9 @@ async def test_prepare_tools_fetch_vector_stores(
 
         result = await prepare_tools(mock_client, None, False, "token")
         assert result is not None
-        assert len(result) == 1
-        assert result[0].vector_store_ids == ["vs1", "vs2"]
+        assert len(result) == 2
+        assert result[0].vector_store_ids == ["vs1"]
+        assert result[1].vector_store_ids == ["vs2"]
 
     @pytest.mark.asyncio
     async def test_prepare_tools_connection_error(self, mocker: MockerFixture) -> None:
@@ -1637,6 +1674,92 @@ def test_preserves_order(self) -> None:
         assert result == ["vs-a", "vs-b"]
 
 
+class TestTranslateToolsVectorStoreIds:
+    """Tests for translate_tools_vector_store_ids."""
+
+    def test_sets_ranking_when_omitted(self) -> None:
+        """file_search tools get ranking_options from BYOK cutoffs when unset."""
+        byok_rags = [
+            ByokRag(
+                rag_id="x",
+                vector_db_id="vs1",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.4,
+            )
+        ]
+        tools_in: list[InputTool] = [
+            InputToolFileSearch(vector_store_ids=["x"]),
+        ]
+        out = translate_tools_vector_store_ids(tools_in, byok_rags)
+        fs = cast(InputToolFileSearch, out[0])
+        assert fs.vector_store_ids == ["vs1"]
+        assert fs.ranking_options is not None
+        assert fs.ranking_options.score_threshold == 0.4
+
+    def test_preserves_client_ranking_options(self) -> None:
+        """Client score_threshold above store cutoff is kept (floor does not lower it)."""
+        byok_rags = [
+            ByokRag(
+                rag_id="x",
+                vector_db_id="vs1",
+                db_path="tests/configuration/rag.txt",
+            )
+        ]
+        custom = SearchRankingOptions(score_threshold=0.88)
+        tools_in: list[InputTool] = [
+            InputToolFileSearch(vector_store_ids=["x"], ranking_options=custom),
+        ]
+        out = translate_tools_vector_store_ids(tools_in, byok_rags)
+        fs = cast(InputToolFileSearch, out[0])
+        assert fs.ranking_options is not None
+        assert fs.ranking_options.score_threshold == 0.88
+
+    def test_client_ranking_floor_raises_below_store_cutoff(self) -> None:
+        """Effective score_threshold is max(client, store_cutoff)."""
+        byok_rags = [
+            ByokRag(
+                rag_id="x",
+                vector_db_id="vs1",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.5,
+            )
+        ]
+        custom = SearchRankingOptions(score_threshold=0.1)
+        tools_in: list[InputTool] = [
+            InputToolFileSearch(vector_store_ids=["x"], ranking_options=custom),
+        ]
+        out = translate_tools_vector_store_ids(tools_in, byok_rags)
+        fs = cast(InputToolFileSearch, out[0])
+        assert fs.ranking_options is not None
+        assert fs.ranking_options.score_threshold == 0.5
+
+    def test_splits_multiple_stores_into_separate_tools(self) -> None:
+        """One client file_search with multiple stores becomes one tool per store."""
+        byok_rags = [
+            ByokRag(
+                rag_id="a",
+                vector_db_id="vs-a",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.2,
+            ),
+            ByokRag(
+                rag_id="b",
+                vector_db_id="vs-b",
+                db_path="tests/configuration/rag.txt",
+                relevance_cutoff_score=0.6,
+            ),
+        ]
+        tools_in: list[InputTool] = [
+            InputToolFileSearch(vector_store_ids=["a", "b"]),
+        ]
+        out = translate_tools_vector_store_ids(tools_in, byok_rags)
+        assert len(out) == 2
+        assert cast(InputToolFileSearch, out[0]).vector_store_ids == ["vs-a"]
+        assert cast(InputToolFileSearch, out[0]).ranking_options.score_threshold == 0.2
+        assert cast(InputToolFileSearch, out[1]).vector_store_ids == ["vs-b"]
+        assert cast(InputToolFileSearch, out[1]).ranking_options.score_threshold == 0.6
+
+
 class TestPrepareToolsTranslatesVectorStoreIds:
     """Tests that prepare_tools translates BYOK IDs before building RAG tools."""
 
@@ -1735,9 +1858,10 @@ async def test_uses_rag_tool_config_when_no_per_request_ids(
         result = await prepare_tools(mock_client, None, False, "token")
 
         assert result is not None
-        assert len(result) == 1
-        assert result[0].type == "file_search"
-        assert result[0].vector_store_ids == ["rag-tool-id-1", "rag-tool-id-2"]
+        assert len(result) == 2
+        assert all(t.type == "file_search" for t in result)
+        assert result[0].vector_store_ids == ["rag-tool-id-1"]
+        assert result[1].vector_store_ids == ["rag-tool-id-2"]
         mock_client.vector_stores.list.assert_not_called()
 
     @pytest.mark.asyncio
@@ -3173,11 +3297,11 @@ class TestGetRAGToolsWithConfig:
     def test_returns_empty_when_no_vector_store_ids(self) -> None:
         """Test get_rag_tools returns empty list when no vector store IDs are provided."""
         # pylint: disable-next=use-implicit-booleaness-not-comparison
-        assert get_rag_tools([]) == []
+        assert get_rag_tools([], []) == []
 
     def test_returns_tools_when_stores_provided(self) -> None:
         """Test get_rag_tools returns tools when vector store IDs are provided."""
-        tools = get_rag_tools(["vs1"])
+        tools = get_rag_tools(["vs1"], [])
         assert tools is not None
         assert tools[0].type == constants.DEFAULT_RAG_TOOL
         assert tools[0].vector_store_ids == ["vs1"]
diff --git a/tests/unit/utils/test_vector_search.py b/tests/unit/utils/test_vector_search.py
index 47dddbdd4..5dee8495c 100644
--- a/tests/unit/utils/test_vector_search.py
+++ b/tests/unit/utils/test_vector_search.py
@@ -112,7 +112,6 @@ def test_extract_chunks_with_metadata(self, mocker: MockerFixture) -> None:
             search_response,
             vector_store_id="test_store",
             weight=1.5,
-            byok_raw_cutoff_score=0.0,
         )
 
         assert len(result) == 2
@@ -137,15 +136,14 @@ def test_extract_chunks_without_metadata(self, mocker: MockerFixture) -> None:
             search_response,
             vector_store_id="test_store",
             weight=1.0,
-            byok_raw_cutoff_score=0.0,
         )
 
         assert len(result) == 1
         assert result[0]["doc_id"] == "chunk_id"
         assert result[0]["metadata"] == {}
 
-    def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> None:
-        """Raw scores below the BYOK cutoff are not included."""
+    def test_extract_maps_all_chunks_from_response(self, mocker: MockerFixture) -> None:
+        """Extraction maps every chunk returned by vector_io (server applies score_threshold)."""
         chunk_hi = mocker.Mock()
         chunk_hi.content = "Hi"
         chunk_hi.chunk_id = "a"
@@ -163,12 +161,10 @@ def test_extract_skips_chunks_below_raw_cutoff(self, mocker: MockerFixture) -> N
             search_response,
             vector_store_id="vs",
             weight=0.5,
-            byok_raw_cutoff_score=0.5,
         )
-        assert len(result) == 1
-        assert result[0]["content"] == "Hi"
-        assert result[0]["score"] == 0.9
+        assert len(result) == 2
         assert result[0]["weighted_score"] == 0.45
+        assert result[1]["weighted_score"] == 0.1
 
 
 class TestFormatRagContext:
@@ -514,7 +510,11 @@ async def test_user_facing_ids_translated_to_internal_ids(
         client_mock.vector_io.query.assert_called_once_with(
             vector_store_id="vs-internal-001",
             query="test query",
-            params={"max_chunks": constants.BYOK_RAG_MAX_CHUNKS, "mode": "vector"},
+            params={
+                "max_chunks": constants.BYOK_RAG_MAX_CHUNKS,
+                "mode": "vector",
+                "score_threshold": 0.0,
+            },
         )
 
     @pytest.mark.asyncio
@@ -609,10 +609,10 @@ async def test_request_id_not_in_inline_config_skips_byok(
         client_mock.vector_io.query.assert_not_called()
 
     @pytest.mark.asyncio
-    async def test_byok_relevance_cutoff_drops_low_raw_scores(
+    async def test_byok_relevance_cutoff_passed_as_score_threshold(
         self, mocker: MockerFixture
     ) -> None:
-        """Chunks with raw retrieval score below BYOK relevance_cutoff_score are excluded."""
+        """vector_io.query receives score_threshold from BYOK relevance_cutoff_mapping."""
         config_mock = mocker.Mock(spec=AppConfig)
         byok_rag_mock = mocker.Mock()
         byok_rag_mock.rag_id = "rag_1"
@@ -632,14 +632,9 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores(
         chunk_hi.chunk_id = "c1"
         chunk_hi.metadata = {}
 
-        chunk_lo = mocker.Mock()
-        chunk_lo.content = "Drop"
-        chunk_lo.chunk_id = "c2"
-        chunk_lo.metadata = {}
-
         search_response = mocker.Mock()
-        search_response.chunks = [chunk_hi, chunk_lo]
-        search_response.scores = [0.9, 0.3]
+        search_response.chunks = [chunk_hi]
+        search_response.scores = [0.9]
 
         client_mock = mocker.AsyncMock()
         client_mock.vector_io.query.return_value = search_response
@@ -647,6 +642,8 @@ async def test_byok_relevance_cutoff_drops_low_raw_scores(
         rag_chunks, _referenced_docs = await _fetch_byok_rag(client_mock, "q")
         assert len(rag_chunks) == 1
         assert rag_chunks[0].content == "Keep"
+        call_kwargs = client_mock.vector_io.query.call_args.kwargs
+        assert call_kwargs["params"]["score_threshold"] == 0.5
 
 
 class TestFetchSolrRag: