lightspeed-core · syedriko · Apr 17, 2026 · Apr 17, 2026 · Apr 18, 2026
diff --git a/docs/byok_guide.md b/docs/byok_guide.md
@@ -79,10 +79,11 @@ Both modes rely on:
 
 Inline RAG additionally supports:
 - **Score Multiplier**: Optional weight applied per BYOK vector store when mixing multiple sources. Allows custom prioritization of content.
+- **Relevance cutoff (`relevance_cutoff_score`)**: Optional minimum **raw** similarity score, configured **per BYOK store** (each `byok_rag` entry) during **Inline RAG**. Chunks below the cutoff are dropped **before** `score_multiplier` is applied. It applies only to BYOK stores listed under `byok_rag`; it does not affect OKP/Solr inline RAG (which uses separate query defaults) and is not used for Tool RAG (`file_search`). If an entry omits `relevance_cutoff_score`, it defaults to `DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE` in `src/constants.py` (currently `0.3`). Set to `0.0` to disable filtering for that store.
 
 > [!NOTE]
 > OKP and BYOK scores are not directly comparable (different scoring systems), so
-> `score_multiplier` does not apply to OKP results. To control the amount of retrieved
+> `score_multiplier` and `relevance_cutoff_score` do not apply to OKP results. To control the amount of retrieved
 > context, set the `BYOK_RAG_MAX_CHUNKS` and `OKP_RAG_MAX_CHUNKS` constants in `src/constants.py`
 > (defaults: 10 and 5 respectively). For Tool RAG, use `TOOL_RAG_MAX_CHUNKS` (default: 10).
 
@@ -280,6 +281,8 @@ registered_resources:
 > section of `lightspeed-stack.yaml`. The lightspeed-stack service automatically generates the required configuration
 > at startup.
 >
+> Preferred shape: a YAML list of stores. Set optional `relevance_cutoff_score` on **each** store, or rely on the default from `src/constants.py`:
+>
 > ```yaml
 > byok_rag:
 >   - rag_id: my-docs           # Unique identifier for this knowledge source
@@ -288,11 +291,13 @@ registered_resources:
 >     embedding_dimension: 768
 >     vector_db_id: your-index-id  # Llama Stack vector store ID (from index generation)
 >     db_path: /path/to/vector_db/faiss_store.db
->     score_multiplier: 1.0       # Optional: weight results when mixing multiple sources
+>     relevance_cutoff_score: 0.3   # Optional per store; min raw score before score_multiplier (BYOK inline only)
+>     score_multiplier: 1.0         # Optional: weight results when mixing multiple sources
 > ```
 >
 > When multiple BYOK sources are configured, `score_multiplier` adjusts the relative importance of
 > each store's results during Inline RAG retrieval. Values above 1.0 boost a store; below 1.0 reduce it.
+> `relevance_cutoff_score` filters by raw retrieval score first; weighting applies only to chunks that pass the cutoff.
 
 ### Step 5: Configure RAG Strategy
 
@@ -319,10 +324,10 @@ okp:
 
 Both modes can be enabled simultaneously. Choose based on your latency and control preferences:
 
-| Mode | When context is fetched | Tool call needed | score_multiplier |
-|------|------------------------|------------------|-----------------|
-| Inline RAG | With every query | No | Yes (BYOK only) |
-| Tool RAG | On LLM demand | Yes | No |
+| Mode | When context is fetched | Tool call needed | score_multiplier | relevance_cutoff_score |
+|------|------------------------|------------------|------------------|------------------------|
+| Inline RAG | With every query | No | Yes (BYOK only) | Yes (BYOK only) |
+| Tool RAG | On LLM demand | Yes | No | No |
 
 > [!TIP]
 > A ready-to-use example combining BYOK and OKP is available at

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -6931,8 +6931,8 @@
                 "tags": [
                     "a2a"
                 ],
-                "summary": "Handle A2A Jsonrpc",
-                "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n    request: FastAPI request object\n    auth: Authentication tuple\n    mcp_headers: MCP headers for context propagation\n\nReturns:\n    JSON-RPC response or streaming response",
+                "summary": "Handle A2A Jsonrpc Get",
+                "description": "Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST).",
                 "operationId": "handle_a2a_jsonrpc_a2a_get",
                 "responses": {
                     "200": {
@@ -6949,9 +6949,9 @@
                 "tags": [
                     "a2a"
                 ],
-                "summary": "Handle A2A Jsonrpc",
-                "description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n    request: FastAPI request object\n    auth: Authentication tuple\n    mcp_headers: MCP headers for context propagation\n\nReturns:\n    JSON-RPC response or streaming response",
-                "operationId": "handle_a2a_jsonrpc_a2a_get",
+                "summary": "Handle A2A Jsonrpc Post",
+                "description": "Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET).",
+                "operationId": "handle_a2a_jsonrpc_a2a_post",
                 "responses": {
                     "200": {
                         "description": "Successful Response",
@@ -8074,6 +8074,13 @@
                         "title": "DB path",
                         "description": "Path to RAG database."
                     },
+                    "relevance_cutoff_score": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "title": "BYOK inline RAG relevance cutoff",
+                        "description": "Minimum raw similarity score from this **BYOK** vector store before score_multiplier weighting. Chunks below this threshold are dropped immediately after retrieval from this store only. Does not apply to OKP/Solr. Set to 0.0 to disable filtering for this store.",
+                        "default": 0.3
+                    },
                     "score_multiplier": {
                         "type": "number",
                         "exclusiveMinimum": 0.0,
@@ -8251,7 +8258,7 @@
                         },
                         "type": "array",
                         "title": "BYOK RAG configuration",
-                        "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file"
+                        "description": "BYOK RAG configuration. This configuration can be used to reconfigure Llama Stack through its run.yaml configuration file. Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set ``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr)."
                     },
                     "a2a_state": {
                         "$ref": "#/components/schemas/A2AStateConfiguration",

diff --git a/examples/lightspeed-stack-byok-okp-rag.yaml b/examples/lightspeed-stack-byok-okp-rag.yaml
@@ -40,12 +40,14 @@ byok_rag:
     embedding_dimension: 1024
     vector_db_id: vs_123       # Llama-stack vector_store_id
     db_path: /tmp/ocp.faiss
+    relevance_cutoff_score: 0.3   # Optional per store; min raw score before score_multiplier (inline BYOK only)
     score_multiplier: 1.0      # Weight for this vector store's results (Inline RAG only)
   - rag_id: knowledge-base     # referenced in rag.inline / rag.tool
     rag_type: inline::faiss
     embedding_dimension: 384
     vector_db_id: vs_456       # Llama-stack vector_store_id
     db_path: /tmp/kb.faiss
+    relevance_cutoff_score: 0.3
     score_multiplier: 1.2      # Weight for this vector store's results (Inline RAG only)
 
 # RAG configuration

diff --git a/src/app/endpoints/a2a.py b/src/app/endpoints/a2a.py
@@ -692,12 +692,40 @@ async def _create_a2a_app(
     return a2a_app.build()
 
 
-@router.api_route("/a2a", methods=["GET", "POST"], response_model=None)
+@router.get(
+    "/a2a",
+    response_model=None,
+    operation_id="handle_a2a_jsonrpc_a2a_get",
+)
+@authorize(Action.A2A_JSONRPC)
+async def handle_a2a_jsonrpc_get(
+    request: Request,
+    auth: Annotated[AuthTuple, Depends(auth_dependency)],
+    mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
+) -> Response | StreamingResponse:
+    """Handle GET on ``/a2a`` for A2A JSON-RPC (same handler as POST)."""
+    return await _handle_a2a_jsonrpc(request, auth, mcp_headers)
+
+
+@router.post(
+    "/a2a",
+    response_model=None,
+    operation_id="handle_a2a_jsonrpc_a2a_post",
+)
 @authorize(Action.A2A_JSONRPC)
-async def handle_a2a_jsonrpc(  # pylint: disable=too-many-locals,too-many-statements
+async def handle_a2a_jsonrpc_post(
     request: Request,
     auth: Annotated[AuthTuple, Depends(auth_dependency)],
     mcp_headers: McpHeaders = Depends(mcp_headers_dependency),
+) -> Response | StreamingResponse:
+    """Handle POST on ``/a2a`` for A2A JSON-RPC (same handler as GET)."""
+    return await _handle_a2a_jsonrpc(request, auth, mcp_headers)
+
+
+async def _handle_a2a_jsonrpc(  # pylint: disable=too-many-locals,too-many-statements
+    request: Request,
+    auth: AuthTuple,
+    mcp_headers: McpHeaders,
 ) -> Response | StreamingResponse:
     """
     Handle A2A JSON-RPC requests following the A2A protocol specification.

diff --git a/src/configuration.py b/src/configuration.py
@@ -508,6 +508,28 @@ def score_multiplier_mapping(self) -> dict[str, float]:
             for brag in self._configuration.byok_rag
         }
 
+    @property
+    def relevance_cutoff_mapping(self) -> dict[str, float]:
+        """Return mapping from vector_db_id to relevance_cutoff_score from BYOK RAG config.
+
+        If ``byok_rag`` lists the same ``vector_db_id`` more than once, the first
+        occurrence wins.
+
+        Returns:
+            dict[str, float]: Mapping where keys are llama-stack ``vector_db_id`` values
+            and values are per-store raw-score cutoffs before score_multiplier weighting.
+
+        Raises:
+            LogicError: If the configuration has not been loaded.
+        """
+        if self._configuration is None:
+            raise LogicError("logic error: configuration is not loaded")
+        mapping: dict[str, float] = {}
+        for brag in self._configuration.byok_rag:
+            if brag.vector_db_id not in mapping:
+                mapping[brag.vector_db_id] = brag.relevance_cutoff_score
+        return mapping
+
     @property
     def inline_solr_enabled(self) -> bool:
         """Return whether OKP is included in the inline RAG list.

diff --git a/src/constants.py b/src/constants.py
@@ -186,10 +186,13 @@
 
 # Inline RAG constants
 BYOK_RAG_MAX_CHUNKS = 10  # retrieved from BYOK RAG
+# Default minimum raw similarity per BYOK store (``ByokRag.relevance_cutoff_score``)
+DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE = 0.3
 OKP_RAG_MAX_CHUNKS = 5  # retrieved from OKP RAG
 
 # Solr OKP constants
 SOLR_VECTOR_SEARCH_DEFAULT_K = 5
+# Default score_threshold in vector_io.query params for the OKP/Solr vector store
 SOLR_VECTOR_SEARCH_DEFAULT_SCORE_THRESHOLD = 0.3
 SOLR_VECTOR_SEARCH_DEFAULT_MODE = "hybrid"
 

diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
@@ -41,6 +41,28 @@ def increase_indent(self, flow: bool = False, indentless: bool = False) -> None:
         return super().increase_indent(flow, False)
 
 
+def _raw_byok_rag_store_list(raw_byok_rag: Any) -> list[Any]:
+    """Return BYOK store definitions from raw Lightspeed YAML.
+
+    Only a YAML list is accepted as BYOK stores. Each list item is expected to be
+    a mapping with ``.get`` for :func:`enrich_byok_rag`.
+
+    Parameters:
+    ----------
+        raw_byok_rag (Any):
+            Raw parsed YAML value (may be a list, ``None``, or other types).
+
+    Returns:
+    -------
+        list[Any]:
+            The input list returned as-is when ``raw_byok_rag`` is a list;
+            otherwise an empty list (``[]`` for ``None`` or any non-list type).
+    """
+    if isinstance(raw_byok_rag, list):
+        return raw_byok_rag
+    return []
+
+
 # =============================================================================
 # Enrichment: Azure Entra ID
 # =============================================================================
@@ -619,7 +641,7 @@ def generate_configuration(
     setup_azure_entra_id_token(config.get("azure_entra_id"), env_file)
 
     # Enrichment: BYOK RAG
-    enrich_byok_rag(ls_config, config.get("byok_rag", []))
+    enrich_byok_rag(ls_config, _raw_byok_rag_store_list(config.get("byok_rag", [])))
 
     # Enrichment: Solr - enabled when "okp" appears in either inline or tool list
     enrich_solr(ls_config, config.get("rag", {}), config.get("okp", {}))

diff --git a/src/models/config.py b/src/models/config.py
@@ -2,19 +2,21 @@
 
 # pylint: disable=too-many-lines
 
+import math
 import re
 from enum import Enum
 from functools import cached_property
 from pathlib import Path
 from re import Pattern
-from typing import Any, Literal, Optional, Self
+from typing import Annotated, Any, Literal, Optional
 
 import jsonpath_ng
 import yaml
 from jsonpath_ng.exceptions import JSONPathError
 from pydantic import (
     AnyHttpUrl,
     BaseModel,
+    BeforeValidator,
     ConfigDict,
     Field,
     FilePath,
@@ -26,6 +28,7 @@
     model_validator,
 )
 from pydantic.dataclasses import dataclass
+from typing_extensions import Self  # noqa: UP035
 
 import constants
 from log import get_logger
@@ -1612,6 +1615,39 @@ class ByokRag(ConfigurationBase):
         description="Path to RAG database.",
     )
 
+    relevance_cutoff_score: float = Field(
+        constants.DEFAULT_BYOK_RAG_RELEVANCE_CUTOFF_SCORE,
+        ge=0,
+        title="BYOK inline RAG relevance cutoff",
+        description="Minimum raw similarity score from this **BYOK** vector store "
+        "before score_multiplier weighting. Chunks below this threshold are dropped "
+        "immediately after retrieval from this store only. Does not apply to OKP/Solr. "
+        "Set to 0.0 to disable filtering for this store.",
+    )
+
+    @field_validator("relevance_cutoff_score")
+    @classmethod
+    def validate_relevance_cutoff_score(cls, value: float) -> float:
+        """Reject non-finite values (e.g. ``.inf`` or ``.nan`` from YAML).
+
+        Parameters:
+        ----------
+            value: Cutoff after coercion to ``float``.
+
+        Returns:
+        -------
+            The same finite value.
+
+        Raises:
+        ------
+            ValueError: If ``value`` is not finite.
+        """
+        if not math.isfinite(value):
+            raise ValueError(
+                "relevance_cutoff_score must be a finite number (not inf, -inf, or nan)"
+            )
+        return value
+
     score_multiplier: float = Field(
         constants.DEFAULT_SCORE_MULTIPLIER,
         gt=0,
@@ -1622,6 +1658,30 @@ class ByokRag(ConfigurationBase):
     )
 
 
+def _normalize_byok_rag_input(value: Any) -> Any:
+    """Normalize ``byok_rag`` YAML to a list of :class:`ByokRag` definitions.
+
+    ``null`` becomes ``[]``. The value must otherwise be a YAML list of store
+    mappings (not a mapping at the top level).
+
+    Raises:
+        ValueError: If ``value`` is a dict or other non-list type (other than ``None``).
+    """
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return value
+    raise ValueError(
+        "byok_rag must be a YAML list of BYOK store definitions, not a mapping."
+    )
+
+
+ByokRagListValidated = Annotated[
+    list[ByokRag],
+    BeforeValidator(_normalize_byok_rag_input),
+]
+
+
 class QuotaLimiterConfiguration(ConfigurationBase):
     """Configuration for one quota limiter.
 
@@ -1908,11 +1968,13 @@ class Configuration(ConfigurationBase):
         description="Conversation history configuration.",
     )
 
-    byok_rag: list[ByokRag] = Field(
+    byok_rag: ByokRagListValidated = Field(
         default_factory=list,
         title="BYOK RAG configuration",
         description="BYOK RAG configuration. This configuration can be used to "
-        "reconfigure Llama Stack through its run.yaml configuration file",
+        "reconfigure Llama Stack through its run.yaml configuration file. "
+        "Use ``byok_rag: [ ... ]`` (a list of stores). Each store may set "
+        "``relevance_cutoff_score`` (BYOK inline RAG only; not used for OKP/Solr).",
     )
 
     a2a_state: A2AStateConfiguration = Field(