lightspeed-core · asimurka · Apr 28, 2026 · Apr 28, 2026 · asimurka · Apr 28, 2026
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -23,6 +23,7 @@
 from client import AsyncLlamaStackClientHolder
 from configuration import configuration
 from log import get_logger
+from models.common.responses.responses_api_params import ResponsesApiParams
 from models.config import Action
 from models.requests import QueryRequest
 from models.responses import (
@@ -65,7 +66,6 @@
 from utils.shields import run_shield_moderation, validate_shield_ids_override
 from utils.suid import normalize_conversation_id
 from utils.types import (
-    ResponsesApiParams,
     ShieldModerationResult,
     TurnSummary,
 )

diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -59,6 +59,7 @@
     TOPIC_SUMMARY_INTERRUPT_TIMEOUT_SECONDS,
 )
 from log import get_logger
+from models.common.responses.responses_api_params import ResponsesApiParams
 from models.config import Action
 from models.context import ResponseGeneratorContext
 from models.requests import QueryRequest
@@ -115,7 +116,7 @@
 from utils.stream_interrupts import get_stream_interrupt_registry
 from utils.suid import get_suid, normalize_conversation_id
 from utils.token_counter import TokenCounter
-from utils.types import ReferencedDocument, ResponsesApiParams, TurnSummary
+from utils.types import ReferencedDocument, TurnSummary
 from utils.vector_search import build_rag_context
 
 logger = get_logger(__name__)

diff --git a/src/models/common/responses/responses_api_params.py b/src/models/common/responses/responses_api_params.py
@@ -0,0 +1,175 @@
+"""Request parameter model for Llama Stack responses API calls."""
+
+from collections.abc import Mapping
+from typing import Any, Final, Optional
+
+from llama_stack_api.openai_responses import (
+    OpenAIResponseInputTool as InputTool,
+)
+from llama_stack_api.openai_responses import (
+    OpenAIResponseInputToolChoice as ToolChoice,
+)
+from llama_stack_api.openai_responses import (
+    OpenAIResponsePrompt as Prompt,
+)
+from llama_stack_api.openai_responses import (
+    OpenAIResponseReasoning as Reasoning,
+)
+from llama_stack_api.openai_responses import (
+    OpenAIResponseText as Text,
+)
+from llama_stack_api.openai_responses import (
+    OpenAIResponseToolMCP as OutputToolMCP,
+)
+from pydantic import BaseModel, Field
+
+from utils.tool_formatter import translate_vector_store_ids_to_user_facing
+from utils.types import IncludeParameter, ResponseInput
+
+# Attribute names that are echoed back in the response.
+_ECHOED_FIELDS: Final[set[str]] = set(
+    {
+        "instructions",
+        "max_tool_calls",
+        "max_output_tokens",
+        "metadata",
+        "model",
+        "parallel_tool_calls",
+        "previous_response_id",
+        "prompt",
+        "reasoning",
+        "safety_identifier",
+        "temperature",
+        "top_p",
+        "truncation",
+        "text",
+        "tool_choice",
+        "store",
+    }
+)
+
+
+class ResponsesApiParams(BaseModel):
+    """Parameters for a Llama Stack Responses API request.
+
+    All fields accepted by the Llama Stack client responses.create() body are
+    included so that dumped model can be passed directly to response create.
+    """
+
+    input: ResponseInput = Field(description="The input text or structured input items")
+    model: str = Field(description='The full model ID in format "provider/model"')
+    conversation: str = Field(description="The conversation ID in llama-stack format")
+    include: Optional[list[IncludeParameter]] = Field(
+        default=None,
+        description="Output item types to include in the response",
+    )
+    instructions: Optional[str] = Field(
+        default=None, description="The resolved system prompt"
+    )
+    max_infer_iters: Optional[int] = Field(
+        default=None,
+        description="Maximum number of inference iterations",
+    )
+    max_output_tokens: Optional[int] = Field(
+        default=None,
+        description="Maximum number of tokens allowed in the response",
+    )
+    max_tool_calls: Optional[int] = Field(
+        default=None,
+        description="Maximum tool calls allowed in a single response",
+    )
+    metadata: Optional[dict[str, str]] = Field(
+        default=None,
+        description="Custom metadata for tracking or logging",
+    )
+    parallel_tool_calls: Optional[bool] = Field(
+        default=None,
+        description="Whether the model can make multiple tool calls in parallel",
+    )
+    previous_response_id: Optional[str] = Field(
+        default=None,
+        description="Identifier of the previous response in a multi-turn conversation",
+    )
+    prompt: Optional[Prompt] = Field(
+        default=None,
+        description="Prompt template with variables for dynamic substitution",
+    )
+    reasoning: Optional[Reasoning] = Field(
+        default=None,
+        description="Reasoning configuration for the response",
+    )
+    safety_identifier: Optional[str] = Field(
+        default=None,
+        description="Stable identifier for safety monitoring and abuse detection",
+    )
+    store: bool = Field(description="Whether to store the response")
+    stream: bool = Field(description="Whether to stream the response")
+    temperature: Optional[float] = Field(
+        default=None,
+        description="Sampling temperature (e.g. 0.0-2.0)",
+    )
+    text: Optional[Text] = Field(
+        default=None,
+        description="Text response configuration (format constraints)",
+    )
+    tool_choice: Optional[ToolChoice] = Field(
+        default=None,
+        description="Tool selection strategy",
+    )
+    tools: Optional[list[InputTool]] = Field(
+        default=None,
+        description="Prepared tool groups for Responses API (same type as ResponsesRequest.tools)",
+    )
+    extra_headers: Optional[dict[str, str]] = Field(
+        default=None,
+        description="Extra HTTP headers to send with the request (e.g. x-llamastack-provider-data)",
+    )
+
+    def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
+        """Serialize params, re-injecting MCP authorization stripped by exclude=True.
+
+        llama-stack-api marks ``InputToolMCP.authorization`` with
+        ``Field(exclude=True)`` to prevent token leakage in API responses.
+        The base ``model_dump()`` therefore strips the field, but we need it
+        in the request payload so llama-stack server can authenticate with
+        MCP servers.  See LCORE-1414 / GitHub issue #1269.
+        """
+        result = super().model_dump(*args, **kwargs)
+        # Only one context option is allowed, previous_response_id has priority
+        # Turn is added to conversation manually if previous_response_id is used
+        if self.previous_response_id:
+            result.pop("conversation", None)
+        dumped_tools = result.get("tools")
+        if not self.tools or not isinstance(dumped_tools, list):
+            return result
+        if len(dumped_tools) != len(self.tools):
+            return result
+        for tool, dumped_tool in zip(self.tools, dumped_tools):
+            authorization = getattr(tool, "authorization", None)
+            if authorization is not None and isinstance(dumped_tool, dict):
+                dumped_tool["authorization"] = authorization
+        return result
+
+    def echoed_params(self, rag_id_mapping: Mapping[str, str]) -> dict[str, Any]:
+        """Build kwargs echoed into synthetic OpenAI-style responses (e.g. moderation blocks).
+
+        Parameters:
+            rag_id_mapping: Llama Stack vector_db_id to user-facing RAG id (from app config).
+        Returns:
+            dict[str, Any]: Field names and values to merge into the response object.
+        """
+        data = self.model_dump(include=_ECHOED_FIELDS)
+        if self.tools is not None:
+            tool_dicts: list[dict[str, Any]] = []
+            for t in self.tools:
+                if t.type == "mcp":
+                    validated = OutputToolMCP.model_validate(t.model_dump())
+                    tool_dicts.append(validated.model_dump())
+                else:
+                    tool_dicts.append(t.model_dump())
+
+            data["tools"] = translate_vector_store_ids_to_user_facing(
+                tool_dicts, rag_id_mapping
+            )
+
+        return data
diff --git a/src/models/common/responses/responses_context.py b/src/models/common/responses/responses_context.py
@@ -0,0 +1,45 @@
+"""Request-scoped context model for the responses endpoint pipeline."""
+
+from datetime import datetime
+from typing import Optional
+
+from fastapi import BackgroundTasks
+from llama_stack_client import AsyncLlamaStackClient
+from pydantic import BaseModel, ConfigDict, Field
+
+from utils.types import RAGContext, ShieldModerationResult
+
+
+class ResponsesContext(BaseModel):
+    """Shared request-scoped context for the /responses endpoint pipeline."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    client: AsyncLlamaStackClient = Field(description="The Llama Stack client")
+    auth: tuple[str, str, bool, str] = Field(
+        description="Authentication tuple (user_id, username, skip_userid_check, token)",
+    )
+    input_text: str = Field(description="Extracted user input text for the turn")
+    started_at: datetime = Field(description="UTC timestamp when the request started")
+    moderation_result: ShieldModerationResult = Field(
+        description="Shield moderation outcome",
+    )
+    inline_rag_context: RAGContext = Field(
+        description="Inline RAG context for the turn"
+    )
+    filter_server_tools: bool = Field(
+        default=False,
+        description="Whether to filter server-deployed MCP tool events from output",
+    )
+    background_tasks: Optional[BackgroundTasks] = Field(
+        default=None,
+        description="Background tasks for telemetry, if enabled",
+    )
+    rh_identity_context: tuple[str, str] = Field(
+        default_factory=lambda: ("", ""),
+        description="RH identity (org_id, system_id) for Splunk events",
+    )
+    generate_topic_summary: bool = Field(
+        default=False,
+        description="Whether to generate a topic summary for new conversations",
+    )
diff --git a/src/models/requests.py b/src/models/requests.py
@@ -21,12 +21,8 @@
 from llama_stack_api.openai_responses import (
     OpenAIResponseText as Text,
 )
-from llama_stack_api.openai_responses import (
-    OpenAIResponseToolMCP as OutputToolMCP,
-)
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
-from configuration import configuration
 from constants import (
     MCP_AUTH_CLIENT,
     MCP_AUTH_KUBERNETES,
@@ -38,7 +34,6 @@
 )
 from log import get_logger
 from utils import suid
-from utils.tool_formatter import translate_vector_store_ids_to_user_facing
 from utils.types import IncludeParameter, ResponseInput
 
 logger = get_logger(__name__)
@@ -867,28 +862,6 @@ def check_previous_response_id(cls, value: Optional[str]) -> Optional[str]:
             raise ValueError("You cannot provide context by moderation response.")
         return value
 
-    def echoed_params(self) -> dict[str, Any]:
-        """Build kwargs echoed into synthetic OpenAI-style responses (e.g. moderation blocks).
-
-        Returns:
-            dict[str, Any]: Field names and values to merge into the response object.
-        """
-        data = self.model_dump(include=_ECHOED_FIELDS)
-        if self.tools is not None:
-            tool_dicts: list[dict[str, Any]] = [
-                (
-                    OutputToolMCP.model_validate(t.model_dump()).model_dump()
-                    if t.type == "mcp"
-                    else t.model_dump()
-                )
-                for t in self.tools
-            ]
-            data["tools"] = translate_vector_store_ids_to_user_facing(
-                tool_dicts, configuration.rag_id_mapping
-            )
-
-        return data
-
 
 class MCPServerRegistrationRequest(BaseModel):
     """Request model for dynamically registering an MCP server.

diff --git a/src/utils/responses.py b/src/utils/responses.py
@@ -91,6 +91,7 @@
 from configuration import configuration
 from constants import DEFAULT_RAG_TOOL
 from log import get_logger
+from models.common.responses.responses_api_params import ResponsesApiParams
 from models.config import ByokRag
 from models.database.conversations import UserConversation
 from models.requests import QueryRequest
@@ -118,7 +119,6 @@
     ReferencedDocument,
     ResponseInput,
     ResponseItem,
-    ResponsesApiParams,
     ToolCallSummary,
     ToolResultSummary,
     TurnSummary,