lightspeed-core · Lifto · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -193,6 +193,15 @@ uv run make test-e2e         # End-to-end tests
 - **SQLAlchemy**: Database ORM
 - **Kubernetes**: K8s auth integration
 
+## Pull Request Requirements
+
+**PR titles MUST start with a JIRA issue key prefix.** CI enforces this via `pr-title-checker` (config: `.github/pr-title-checker-config.json`).
+
+Allowed prefixes: `LCORE-`, `RSPEED-`, `MGTM-`, `OLS-`, `RHDHPAI-`, `LEADS-`
+
+- ✅ `RSPEED-2849: add user_agent to ResponsesEventData`
+- ❌ `feat(observability): add user_agent to ResponsesEventData`
+
 ## Development Workflow
 1. Use `uv sync --group dev --group llslibdev` for dependencies
 2. Always use `uv run` prefix for commands

diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -170,9 +170,10 @@ async def query_endpoint_handler(
 
     # Moderation input is the raw user content (query + attachments) without injected RAG
     # context, to avoid false positives from retrieved document content.
+    endpoint_path = "/v1/query"
     moderation_input = prepare_input(query_request)
     moderation_result = await run_shield_moderation(
-        client, moderation_input, query_request.shield_ids
+        client, moderation_input, endpoint_path, query_request.shield_ids
     )
 
     # Build RAG context from Inline RAG sources
@@ -207,7 +208,9 @@ async def query_endpoint_handler(
         client = await update_azure_token(client)
 
     # Retrieve response using Responses API
-    turn_summary = await retrieve_response(client, responses_params, moderation_result)
+    turn_summary = await retrieve_response(
+        client, responses_params, moderation_result, endpoint_path
+    )
 
     if moderation_result.decision == "passed":
         # Combine inline RAG results (BYOK + Solr) with tool-based RAG results for the transcript
@@ -280,6 +283,7 @@ async def retrieve_response(
     client: AsyncLlamaStackClient,
     responses_params: ResponsesApiParams,
     moderation_result: ShieldModerationResult,
+    endpoint_path: str = "",
 ) -> TurnSummary:
     """
     Retrieve response from LLMs and agents.
@@ -332,5 +336,9 @@ async def retrieve_response(
     vector_store_ids = extract_vector_store_ids_from_tools(responses_params.tools)
     rag_id_mapping = configuration.rag_id_mapping
     return build_turn_summary(
-        response, responses_params.model, vector_store_ids, rag_id_mapping
+        response,
+        responses_params.model,
+        endpoint_path,
+        vector_store_ids,
+        rag_id_mapping,
     )
diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
@@ -6,7 +6,7 @@
 import json
 from collections.abc import AsyncIterator
 from datetime import UTC, datetime
-from typing import Annotated, Any, Optional, cast
+from typing import Annotated, Any, Final, Optional, cast
 
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
 from fastapi.responses import StreamingResponse
@@ -112,6 +112,30 @@
 logger = get_logger(__name__)
 router = APIRouter(tags=["responses"])
 
+_USER_AGENT_MAX_LENGTH: Final[int] = 128
+
+
+def _get_user_agent(request: Request) -> Optional[str]:
+    """Extract and sanitize the User-Agent header from the request.
+
+    Parses the raw User-Agent header, strips control characters and newlines,
+    and truncates to a safe maximum length. Returns None when the header is
+    absent or empty.
+
+    Args:
+        request: The FastAPI request object.
+
+    Returns:
+        Sanitized User-Agent string, or None if the header is absent or empty.
+    """
+    raw = request.headers.get("User-Agent", "")
+    if not raw:
+        return None
+    sanitized = "".join(c for c in raw if ord(c) >= 32 and c not in ("\r", "\n"))
+    sanitized = sanitized[:_USER_AGENT_MAX_LENGTH]
+    return sanitized or None
+
+
 responses_response: dict[int | str, dict[str, Any]] = {
     200: ResponsesResponse.openapi_response(),
     401: UnauthorizedResponse.openapi_response(
@@ -153,6 +177,7 @@ def _queue_responses_splunk_event(  # pylint: disable=too-many-arguments,too-man
     input_tokens: int = 0,
     output_tokens: int = 0,
     fire_and_forget: bool = False,
+    user_agent: Optional[str] = None,
 ) -> None:
     """Build and queue a Splunk telemetry event for the responses endpoint.
 
@@ -173,6 +198,7 @@ def _queue_responses_splunk_event(  # pylint: disable=too-many-arguments,too-man
         fire_and_forget: When True, dispatch via asyncio.create_task() instead
             of background_tasks.  Use for error paths where an HTTPException
             follows, since FastAPI discards BackgroundTasks on non-2xx responses.
+        user_agent: Sanitized User-Agent string from the request header, or None.
     """
     if not fire_and_forget and background_tasks is None:
         return
@@ -187,6 +213,7 @@ def _queue_responses_splunk_event(  # pylint: disable=too-many-arguments,too-man
         inference_time=inference_time,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
+        user_agent=user_agent,
     )
     event = build_responses_event(event_data)
     if fire_and_forget:
@@ -304,9 +331,11 @@ async def responses_endpoint_handler(
     )
     attachments_text = extract_attachments_text(original_request.input)
 
+    endpoint_path = "/v1/responses"
     moderation_result = await run_shield_moderation(
         client,
         input_text + "\n\n" + attachments_text,
+        endpoint_path,
         original_request.shield_ids,
     )
 
@@ -360,6 +389,8 @@ async def responses_endpoint_handler(
         filter_server_tools=filter_server_tools,
         background_tasks=background_tasks,
         rh_identity_context=rh_identity_context,
+        user_agent=_get_user_agent(request),
+        endpoint_path=endpoint_path,
     )
 
 
@@ -375,6 +406,8 @@ async def handle_streaming_response(
     filter_server_tools: bool = False,
     background_tasks: Optional[BackgroundTasks] = None,
     rh_identity_context: tuple[str, str] = ("", ""),
+    user_agent: Optional[str] = None,
+    endpoint_path: str = "",
 ) -> StreamingResponse:
     """Handle streaming response from Responses API.
 
@@ -389,6 +422,7 @@ async def handle_streaming_response(
         filter_server_tools: Whether to filter server-deployed MCP tool events from the stream
         background_tasks: FastAPI background task manager for telemetry events
         rh_identity_context: Tuple of (org_id, system_id) from RH identity
+        user_agent: Sanitized User-Agent string from request headers, or None.
     Returns:
         StreamingResponse with SSE-formatted events
     """
@@ -424,6 +458,7 @@ async def handle_streaming_response(
             rh_identity_context=rh_identity_context,
             inference_time=(datetime.now(UTC) - started_at).total_seconds(),
             sourcetype="responses_shield_blocked",
+            user_agent=user_agent,
         )
     else:
         try:
@@ -439,6 +474,7 @@ async def handle_streaming_response(
                 turn_summary=turn_summary,
                 inline_rag_context=inline_rag_context,
                 filter_server_tools=filter_server_tools,
+                endpoint_path=endpoint_path,
             )
         except RuntimeError as e:  # library mode wraps 413 into runtime error
             if is_context_length_error(str(e)):
@@ -452,6 +488,7 @@ async def handle_streaming_response(
                     inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                     sourcetype="responses_error",
                     fire_and_forget=True,
+                    user_agent=user_agent,
                 )
                 error_response = PromptTooLongResponse(model=api_params.model)
                 raise HTTPException(**error_response.model_dump()) from e
@@ -467,6 +504,7 @@ async def handle_streaming_response(
                 inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                 sourcetype="responses_error",
                 fire_and_forget=True,
+                user_agent=user_agent,
             )
             error_response = ServiceUnavailableResponse(
                 backend_name="Llama Stack",
@@ -484,6 +522,7 @@ async def handle_streaming_response(
                 inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                 sourcetype="responses_error",
                 fire_and_forget=True,
+                user_agent=user_agent,
             )
             error_response = handle_known_apistatus_errors(e, api_params.model)
             raise HTTPException(**error_response.model_dump()) from e
@@ -501,6 +540,7 @@ async def handle_streaming_response(
             background_tasks=background_tasks,
             rh_identity_context=rh_identity_context,
             shield_blocked=(moderation_result.decision == "blocked"),
+            user_agent=user_agent,
         ),
         media_type="text/event-stream",
     )
@@ -763,6 +803,7 @@ async def response_generator(
     turn_summary: TurnSummary,
     inline_rag_context: RAGContext,
     filter_server_tools: bool = False,
+    endpoint_path: str = "",
 ) -> AsyncIterator[str]:
     """Generate SSE-formatted streaming response with LCORE-enriched events.
 
@@ -838,7 +879,7 @@ async def response_generator(
 
             # Extract and consume tokens if any were used
             turn_summary.token_usage = extract_token_usage(
-                latest_response_object.usage, api_params.model
+                latest_response_object.usage, api_params.model, endpoint_path
             )
             consume_query_tokens(
                 user_id=user_id,
@@ -894,6 +935,7 @@ async def generate_response(
     background_tasks: Optional[BackgroundTasks] = None,
     rh_identity_context: tuple[str, str] = ("", ""),
     shield_blocked: bool = False,
+    user_agent: Optional[str] = None,
 ) -> AsyncIterator[str]:
     """Stream the response from the generator and persist conversation details.
 
@@ -911,6 +953,7 @@ async def generate_response(
         background_tasks: FastAPI background task manager for telemetry events
         rh_identity_context: Tuple of (org_id, system_id) from RH identity
         shield_blocked: Whether the request was blocked by a shield
+        user_agent: Sanitized User-Agent string from request headers, or None.
     Yields:
         SSE-formatted strings from the generator
     """
@@ -956,6 +999,7 @@ async def generate_response(
                 if turn_summary.token_usage
                 else 0
             ),
+            user_agent=user_agent,
         )
 
 
@@ -971,6 +1015,8 @@ async def handle_non_streaming_response(
     filter_server_tools: bool = False,
     background_tasks: Optional[BackgroundTasks] = None,
     rh_identity_context: tuple[str, str] = ("", ""),
+    user_agent: Optional[str] = None,
+    endpoint_path: str = "",
 ) -> ResponsesResponse:
     """Handle non-streaming response from Responses API.
 
@@ -986,6 +1032,7 @@ async def handle_non_streaming_response(
         filter_server_tools: Whether to filter server-deployed MCP tool output
         background_tasks: FastAPI background task manager for telemetry events
         rh_identity_context: Tuple of (org_id, system_id) from RH identity
+        user_agent: Sanitized User-Agent string from request headers, or None.
     Returns:
         ResponsesResponse with the completed response
     """
@@ -1019,6 +1066,7 @@ async def handle_non_streaming_response(
             rh_identity_context=rh_identity_context,
             inference_time=(datetime.now(UTC) - started_at).total_seconds(),
             sourcetype="responses_shield_blocked",
+            user_agent=user_agent,
         )
     else:
         try:
@@ -1028,7 +1076,9 @@ async def handle_non_streaming_response(
                     **api_params.model_dump(exclude_none=True)
                 ),
             )
-            token_usage = extract_token_usage(api_response.usage, api_params.model)
+            token_usage = extract_token_usage(
+                api_response.usage, api_params.model, endpoint_path
+            )
             logger.info("Consuming tokens")
             consume_query_tokens(
                 user_id=user_id,
@@ -1057,6 +1107,7 @@ async def handle_non_streaming_response(
                     inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                     sourcetype="responses_error",
                     fire_and_forget=True,
+                    user_agent=user_agent,
                 )
                 error_response = PromptTooLongResponse(model=api_params.model)
                 raise HTTPException(**error_response.model_dump()) from e
@@ -1072,6 +1123,7 @@ async def handle_non_streaming_response(
                 inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                 sourcetype="responses_error",
                 fire_and_forget=True,
+                user_agent=user_agent,
             )
             error_response = ServiceUnavailableResponse(
                 backend_name="Llama Stack",
@@ -1089,6 +1141,7 @@ async def handle_non_streaming_response(
                 inference_time=(datetime.now(UTC) - started_at).total_seconds(),
                 sourcetype="responses_error",
                 fire_and_forget=True,
+                user_agent=user_agent,
             )
             error_response = handle_known_apistatus_errors(e, api_params.model)
             raise HTTPException(**error_response.model_dump()) from e
@@ -1108,6 +1161,7 @@ async def handle_non_streaming_response(
     turn_summary = build_turn_summary(
         api_response,
         api_params.model,
+        endpoint_path,
         vector_store_ids,
         configuration.rag_id_mapping,
         filter_server_tools=filter_server_tools,
@@ -1135,6 +1189,7 @@ async def handle_non_streaming_response(
                 if turn_summary.token_usage
                 else 0
             ),
+            user_agent=user_agent,
         )
     if api_params.store:
         store_query_results(