Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,15 @@ uv run make test-e2e # End-to-end tests
- **SQLAlchemy**: Database ORM
- **Kubernetes**: K8s auth integration

## Pull Request Requirements

**PR titles MUST start with a JIRA issue key prefix.** CI enforces this via `pr-title-checker` (config: `.github/pr-title-checker-config.json`).

Allowed prefixes: `LCORE-`, `RSPEED-`, `MGTM-`, `OLS-`, `RHDHPAI-`, `LEADS-`

- ✅ `RSPEED-2849: add user_agent to ResponsesEventData`
- ❌ `feat(observability): add user_agent to ResponsesEventData`

## Development Workflow
1. Use `uv sync --group dev --group llslibdev` for dependencies
2. Always use `uv run` prefix for commands
Expand Down
14 changes: 11 additions & 3 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,10 @@ async def query_endpoint_handler(

# Moderation input is the raw user content (query + attachments) without injected RAG
# context, to avoid false positives from retrieved document content.
endpoint_path = "/v1/query"
moderation_input = prepare_input(query_request)
moderation_result = await run_shield_moderation(
client, moderation_input, query_request.shield_ids
client, moderation_input, endpoint_path, query_request.shield_ids
)

# Build RAG context from Inline RAG sources
Expand Down Expand Up @@ -207,7 +208,9 @@ async def query_endpoint_handler(
client = await update_azure_token(client)

# Retrieve response using Responses API
turn_summary = await retrieve_response(client, responses_params, moderation_result)
turn_summary = await retrieve_response(
client, responses_params, moderation_result, endpoint_path
)

if moderation_result.decision == "passed":
# Combine inline RAG results (BYOK + Solr) with tool-based RAG results for the transcript
Expand Down Expand Up @@ -280,6 +283,7 @@ async def retrieve_response(
client: AsyncLlamaStackClient,
responses_params: ResponsesApiParams,
moderation_result: ShieldModerationResult,
endpoint_path: str = "",
) -> TurnSummary:
"""
Retrieve response from LLMs and agents.
Expand Down Expand Up @@ -332,5 +336,9 @@ async def retrieve_response(
vector_store_ids = extract_vector_store_ids_from_tools(responses_params.tools)
rag_id_mapping = configuration.rag_id_mapping
return build_turn_summary(
response, responses_params.model, vector_store_ids, rag_id_mapping
response,
responses_params.model,
endpoint_path,
vector_store_ids,
rag_id_mapping,
)
61 changes: 58 additions & 3 deletions src/app/endpoints/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import json
from collections.abc import AsyncIterator
from datetime import UTC, datetime
from typing import Annotated, Any, Optional, cast
from typing import Annotated, Any, Final, Optional, cast

from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import StreamingResponse
Expand Down Expand Up @@ -112,6 +112,30 @@
logger = get_logger(__name__)
router = APIRouter(tags=["responses"])

_USER_AGENT_MAX_LENGTH: Final[int] = 128


def _get_user_agent(request: Request) -> Optional[str]:
"""Extract and sanitize the User-Agent header from the request.

Parses the raw User-Agent header, strips control characters and newlines,
and truncates to a safe maximum length. Returns None when the header is
absent or empty.

Args:
request: The FastAPI request object.

Returns:
Sanitized User-Agent string, or None if the header is absent or empty.
"""
raw = request.headers.get("User-Agent", "")
if not raw:
return None
sanitized = "".join(c for c in raw if ord(c) >= 32 and c not in ("\r", "\n"))
sanitized = sanitized[:_USER_AGENT_MAX_LENGTH]
return sanitized or None


responses_response: dict[int | str, dict[str, Any]] = {
200: ResponsesResponse.openapi_response(),
401: UnauthorizedResponse.openapi_response(
Expand Down Expand Up @@ -153,6 +177,7 @@ def _queue_responses_splunk_event( # pylint: disable=too-many-arguments,too-man
input_tokens: int = 0,
output_tokens: int = 0,
fire_and_forget: bool = False,
user_agent: Optional[str] = None,
) -> None:
"""Build and queue a Splunk telemetry event for the responses endpoint.

Expand All @@ -173,6 +198,7 @@ def _queue_responses_splunk_event( # pylint: disable=too-many-arguments,too-man
fire_and_forget: When True, dispatch via asyncio.create_task() instead
of background_tasks. Use for error paths where an HTTPException
follows, since FastAPI discards BackgroundTasks on non-2xx responses.
user_agent: Sanitized User-Agent string from the request header, or None.
"""
if not fire_and_forget and background_tasks is None:
return
Expand All @@ -187,6 +213,7 @@ def _queue_responses_splunk_event( # pylint: disable=too-many-arguments,too-man
inference_time=inference_time,
input_tokens=input_tokens,
output_tokens=output_tokens,
user_agent=user_agent,
)
event = build_responses_event(event_data)
if fire_and_forget:
Expand Down Expand Up @@ -304,9 +331,11 @@ async def responses_endpoint_handler(
)
attachments_text = extract_attachments_text(original_request.input)

endpoint_path = "/v1/responses"
moderation_result = await run_shield_moderation(
client,
input_text + "\n\n" + attachments_text,
endpoint_path,
original_request.shield_ids,
)

Expand Down Expand Up @@ -360,6 +389,8 @@ async def responses_endpoint_handler(
filter_server_tools=filter_server_tools,
background_tasks=background_tasks,
rh_identity_context=rh_identity_context,
user_agent=_get_user_agent(request),
endpoint_path=endpoint_path,
)


Expand All @@ -375,6 +406,8 @@ async def handle_streaming_response(
filter_server_tools: bool = False,
background_tasks: Optional[BackgroundTasks] = None,
rh_identity_context: tuple[str, str] = ("", ""),
user_agent: Optional[str] = None,
endpoint_path: str = "",
) -> StreamingResponse:
"""Handle streaming response from Responses API.

Expand All @@ -389,6 +422,7 @@ async def handle_streaming_response(
filter_server_tools: Whether to filter server-deployed MCP tool events from the stream
background_tasks: FastAPI background task manager for telemetry events
rh_identity_context: Tuple of (org_id, system_id) from RH identity
user_agent: Sanitized User-Agent string from request headers, or None.
Returns:
StreamingResponse with SSE-formatted events
"""
Expand Down Expand Up @@ -424,6 +458,7 @@ async def handle_streaming_response(
rh_identity_context=rh_identity_context,
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_shield_blocked",
user_agent=user_agent,
)
else:
try:
Expand All @@ -439,6 +474,7 @@ async def handle_streaming_response(
turn_summary=turn_summary,
inline_rag_context=inline_rag_context,
filter_server_tools=filter_server_tools,
endpoint_path=endpoint_path,
)
except RuntimeError as e: # library mode wraps 413 into runtime error
if is_context_length_error(str(e)):
Expand All @@ -452,6 +488,7 @@ async def handle_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = PromptTooLongResponse(model=api_params.model)
raise HTTPException(**error_response.model_dump()) from e
Expand All @@ -467,6 +504,7 @@ async def handle_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = ServiceUnavailableResponse(
backend_name="Llama Stack",
Expand All @@ -484,6 +522,7 @@ async def handle_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = handle_known_apistatus_errors(e, api_params.model)
raise HTTPException(**error_response.model_dump()) from e
Expand All @@ -501,6 +540,7 @@ async def handle_streaming_response(
background_tasks=background_tasks,
rh_identity_context=rh_identity_context,
shield_blocked=(moderation_result.decision == "blocked"),
user_agent=user_agent,
),
media_type="text/event-stream",
)
Expand Down Expand Up @@ -763,6 +803,7 @@ async def response_generator(
turn_summary: TurnSummary,
inline_rag_context: RAGContext,
filter_server_tools: bool = False,
endpoint_path: str = "",
) -> AsyncIterator[str]:
"""Generate SSE-formatted streaming response with LCORE-enriched events.

Expand Down Expand Up @@ -838,7 +879,7 @@ async def response_generator(

# Extract and consume tokens if any were used
turn_summary.token_usage = extract_token_usage(
latest_response_object.usage, api_params.model
latest_response_object.usage, api_params.model, endpoint_path
)
consume_query_tokens(
user_id=user_id,
Expand Down Expand Up @@ -894,6 +935,7 @@ async def generate_response(
background_tasks: Optional[BackgroundTasks] = None,
rh_identity_context: tuple[str, str] = ("", ""),
shield_blocked: bool = False,
user_agent: Optional[str] = None,
) -> AsyncIterator[str]:
"""Stream the response from the generator and persist conversation details.

Expand All @@ -911,6 +953,7 @@ async def generate_response(
background_tasks: FastAPI background task manager for telemetry events
rh_identity_context: Tuple of (org_id, system_id) from RH identity
shield_blocked: Whether the request was blocked by a shield
user_agent: Sanitized User-Agent string from request headers, or None.
Yields:
SSE-formatted strings from the generator
"""
Expand Down Expand Up @@ -956,6 +999,7 @@ async def generate_response(
if turn_summary.token_usage
else 0
),
user_agent=user_agent,
)


Expand All @@ -971,6 +1015,8 @@ async def handle_non_streaming_response(
filter_server_tools: bool = False,
background_tasks: Optional[BackgroundTasks] = None,
rh_identity_context: tuple[str, str] = ("", ""),
user_agent: Optional[str] = None,
endpoint_path: str = "",
) -> ResponsesResponse:
"""Handle non-streaming response from Responses API.

Expand All @@ -986,6 +1032,7 @@ async def handle_non_streaming_response(
filter_server_tools: Whether to filter server-deployed MCP tool output
background_tasks: FastAPI background task manager for telemetry events
rh_identity_context: Tuple of (org_id, system_id) from RH identity
user_agent: Sanitized User-Agent string from request headers, or None.
Returns:
ResponsesResponse with the completed response
"""
Expand Down Expand Up @@ -1019,6 +1066,7 @@ async def handle_non_streaming_response(
rh_identity_context=rh_identity_context,
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_shield_blocked",
user_agent=user_agent,
)
else:
try:
Expand All @@ -1028,7 +1076,9 @@ async def handle_non_streaming_response(
**api_params.model_dump(exclude_none=True)
),
)
token_usage = extract_token_usage(api_response.usage, api_params.model)
token_usage = extract_token_usage(
api_response.usage, api_params.model, endpoint_path
)
logger.info("Consuming tokens")
consume_query_tokens(
user_id=user_id,
Expand Down Expand Up @@ -1057,6 +1107,7 @@ async def handle_non_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = PromptTooLongResponse(model=api_params.model)
raise HTTPException(**error_response.model_dump()) from e
Expand All @@ -1072,6 +1123,7 @@ async def handle_non_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = ServiceUnavailableResponse(
backend_name="Llama Stack",
Expand All @@ -1089,6 +1141,7 @@ async def handle_non_streaming_response(
inference_time=(datetime.now(UTC) - started_at).total_seconds(),
sourcetype="responses_error",
fire_and_forget=True,
user_agent=user_agent,
)
error_response = handle_known_apistatus_errors(e, api_params.model)
raise HTTPException(**error_response.model_dump()) from e
Expand All @@ -1108,6 +1161,7 @@ async def handle_non_streaming_response(
turn_summary = build_turn_summary(
api_response,
api_params.model,
endpoint_path,
vector_store_ids,
configuration.rag_id_mapping,
filter_server_tools=filter_server_tools,
Expand Down Expand Up @@ -1135,6 +1189,7 @@ async def handle_non_streaming_response(
if turn_summary.token_usage
else 0
),
user_agent=user_agent,
)
if api_params.store:
store_query_results(
Expand Down
Loading
Loading