From 69ee15bd6388a8e5fd9e0c0f7d3cd89d2eba2093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3n=20Levy?= Date: Wed, 26 Nov 2025 15:09:45 +0000 Subject: [PATCH 1/8] fix(agentcore): simplify agentcore streaming --- .../bedrock/chat/agentcore/sse_iterator.py | 252 ---------------- .../bedrock/chat/agentcore/transformation.py | 281 ++++++++++++++---- requirements.txt | 4 +- .../llm_translation/test_bedrock_agentcore.py | 207 ++++++++++++- 4 files changed, 431 insertions(+), 313 deletions(-) delete mode 100644 litellm/llms/bedrock/chat/agentcore/sse_iterator.py diff --git a/litellm/llms/bedrock/chat/agentcore/sse_iterator.py b/litellm/llms/bedrock/chat/agentcore/sse_iterator.py deleted file mode 100644 index 90c5ada769f0..000000000000 --- a/litellm/llms/bedrock/chat/agentcore/sse_iterator.py +++ /dev/null @@ -1,252 +0,0 @@ -""" -SSE Stream Iterator for Bedrock AgentCore. - -Handles Server-Sent Events (SSE) streaming responses from AgentCore. -""" - -import json -from typing import TYPE_CHECKING, Any, Optional - -import httpx - -from litellm._logging import verbose_logger -from litellm._uuid import uuid -from litellm.types.llms.bedrock_agentcore import AgentCoreUsage -from litellm.types.utils import Delta, ModelResponse, StreamingChoices, Usage - -if TYPE_CHECKING: - pass - - -class AgentCoreSSEStreamIterator: - """ - Iterator for AgentCore SSE streaming responses. - Supports both sync and async iteration. - - CRITICAL: The line iterators are created lazily on first access and reused. - We must NOT create new iterators in __aiter__/__iter__ because - CustomStreamWrapper calls __aiter__ on every call to its __anext__, - which would create new iterators and cause StreamConsumed errors. - """ - - def __init__(self, response: httpx.Response, model: str): - self.response = response - self.model = model - self.finished = False - self._sync_iter: Any = None - self._async_iter: Any = None - self._sync_iter_initialized = False - self._async_iter_initialized = False - - def __iter__(self): - """Initialize sync iteration - create iterator lazily on first call only.""" - if not self._sync_iter_initialized: - self._sync_iter = iter(self.response.iter_lines()) - self._sync_iter_initialized = True - return self - - def __aiter__(self): - """Initialize async iteration - create iterator lazily on first call only.""" - if not self._async_iter_initialized: - self._async_iter = self.response.aiter_lines().__aiter__() - self._async_iter_initialized = True - return self - - def _parse_sse_line(self, line: str) -> Optional[ModelResponse]: - """ - Parse a single SSE line and return a ModelResponse chunk if applicable. - - AgentCore SSE format: - - data: {"event": {"contentBlockDelta": {"delta": {"text": "..."}}}} - - data: {"event": {"metadata": {"usage": {...}}}} - - data: {"message": {...}} - """ - line = line.strip() - if not line or not line.startswith("data:"): - return None - - json_str = line[5:].strip() - if not json_str: - return None - - try: - data = json.loads(json_str) - - # Skip non-dict data (some lines contain Python repr strings) - if not isinstance(data, dict): - return None - - # Process content delta events - if "event" in data and isinstance(data["event"], dict): - event_payload = data["event"] - content_block_delta = event_payload.get("contentBlockDelta") - - if content_block_delta: - delta = content_block_delta.get("delta", {}) - text = delta.get("text", "") - - if text: - # Return chunk with text - chunk = ModelResponse( - id=f"chatcmpl-{uuid.uuid4()}", - created=0, - model=self.model, - object="chat.completion.chunk", - ) - - chunk.choices = [ - StreamingChoices( - finish_reason=None, - index=0, - delta=Delta(content=text, role="assistant"), - ) - ] - - return chunk - - # Check for metadata/usage - this signals the end - metadata = event_payload.get("metadata") - if metadata and "usage" in metadata: - chunk = ModelResponse( - id=f"chatcmpl-{uuid.uuid4()}", - created=0, - model=self.model, - object="chat.completion.chunk", - ) - - chunk.choices = [ - StreamingChoices( - finish_reason="stop", - index=0, - delta=Delta(), - ) - ] - - usage_data: AgentCoreUsage = metadata["usage"] # type: ignore - setattr( - chunk, - "usage", - Usage( - prompt_tokens=usage_data.get("inputTokens", 0), - completion_tokens=usage_data.get("outputTokens", 0), - total_tokens=usage_data.get("totalTokens", 0), - ), - ) - - self.finished = True - return chunk - - # Check for final message (alternative finish signal) - if "message" in data and isinstance(data["message"], dict): - if not self.finished: - chunk = ModelResponse( - id=f"chatcmpl-{uuid.uuid4()}", - created=0, - model=self.model, - object="chat.completion.chunk", - ) - - chunk.choices = [ - StreamingChoices( - finish_reason="stop", - index=0, - delta=Delta(), - ) - ] - - self.finished = True - return chunk - - except json.JSONDecodeError: - verbose_logger.debug(f"Skipping non-JSON SSE line: {line[:100]}") - - return None - - def _create_final_chunk(self) -> ModelResponse: - """Create a final chunk to signal stream completion.""" - chunk = ModelResponse( - id=f"chatcmpl-{uuid.uuid4()}", - created=0, - model=self.model, - object="chat.completion.chunk", - ) - - chunk.choices = [ - StreamingChoices( - finish_reason="stop", - index=0, - delta=Delta(), - ) - ] - - return chunk - - def __next__(self) -> ModelResponse: - """ - Sync iteration - parse SSE events and yield ModelResponse chunks. - - Uses next() on the stored iterator to properly resume between calls. - """ - try: - if self._sync_iter is None: - raise StopIteration - - # Keep getting lines until we have a result to return - while True: - try: - line = next(self._sync_iter) - except StopIteration: - # Stream ended - send final chunk if not already finished - if not self.finished: - self.finished = True - return self._create_final_chunk() - raise - - result = self._parse_sse_line(line) - if result is not None: - return result - - except StopIteration: - raise - except httpx.StreamConsumed: - raise StopIteration - except httpx.StreamClosed: - raise StopIteration - except Exception as e: - verbose_logger.error(f"Error in AgentCore SSE stream: {str(e)}") - raise StopIteration - - async def __anext__(self) -> ModelResponse: - """ - Async iteration - parse SSE events and yield ModelResponse chunks. - - Uses __anext__() on the stored iterator to properly resume between calls. - """ - try: - if self._async_iter is None: - raise StopAsyncIteration - - # Keep getting lines until we have a result to return - while True: - try: - line = await self._async_iter.__anext__() - except StopAsyncIteration: - # Stream ended - send final chunk if not already finished - if not self.finished: - self.finished = True - return self._create_final_chunk() - raise - - result = self._parse_sse_line(line) - if result is not None: - return result - - except StopAsyncIteration: - raise - except httpx.StreamConsumed: - raise StopAsyncIteration - except httpx.StreamClosed: - raise StopAsyncIteration - except Exception as e: - verbose_logger.error(f"Error in AgentCore SSE stream: {str(e)}") - raise StopAsyncIteration diff --git a/litellm/llms/bedrock/chat/agentcore/transformation.py b/litellm/llms/bedrock/chat/agentcore/transformation.py index 7c65cad94df0..c04e416cfc8c 100644 --- a/litellm/llms/bedrock/chat/agentcore/transformation.py +++ b/litellm/llms/bedrock/chat/agentcore/transformation.py @@ -5,6 +5,7 @@ """ import json +from collections.abc import AsyncGenerator from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast from urllib.parse import quote @@ -17,7 +18,6 @@ ) from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException from litellm.llms.bedrock.base_aws_llm import BaseAWSLLM -from litellm.llms.bedrock.chat.agentcore.sse_iterator import AgentCoreSSEStreamIterator from litellm.llms.bedrock.common_utils import BedrockError from litellm.types.llms.bedrock_agentcore import ( AgentCoreMessage, @@ -25,7 +25,7 @@ AgentCoreUsage, ) from litellm.types.llms.openai import AllMessageValues -from litellm.types.utils import Choices, Message, ModelResponse, Usage +from litellm.types.utils import Choices, Delta, Message, ModelResponse, StreamingChoices, Usage if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj @@ -116,7 +116,8 @@ def sign_request( fake_stream: Optional[bool] = None, ) -> Tuple[dict, Optional[bytes]]: # Check if api_key (bearer token) is provided for Cognito authentication - jwt_token = optional_params.get("api_key") + # Priority: api_key parameter first, then optional_params + jwt_token = api_key or optional_params.get("api_key") if jwt_token: verbose_logger.debug( f"AgentCore: Using Bearer token authentication (Cognito/JWT) - token: {jwt_token[:50]}..." @@ -437,22 +438,104 @@ def _parse_sse_stream(self, response_text: str) -> AgentCoreParsedResponse: content=content, usage=usage_data, final_message=final_message ) - def get_streaming_response( + def _stream_agentcore_response_sync( self, + response: httpx.Response, model: str, - raw_response: httpx.Response, - ) -> AgentCoreSSEStreamIterator: - """ - Return a streaming iterator for SSE responses. - - Args: - model: The model name - raw_response: Raw HTTP response with streaming data - - Returns: - AgentCoreSSEStreamIterator: Iterator that yields ModelResponse chunks - """ - return AgentCoreSSEStreamIterator(response=raw_response, model=model) + ): + """ + Internal sync generator that parses SSE and yields ModelResponse chunks. + """ + buffer = "" + for text_chunk in response.iter_text(): + buffer += text_chunk + + # Process complete lines + while '\n' in buffer: + line, buffer = buffer.split('\n', 1) + line = line.strip() + + if not line or not line.startswith('data:'): + continue + + json_str = line[5:].strip() + if not json_str: + continue + + try: + data_obj = json.loads(json_str) + if not isinstance(data_obj, dict): + continue + + # Process contentBlockDelta events + if "event" in data_obj and isinstance(data_obj["event"], dict): + event_payload = data_obj["event"] + content_block_delta = event_payload.get("contentBlockDelta") + + if content_block_delta: + delta = content_block_delta.get("delta", {}) + text = delta.get("text", "") + + if text: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content=text, role="assistant"), + ) + ] + yield chunk + + # Process metadata/usage + metadata = event_payload.get("metadata") + if metadata and "usage" in metadata: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + usage_data: AgentCoreUsage = metadata["usage"] # type: ignore + setattr(chunk, "usage", Usage( + prompt_tokens=usage_data.get("inputTokens", 0), + completion_tokens=usage_data.get("outputTokens", 0), + total_tokens=usage_data.get("totalTokens", 0), + )) + yield chunk + + # Process final message + if "message" in data_obj and isinstance(data_obj["message"], dict): + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + yield chunk + + except json.JSONDecodeError: + verbose_logger.debug(f"Skipping non-JSON SSE line: {line[:100]}") + continue def get_sync_custom_stream_wrapper( self, @@ -466,17 +549,14 @@ def get_sync_custom_stream_wrapper( client: Optional[Union[HTTPHandler, "AsyncHTTPHandler"]] = None, json_mode: Optional[bool] = None, signed_json_body: Optional[bytes] = None, - ) -> CustomStreamWrapper: + ) -> "CustomStreamWrapper": """ - Get a CustomStreamWrapper for synchronous streaming. - - This is called when stream=True is passed to completion(). + Simplified sync streaming - returns a generator that yields ModelResponse chunks. """ from litellm.llms.custom_httpx.http_handler import ( HTTPHandler, _get_httpx_client, ) - from litellm.utils import CustomStreamWrapper if client is None or not isinstance(client, HTTPHandler): client = _get_httpx_client(params={}) @@ -488,7 +568,7 @@ def get_sync_custom_stream_wrapper( api_base, headers=headers, data=signed_json_body if signed_json_body else json.dumps(data), - stream=True, # THIS IS KEY - tells httpx to not buffer + stream=True, logging_obj=logging_obj, ) @@ -497,18 +577,6 @@ def get_sync_custom_stream_wrapper( status_code=response.status_code, message=str(response.read()) ) - # Create iterator for SSE stream - completion_stream = self.get_streaming_response( - model=model, raw_response=response - ) - - streaming_response = CustomStreamWrapper( - completion_stream=completion_stream, - model=model, - custom_llm_provider=custom_llm_provider, - logging_obj=logging_obj, - ) - # LOGGING logging_obj.post_call( input=messages, @@ -517,7 +585,114 @@ def get_sync_custom_stream_wrapper( additional_args={"complete_input_dict": data}, ) - return streaming_response + # Wrap the generator in CustomStreamWrapper + from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper + + return CustomStreamWrapper( + completion_stream=self._stream_agentcore_response_sync(response, model), + model=model, + custom_llm_provider="bedrock", + logging_obj=logging_obj, + ) + + async def _stream_agentcore_response( + self, + response: httpx.Response, + model: str, + ) -> AsyncGenerator[ModelResponse, None]: + """ + Internal async generator that parses SSE and yields ModelResponse chunks. + """ + buffer = "" + async for text_chunk in response.aiter_text(): + buffer += text_chunk + + # Process complete lines + while '\n' in buffer: + line, buffer = buffer.split('\n', 1) + line = line.strip() + + if not line or not line.startswith('data:'): + continue + + json_str = line[5:].strip() + if not json_str: + continue + + try: + data_obj = json.loads(json_str) + if not isinstance(data_obj, dict): + continue + + # Process contentBlockDelta events + if "event" in data_obj and isinstance(data_obj["event"], dict): + event_payload = data_obj["event"] + content_block_delta = event_payload.get("contentBlockDelta") + + if content_block_delta: + delta = content_block_delta.get("delta", {}) + text = delta.get("text", "") + + if text: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content=text, role="assistant"), + ) + ] + yield chunk + + # Process metadata/usage + metadata = event_payload.get("metadata") + if metadata and "usage" in metadata: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + usage_data: AgentCoreUsage = metadata["usage"] # type: ignore + setattr(chunk, "usage", Usage( + prompt_tokens=usage_data.get("inputTokens", 0), + completion_tokens=usage_data.get("outputTokens", 0), + total_tokens=usage_data.get("totalTokens", 0), + )) + yield chunk + + # Process final message + if "message" in data_obj and isinstance(data_obj["message"], dict): + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + yield chunk + + except json.JSONDecodeError: + verbose_logger.debug(f"Skipping non-JSON SSE line: {line[:100]}") + continue async def get_async_custom_stream_wrapper( self, @@ -531,17 +706,14 @@ async def get_async_custom_stream_wrapper( client: Optional["AsyncHTTPHandler"] = None, json_mode: Optional[bool] = None, signed_json_body: Optional[bytes] = None, - ) -> CustomStreamWrapper: + ) -> "CustomStreamWrapper": """ - Get a CustomStreamWrapper for asynchronous streaming. - - This is called when stream=True is passed to acompletion(). + Simplified async streaming - returns an async generator that yields ModelResponse chunks. """ from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, get_async_httpx_client, ) - from litellm.utils import CustomStreamWrapper if client is None or not isinstance(client, AsyncHTTPHandler): client = get_async_httpx_client( @@ -555,7 +727,7 @@ async def get_async_custom_stream_wrapper( api_base, headers=headers, data=signed_json_body if signed_json_body else json.dumps(data), - stream=True, # THIS IS KEY - tells httpx to not buffer + stream=True, logging_obj=logging_obj, ) @@ -564,18 +736,6 @@ async def get_async_custom_stream_wrapper( status_code=response.status_code, message=str(await response.aread()) ) - # Create iterator for SSE stream - completion_stream = self.get_streaming_response( - model=model, raw_response=response - ) - - streaming_response = CustomStreamWrapper( - completion_stream=completion_stream, - model=model, - custom_llm_provider=custom_llm_provider, - logging_obj=logging_obj, - ) - # LOGGING logging_obj.post_call( input=messages, @@ -584,7 +744,15 @@ async def get_async_custom_stream_wrapper( additional_args={"complete_input_dict": data}, ) - return streaming_response + # Wrap the async generator in CustomStreamWrapper + from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper + + return CustomStreamWrapper( + completion_stream=self._stream_agentcore_response(response, model), + model=model, + custom_llm_provider="bedrock", + logging_obj=logging_obj, + ) @property def has_custom_stream_wrapper(self) -> bool: @@ -692,4 +860,5 @@ def should_fake_stream( stream: Optional[bool], custom_llm_provider: Optional[str] = None, ) -> bool: - return True + # AgentCore supports true streaming - don't buffer + return False diff --git a/requirements.txt b/requirements.txt index 5f00a269a7c4..3e1c510ec046 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ uvicorn==0.31.1 # server dep gunicorn==23.0.0 # server dep fastuuid==0.13.5 # for uuid4 uvloop==0.21.0 # uvicorn dep, gives us much better performance under load -boto3==1.36.0 # aws bedrock/sagemaker calls +boto3==1.40.53 # aws bedrock/sagemaker calls (has bedrock-agentcore-control, compatible with aioboto3) redis==5.2.1 # redis caching prisma==0.11.0 # for db nodejs-wheel-binaries==24.12.0 ## required by prisma for migrations, prevents runtime download (updated from nodejs-bin for security fixes) @@ -58,7 +58,7 @@ click==8.1.7 # for proxy cli rich==13.7.1 # for litellm proxy cli jinja2==3.1.6 # for prompt templates aiohttp==3.13.3 # for network calls -aioboto3==13.4.0 # for async sagemaker calls +aioboto3==15.5.0 # for async sagemaker calls (updated to match boto3 1.40.73) tenacity==8.5.0 # for retrying requests, when litellm.num_retries set pydantic>=2.11,<3 # proxy + openai req. + mcp jsonschema>=4.23.0,<5.0.0 # validating json schema - aligned with openapi-core + mcp diff --git a/tests/llm_translation/test_bedrock_agentcore.py b/tests/llm_translation/test_bedrock_agentcore.py index 3afb01482ac3..3ea5b9bac064 100644 --- a/tests/llm_translation/test_bedrock_agentcore.py +++ b/tests/llm_translation/test_bedrock_agentcore.py @@ -12,10 +12,9 @@ ) import litellm -from unittest.mock import MagicMock, patch -import pytest - +from unittest.mock import MagicMock, Mock, patch import pytest +import httpx @pytest.mark.parametrize( "model", [ @@ -367,3 +366,205 @@ def test_bedrock_agentcore_without_api_key_uses_sigv4(): assert "X-Amzn-Bedrock-AgentCore-Runtime-Session-Id" in headers assert headers["X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"] == "sigv4-test-session" + +def test_agentcore_parse_json_response(): + """ + Unit test for JSON response parsing (non-streaming) + Verifies that content-type: application/json responses are parsed correctly + """ + from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig + + config = AmazonAgentCoreConfig() + + # Create a mock JSON response + mock_response = Mock(spec=httpx.Response) + mock_response.headers = {"content-type": "application/json"} + mock_response.json.return_value = { + "result": { + "role": "assistant", + "content": [{"text": "Hello from JSON response"}] + } + } + + # Parse the response + parsed = config._get_parsed_response(mock_response) + + # Verify content extraction + assert parsed["content"] == "Hello from JSON response" + # JSON responses don't include usage data + assert parsed["usage"] is None + # Final message should be the result object + assert parsed["final_message"] == mock_response.json.return_value["result"] + + +def test_agentcore_parse_sse_response(): + """ + Unit test for SSE response parsing (streaming response consumed as text) + Verifies that text/event-stream responses are parsed correctly + """ + from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig + + config = AmazonAgentCoreConfig() + + # Create a mock SSE response with multiple events + sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"Hello "}}}} + +data: {"event":{"contentBlockDelta":{"delta":{"text":"from SSE"}}}} + +data: {"event":{"metadata":{"usage":{"inputTokens":10,"outputTokens":5,"totalTokens":15}}}} + +data: {"message":{"role":"assistant","content":[{"text":"Hello from SSE"}]}} +""" + + mock_response = Mock(spec=httpx.Response) + mock_response.headers = {"content-type": "text/event-stream"} + mock_response.text = sse_data + + # Parse the response + parsed = config._get_parsed_response(mock_response) + + # Verify content extraction from final message + assert parsed["content"] == "Hello from SSE" + # SSE responses can include usage data + assert parsed["usage"] is not None + assert parsed["usage"]["inputTokens"] == 10 + assert parsed["usage"]["outputTokens"] == 5 + assert parsed["usage"]["totalTokens"] == 15 + # Final message should be present + assert parsed["final_message"] is not None + assert parsed["final_message"]["role"] == "assistant" + + +def test_agentcore_parse_sse_response_without_final_message(): + """ + Unit test for SSE response parsing when only deltas are present (no final message) + """ + from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig + + config = AmazonAgentCoreConfig() + + # Create a mock SSE response with only content deltas + sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"First "}}}} + +data: {"event":{"contentBlockDelta":{"delta":{"text":"second "}}}} + +data: {"event":{"contentBlockDelta":{"delta":{"text":"third"}}}} +""" + + mock_response = Mock(spec=httpx.Response) + mock_response.headers = {"content-type": "text/event-stream"} + mock_response.text = sse_data + + # Parse the response + parsed = config._get_parsed_response(mock_response) + + # Content should be concatenated from deltas + assert parsed["content"] == "First second third" + # No final message + assert parsed["final_message"] is None + + +def test_agentcore_transform_response_json(): + """ + Integration test for transform_response with JSON response + Verifies end-to-end transformation of JSON responses to ModelResponse + """ + from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig + from litellm.types.utils import ModelResponse + + config = AmazonAgentCoreConfig() + + # Create mock JSON response + mock_response = Mock(spec=httpx.Response) + mock_response.headers = {"content-type": "application/json"} + mock_response.json.return_value = { + "result": { + "role": "assistant", + "content": [{"text": "Response from transform_response"}] + } + } + mock_response.status_code = 200 + + # Create model response + model_response = ModelResponse() + + # Mock logging object + mock_logging = MagicMock() + + # Transform the response + result = config.transform_response( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/test", + raw_response=mock_response, + model_response=model_response, + logging_obj=mock_logging, + request_data={}, + messages=[{"role": "user", "content": "test"}], + optional_params={}, + litellm_params={}, + encoding=None, + ) + + # Verify ModelResponse structure + assert len(result.choices) == 1 + assert result.choices[0].message.content == "Response from transform_response" + assert result.choices[0].message.role == "assistant" + assert result.choices[0].finish_reason == "stop" + assert result.choices[0].index == 0 + + +def test_agentcore_transform_response_sse(): + """ + Integration test for transform_response with SSE response + Verifies end-to-end transformation of SSE responses to ModelResponse + """ + from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig + from litellm.types.utils import ModelResponse + + config = AmazonAgentCoreConfig() + + # Create mock SSE response + sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"SSE "}}}} + +data: {"event":{"contentBlockDelta":{"delta":{"text":"response"}}}} + +data: {"event":{"metadata":{"usage":{"inputTokens":20,"outputTokens":10,"totalTokens":30}}}} + +data: {"message":{"role":"assistant","content":[{"text":"SSE response"}]}} +""" + + mock_response = Mock(spec=httpx.Response) + mock_response.headers = {"content-type": "text/event-stream"} + mock_response.text = sse_data + mock_response.status_code = 200 + + # Create model response + model_response = ModelResponse() + + # Mock logging object + mock_logging = MagicMock() + + # Transform the response + result = config.transform_response( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/test", + raw_response=mock_response, + model_response=model_response, + logging_obj=mock_logging, + request_data={}, + messages=[{"role": "user", "content": "test"}], + optional_params={}, + litellm_params={}, + encoding=None, + ) + + # Verify ModelResponse structure + assert len(result.choices) == 1 + assert result.choices[0].message.content == "SSE response" + assert result.choices[0].message.role == "assistant" + assert result.choices[0].finish_reason == "stop" + + # Verify usage data from SSE metadata + assert hasattr(result, "usage") + assert result.usage.prompt_tokens == 20 + assert result.usage.completion_tokens == 10 + assert result.usage.total_tokens == 30 + From 91fa1163aa1b507e3ebc62bd043ad50ff03cf1f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3n=20Levy?= Date: Thu, 4 Dec 2025 10:55:44 +0000 Subject: [PATCH 2/8] fix(agentcore): move CustomStreamWrapper import to module level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deferred imports inside streaming methods caused initialization delays during health check requests, leading to timeouts in ECS deployments. - Move CustomStreamWrapper import to module-level (line 19) - Remove deferred imports from get_sync_custom_stream_wrapper (line 588) - Remove deferred import from get_async_custom_stream_wrapper (line 747) - Remove from TYPE_CHECKING block to use actual import This ensures the import happens at module load time rather than during first request processing, preventing health check endpoint blocking. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- litellm/llms/bedrock/chat/agentcore/transformation.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/litellm/llms/bedrock/chat/agentcore/transformation.py b/litellm/llms/bedrock/chat/agentcore/transformation.py index c04e416cfc8c..94e845e30958 100644 --- a/litellm/llms/bedrock/chat/agentcore/transformation.py +++ b/litellm/llms/bedrock/chat/agentcore/transformation.py @@ -16,6 +16,7 @@ from litellm.litellm_core_utils.prompt_templates.common_utils import ( convert_content_list_to_str, ) +from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException from litellm.llms.bedrock.base_aws_llm import BaseAWSLLM from litellm.llms.bedrock.common_utils import BedrockError @@ -30,14 +31,12 @@ if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler - from litellm.utils import CustomStreamWrapper LiteLLMLoggingObj = _LiteLLMLoggingObj else: LiteLLMLoggingObj = Any HTTPHandler = Any AsyncHTTPHandler = Any - CustomStreamWrapper = Any class AmazonAgentCoreConfig(BaseConfig, BaseAWSLLM): @@ -586,8 +585,6 @@ def get_sync_custom_stream_wrapper( ) # Wrap the generator in CustomStreamWrapper - from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper - return CustomStreamWrapper( completion_stream=self._stream_agentcore_response_sync(response, model), model=model, @@ -745,8 +742,6 @@ async def get_async_custom_stream_wrapper( ) # Wrap the async generator in CustomStreamWrapper - from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper - return CustomStreamWrapper( completion_stream=self._stream_agentcore_response(response, model), model=model, From 148f50d990cba11576a48d32a9704767f42c2562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3n=20Levy?= Date: Thu, 4 Dec 2025 12:12:54 +0000 Subject: [PATCH 3/8] test(agentcore): ensure sync response --- .../llm_translation/test_bedrock_agentcore.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/llm_translation/test_bedrock_agentcore.py b/tests/llm_translation/test_bedrock_agentcore.py index 3ea5b9bac064..f00ceb8eab4e 100644 --- a/tests/llm_translation/test_bedrock_agentcore.py +++ b/tests/llm_translation/test_bedrock_agentcore.py @@ -568,3 +568,77 @@ def test_agentcore_transform_response_sse(): assert result.usage.completion_tokens == 10 assert result.usage.total_tokens == 30 + +def test_agentcore_synchronous_non_streaming_response(): + """ + Test that synchronous (non-streaming) AgentCore calls still work correctly + after streaming simplification changes. + + This test verifies: + 1. Synchronous completion calls work (stream=False or no stream param) + 2. Response is properly parsed and returned as ModelResponse + 3. Content is extracted correctly + 4. Usage data is calculated when not provided by API + + This is a regression test for the streaming simplification changes + to ensure we didn't break the non-streaming code path. + """ + from litellm.llms.custom_httpx.http_handler import HTTPHandler + + litellm._turn_on_debug() + client = HTTPHandler() + + # Mock a JSON response (typical for synchronous AgentCore calls) + mock_json_response = { + "result": { + "role": "assistant", + "content": [{"text": "This is a synchronous response from AgentCore."}] + } + } + + # Create a mock response object + mock_response = Mock(spec=httpx.Response) + mock_response.status_code = 200 + mock_response.headers = {"content-type": "application/json"} + mock_response.json.return_value = mock_json_response + + with patch.object(client, "post", return_value=mock_response) as mock_post: + # Make a synchronous (non-streaming) completion call + response = litellm.completion( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_r9jvp-3ySZuRHjLC", + messages=[ + { + "role": "user", + "content": "Test synchronous response", + } + ], + stream=False, # Explicitly disable streaming + client=client, + ) + + # Verify the response structure + assert response is not None + assert hasattr(response, "choices") + assert len(response.choices) > 0 + + # Verify content + message = response.choices[0].message + assert message is not None + assert message.content == "This is a synchronous response from AgentCore." + assert message.role == "assistant" + + # Verify completion metadata + assert response.choices[0].finish_reason == "stop" + assert response.choices[0].index == 0 + + # Verify usage data exists (either from API or calculated) + assert hasattr(response, "usage") + assert response.usage is not None + assert response.usage.prompt_tokens > 0 + assert response.usage.completion_tokens > 0 + assert response.usage.total_tokens > 0 + + print(f"Synchronous response: {response}") + print(f"Content: {message.content}") + print(f"Usage: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}") + From b5163c3c83e6f73b818fe120dcac5c941416def5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3n=20Levy?= Date: Thu, 4 Dec 2025 16:13:09 +0000 Subject: [PATCH 4/8] chore: upgrade boto3 to 1.40.76 in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index aa8e6fd97be4..61bb7a8d802d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ google-cloud-iam = {version = "^2.19.1", optional = true} resend = {version = ">=0.8.0", optional = true} pynacl = {version = "^1.5.0", optional = true} websockets = {version = "^15.0.1", optional = true} -boto3 = {version = "1.36.0", optional = true} +boto3 = { version = "1.40.76", optional = true } redisvl = {version = "^0.4.1", optional = true, markers = "python_version >= '3.9' and python_version < '3.14'"} mcp = {version = "^1.21.2", optional = true, python = ">=3.10"} litellm-proxy-extras = {version = "0.4.21", optional = true} From 81e1789d43279aa89cc5f758813d4dfc5566783a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B3n=20Levy?= Date: Thu, 4 Dec 2025 16:17:20 +0000 Subject: [PATCH 5/8] chore: added taplo.toml --- taplo.toml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 taplo.toml diff --git a/taplo.toml b/taplo.toml new file mode 100644 index 000000000000..e5065a696162 --- /dev/null +++ b/taplo.toml @@ -0,0 +1,23 @@ +[formatting] + +# Keep your table/key order as written (project, project.scripts, dependency-groups, tool.uv, ...) +reorder_keys = false + +# Force arrays to stay multiline once expanded +array_auto_expand = true +array_auto_collapse = false + +# Keep nice spacing inside arrays and inline tables +compact_arrays = false +compact_inline_tables = true + +# Don’t align `=` vertically (matches your example) +align_entries = true + +# Reasonable defaults +align_comments = true +trailing_newline = true +reorder_arrays = true +reorder_inline_tables = false +allowed_blank_lines = 2 +crlf = false From e8f7b2e697456b9da0115019e6f3318b083a716d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benedikt=20=C3=93skarsson?= Date: Thu, 8 Jan 2026 00:44:59 +0000 Subject: [PATCH 6/8] fix(bedrock): handle thinking with tool calls for Claude 4 models --- .../bedrock/chat/converse_transformation.py | 32 +- litellm/llms/bedrock/chat/invoke_handler.py | 36 +- litellm/utils.py | 62 ++- .../chat/test_converse_transformation.py | 506 +++++++++++------- 4 files changed, 407 insertions(+), 229 deletions(-) diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py index 59590e464fca..fd8c0d754730 100644 --- a/litellm/llms/bedrock/chat/converse_transformation.py +++ b/litellm/llms/bedrock/chat/converse_transformation.py @@ -53,7 +53,12 @@ PromptTokensDetailsWrapper, Usage, ) -from litellm.utils import add_dummy_tool, has_tool_call_blocks, supports_reasoning +from litellm.utils import ( + add_dummy_tool, + has_tool_call_blocks, + last_assistant_with_tool_calls_has_no_thinking_blocks, + supports_reasoning, +) from ..common_utils import ( BedrockError, @@ -773,7 +778,7 @@ def _translate_response_format_param( return optional_params """ - Follow similar approach to anthropic - translate to a single tool call. + Follow similar approach to anthropic - translate to a single tool call. When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode - You usually want to provide a single tool @@ -1070,9 +1075,24 @@ def _transform_request_helper( llm_provider="bedrock", ) + # Drop thinking param if thinking is enabled but thinking_blocks are missing + # This prevents the error: "Expected thinking or redacted_thinking, but found tool_use" + # Related issues: https://github.com/BerriAI/litellm/issues/14194 + if ( + optional_params.get("thinking") is not None + and messages is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks(messages) + ): + if litellm.modify_params: + optional_params.pop("thinking", None) + litellm.verbose_logger.warning( + "Dropping 'thinking' param because the last assistant message with tool_calls " + "has no thinking_blocks. The model won't use extended thinking for this turn." + ) + # Prepare and separate parameters - inference_params, additional_request_params, request_metadata = ( - self._prepare_request_params(optional_params, model) + inference_params, additional_request_params, request_metadata = self._prepare_request_params( + optional_params, model ) original_tools = inference_params.pop("tools", []) @@ -1459,11 +1479,11 @@ def _transform_response( ) """ - Bedrock Response Object has optional message block + Bedrock Response Object has optional message block completion_response["output"].get("message", None) - A message block looks like this (Example 1): + A message block looks like this (Example 1): "output": { "message": { "role": "assistant", diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py index 49292545208e..ea612da52c60 100644 --- a/litellm/llms/bedrock/chat/invoke_handler.py +++ b/litellm/llms/bedrock/chat/invoke_handler.py @@ -374,6 +374,29 @@ class BedrockLLM(BaseAWSLLM): def __init__(self) -> None: super().__init__() + @staticmethod + def is_claude_messages_api_model(model: str) -> bool: + """ + Check if the model uses the Claude Messages API (Claude 3+). + + Handles: + - Regional prefixes: eu.anthropic.claude-*, us.anthropic.claude-* + - Claude 3 models: claude-3-haiku, claude-3-sonnet, claude-3-opus, claude-3-5-*, claude-3-7-* + - Claude 4 models: claude-opus-4, claude-sonnet-4, claude-haiku-4 + """ + # Normalize model string to lowercase for matching + model_lower = model.lower() + + # Claude 3+ indicators (all use Messages API) + messages_api_indicators = [ + "claude-3", # Claude 3.x models + "claude-opus-4", # Claude Opus 4 + "claude-sonnet-4", # Claude Sonnet 4 + "claude-haiku-4", # Claude Haiku 4 + ] + + return any(indicator in model_lower for indicator in messages_api_indicators) + def convert_messages_to_prompt( self, model, messages, provider, custom_prompt_dict ) -> Tuple[str, Optional[list]]: @@ -465,7 +488,7 @@ def process_response( # noqa: PLR0915 completion_response["generations"][0]["finish_reason"] ) elif provider == "anthropic": - if model.startswith("anthropic.claude-3"): + if self.is_claude_messages_api_model(model): json_schemas: dict = {} _is_function_call = False ## Handle Tool Calling @@ -595,13 +618,12 @@ def process_response( # noqa: PLR0915 outputText = choice["message"].get("content") elif "text" in choice: # fallback for completion format outputText = choice["text"] - # Set finish reason if "finish_reason" in choice: model_response.choices[0].finish_reason = map_finish_reason( choice["finish_reason"] ) - + # Set usage if available if "usage" in completion_response: usage = completion_response["usage"] @@ -842,7 +864,7 @@ def completion( # noqa: PLR0915 ] = True # cohere requires stream = True in inference params data = json.dumps({"prompt": prompt, **inference_params}) elif provider == "anthropic": - if model.startswith("anthropic.claude-3"): + if self.is_claude_messages_api_model(model): # Separate system prompt from rest of message system_prompt_idx: list[int] = [] system_messages: list[str] = [] @@ -940,13 +962,13 @@ def completion( # noqa: PLR0915 # Use AmazonBedrockOpenAIConfig for proper OpenAI transformation openai_config = AmazonBedrockOpenAIConfig() supported_params = openai_config.get_supported_openai_params(model=model) - + # Filter to only supported OpenAI params filtered_params = { - k: v for k, v in inference_params.items() + k: v for k, v in inference_params.items() if k in supported_params } - + # OpenAI uses messages format, not prompt data = json.dumps({"messages": messages, **filtered_params}) else: diff --git a/litellm/utils.py b/litellm/utils.py index 7c7591cdbabf..61829f1c0004 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -619,7 +619,7 @@ def load_credentials_from_list(kwargs: dict): """ # Access CredentialAccessor via module to trigger lazy loading if needed CredentialAccessor = getattr(sys.modules[__name__], 'CredentialAccessor') - + credential_name = kwargs.get("litellm_credential_name") if credential_name and litellm.credential_list: credential_accessor = CredentialAccessor.get_credential_values(credential_name) @@ -646,7 +646,7 @@ def _is_gemini_model(model: Optional[str], custom_llm_provider: Optional[str]) - if custom_llm_provider in ["vertex_ai", "vertex_ai_beta"]: return model is not None and "gemini" in model.lower() return True - + # Check if model name contains gemini return model is not None and "gemini" in model.lower() @@ -668,7 +668,7 @@ def _process_assistant_message_tool_calls( """ role = msg_copy.get("role") tool_calls = msg_copy.get("tool_calls") - + if role == "assistant" and isinstance(tool_calls, list): new_tool_calls = [] for tc in tool_calls: @@ -681,17 +681,17 @@ def _process_assistant_message_tool_calls( else: new_tool_calls.append(tc) continue - + # Remove thought signature from ID if present if isinstance(tc_dict.get("id"), str): if thought_signature_separator in tc_dict["id"]: tc_dict["id"] = _remove_thought_signature_from_id( tc_dict["id"], thought_signature_separator ) - + new_tool_calls.append(tc_dict) msg_copy["tool_calls"] = new_tool_calls - + return msg_copy @@ -706,7 +706,7 @@ def _process_tool_message_id(msg_copy: dict, thought_signature_separator: str) - msg_copy["tool_call_id"] = _remove_thought_signature_from_id( msg_copy["tool_call_id"], thought_signature_separator ) - + return msg_copy @@ -717,7 +717,7 @@ def _remove_thought_signatures_from_messages( Remove thought signatures from tool call IDs in all messages. """ processed_messages = [] - + for msg in messages: # Handle Pydantic models (convert to dict) if hasattr(msg, "model_dump"): @@ -728,17 +728,17 @@ def _remove_thought_signatures_from_messages( # Unknown type, keep as is processed_messages.append(msg) continue - + # Process assistant messages with tool_calls msg_dict = _process_assistant_message_tool_calls( msg_dict, thought_signature_separator ) - + # Process tool messages with tool_call_id msg_dict = _process_tool_message_id(msg_dict, thought_signature_separator) - + processed_messages.append(msg_dict) - + return processed_messages @@ -958,7 +958,7 @@ def function_setup( # noqa: PLR0915 input=buffer.getvalue(), model=model, ) - + ### REMOVE THOUGHT SIGNATURES FROM TOOL CALL IDS FOR NON-GEMINI MODELS ### # Gemini models embed thought signatures in tool call IDs. When sending # messages with tool calls to non-Gemini providers, we need to remove these @@ -974,7 +974,7 @@ def function_setup( # noqa: PLR0915 # Get custom_llm_provider to determine target provider custom_llm_provider = kwargs.get("custom_llm_provider") - + # If custom_llm_provider not in kwargs, try to determine it from the model if not custom_llm_provider and model: try: @@ -985,18 +985,18 @@ def function_setup( # noqa: PLR0915 except Exception: # If we can't determine the provider, skip this processing pass - + # Only process if target is NOT a Gemini model if not _is_gemini_model(model, custom_llm_provider): verbose_logger.debug( "Removing thought signatures from tool call IDs for non-Gemini model" ) - + # Process messages to remove thought signatures processed_messages = _remove_thought_signatures_from_messages( messages, THOUGHT_SIGNATURE_SEPARATOR ) - + # Update messages in kwargs or args if "messages" in kwargs: kwargs["messages"] = processed_messages @@ -2976,7 +2976,7 @@ def get_optional_params_embeddings( # noqa: PLR0915 ): # Lazy load get_supported_openai_params get_supported_openai_params = getattr(sys.modules[__name__], 'get_supported_openai_params') - + # retrieve all parameters passed to the function passed_params = locals() custom_llm_provider = passed_params.pop("custom_llm_provider", None) @@ -4062,7 +4062,14 @@ def _check_valid_arg(supported_params: List[str]): ), ) elif "anthropic" in bedrock_base_model and bedrock_route == "invoke": - if bedrock_base_model.startswith("anthropic.claude-3"): + # Check for Claude 3+ models (Messages API) including regional prefixes and Claude 4 + # Models like eu.anthropic.claude-opus-4-5, us.anthropic.claude-3-5-sonnet, etc. + bedrock_base_model_lower = bedrock_base_model.lower() + is_messages_api_model = any( + indicator in bedrock_base_model_lower + for indicator in ["claude-3", "claude-opus-4", "claude-sonnet-4", "claude-haiku-4"] + ) + if is_messages_api_model: optional_params = ( litellm.AmazonAnthropicClaudeConfig().map_openai_params( non_default_params=non_default_params, @@ -6936,7 +6943,7 @@ def get_valid_models( # init litellm_params ################################# from litellm.types.router import LiteLLM_Params - + if litellm_params is None: litellm_params = LiteLLM_Params(model="") if api_key is not None: @@ -7470,7 +7477,7 @@ class ProviderConfigManager: @staticmethod def _build_provider_config_map() -> dict[LlmProviders, tuple[Callable, bool]]: """Build the provider-to-config mapping dictionary. - + Returns a dict mapping provider to (factory_function, needs_model_parameter). This avoids expensive inspect.signature() calls at runtime. """ @@ -7619,7 +7626,7 @@ def _get_bedrock_config(model: str) -> BaseConfig: def _get_cohere_config(model: str) -> BaseConfig: """Get Cohere config based on route.""" CohereModelInfo = getattr(sys.modules[__name__], 'CohereModelInfo') - route = CohereModelInfo.get_cohere_route(model) + route = CohereModelInfo.get_cohere_route(model) if route == "v2": return litellm.CohereV2ChatConfig() return litellm.CohereChatConfig() @@ -7636,7 +7643,7 @@ def get_provider_chat_config( # noqa: PLR0915 ) -> Optional[BaseConfig]: """ Returns the provider config for a given provider. - + Uses O(1) dictionary lookup for fast provider resolution. """ # Check JSON providers FIRST (these override standard mappings) @@ -8333,8 +8340,7 @@ def get_provider_ocr_config( from litellm.llms.vertex_ai.ocr.common_utils import get_vertex_ai_ocr_config return get_vertex_ai_ocr_config(model=model) - - MistralOCRConfig = getattr(sys.modules[__name__], 'MistralOCRConfig') +MistralOCRConfig = getattr(sys.modules[__name__], 'MistralOCRConfig') PROVIDER_TO_CONFIG_MAP = { litellm.LlmProviders.MISTRAL: MistralOCRConfig, } @@ -8771,12 +8777,12 @@ def __getattr__(name: str) -> Any: """Lazy import handler for utils module with cached registry for improved performance.""" # Use cached registry from _lazy_imports instead of importing tuples every time from litellm._lazy_imports import _get_lazy_import_registry - + registry = _get_lazy_import_registry() - + # Check if name is in registry and call the cached handler function if name in registry: handler_func = registry[name] return handler_func(name) - + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/tests/test_litellm/llms/bedrock/chat/test_converse_transformation.py b/tests/test_litellm/llms/bedrock/chat/test_converse_transformation.py index 692866f85529..3361245f3cf3 100644 --- a/tests/test_litellm/llms/bedrock/chat/test_converse_transformation.py +++ b/tests/test_litellm/llms/bedrock/chat/test_converse_transformation.py @@ -275,10 +275,10 @@ def test_get_supported_openai_params(): def test_get_supported_openai_params_bedrock_converse(): """ - Test that all documented bedrock converse models have the same set of supported openai params when using + Test that all documented bedrock converse models have the same set of supported openai params when using `bedrock/converse/` or `bedrock/` prefix. - Note: This test is critical for routing, if we ever remove `litellm.BEDROCK_CONVERSE_MODELS`, + Note: This test is critical for routing, if we ever remove `litellm.BEDROCK_CONVERSE_MODELS`, please update this test to read `bedrock_converse` models from the model cost map. """ for model in litellm.BEDROCK_CONVERSE_MODELS: @@ -380,7 +380,7 @@ def json(self): @property def text(self): return json.dumps(response_json) - + config = AmazonConverseConfig() model_response = ModelResponse() optional_params = { @@ -471,7 +471,7 @@ def json(self): @property def text(self): return json.dumps(response_json) - + config = AmazonConverseConfig() model_response = ModelResponse() optional_params = { @@ -525,7 +525,7 @@ def test_transform_response_with_structured_response_being_called(): "toolUseId": "tooluse_456", "name": "json_tool_call", "input": { - "Current_Temperature": 62, + "Current_Temperature": 62, "Weather_Explanation": "San Francisco typically has mild, cool weather year-round due to its coastal location and marine influence. The city is known for its fog, moderate temperatures, and relatively stable climate with little seasonal variation."}, } } @@ -550,51 +550,51 @@ def json(self): @property def text(self): return json.dumps(response_json) - + config = AmazonConverseConfig() model_response = ModelResponse() optional_params = { "json_mode": True, "tools": [ { - 'type': 'function', + 'type': 'function', 'function': { - 'name': 'get_weather', - 'description': 'Get the current weather in a given location', + 'name': 'get_weather', + 'description': 'Get the current weather in a given location', 'parameters': { - 'type': 'object', + 'type': 'object', 'properties': { 'location': { - 'type': 'string', + 'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA' - }, + }, 'unit': { - 'type': 'string', + 'type': 'string', 'enum': ['celsius', 'fahrenheit'] } - }, + }, 'required': ['location'] } } - }, + }, { - 'type': 'function', + 'type': 'function', 'function': { - 'name': 'json_tool_call', + 'name': 'json_tool_call', 'parameters': { - '$schema': 'http://json-schema.org/draft-07/schema#', - 'type': 'object', - 'required': ['Weather_Explanation', 'Current_Temperature'], + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'required': ['Weather_Explanation', 'Current_Temperature'], 'properties': { 'Weather_Explanation': { - 'type': ['string', 'null'], + 'type': ['string', 'null'], 'description': '1-2 sentences explaining the weather in the location' - }, + }, 'Current_Temperature': { - 'type': ['number', 'null'], + 'type': ['number', 'null'], 'description': 'Current temperature in the location' } - }, + }, 'additionalProperties': False } } @@ -629,36 +629,36 @@ def test_transform_response_with_structured_response_calling_tool(): response_json = { "metrics": { "latencyMs": 1148 - }, + }, "output": { - "message": + "message": { "content": [ { "text": "I\'ll check the current weather in San Francisco for you." - }, + }, { "toolUse": { "input": { "location": "San Francisco, CA", "unit": "celsius" - }, - "name": "get_weather", + }, + "name": "get_weather", "toolUseId": "tooluse_oKk__QrqSUmufMw3Q7vGaQ" } } - ], + ], "role": "assistant" } - }, - "stopReason": "tool_use", + }, + "stopReason": "tool_use", "usage": { - "cacheReadInputTokenCount": 0, - "cacheReadInputTokens": 0, - "cacheWriteInputTokenCount": 0, - "cacheWriteInputTokens": 0, - "inputTokens": 534, - "outputTokens": 69, + "cacheReadInputTokenCount": 0, + "cacheReadInputTokens": 0, + "cacheWriteInputTokenCount": 0, + "cacheWriteInputTokens": 0, + "inputTokens": 534, + "outputTokens": 69, "totalTokens": 603 } } @@ -669,51 +669,51 @@ def json(self): @property def text(self): return json.dumps(response_json) - + config = AmazonConverseConfig() model_response = ModelResponse() optional_params = { "json_mode": True, "tools": [ { - 'type': 'function', + 'type': 'function', 'function': { - 'name': 'get_weather', - 'description': 'Get the current weather in a given location', + 'name': 'get_weather', + 'description': 'Get the current weather in a given location', 'parameters': { - 'type': 'object', + 'type': 'object', 'properties': { 'location': { - 'type': 'string', + 'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA' - }, + }, 'unit': { - 'type': 'string', + 'type': 'string', 'enum': ['celsius', 'fahrenheit'] } - }, + }, 'required': ['location'] } } - }, + }, { - 'type': 'function', + 'type': 'function', 'function': { - 'name': 'json_tool_call', + 'name': 'json_tool_call', 'parameters': { - '$schema': 'http://json-schema.org/draft-07/schema#', - 'type': 'object', - 'required': ['Weather_Explanation', 'Current_Temperature'], + '$schema': 'http://json-schema.org/draft-07/schema#', + 'type': 'object', + 'required': ['Weather_Explanation', 'Current_Temperature'], 'properties': { 'Weather_Explanation': { - 'type': ['string', 'null'], + 'type': ['string', 'null'], 'description': '1-2 sentences explaining the weather in the location' - }, + }, 'Current_Temperature': { - 'type': ['number', 'null'], + 'type': ['number', 'null'], 'description': 'Current temperature in the location' } - }, + }, 'additionalProperties': False } } @@ -743,7 +743,7 @@ def text(self): @pytest.mark.asyncio async def test_bedrock_bash_tool_acompletion(): """Test Bedrock with bash tool for ls command using acompletion.""" - + # Test with bash tool instead of computer tool tools = [ { @@ -751,14 +751,14 @@ async def test_bedrock_bash_tool_acompletion(): "name": "bash", } ] - + messages = [ { - "role": "user", + "role": "user", "content": "run ls command and find all python files" } ] - + try: response = await litellm.acompletion( model="bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -771,13 +771,13 @@ async def test_bedrock_bash_tool_acompletion(): assert False, "Expected authentication error but got successful response" except Exception as e: error_str = str(e).lower() - + # Check if it's an expected authentication/credentials error auth_error_indicators = [ - "credentials", "authentication", "unauthorized", "access denied", + "credentials", "authentication", "unauthorized", "access denied", "aws", "region", "profile", "token", "invalid", "signature" ] - + if any(auth_error in error_str for auth_error in auth_error_indicators): # This is expected - request formatting succeeded, auth failed as expected assert True @@ -789,7 +789,7 @@ async def test_bedrock_bash_tool_acompletion(): @pytest.mark.asyncio async def test_bedrock_computer_use_acompletion(): """Test Bedrock computer use with acompletion function.""" - + # Test with computer use tool tools = [ { @@ -800,10 +800,10 @@ async def test_bedrock_computer_use_acompletion(): "display_number": 0, } ] - + messages = [ { - "role": "user", + "role": "user", "content": [ { "type": "text", @@ -818,7 +818,7 @@ async def test_bedrock_computer_use_acompletion(): ] } ] - + try: response = await litellm.acompletion( model="bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -831,13 +831,13 @@ async def test_bedrock_computer_use_acompletion(): assert False, "Expected authentication error but got successful response" except Exception as e: error_str = str(e).lower() - + # Check if it's an expected authentication/credentials error auth_error_indicators = [ - "credentials", "authentication", "unauthorized", "access denied", + "credentials", "authentication", "unauthorized", "access denied", "aws", "region", "profile", "token", "invalid", "signature" ] - + if any(auth_error in error_str for auth_error in auth_error_indicators): # This is expected - request formatting succeeded, auth failed as expected assert True @@ -849,9 +849,9 @@ async def test_bedrock_computer_use_acompletion(): @pytest.mark.asyncio async def test_transformation_directly(): """Test the transformation directly to verify the request structure.""" - + config = AmazonConverseConfig() - + tools = [ { "type": "computer_20241022", @@ -865,14 +865,14 @@ async def test_transformation_directly(): "name": "bash", } ] - + messages = [ { "role": "user", "content": "run ls command and find all python files" } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -881,19 +881,19 @@ async def test_transformation_directly(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Check that anthropic_beta is set correctly for computer use assert "anthropic_beta" in additional_fields assert additional_fields["anthropic_beta"] == ["computer-use-2024-10-22"] - + # Check that tools are present assert "tools" in additional_fields assert len(additional_fields["tools"]) == 2 - + # Verify tool types tool_types = [tool.get("type") for tool in additional_fields["tools"]] assert "computer_20241022" in tool_types @@ -933,7 +933,7 @@ def test_transform_request_helper_includes_anthropic_beta_and_tools_bash(): def test_transform_request_with_multiple_tools(): """Test transformation with multiple tools including computer, bash, and function tools.""" config = AmazonConverseConfig() - + # Use the exact payload from the user's error tools = [ { @@ -974,14 +974,14 @@ def test_transform_request_with_multiple_tools(): } } ] - + messages = [ { "role": "user", "content": "run ls command and find all python files" } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -990,25 +990,25 @@ def test_transform_request_with_multiple_tools(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Check that anthropic_beta is set correctly for computer use assert "anthropic_beta" in additional_fields assert additional_fields["anthropic_beta"] == ["computer-use-2024-10-22"] - + # Check that tools are present assert "tools" in additional_fields assert len(additional_fields["tools"]) == 3 # computer, bash, text_editor tools - + # Verify tool types tool_types = [tool.get("type") for tool in additional_fields["tools"]] assert "computer_20241022" in tool_types assert "bash_20241022" in tool_types assert "text_editor_20241022" in tool_types - + # Function tools are processed separately and not included in computer use tools # They would be in toolConfig if present @@ -1016,7 +1016,7 @@ def test_transform_request_with_multiple_tools(): def test_transform_request_with_computer_tool_only(): """Test transformation with only computer tool.""" config = AmazonConverseConfig() - + tools = [ { "type": "computer_20241022", @@ -1026,10 +1026,10 @@ def test_transform_request_with_computer_tool_only(): "display_number": 0, } ] - + messages = [ { - "role": "user", + "role": "user", "content": [ { "type": "text", @@ -1044,7 +1044,7 @@ def test_transform_request_with_computer_tool_only(): ] } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -1053,15 +1053,15 @@ def test_transform_request_with_computer_tool_only(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Check that anthropic_beta is set correctly for computer use assert "anthropic_beta" in additional_fields assert additional_fields["anthropic_beta"] == ["computer-use-2024-10-22"] - + # Check that tools are present assert "tools" in additional_fields assert len(additional_fields["tools"]) == 1 @@ -1071,21 +1071,21 @@ def test_transform_request_with_computer_tool_only(): def test_transform_request_with_bash_tool_only(): """Test transformation with only bash tool.""" config = AmazonConverseConfig() - + tools = [ { "type": "bash_20241022", "name": "bash", } ] - + messages = [ { - "role": "user", + "role": "user", "content": "run ls command and find all python files" } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -1094,15 +1094,15 @@ def test_transform_request_with_bash_tool_only(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Check that anthropic_beta is set correctly for computer use assert "anthropic_beta" in additional_fields assert additional_fields["anthropic_beta"] == ["computer-use-2024-10-22"] - + # Check that tools are present assert "tools" in additional_fields assert len(additional_fields["tools"]) == 1 @@ -1112,21 +1112,21 @@ def test_transform_request_with_bash_tool_only(): def test_transform_request_with_text_editor_tool(): """Test transformation with text editor tool.""" config = AmazonConverseConfig() - + tools = [ { "type": "text_editor_20241022", "name": "str_replace_editor", } ] - + messages = [ { "role": "user", "content": "Edit this text file" } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -1135,15 +1135,15 @@ def test_transform_request_with_text_editor_tool(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Check that anthropic_beta is set correctly for computer use assert "anthropic_beta" in additional_fields assert additional_fields["anthropic_beta"] == ["computer-use-2024-10-22"] - + # Check that tools are present assert "tools" in additional_fields assert len(additional_fields["tools"]) == 1 @@ -1153,7 +1153,7 @@ def test_transform_request_with_text_editor_tool(): def test_transform_request_with_function_tool(): """Test transformation with function tool.""" config = AmazonConverseConfig() - + tools = [ { "type": "function", @@ -1174,14 +1174,14 @@ def test_transform_request_with_function_tool(): } } ] - + messages = [ { "role": "user", "content": "What's the weather like in San Francisco?" } ] - + # Transform request request_data = config.transform_request( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -1190,11 +1190,11 @@ def test_transform_request_with_function_tool(): litellm_params={}, headers={} ) - + # Verify the structure assert "additionalModelRequestFields" in request_data additional_fields = request_data["additionalModelRequestFields"] - + # Function tools are not computer use tools, so they don't get anthropic_beta # They are processed through the regular tool config assert "toolConfig" in request_data @@ -1206,7 +1206,7 @@ def test_transform_request_with_function_tool(): def test_map_openai_params_with_response_format(): """Test map_openai_params with response_format.""" config = AmazonConverseConfig() - + tools = [ { "type": "function", @@ -1277,12 +1277,12 @@ async def test_assistant_message_cache_control(): messages = [ {"role": "user", "content": "Hello"}, { - "role": "assistant", + "role": "assistant", "content": "Hi there!", "cache_control": {"type": "ephemeral"} } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1294,7 +1294,7 @@ async def test_assistant_message_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result async_result = await BedrockConverseMessagesProcessor._bedrock_converse_messages_pt_async( @@ -1302,14 +1302,14 @@ async def test_assistant_message_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Should have user message and assistant message assert len(result) == 2 assert result[0]["role"] == "user" assert result[1]["role"] == "assistant" - + # Assistant message should have text content and cachePoint assistant_content = result[1]["content"] assert len(assistant_content) == 2 @@ -1325,7 +1325,7 @@ async def test_assistant_message_list_content_cache_control(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "Hello"}, { @@ -1339,7 +1339,7 @@ async def test_assistant_message_list_content_cache_control(): ] } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1351,9 +1351,9 @@ async def test_assistant_message_list_content_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Assistant message should have text content and cachePoint assistant_content = result[1]["content"] assert len(assistant_content) == 2 @@ -1369,7 +1369,7 @@ async def test_tool_message_cache_control(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "What's the weather?"}, { @@ -1395,7 +1395,7 @@ async def test_tool_message_cache_control(): ] } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1407,20 +1407,20 @@ async def test_tool_message_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Should have user, assistant, and user (tool results) messages assert len(result) == 3 - + # Last message should contain tool result and cachePoint tool_message_content = result[2]["content"] assert len(tool_message_content) == 2 - + # First should be tool result assert "toolResult" in tool_message_content[0] assert tool_message_content[0]["toolResult"]["content"][0]["text"] == "Weather data: sunny, 25°C" - + # Second should be cachePoint assert "cachePoint" in tool_message_content[1] assert tool_message_content[1]["cachePoint"]["type"] == "default" @@ -1433,7 +1433,7 @@ async def test_tool_message_string_content_cache_control(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "What's the weather?"}, { @@ -1442,7 +1442,7 @@ async def test_tool_message_string_content_cache_control(): "tool_calls": [ { "id": "call_123", - "type": "function", + "type": "function", "function": {"name": "get_weather", "arguments": "{}"} } ] @@ -1454,7 +1454,7 @@ async def test_tool_message_string_content_cache_control(): "cache_control": {"type": "ephemeral"} } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1466,17 +1466,17 @@ async def test_tool_message_string_content_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Last message should contain tool result and cachePoint tool_message_content = result[2]["content"] assert len(tool_message_content) == 2 - + # First should be tool result assert "toolResult" in tool_message_content[0] assert tool_message_content[0]["toolResult"]["content"][0]["text"] == "Weather: sunny, 25°C" - + # Second should be cachePoint assert "cachePoint" in tool_message_content[1] assert tool_message_content[1]["cachePoint"]["type"] == "default" @@ -1489,7 +1489,7 @@ async def test_assistant_tool_calls_cache_control(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "Calculate 2+2"}, { @@ -1505,7 +1505,7 @@ async def test_assistant_tool_calls_cache_control(): ] } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1517,18 +1517,18 @@ async def test_assistant_tool_calls_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Assistant message should have tool use and cachePoint assistant_content = result[1]["content"] assert len(assistant_content) == 2 - + # First should be tool use assert "toolUse" in assistant_content[0] assert assistant_content[0]["toolUse"]["name"] == "calc" assert assistant_content[0]["toolUse"]["toolUseId"] == "call_proxy_123" - + # Second should be cachePoint assert "cachePoint" in assistant_content[1] assert assistant_content[1]["cachePoint"]["type"] == "default" @@ -1541,7 +1541,7 @@ async def test_multiple_tool_calls_with_mixed_cache_control(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "Do multiple calculations"}, { @@ -1563,7 +1563,7 @@ async def test_multiple_tool_calls_with_mixed_cache_control(): ] } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1575,21 +1575,21 @@ async def test_multiple_tool_calls_with_mixed_cache_control(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Assistant message should have: toolUse1, cachePoint, toolUse2 assistant_content = result[1]["content"] assert len(assistant_content) == 3 - + # First tool use with cache assert "toolUse" in assistant_content[0] assert assistant_content[0]["toolUse"]["toolUseId"] == "call_1" - + # Cache point for first tool assert "cachePoint" in assistant_content[1] assert assistant_content[1]["cachePoint"]["type"] == "default" - + # Second tool use without cache assert "toolUse" in assistant_content[2] assert assistant_content[2]["toolUse"]["toolUseId"] == "call_2" @@ -1602,7 +1602,7 @@ async def test_no_cache_control_no_cache_point(): BedrockConverseMessagesProcessor, _bedrock_converse_messages_pt, ) - + messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, # No cache_control @@ -1612,7 +1612,7 @@ async def test_no_cache_control_no_cache_point(): "content": "Tool result" # No cache_control } ] - + result = _bedrock_converse_messages_pt( messages=messages, model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", @@ -1624,14 +1624,14 @@ async def test_no_cache_control_no_cache_point(): model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + assert result == async_result - + # Assistant message should only have text content, no cachePoint assistant_content = result[1]["content"] assert len(assistant_content) == 1 assert assistant_content[0]["text"] == "Hi there!" - + # Tool message should only have tool result, no cachePoint tool_content = result[2]["content"] assert len(tool_content) == 1 @@ -1867,11 +1867,11 @@ def test_guarded_text_with_tool_calls(): # First should be regular text assert "text" in content[0] assert content[0]["text"] == "What's the weather?" - + # Second should be guardContent assert "guardContent" in content[1] assert content[1]["guardContent"]["text"]["text"] == "Please be careful with sensitive information" - + # Other messages should not have guardContent for i in range(1, 3): content = result[i]["content"] @@ -2115,7 +2115,7 @@ def test_auto_convert_in_full_transformation(): # Verify the transformation worked assert "messages" in result assert len(result["messages"]) == 1 - + # The message should have guardContent message = result["messages"][0] assert "content" in message @@ -2626,79 +2626,79 @@ def test_empty_assistant_message_handling(): {"role": "assistant", "content": ""}, # Empty content {"role": "user", "content": "How are you?"} ] - + # Enable modify_params to prevent consecutive user message merging original_modify_params = litellm.modify_params litellm.modify_params = True - + try: result = _bedrock_converse_messages_pt( messages=messages, model="anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + # Should have 3 messages: user, assistant (with placeholder), user assert len(result) == 3 assert result[0]["role"] == "user" assert result[1]["role"] == "assistant" assert result[2]["role"] == "user" - + # Assistant message should have placeholder text instead of empty content assert len(result[1]["content"]) == 1 assert result[1]["content"][0]["text"] == "Please continue." - + # Test case 2: Whitespace-only content messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": " "}, # Whitespace-only content {"role": "user", "content": "How are you?"} ] - + result = _bedrock_converse_messages_pt( messages=messages, model="anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + # Assistant message should have placeholder text instead of whitespace assert len(result[1]["content"]) == 1 assert result[1]["content"][0]["text"] == "Please continue." - + # Test case 3: Empty list content messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": [{"type": "text", "text": ""}]}, # Empty text in list {"role": "user", "content": "How are you?"} ] - + result = _bedrock_converse_messages_pt( messages=messages, model="anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + # Assistant message should have placeholder text instead of empty text assert len(result[1]["content"]) == 1 assert result[1]["content"][0]["text"] == "Please continue." - + # Test case 4: Normal content should not be affected messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "I'm doing well, thank you!"}, # Normal content {"role": "user", "content": "How are you?"} ] - + result = _bedrock_converse_messages_pt( messages=messages, model="anthropic.claude-3-5-sonnet-20240620-v1:0", llm_provider="bedrock_converse" ) - + # Assistant message should keep original content assert len(result[1]["content"]) == 1 assert result[1]["content"][0]["text"] == "I'm doing well, thank you!" - + finally: # Restore original modify_params setting litellm.modify_params = original_modify_params @@ -2707,54 +2707,184 @@ def test_empty_assistant_message_handling(): def test_is_nova_lite_2_model(): """Test the _is_nova_lite_2_model() method for detecting Nova 2 models.""" config = AmazonConverseConfig() - + # Test with amazon.nova-2-lite-v1:0 assert config._is_nova_lite_2_model("amazon.nova-2-lite-v1:0") is True - + # Test with regional variants assert config._is_nova_lite_2_model("us.amazon.nova-2-lite-v1:0") is True assert config._is_nova_lite_2_model("eu.amazon.nova-2-lite-v1:0") is True assert config._is_nova_lite_2_model("apac.amazon.nova-2-lite-v1:0") is True - + # Test with other Nova 2 variants (pro, micro) assert config._is_nova_lite_2_model("amazon.nova-pro-1-5-v1:0") is False assert config._is_nova_lite_2_model("amazon.nova-micro-1-5-v1:0") is False assert config._is_nova_lite_2_model("us.amazon.nova-pro-1-5-v1:0") is False assert config._is_nova_lite_2_model("eu.amazon.nova-micro-1-5-v1:0") is False - + # Test with non-Nova-1.5 lite models (should return False) assert config._is_nova_lite_2_model("amazon.nova-lite-v1:0") is False assert config._is_nova_lite_2_model("amazon.nova-pro-v1:0") is False assert config._is_nova_lite_2_model("amazon.nova-micro-v1:0") is False - + # Test with Nova v1:0 models (should return False) assert config._is_nova_lite_2_model("us.amazon.nova-lite-v1:0") is False assert config._is_nova_lite_2_model("eu.amazon.nova-pro-v1:0") is False - + # Test with completely different models (should return False) assert config._is_nova_lite_2_model("anthropic.claude-3-5-sonnet-20240620-v1:0") is False assert config._is_nova_lite_2_model("meta.llama3-70b-instruct-v1:0") is False assert config._is_nova_lite_2_model("mistral.mistral-7b-instruct-v0:2") is False +def test_drop_thinking_param_when_thinking_blocks_missing(): + """ + Test that thinking param is dropped when modify_params=True and + thinking_blocks are missing from assistant message with tool_calls. + + This prevents the Anthropic/Bedrock error: + "Expected thinking or redacted_thinking, but found tool_use" + + Related issue: https://github.com/BerriAI/litellm/issues/14194 + """ + from litellm.utils import last_assistant_with_tool_calls_has_no_thinking_blocks + + # Save original modify_params setting + original_modify_params = litellm.modify_params + + try: + # Test case 1: thinking should be dropped when modify_params=True + # and assistant message has tool_calls but no thinking_blocks + litellm.modify_params = True + + messages_without_thinking_blocks = [ + {"role": "user", "content": "Search for weather"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": {"name": "search", "arguments": "{}"}, + } + ], + # No thinking_blocks - simulates OpenAI-compatible client + }, + {"role": "tool", "content": "Weather is sunny", "tool_call_id": "call_123"}, + ] + + optional_params = {"thinking": {"type": "enabled", "budget_tokens": 1000}} + + # Verify the condition is detected + assert last_assistant_with_tool_calls_has_no_thinking_blocks( + messages_without_thinking_blocks + ), "Should detect missing thinking_blocks" + + # Simulate what _transform_request_helper does + if ( + optional_params.get("thinking") is not None + and messages_without_thinking_blocks is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks( + messages_without_thinking_blocks + ) + ): + if litellm.modify_params: + optional_params.pop("thinking", None) + + assert "thinking" not in optional_params, ( + "thinking param should be dropped when modify_params=True " + "and thinking_blocks are missing" + ) + + # Test case 2: thinking should NOT be dropped when thinking_blocks are present + messages_with_thinking_blocks = [ + {"role": "user", "content": "Search for weather"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": {"name": "search", "arguments": "{}"}, + } + ], + "thinking_blocks": [ + {"type": "thinking", "thinking": "Let me search for weather..."} + ], + }, + {"role": "tool", "content": "Weather is sunny", "tool_call_id": "call_123"}, + ] + + optional_params_with_thinking = { + "thinking": {"type": "enabled", "budget_tokens": 1000} + } + + # Verify the condition is NOT detected when thinking_blocks are present + assert not last_assistant_with_tool_calls_has_no_thinking_blocks( + messages_with_thinking_blocks + ), "Should NOT detect missing thinking_blocks when they are present" + + # Simulate what _transform_request_helper does + if ( + optional_params_with_thinking.get("thinking") is not None + and messages_with_thinking_blocks is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks( + messages_with_thinking_blocks + ) + ): + if litellm.modify_params: + optional_params_with_thinking.pop("thinking", None) + + assert "thinking" in optional_params_with_thinking, ( + "thinking param should NOT be dropped when thinking_blocks are present" + ) + + # Test case 3: thinking should NOT be dropped when modify_params=False + litellm.modify_params = False + + optional_params_no_modify = { + "thinking": {"type": "enabled", "budget_tokens": 1000} + } + + # Simulate what _transform_request_helper does + if ( + optional_params_no_modify.get("thinking") is not None + and messages_without_thinking_blocks is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks( + messages_without_thinking_blocks + ) + ): + if litellm.modify_params: + optional_params_no_modify.pop("thinking", None) + + assert "thinking" in optional_params_no_modify, ( + "thinking param should NOT be dropped when modify_params=False" + ) + + finally: + # Restore original modify_params setting + litellm.modify_params = original_modify_params + def test_thinking_with_max_completion_tokens(): """Test that thinking respects max_completion_tokens parameter.""" config = AmazonConverseConfig() - + # Test case 1: max_completion_tokens is specified - should NOT set maxTokens automatically non_default_params_with_max_completion = { "thinking": {"type": "enabled", "budget_tokens": 5000}, "max_completion_tokens": 10000, } optional_params = {} - + result = config.map_openai_params( non_default_params=non_default_params_with_max_completion, optional_params=optional_params, model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", drop_params=False, ) - + # Should have maxTokens set to max_completion_tokens value assert "maxTokens" in result assert result["maxTokens"] == 10000 @@ -2762,21 +2892,21 @@ def test_thinking_with_max_completion_tokens(): assert "thinking" in result assert result["thinking"]["type"] == "enabled" assert result["thinking"]["budget_tokens"] == 5000 - + # Test case 2: max_tokens is specified - should NOT set maxTokens automatically non_default_params_with_max_tokens = { "thinking": {"type": "enabled", "budget_tokens": 5000}, "max_tokens": 8000, } optional_params = {} - + result = config.map_openai_params( non_default_params=non_default_params_with_max_tokens, optional_params=optional_params, model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", drop_params=False, ) - + # Should have maxTokens set to max_tokens value assert "maxTokens" in result assert result["maxTokens"] == 8000 @@ -2784,22 +2914,22 @@ def test_thinking_with_max_completion_tokens(): assert "thinking" in result assert result["thinking"]["type"] == "enabled" assert result["thinking"]["budget_tokens"] == 5000 - + # Test case 3: Neither max_tokens nor max_completion_tokens specified - should set maxTokens automatically from litellm.constants import DEFAULT_MAX_TOKENS - + non_default_params_without_max = { "thinking": {"type": "enabled", "budget_tokens": 5000}, } optional_params = {} - + result = config.map_openai_params( non_default_params=non_default_params_without_max, optional_params=optional_params, model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", drop_params=False, ) - + # Should have maxTokens set to budget_tokens + DEFAULT_MAX_TOKENS assert "maxTokens" in result assert result["maxTokens"] == 5000 + DEFAULT_MAX_TOKENS From afc7378261b3118cccd9ab308c0472708f49b0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benedikt=20=C3=93skarsson?= Date: Thu, 8 Jan 2026 02:04:52 +0000 Subject: [PATCH 7/8] chore: fix formatting error --- litellm/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 61829f1c0004..9cb5b41aacbd 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8340,7 +8340,8 @@ def get_provider_ocr_config( from litellm.llms.vertex_ai.ocr.common_utils import get_vertex_ai_ocr_config return get_vertex_ai_ocr_config(model=model) -MistralOCRConfig = getattr(sys.modules[__name__], 'MistralOCRConfig') + + MistralOCRConfig = getattr(sys.modules[__name__], 'MistralOCRConfig') PROVIDER_TO_CONFIG_MAP = { litellm.LlmProviders.MISTRAL: MistralOCRConfig, } From 143d5682665c9198be5a1723bcde0303695d030e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benedikt=20=C3=93skarsson?= Date: Tue, 13 Jan 2026 14:23:43 +0000 Subject: [PATCH 8/8] fix: indent error --- litellm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index 9cb5b41aacbd..1ea0e4387b58 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7626,7 +7626,7 @@ def _get_bedrock_config(model: str) -> BaseConfig: def _get_cohere_config(model: str) -> BaseConfig: """Get Cohere config based on route.""" CohereModelInfo = getattr(sys.modules[__name__], 'CohereModelInfo') - route = CohereModelInfo.get_cohere_route(model) + route = CohereModelInfo.get_cohere_route(model) if route == "v2": return litellm.CohereV2ChatConfig() return litellm.CohereChatConfig()