Skip to content

Create LiteLLMModel to fix thinking parts not being sent to Anthropic on Vertex via LiteLLM with OpenAIChatModel #3113

@ExtraBB

Description

@ExtraBB

Initial Checks

Description

Hi,

I'm using a LiteLLM proxy which proxies requests for claude models to Google Vertex AI. However, I am hitting errors when toolcalls and thinking are getting in the mix:

ERROR:src.agents.excel.excel_agent_message_processor:Exception cause: BadRequestError: Error code: 400 - {'error': {'message': 'litellm.BadRequestError: Vertex_aiException BadRequestError - b\'{"type":"error","error":{"type":"invalid_request_error","message":"messages.1.content.0.type: Expected `thinking` or `redacted_thinking`, but found `text`. When `thinking` is enabled, a final `assistant` message must start with a thinking block (preceeding the lastmost set of `tool_use` and `tool_result` blocks). We recommend you include thinking blocks from previous turns. To avoid this requirement, disable `thinking`. Please consult our documentation at https://docs.claude.com/en/docs/build-with-claude/extended-thinking"},"request_id":"req_vrtx_011CTupmw9UhTioFUQDwomKa"}\'. Received Model Group=anthropic/claude\nAvailable Model Group Fallbacks=None', 'type': None, 'param': None, 'code': '400'}}

This looks similar to the issues:

I am at a bit of a loss what I can do here, any ideas/suggestions? I would expect pydantic to send back the thinking blocks when there are thinkingparts. I also added an example of openai working normally with the same endpoint.

PS: @DouweM Long time no see! We worked together on Convex at Utrecht University in another lifetime :D

Example Code

Not working :

#!/usr/bin/env python3
"""
Standalone test script for litellm server with Pydantic AI.
Tests extended thinking and tool calling.
"""

from pydantic_ai import Agent, ModelSettings, RunContext
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.litellm import LiteLLMProvider
from pydantic import BaseModel
import pprint


# Define weather data structure
class WeatherData(BaseModel):
    temperature: float
    conditions: str
    unit: str


# Configure the model
model = OpenAIChatModel(
    "anthropic/claude-sonnet-4-5-20250929",
    provider=LiteLLMProvider(
        api_base="<endpoint>",
        api_key="<key>",
    ),
    settings=ModelSettings(
        max_tokens=4096,
        extra_body={
            "thinking": {
                "type": "enabled",
                "budget_tokens": 1024
            }
        }
    )
)

# Create the agent
agent = Agent(
    model,
    system_prompt="You are a helpful weather assistant. Use the get_weather tool to retrieve weather information."
)


# Define the tool
@agent.tool
def get_weather(ctx: RunContext, location: str, unit: str = "celsius") -> WeatherData:
    """
    Get the current weather for a given location.

    Args:
        location: The city and state, e.g. San Francisco, CA
        unit: The temperature unit to use (celsius or fahrenheit)
    """
    print(f"\n--- Tool Called: get_weather ---")
    print(f"Location: {location}")
    print(f"Unit: {unit}")

    # Return dummy weather data
    return WeatherData(
        temperature=18,
        conditions="partly cloudy",
        unit=unit
    )


async def main():
    try:
        # Run the agent
        result = await agent.run(
            "What's the weather like in Paris? Think through your answer carefully."
        )

        print("\n--- Messages ---")
        pprint.pp(result.all_messages())

        print("\n Test completed successfully!")

    except Exception as e:
        print(f"\n Error occurred: {type(e).__name__}")
        print(f"Message: {str(e)}")
        raise


if __name__ == "__main__":
    import asyncio
    asyncio.run(main())

Working with pure openai

#!/usr/bin/env python3
"""
Standalone test script for litellm server with OpenAI SDK.
Tests extended thinking and tool calling.
"""

from openai import OpenAI
import pprint

# Configure client for litellm server
client = OpenAI(
    base_url="<endpoint>",
    api_key="<key>",
)

# Define a test tool
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use",
                    },
                },
                "required": ["location"],
            },
        },
    }
]

def main():
    print("Testing litellm server with OpenAI SDK...")

    try:
        # Initialize messages
        messages = [
            {
                "role": "user",
                "content": "What's the weather like in Paris? Think through your answer carefully.",
            }
        ]

        # Make a request with extended thinking and tool calling
        response = client.chat.completions.create(
            model="anthropic/claude-sonnet-4-5-20250929",
            messages=messages,
            tools=tools,
            # Enable extended thinking for Claude
            extra_body={
                "thinking": {
                    "type": "enabled",
                    "budget_tokens": 1024
                }
            }
        )

        pprint.pp(response.choices[0].message)
        # Check for thinking content
        message = response.choices[0].message
        if hasattr(message, 'content') and message.content:
            print(f"\n--- Content ---")
            print(message.content)

        # Add assistant's response to messages
        messages.append(message.model_dump(exclude_unset=True))

        # Check for tool calls
        if hasattr(message, 'tool_calls') and message.tool_calls:
            print(f"\n--- Tool Calls ---")
            for tool_call in message.tool_calls:
                print(f"Tool: {tool_call.function.name}")
                print(f"Arguments: {tool_call.function.arguments}")

            # Respond to tool call with dummy value
            print("\n--- Responding to tool call with dummy data ---")

            # Add tool result to messages
            messages.append({
                "role": "tool",
                "tool_call_id": message.tool_calls[0].id,
                "content": '{"temperature": 18, "conditions": "partly cloudy", "unit": "celsius"}'
            })

            print("SENDING MESSAGES:")
            pprint.pp(messages)

            # Get final answer
            final_response = client.chat.completions.create(
                model="anthropic/claude",
                messages=messages,
                tools=tools,
                extra_body={
                    "thinking": {
                        "type": "enabled",
                        "budget_tokens": 1024
                    }
                }
            )

            print("\n--- Final Answer ---")
            pprint.pp(final_response.choices[0].message)
            final_message = final_response.choices[0].message
            if hasattr(final_message, 'content') and final_message.content:
                print(final_message.content)

        print("\n Test completed successfully!")

    except Exception as e:
        print(f"\n Error occurred: {type(e).__name__}")
        print(f"Message: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Python, Pydantic AI & LLM client version

Python 3.12.7
pydantic-ai 1.0.15

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions