Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,40 @@ general_settings:

See [docs/configuration.md](docs/configuration.md) for more information on how to customize your Claude Code experience using `ccproxy`.

<!-- ## Extended Thinking -->
## Extended Thinking Budget Hook

<!-- Normally, when you send a message, Claude Code does a simple keyword scan for words/phrases like "think deeply" to determine whether or not to enable thinking, as well the size of the thinking token budget. [Simply including the word "ultrathink](https://claudelog.com/mechanics/ultrathink-plus-plus/) sets the thinking token budget to the maximum of `31999`. -->
`ccproxy` includes a `thinking_budget` hook that manages Claude's extended thinking `budget_tokens` parameter:

- Injects default `budget_tokens` when thinking is enabled but budget is missing
- Overrides `budget_tokens` when below a configurable minimum
- Ensures `budget_tokens < max_tokens` (API constraint)
- Optionally injects thinking for thinking-capable models

Note: `thinking` is Anthropic-specific. Non-Anthropic providers will ignore it.

### Configuration

```yaml
ccproxy:
hooks:
- ccproxy.hooks.rule_evaluator
- ccproxy.hooks.model_router
- ccproxy.hooks.forward_oauth
- ccproxy.hooks.thinking_budget # Simple form - uses defaults

# OR with custom parameters:
# - hook: ccproxy.hooks.thinking_budget
# params:
# budget_default: 16000 # Default budget (default: 10000)
# budget_min: 4000 # Minimum threshold (default: 1024)
# inject_if_missing: false # Inject thinking if absent (default: false)
# log_modifications: true # Log changes (default: true)
```

### Behavior

- **Existing thinking**: If request has `thinking`, adjusts budget as needed (trusts caller on model choice)
- **inject_if_missing**: Only injects thinking for Claude 3.7+ and Claude 4 models

## Routing Rules

Expand Down
117 changes: 117 additions & 0 deletions src/ccproxy/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@
# Set up structured logging
logger = logging.getLogger(__name__)

# Minimum budget_tokens allowed by the Anthropic API for extended thinking
API_MIN_BUDGET_TOKENS = 1024

# Models that support extended thinking (regex patterns)
# Note: thinking is Anthropic-specific; non-Anthropic providers will ignore it
THINKING_CAPABLE_PATTERNS = [
r"claude-3-7", # Claude 3.7 (e.g., claude-3-7-sonnet-20250219)
r"claude-[a-z]+-4", # Claude 4.x (e.g., claude-sonnet-4-*, claude-opus-4-*)
r"claude-4", # Future claude-4-* models
]

# Global storage for request metadata, keyed by litellm_call_id
# Required because LiteLLM doesn't preserve custom metadata from async_pre_call_hook
# to logging callbacks - only internal fields like user_id and hidden_params survive.
Expand Down Expand Up @@ -429,3 +440,109 @@ def forward_apikey(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kw
)

return data


def _is_thinking_capable_model(model: str | None) -> bool:
"""Check if the model supports extended thinking."""
if not model:
return False
model_lower = model.lower()
return any(re.search(pattern, model_lower) for pattern in THINKING_CAPABLE_PATTERNS)


def thinking_budget(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
"""Manage thinking budget_tokens for Anthropic extended thinking requests.

Adjusts budget_tokens when thinking is enabled:
- Injects default budget when missing
- Overrides budget below minimum threshold
- Ensures budget < max_tokens (API constraint)

Optionally injects thinking config when not present (for thinking-capable models only).

Note: thinking is Anthropic-specific. Non-Anthropic providers will ignore it.

Args:
data: Request data from LiteLLM
user_api_key_dict: User API key dictionary
**kwargs: Hook parameters:
- budget_default: Default budget (default: 10000)
- budget_min: Minimum threshold (default: 1024)
- inject_if_missing: Inject thinking if absent (default: False)
- log_modifications: Log changes (default: True)
"""
budget_default = kwargs.get("budget_default", 10000)
budget_min = kwargs.get("budget_min", API_MIN_BUDGET_TOKENS)
inject_if_missing = kwargs.get("inject_if_missing", False)
log_modifications = kwargs.get("log_modifications", True)

# Ensure minimums respect API constraint
budget_min = max(budget_min, API_MIN_BUDGET_TOKENS)
budget_default = max(budget_default, API_MIN_BUDGET_TOKENS)

# Get fields from request
request = data.get("proxy_server_request", {})
body = request.get("body", {})
thinking = data.get("thinking") or body.get("thinking")
model = data.get("model") or body.get("model")
max_tokens = data.get("max_tokens") or body.get("max_tokens")

modified = False
reason = ""

if thinking is not None:
# Request has thinking - adjust budget if needed (trust caller on model choice)
if isinstance(thinking, dict) and thinking.get("type") == "enabled":
current_budget = thinking.get("budget_tokens")

if current_budget is None:
thinking["budget_tokens"] = budget_default
modified = True
reason = f"injected budget_tokens={budget_default}"
elif current_budget < budget_min:
thinking["budget_tokens"] = budget_default
modified = True
reason = f"increased budget_tokens {current_budget} -> {budget_default} (min: {budget_min})"

# Ensure budget < max_tokens
if max_tokens is not None and thinking.get("budget_tokens", 0) >= max_tokens:
if max_tokens <= API_MIN_BUDGET_TOKENS:
logger.warning(f"max_tokens={max_tokens} too low for thinking (min budget: {API_MIN_BUDGET_TOKENS})")
else:
thinking["budget_tokens"] = max_tokens - 1
modified = True
reason += f"; capped at {max_tokens - 1} (max_tokens constraint)"

data["thinking"] = thinking

elif inject_if_missing:
# No thinking present - inject only for thinking-capable models
if not _is_thinking_capable_model(model):
logger.debug(f"Skipping thinking injection for model: {model}")
return data

if max_tokens is not None and max_tokens <= API_MIN_BUDGET_TOKENS:
logger.debug(f"Skipping thinking injection: max_tokens={max_tokens} too low")
return data

budget = budget_default
if max_tokens is not None and budget >= max_tokens:
budget = max_tokens - 1

data["thinking"] = {"type": "enabled", "budget_tokens": budget}
modified = True
reason = f"injected thinking with budget_tokens={budget}"

# Thinking requires temperature=1
current_temp = data.get("temperature") or body.get("temperature")
if current_temp is not None and current_temp != 1:
data["temperature"] = 1
reason += "; set temperature=1"

if modified and log_modifications:
logger.info(
f"Adjusted thinking budget for {model}: {reason}",
extra={"event": "thinking_budget_modified", "model": model, "reason": reason},
)

return data
Loading