From 09bb99411a52eb7a4fd613d5123def176027cde4 Mon Sep 17 00:00:00 2001 From: nmarasoiu Date: Wed, 10 Dec 2025 10:33:32 +0000 Subject: [PATCH] feat(hooks): add thinking_budget hook for extended thinking management Add a new hook that manages Claude's extended thinking budget_tokens parameter: - Inject default budget_tokens when thinking is enabled but budget is missing - Override budget_tokens when below configurable minimum threshold - Ensure budget < max_tokens (API constraint) - Optionally inject thinking configuration for thinking-capable models Key design decisions: - Trust caller: if request has thinking, adjust budget regardless of model - Model filter only applies to inject_if_missing mode - Anthropic-specific: non-Anthropic providers will ignore thinking field - Simple config via hook params (no env var complexity) Configuration: - budget_default: Default budget (10000) - budget_min: Minimum threshold (1024) - inject_if_missing: Auto-inject thinking (false) - log_modifications: Log changes (true) --- README.md | 35 +++- src/ccproxy/hooks.py | 117 +++++++++++++ tests/test_hooks.py | 406 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 556 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b46fcee..5bc10cc 100644 --- a/README.md +++ b/README.md @@ -232,9 +232,40 @@ general_settings: See [docs/configuration.md](docs/configuration.md) for more information on how to customize your Claude Code experience using `ccproxy`. - +## Extended Thinking Budget Hook - +`ccproxy` includes a `thinking_budget` hook that manages Claude's extended thinking `budget_tokens` parameter: + +- Injects default `budget_tokens` when thinking is enabled but budget is missing +- Overrides `budget_tokens` when below a configurable minimum +- Ensures `budget_tokens < max_tokens` (API constraint) +- Optionally injects thinking for thinking-capable models + +Note: `thinking` is Anthropic-specific. Non-Anthropic providers will ignore it. + +### Configuration + +```yaml +ccproxy: + hooks: + - ccproxy.hooks.rule_evaluator + - ccproxy.hooks.model_router + - ccproxy.hooks.forward_oauth + - ccproxy.hooks.thinking_budget # Simple form - uses defaults + + # OR with custom parameters: + # - hook: ccproxy.hooks.thinking_budget + # params: + # budget_default: 16000 # Default budget (default: 10000) + # budget_min: 4000 # Minimum threshold (default: 1024) + # inject_if_missing: false # Inject thinking if absent (default: false) + # log_modifications: true # Log changes (default: true) +``` + +### Behavior + +- **Existing thinking**: If request has `thinking`, adjusts budget as needed (trusts caller on model choice) +- **inject_if_missing**: Only injects thinking for Claude 3.7+ and Claude 4 models ## Routing Rules diff --git a/src/ccproxy/hooks.py b/src/ccproxy/hooks.py index 5515365..b849088 100644 --- a/src/ccproxy/hooks.py +++ b/src/ccproxy/hooks.py @@ -13,6 +13,17 @@ # Set up structured logging logger = logging.getLogger(__name__) +# Minimum budget_tokens allowed by the Anthropic API for extended thinking +API_MIN_BUDGET_TOKENS = 1024 + +# Models that support extended thinking (regex patterns) +# Note: thinking is Anthropic-specific; non-Anthropic providers will ignore it +THINKING_CAPABLE_PATTERNS = [ + r"claude-3-7", # Claude 3.7 (e.g., claude-3-7-sonnet-20250219) + r"claude-[a-z]+-4", # Claude 4.x (e.g., claude-sonnet-4-*, claude-opus-4-*) + r"claude-4", # Future claude-4-* models +] + # Global storage for request metadata, keyed by litellm_call_id # Required because LiteLLM doesn't preserve custom metadata from async_pre_call_hook # to logging callbacks - only internal fields like user_id and hidden_params survive. @@ -429,3 +440,109 @@ def forward_apikey(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kw ) return data + + +def _is_thinking_capable_model(model: str | None) -> bool: + """Check if the model supports extended thinking.""" + if not model: + return False + model_lower = model.lower() + return any(re.search(pattern, model_lower) for pattern in THINKING_CAPABLE_PATTERNS) + + +def thinking_budget(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kwargs: Any) -> dict[str, Any]: + """Manage thinking budget_tokens for Anthropic extended thinking requests. + + Adjusts budget_tokens when thinking is enabled: + - Injects default budget when missing + - Overrides budget below minimum threshold + - Ensures budget < max_tokens (API constraint) + + Optionally injects thinking config when not present (for thinking-capable models only). + + Note: thinking is Anthropic-specific. Non-Anthropic providers will ignore it. + + Args: + data: Request data from LiteLLM + user_api_key_dict: User API key dictionary + **kwargs: Hook parameters: + - budget_default: Default budget (default: 10000) + - budget_min: Minimum threshold (default: 1024) + - inject_if_missing: Inject thinking if absent (default: False) + - log_modifications: Log changes (default: True) + """ + budget_default = kwargs.get("budget_default", 10000) + budget_min = kwargs.get("budget_min", API_MIN_BUDGET_TOKENS) + inject_if_missing = kwargs.get("inject_if_missing", False) + log_modifications = kwargs.get("log_modifications", True) + + # Ensure minimums respect API constraint + budget_min = max(budget_min, API_MIN_BUDGET_TOKENS) + budget_default = max(budget_default, API_MIN_BUDGET_TOKENS) + + # Get fields from request + request = data.get("proxy_server_request", {}) + body = request.get("body", {}) + thinking = data.get("thinking") or body.get("thinking") + model = data.get("model") or body.get("model") + max_tokens = data.get("max_tokens") or body.get("max_tokens") + + modified = False + reason = "" + + if thinking is not None: + # Request has thinking - adjust budget if needed (trust caller on model choice) + if isinstance(thinking, dict) and thinking.get("type") == "enabled": + current_budget = thinking.get("budget_tokens") + + if current_budget is None: + thinking["budget_tokens"] = budget_default + modified = True + reason = f"injected budget_tokens={budget_default}" + elif current_budget < budget_min: + thinking["budget_tokens"] = budget_default + modified = True + reason = f"increased budget_tokens {current_budget} -> {budget_default} (min: {budget_min})" + + # Ensure budget < max_tokens + if max_tokens is not None and thinking.get("budget_tokens", 0) >= max_tokens: + if max_tokens <= API_MIN_BUDGET_TOKENS: + logger.warning(f"max_tokens={max_tokens} too low for thinking (min budget: {API_MIN_BUDGET_TOKENS})") + else: + thinking["budget_tokens"] = max_tokens - 1 + modified = True + reason += f"; capped at {max_tokens - 1} (max_tokens constraint)" + + data["thinking"] = thinking + + elif inject_if_missing: + # No thinking present - inject only for thinking-capable models + if not _is_thinking_capable_model(model): + logger.debug(f"Skipping thinking injection for model: {model}") + return data + + if max_tokens is not None and max_tokens <= API_MIN_BUDGET_TOKENS: + logger.debug(f"Skipping thinking injection: max_tokens={max_tokens} too low") + return data + + budget = budget_default + if max_tokens is not None and budget >= max_tokens: + budget = max_tokens - 1 + + data["thinking"] = {"type": "enabled", "budget_tokens": budget} + modified = True + reason = f"injected thinking with budget_tokens={budget}" + + # Thinking requires temperature=1 + current_temp = data.get("temperature") or body.get("temperature") + if current_temp is not None and current_temp != 1: + data["temperature"] = 1 + reason += "; set temperature=1" + + if modified and log_modifications: + logger.info( + f"Adjusted thinking budget for {model}: {reason}", + extra={"event": "thinking_budget_modified", "model": model, "reason": reason}, + ) + + return data diff --git a/tests/test_hooks.py b/tests/test_hooks.py index dbc58da..773c669 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -9,12 +9,16 @@ from ccproxy.classifier import RequestClassifier from ccproxy.config import clear_config_instance from ccproxy.hooks import ( + API_MIN_BUDGET_TOKENS, + THINKING_CAPABLE_PATTERNS, + _is_thinking_capable_model, capture_headers, extract_session_id, forward_apikey, forward_oauth, model_router, rule_evaluator, + thinking_budget, ) from ccproxy.router import ModelRouter, clear_router @@ -1258,3 +1262,405 @@ def test_extract_session_id_preserves_existing_trace_metadata(self, user_api_key assert trace_meta["existing_trace_key"] == "existing_trace_value" assert trace_meta["claude_user_hash"] == "hash123" assert trace_meta["claude_account_id"] == "acct456" + + +class TestIsThinkingCapableModel: + """Test the _is_thinking_capable_model helper function.""" + + def test_claude_3_7_sonnet(self): + """Test claude-3-7-sonnet is recognized.""" + assert _is_thinking_capable_model("claude-3-7-sonnet-20250219") is True + + def test_claude_4_models(self): + """Test claude-4 models are recognized.""" + assert _is_thinking_capable_model("claude-4-opus-20250101") is True + assert _is_thinking_capable_model("claude-4-sonnet-20250101") is True + + def test_claude_sonnet_4(self): + """Test claude-sonnet-4 models are recognized.""" + assert _is_thinking_capable_model("claude-sonnet-4-5-20250929") is True + assert _is_thinking_capable_model("claude-sonnet-4-20250514") is True + + def test_claude_opus_4(self): + """Test claude-opus-4 models are recognized.""" + assert _is_thinking_capable_model("claude-opus-4-5-20251101") is True + assert _is_thinking_capable_model("claude-opus-4-20250514") is True + + def test_claude_haiku_4(self): + """Test claude-haiku-4 models are recognized.""" + assert _is_thinking_capable_model("claude-haiku-4-5-20251001") is True + + def test_older_models_not_capable(self): + """Test older models are not recognized as thinking-capable.""" + assert _is_thinking_capable_model("claude-3-5-sonnet-20241022") is False + assert _is_thinking_capable_model("claude-3-5-haiku-20241022") is False + assert _is_thinking_capable_model("claude-3-opus-20240229") is False + + def test_non_claude_models(self): + """Test non-Claude models are not recognized.""" + assert _is_thinking_capable_model("gpt-4") is False + assert _is_thinking_capable_model("gemini-pro") is False + + def test_none_model(self): + """Test None model returns False.""" + assert _is_thinking_capable_model(None) is False + + def test_empty_model(self): + """Test empty model returns False.""" + assert _is_thinking_capable_model("") is False + + def test_case_insensitive(self): + """Test model matching is case-insensitive.""" + assert _is_thinking_capable_model("CLAUDE-SONNET-4-5-20250929") is True + assert _is_thinking_capable_model("Claude-Opus-4-5-20251101") is True + + +class TestThinkingBudgetHookBasic: + """Test basic thinking_budget hook functionality.""" + + def test_no_modification_without_thinking(self, user_api_key_dict): + """Test hook doesn't modify request without thinking field.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "messages": [{"role": "user", "content": "test message"}], + "max_tokens": 16000, + } + result = thinking_budget(data, user_api_key_dict) + assert "thinking" not in result + + def test_preserves_existing_thinking(self, user_api_key_dict): + """Test hook preserves existing thinking configuration above minimum.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "messages": [{"role": "user", "content": "test message"}], + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 5000, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["type"] == "enabled" + assert result["thinking"]["budget_tokens"] == 5000 + + def test_returns_data_dict(self, user_api_key_dict): + """Test hook returns a dict.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "messages": [{"role": "user", "content": "test message"}], + "max_tokens": 16000, + } + result = thinking_budget(data, user_api_key_dict) + assert isinstance(result, dict) + + +class TestThinkingBudgetHookModification: + """Test thinking_budget hook modification behavior.""" + + def test_inject_missing_budget_tokens(self, user_api_key_dict): + """Test hook injects budget_tokens when missing.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 10000 + + def test_override_low_budget(self, user_api_key_dict): + """Test hook overrides budget below minimum.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 500, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 10000 + + def test_custom_budget_min(self, user_api_key_dict): + """Test hook with custom budget_min parameter.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 3000, + }, + } + result = thinking_budget(data, user_api_key_dict, budget_min=5000, budget_default=8000) + assert result["thinking"]["budget_tokens"] == 8000 + + def test_respects_budget_above_min(self, user_api_key_dict): + """Test hook doesn't modify budget above minimum.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 8000, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 8000 + + +class TestThinkingBudgetHookMaxTokensConstraint: + """Test max_tokens constraint handling.""" + + def test_adjust_budget_when_exceeds_max_tokens(self, user_api_key_dict): + """Test hook adjusts budget when it exceeds max_tokens.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 5000, + "thinking": { + "type": "enabled", + "budget_tokens": 6000, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 4999 + + def test_adjust_budget_equals_max_tokens(self, user_api_key_dict): + """Test hook adjusts budget when it equals max_tokens.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 5000, + "thinking": { + "type": "enabled", + "budget_tokens": 5000, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 4999 + + def test_no_adjustment_when_budget_below_max_tokens(self, user_api_key_dict): + """Test hook doesn't adjust when budget is properly below max_tokens.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 10000, + "thinking": { + "type": "enabled", + "budget_tokens": 5000, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 5000 + + def test_max_tokens_too_low_warning(self, user_api_key_dict, caplog): + """Test warning when max_tokens is too low for any budget.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 500, + "thinking": { + "type": "enabled", + "budget_tokens": 2000, + }, + } + with caplog.at_level(logging.WARNING): + thinking_budget(data, user_api_key_dict) + assert "max_tokens=500 too low for thinking" in caplog.text + + +class TestThinkingBudgetHookInjection: + """Test inject_if_missing behavior.""" + + def test_inject_thinking_when_missing(self, user_api_key_dict): + """Test hook injects thinking when configured to do so.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "messages": [{"role": "user", "content": "test"}], + } + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + assert "thinking" in result + assert result["thinking"]["type"] == "enabled" + assert result["thinking"]["budget_tokens"] == 10000 + + def test_no_inject_by_default(self, user_api_key_dict): + """Test hook doesn't inject thinking by default.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "messages": [{"role": "user", "content": "test message"}], + "max_tokens": 16000, + } + result = thinking_budget(data, user_api_key_dict) + assert "thinking" not in result + + def test_inject_adjusts_for_max_tokens(self, user_api_key_dict): + """Test injected thinking respects max_tokens constraint.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 5000, + "messages": [{"role": "user", "content": "test"}], + } + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True, budget_default=10000) + assert result["thinking"]["budget_tokens"] == 4999 + + def test_inject_skipped_when_max_tokens_too_low(self, user_api_key_dict, caplog): + """Test injection is skipped when max_tokens is too low.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 500, + "messages": [{"role": "user", "content": "test"}], + } + with caplog.at_level(logging.DEBUG): + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + assert "thinking" not in result + assert "Skipping thinking injection" in caplog.text + + def test_inject_sets_temperature(self, user_api_key_dict): + """Test injected thinking sets temperature to 1.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "temperature": 0.7, + "messages": [{"role": "user", "content": "test"}], + } + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + assert result["temperature"] == 1 + + +class TestThinkingBudgetHookModelFilter: + """Test model filtering behavior (only applies to inject_if_missing).""" + + def test_existing_thinking_processed_regardless_of_model(self, user_api_key_dict): + """Test hook processes existing thinking even for non-thinking models (trust caller).""" + data = { + "model": "claude-3-5-sonnet-20241022", # Old model + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 500, # Below default min + }, + } + result = thinking_budget(data, user_api_key_dict) + # Should still adjust budget - caller explicitly enabled thinking + assert result["thinking"]["budget_tokens"] == 10000 + + def test_inject_skipped_for_non_thinking_model(self, user_api_key_dict, caplog): + """Test inject_if_missing skipped for non-thinking-capable models.""" + data = { + "model": "claude-3-5-sonnet-20241022", # Old model + "max_tokens": 16000, + "messages": [{"role": "user", "content": "test"}], + } + with caplog.at_level(logging.DEBUG): + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + assert "thinking" not in result + assert "Skipping thinking injection for model" in caplog.text + + def test_inject_allowed_for_thinking_model(self, user_api_key_dict): + """Test inject_if_missing works for thinking-capable models.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "messages": [{"role": "user", "content": "test"}], + } + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + assert "thinking" in result + assert result["thinking"]["budget_tokens"] == 10000 + + +class TestThinkingBudgetHookLogging: + """Test logging behavior.""" + + def test_logs_modification(self, user_api_key_dict, caplog): + """Test hook logs when it modifies a request.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + }, + } + with caplog.at_level(logging.INFO): + thinking_budget(data, user_api_key_dict) + assert "Adjusted thinking budget" in caplog.text + assert "injected budget_tokens" in caplog.text + + def test_disable_logging(self, user_api_key_dict, caplog): + """Test hook doesn't log when logging disabled.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "enabled", + }, + } + with caplog.at_level(logging.INFO): + thinking_budget(data, user_api_key_dict, log_modifications=False) + assert "Adjusted thinking budget" not in caplog.text + + +class TestThinkingBudgetHookEdgeCases: + """Test edge cases and error handling.""" + + def test_thinking_type_not_enabled(self, user_api_key_dict): + """Test hook handles thinking with type != enabled.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": { + "type": "disabled", + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["type"] == "disabled" + assert "budget_tokens" not in result["thinking"] + + def test_thinking_not_dict(self, user_api_key_dict): + """Test hook handles thinking that's not a dict.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "max_tokens": 16000, + "thinking": "enabled", + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"] == "enabled" + + def test_no_max_tokens(self, user_api_key_dict): + """Test hook handles missing max_tokens.""" + data = { + "model": "claude-sonnet-4-5-20250929", + "thinking": { + "type": "enabled", + "budget_tokens": 500, + }, + } + result = thinking_budget(data, user_api_key_dict) + assert result["thinking"]["budget_tokens"] == 10000 + + def test_empty_data(self, user_api_key_dict): + """Test hook handles empty data.""" + data = {} + result = thinking_budget(data, user_api_key_dict) + assert result == {} + + def test_none_model_with_thinking(self, user_api_key_dict): + """Test hook processes thinking even with None model (trust caller).""" + data = { + "model": None, + "max_tokens": 16000, + "thinking": { + "type": "enabled", + "budget_tokens": 500, # Below min + }, + } + result = thinking_budget(data, user_api_key_dict) + # Should process - caller explicitly enabled thinking + assert result["thinking"]["budget_tokens"] == 10000 + + def test_none_model_inject_skipped(self, user_api_key_dict): + """Test inject_if_missing skipped with None model.""" + data = { + "model": None, + "max_tokens": 16000, + } + result = thinking_budget(data, user_api_key_dict, inject_if_missing=True) + # Should not inject - model not recognized as thinking-capable + assert "thinking" not in result