starbaser · nmarasoiu · Dec 10, 2025
diff --git a/README.md b/README.md
@@ -232,9 +232,40 @@ general_settings:
 
 See [docs/configuration.md](docs/configuration.md) for more information on how to customize your Claude Code experience using `ccproxy`.
 
-<!-- ## Extended Thinking -->
+## Extended Thinking Budget Hook
 
-<!-- Normally, when you send a message, Claude Code does a simple keyword scan for words/phrases like "think deeply" to determine whether or not to enable thinking, as well the size of the thinking token budget. [Simply including the word "ultrathink](https://claudelog.com/mechanics/ultrathink-plus-plus/) sets the thinking token budget to the maximum of `31999`. -->
+`ccproxy` includes a `thinking_budget` hook that manages Claude's extended thinking `budget_tokens` parameter:
+
+- Injects default `budget_tokens` when thinking is enabled but budget is missing
+- Overrides `budget_tokens` when below a configurable minimum
+- Ensures `budget_tokens < max_tokens` (API constraint)
+- Optionally injects thinking for thinking-capable models
+
+Note: `thinking` is Anthropic-specific. Non-Anthropic providers will ignore it.
+
+### Configuration
+
+```yaml
+ccproxy:
+  hooks:
+    - ccproxy.hooks.rule_evaluator
+    - ccproxy.hooks.model_router
+    - ccproxy.hooks.forward_oauth
+    - ccproxy.hooks.thinking_budget  # Simple form - uses defaults
+
+    # OR with custom parameters:
+    # - hook: ccproxy.hooks.thinking_budget
+    #   params:
+    #     budget_default: 16000      # Default budget (default: 10000)
+    #     budget_min: 4000           # Minimum threshold (default: 1024)
+    #     inject_if_missing: false   # Inject thinking if absent (default: false)
+    #     log_modifications: true    # Log changes (default: true)
+```
+
+### Behavior
+
+- **Existing thinking**: If request has `thinking`, adjusts budget as needed (trusts caller on model choice)
+- **inject_if_missing**: Only injects thinking for Claude 3.7+ and Claude 4 models
 
 ## Routing Rules
 

diff --git a/src/ccproxy/hooks.py b/src/ccproxy/hooks.py
@@ -13,6 +13,17 @@
 # Set up structured logging
 logger = logging.getLogger(__name__)
 
+# Minimum budget_tokens allowed by the Anthropic API for extended thinking
+API_MIN_BUDGET_TOKENS = 1024
+
+# Models that support extended thinking (regex patterns)
+# Note: thinking is Anthropic-specific; non-Anthropic providers will ignore it
+THINKING_CAPABLE_PATTERNS = [
+    r"claude-3-7",        # Claude 3.7 (e.g., claude-3-7-sonnet-20250219)
+    r"claude-[a-z]+-4",   # Claude 4.x (e.g., claude-sonnet-4-*, claude-opus-4-*)
+    r"claude-4",          # Future claude-4-* models
+]
+
 # Global storage for request metadata, keyed by litellm_call_id
 # Required because LiteLLM doesn't preserve custom metadata from async_pre_call_hook
 # to logging callbacks - only internal fields like user_id and hidden_params survive.
@@ -429,3 +440,109 @@ def forward_apikey(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kw
         )
 
     return data
+
+
+def _is_thinking_capable_model(model: str | None) -> bool:
+    """Check if the model supports extended thinking."""
+    if not model:
+        return False
+    model_lower = model.lower()
+    return any(re.search(pattern, model_lower) for pattern in THINKING_CAPABLE_PATTERNS)
+
+
+def thinking_budget(data: dict[str, Any], user_api_key_dict: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
+    """Manage thinking budget_tokens for Anthropic extended thinking requests.
+
+    Adjusts budget_tokens when thinking is enabled:
+    - Injects default budget when missing
+    - Overrides budget below minimum threshold
+    - Ensures budget < max_tokens (API constraint)
+
+    Optionally injects thinking config when not present (for thinking-capable models only).
+
+    Note: thinking is Anthropic-specific. Non-Anthropic providers will ignore it.
+
+    Args:
+        data: Request data from LiteLLM
+        user_api_key_dict: User API key dictionary
+        **kwargs: Hook parameters:
+            - budget_default: Default budget (default: 10000)
+            - budget_min: Minimum threshold (default: 1024)
+            - inject_if_missing: Inject thinking if absent (default: False)
+            - log_modifications: Log changes (default: True)
+    """
+    budget_default = kwargs.get("budget_default", 10000)
+    budget_min = kwargs.get("budget_min", API_MIN_BUDGET_TOKENS)
+    inject_if_missing = kwargs.get("inject_if_missing", False)
+    log_modifications = kwargs.get("log_modifications", True)
+
+    # Ensure minimums respect API constraint
+    budget_min = max(budget_min, API_MIN_BUDGET_TOKENS)
+    budget_default = max(budget_default, API_MIN_BUDGET_TOKENS)
+
+    # Get fields from request
+    request = data.get("proxy_server_request", {})
+    body = request.get("body", {})
+    thinking = data.get("thinking") or body.get("thinking")
+    model = data.get("model") or body.get("model")
+    max_tokens = data.get("max_tokens") or body.get("max_tokens")
+
+    modified = False
+    reason = ""
+
+    if thinking is not None:
+        # Request has thinking - adjust budget if needed (trust caller on model choice)
+        if isinstance(thinking, dict) and thinking.get("type") == "enabled":
+            current_budget = thinking.get("budget_tokens")
+
+            if current_budget is None:
+                thinking["budget_tokens"] = budget_default
+                modified = True
+                reason = f"injected budget_tokens={budget_default}"
+            elif current_budget < budget_min:
+                thinking["budget_tokens"] = budget_default
+                modified = True
+                reason = f"increased budget_tokens {current_budget} -> {budget_default} (min: {budget_min})"
+
+            # Ensure budget < max_tokens
+            if max_tokens is not None and thinking.get("budget_tokens", 0) >= max_tokens:
+                if max_tokens <= API_MIN_BUDGET_TOKENS:
+                    logger.warning(f"max_tokens={max_tokens} too low for thinking (min budget: {API_MIN_BUDGET_TOKENS})")
+                else:
+                    thinking["budget_tokens"] = max_tokens - 1
+                    modified = True
+                    reason += f"; capped at {max_tokens - 1} (max_tokens constraint)"
+
+            data["thinking"] = thinking
+
+    elif inject_if_missing:
+        # No thinking present - inject only for thinking-capable models
+        if not _is_thinking_capable_model(model):
+            logger.debug(f"Skipping thinking injection for model: {model}")
+            return data
+
+        if max_tokens is not None and max_tokens <= API_MIN_BUDGET_TOKENS:
+            logger.debug(f"Skipping thinking injection: max_tokens={max_tokens} too low")
+            return data
+
+        budget = budget_default
+        if max_tokens is not None and budget >= max_tokens:
+            budget = max_tokens - 1
+
+        data["thinking"] = {"type": "enabled", "budget_tokens": budget}
+        modified = True
+        reason = f"injected thinking with budget_tokens={budget}"
+
+        # Thinking requires temperature=1
+        current_temp = data.get("temperature") or body.get("temperature")
+        if current_temp is not None and current_temp != 1:
+            data["temperature"] = 1
+            reason += "; set temperature=1"
+
+    if modified and log_modifications:
+        logger.info(
+            f"Adjusted thinking budget for {model}: {reason}",
+            extra={"event": "thinking_budget_modified", "model": model, "reason": reason},
+        )
+
+    return data