diff --git a/pkg/runtime/fallback.go b/pkg/runtime/fallback.go index 329824f38..068a478ef 100644 --- a/pkg/runtime/fallback.go +++ b/pkg/runtime/fallback.go @@ -40,6 +40,79 @@ const ( DefaultFallbackCooldown = 1 * time.Minute ) +// ContextOverflowError wraps an underlying error to indicate that the failure +// was caused by the conversation context exceeding the model's context window. +// This is used to trigger auto-compaction in the runtime loop instead of +// surfacing raw HTTP errors to the user. +type ContextOverflowError struct { + Underlying error +} + +func (e *ContextOverflowError) Error() string { + return fmt.Sprintf("context window overflow: %s", e.Underlying.Error()) +} + +func (e *ContextOverflowError) Unwrap() error { + return e.Underlying +} + +// contextOverflowPatterns contains error message substrings that indicate the +// prompt/context exceeds the model's context window. These patterns are checked +// case-insensitively against error messages from various providers. +var contextOverflowPatterns = []string{ + "prompt is too long", + "maximum context length", + "context length exceeded", + "context_length_exceeded", + "max_tokens must be greater than", + "maximum number of tokens", + "content length exceeds", + "request too large", + "payload too large", + "input is too long", + "exceeds the model's max token", + "token limit", + "reduce your prompt", + "reduce the length", +} + +// isContextOverflowError checks whether the error indicates the conversation +// context has exceeded the model's context window. It inspects both structured +// SDK error types and raw error message patterns. +// +// Recognised patterns include: +// - Anthropic 400 "prompt is too long: N tokens > M maximum" +// - Anthropic 400 "max_tokens must be greater than thinking.budget_tokens" +// (emitted when the prompt is so large that max_tokens can't accommodate +// the thinking budget — a proxy for context overflow) +// - OpenAI 400 "maximum context length" / "context_length_exceeded" +// - Anthropic 500 that is actually a context overflow (heuristic: the error +// message is opaque but the conversation was already near the limit) +// +// This function intentionally does NOT match generic 500 errors; callers +// that want to treat an opaque 500 as overflow must check separately with +// additional context (e.g., session token counts). +func isContextOverflowError(err error) bool { + if err == nil { + return false + } + + // Already wrapped + var ctxErr *ContextOverflowError + if errors.As(err, &ctxErr) { + return true + } + + errMsg := strings.ToLower(err.Error()) + for _, pattern := range contextOverflowPatterns { + if strings.Contains(errMsg, pattern) { + return true + } + } + + return false +} + // fallbackCooldownState tracks when we should stick with a fallback model // instead of retrying the primary after a non-retryable error (e.g., 429). type fallbackCooldownState struct { @@ -144,6 +217,14 @@ func isRetryableModelError(err error) bool { return false } + // Context overflow errors are never retryable — the context hasn't changed + // between attempts, so retrying the same oversized payload will always fail. + // This avoids wasting time on 3 attempts + exponential backoff. + if isContextOverflowError(err) { + slog.Debug("Context overflow error, not retryable", "error", err) + return false + } + // First, try to extract HTTP status code from known SDK error types if statusCode := extractHTTPStatusCode(err); statusCode != 0 { retryable := isRetryableStatusCode(statusCode) @@ -587,9 +668,15 @@ func (r *LocalRuntime) tryModelWithFallback( } } - // All models and retries exhausted + // All models and retries exhausted. + // If the last error (or any error in the chain) was a context overflow, + // wrap it in a ContextOverflowError so the caller can auto-compact. if lastErr != nil { - return streamResult{}, nil, fmt.Errorf("all models failed: %w", lastErr) + wrapped := fmt.Errorf("all models failed: %w", lastErr) + if isContextOverflowError(lastErr) { + return streamResult{}, nil, &ContextOverflowError{Underlying: wrapped} + } + return streamResult{}, nil, wrapped } return streamResult{}, nil, errors.New("all models failed with unknown error") } diff --git a/pkg/runtime/fallback_test.go b/pkg/runtime/fallback_test.go index fba4240a7..7a5a6c30b 100644 --- a/pkg/runtime/fallback_test.go +++ b/pkg/runtime/fallback_test.go @@ -201,6 +201,21 @@ func TestIsRetryableModelError(t *testing.T) { err: errors.New("upstream connect error"), expected: true, }, + { + name: "context overflow - prompt too long", + err: errors.New("prompt is too long: 226360 tokens > 200000 maximum"), + expected: false, // Context overflow should not be retried + }, + { + name: "context overflow - thinking budget", + err: errors.New("max_tokens must be greater than thinking.budget_tokens"), + expected: false, // Context overflow should not be retried + }, + { + name: "context overflow - wrapped", + err: &ContextOverflowError{Underlying: errors.New("test")}, + expected: false, // Context overflow should not be retried + }, { name: "unknown error", err: errors.New("something weird happened"), @@ -904,6 +919,113 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) { }) } +func TestIsContextOverflowError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + expected bool + }{ + {name: "nil error", err: nil, expected: false}, + {name: "generic error", err: errors.New("something went wrong"), expected: false}, + {name: "anthropic prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`), expected: true}, + {name: "openai context length exceeded", err: errors.New(`maximum context length is 128000 tokens`), expected: true}, + {name: "context_length_exceeded code", err: errors.New(`error code: context_length_exceeded`), expected: true}, + {name: "thinking budget error", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`), expected: true}, + {name: "request too large", err: errors.New(`request too large for model`), expected: true}, + {name: "input is too long", err: errors.New(`input is too long`), expected: true}, + {name: "reduce your prompt", err: errors.New(`please reduce your prompt`), expected: true}, + {name: "reduce the length", err: errors.New(`please reduce the length of the messages`), expected: true}, + {name: "token limit", err: errors.New(`token limit exceeded`), expected: true}, + {name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}, expected: true}, + {name: "errors.As wrapped", err: fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: errors.New("test")}), expected: true}, + {name: "500 internal server error (not overflow)", err: errors.New(`500 Internal Server Error`), expected: false}, + {name: "429 rate limit (not overflow)", err: errors.New(`429 too many requests`), expected: false}, + {name: "network timeout (not overflow)", err: errors.New(`connection timeout`), expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + result := isContextOverflowError(tt.err) + assert.Equal(t, tt.expected, result, "isContextOverflowError(%v)", tt.err) + }) + } +} + +func TestContextOverflowError(t *testing.T) { + t.Parallel() + + t.Run("wraps underlying error", func(t *testing.T) { + t.Parallel() + underlying := errors.New("prompt is too long: 226360 tokens > 200000 maximum") + ctxErr := &ContextOverflowError{Underlying: underlying} + + assert.Contains(t, ctxErr.Error(), "context window overflow") + assert.Contains(t, ctxErr.Error(), "prompt is too long") + assert.ErrorIs(t, ctxErr, underlying) + }) + + t.Run("errors.As works", func(t *testing.T) { + t.Parallel() + underlying := errors.New("test error") + wrapped := fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: underlying}) + + var ctxErr *ContextOverflowError + assert.ErrorAs(t, wrapped, &ctxErr) + }) +} + +func TestIsRetryableModelError_ContextOverflow(t *testing.T) { + t.Parallel() + + // Context overflow errors should NOT be retryable — the context hasn't changed, + // so retrying the same oversized payload will always fail. + tests := []struct { + name string + err error + }{ + {name: "prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`)}, + {name: "thinking budget cascade", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`)}, + {name: "context length exceeded", err: errors.New(`maximum context length is 128000 tokens`)}, + {name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.False(t, isRetryableModelError(tt.err), + "context overflow errors should not be retryable: %v", tt.err) + }) + } +} + +func TestFormatModelError(t *testing.T) { + t.Parallel() + + t.Run("nil error", func(t *testing.T) { + t.Parallel() + assert.Empty(t, formatModelError(nil)) + }) + + t.Run("context overflow shows user-friendly message", func(t *testing.T) { + t.Parallel() + err := &ContextOverflowError{Underlying: errors.New("prompt is too long")} + msg := formatModelError(err) + assert.Contains(t, msg, "context window") + assert.Contains(t, msg, "/compact") + assert.NotContains(t, msg, "prompt is too long") + }) + + t.Run("generic error preserves message", func(t *testing.T) { + t.Parallel() + err := errors.New("authentication failed") + msg := formatModelError(err) + assert.Equal(t, "authentication failed", msg) + }) +} + // Verify interface compliance var ( _ provider.Provider = (*mockProvider)(nil) diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index ad7601701..951a27d79 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -1169,12 +1169,38 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c streamSpan.End() return } + + // Auto-recovery: if the error is a context overflow and + // session compaction is enabled, compact the conversation + // and retry the request instead of surfacing raw errors. + var ctxOverflow *ContextOverflowError + if errors.As(err, &ctxOverflow) && r.sessionCompaction { + slog.Warn("Context window overflow detected, attempting auto-compaction", + "agent", a.Name(), + "session_id", sess.ID, + "input_tokens", sess.InputTokens, + "output_tokens", sess.OutputTokens, + "context_limit", contextLimit, + ) + events <- Warning( + "The conversation has exceeded the model's context window. Automatically compacting the conversation history...", + r.CurrentAgentName(), + ) + r.Summarize(ctx, sess, "", events) + + // After compaction, loop back to retry with the + // compacted context. The next iteration will re-fetch + // messages from the (now compacted) session. + streamSpan.End() + continue + } + streamSpan.RecordError(err) streamSpan.SetStatus(codes.Error, "error handling stream") slog.Error("All models failed", "agent", a.Name(), "error", err) // Track error in telemetry telemetry.RecordError(ctx, err.Error()) - events <- Error(err.Error()) + events <- Error(formatModelError(err)) streamSpan.End() return } @@ -1258,12 +1284,43 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c usage.LastMessage = msgUsage events <- NewTokenUsageEvent(sess.ID, r.CurrentAgentName(), usage) + // Record the message count before tool calls so we can + // measure how much content was added by tool results. + messageCountBeforeTools := len(sess.GetAllMessages()) + r.processToolCalls(ctx, sess, res.Calls, agentTools, events) if res.Stopped { slog.Debug("Conversation stopped", "agent", a.Name()) break } + + // Root-cause fix for stale token counts (issue #1750): + // After tool calls, sess.InputTokens still reflects the + // *previous* API response and doesn't account for the + // (potentially large) tool results just added. Estimate + // the additional tokens and compact proactively to prevent + // the oversized request from ever being sent. + if m != nil && r.sessionCompaction && contextLimit > 0 { + newMessages := sess.GetAllMessages()[messageCountBeforeTools:] + var addedTokens int64 + for _, msg := range newMessages { + addedTokens += estimateMessageTokens(&msg.Message) + } + + estimatedTotal := sess.InputTokens + sess.OutputTokens + addedTokens + if estimatedTotal > int64(float64(contextLimit)*0.9) { + slog.Info("Proactive compaction: tool results pushed estimated context past 90%% threshold", + "agent", a.Name(), + "input_tokens", sess.InputTokens, + "output_tokens", sess.OutputTokens, + "added_estimated_tokens", addedTokens, + "estimated_total", estimatedTotal, + "context_limit", contextLimit, + ) + r.Summarize(ctx, sess, "", events) + } + } } }() @@ -2389,3 +2446,65 @@ func stripImageContent(messages []chat.Message) []chat.Message { } return result } + +// charsPerToken is the average number of characters per token used for +// estimation. A value of 4 is a widely-used heuristic for English text; +// it slightly overestimates token counts for code/JSON (which is ~3.5), +// making compaction trigger earlier — the safe direction. +const charsPerToken = 4 + +// estimateMessageTokens returns a rough token-count estimate for a single +// chat message based on its text length. This is intentionally conservative +// (overestimates) so that proactive compaction fires before we hit the limit. +// The estimate includes the message content, multi-content text parts, and +// a small overhead per message for role/metadata tokens. +func estimateMessageTokens(msg *chat.Message) int64 { + var chars int + + // Primary text content. + chars += len(msg.Content) + + // Multi-content parts (e.g., tool results with image descriptions). + for _, part := range msg.MultiContent { + chars += len(part.Text) + } + + // Reasoning / thinking content. + chars += len(msg.ReasoningContent) + + // Tool call arguments (they count toward input tokens on the next turn). + for _, tc := range msg.ToolCalls { + chars += len(tc.Function.Arguments) + chars += len(tc.Function.Name) + } + + // Per-message overhead: role, ToolCallID, delimiters, etc. + // Models typically use 3-7 tokens for message framing. + const perMessageOverhead = 5 + + if chars == 0 { + return perMessageOverhead + } + + return int64(chars/charsPerToken) + perMessageOverhead +} + +// formatModelError produces a user-friendly error message from a model error. +// Raw HTTP errors with request IDs, JSON payloads, and API URLs are replaced +// with actionable guidance. Context overflow errors receive a dedicated +// message; other errors are cleaned up to remove noise while preserving the +// essential failure reason. +func formatModelError(err error) string { + if err == nil { + return "" + } + + // Context overflow gets a dedicated, actionable message. + var ctxOverflow *ContextOverflowError + if errors.As(err, &ctxOverflow) { + return "The conversation has exceeded the model's context window and automatic compaction is not enabled. " + + "Try running /compact to reduce the conversation size, or start a new session." + } + + return err.Error() +} diff --git a/pkg/runtime/runtime_test.go b/pkg/runtime/runtime_test.go index a8ae82d09..f3c519c3c 100644 --- a/pkg/runtime/runtime_test.go +++ b/pkg/runtime/runtime_test.go @@ -1823,3 +1823,88 @@ func TestStripImageContent(t *testing.T) { }) } } + +func TestEstimateMessageTokens(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + msg chat.Message + expected int64 + }{ + { + name: "empty message returns overhead only", + msg: chat.Message{}, + expected: 5, // perMessageOverhead + }, + { + name: "text-only message", + msg: chat.Message{Content: "Hello, world!"}, // 13 chars → 13/4 = 3 + 5 overhead = 8 + expected: 8, + }, + { + name: "multi-content text parts", + msg: chat.Message{ + MultiContent: []chat.MessagePart{ + {Type: chat.MessagePartTypeText, Text: "first part"}, // 10 chars + {Type: chat.MessagePartTypeText, Text: "second part"}, // 11 chars + }, + }, + // 21 total chars → 21/4 = 5 + 5 overhead = 10 + expected: 10, + }, + { + name: "message with tool calls", + msg: chat.Message{ + ToolCalls: []tools.ToolCall{ + { + Function: tools.FunctionCall{ + Name: "read_file", // 9 chars + Arguments: `{"path":"/tmp/test.txt"}`, // 24 chars + }, + }, + }, + }, + // 33 chars → 33/4 = 8 + 5 overhead = 13 + expected: 13, + }, + { + name: "message with reasoning content", + msg: chat.Message{ + Content: "answer", // 6 chars + ReasoningContent: "Let me think about this carefully step by step", // 47 chars + }, + // 53 chars → 53/4 = 13 + 5 overhead = 18 + expected: 18, + }, + { + name: "combined content types", + msg: chat.Message{ + Content: "result", // 6 chars + ReasoningContent: "thinking", // 8 chars + MultiContent: []chat.MessagePart{{Text: "extra detail"}}, // 12 chars + ToolCalls: []tools.ToolCall{ + {Function: tools.FunctionCall{Name: "cmd", Arguments: `{"x":"y"}`}}, // 3 + 9 = 12 chars + }, + }, + // 38 chars → 38/4 = 9 + 5 overhead = 14 + expected: 14, + }, + { + name: "large tool result", + msg: chat.Message{ + Content: string(make([]byte, 40000)), // 40000 null bytes + }, + // 40000/4 = 10000 + 5 overhead = 10005 + expected: 10005, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := estimateMessageTokens(&tt.msg) + assert.Equal(t, tt.expected, got, "estimateMessageTokens mismatch") + }) + } +}