diff --git a/examples/01-single-agent.ts b/examples/01-single-agent.ts index b7ac765..af20d90 100644 --- a/examples/01-single-agent.ts +++ b/examples/01-single-agent.ts @@ -114,6 +114,8 @@ const conversationAgent = new Agent( model: 'claude-sonnet-4-6', systemPrompt: 'You are a TypeScript tutor. Give short, direct answers.', maxTurns: 2, + // Keep only the most recent turn in long prompt() conversations. + contextStrategy: { type: 'sliding-window', maxTurns: 1 }, }, new ToolRegistry(), // no tools needed for this conversation new ToolExecutor(new ToolRegistry()), diff --git a/src/agent/agent.ts b/src/agent/agent.ts index 8c1007c..7270e93 100644 --- a/src/agent/agent.ts +++ b/src/agent/agent.ts @@ -153,6 +153,7 @@ export class Agent { agentRole: this.config.systemPrompt?.slice(0, 50) ?? 'assistant', loopDetection: this.config.loopDetection, maxTokenBudget: this.config.maxTokenBudget, + contextStrategy: this.config.contextStrategy, } this.runner = new AgentRunner( diff --git a/src/agent/runner.ts b/src/agent/runner.ts index 81155e8..2b8fbce 100644 --- a/src/agent/runner.ts +++ b/src/agent/runner.ts @@ -29,10 +29,12 @@ import type { LoopDetectionConfig, LoopDetectionInfo, LLMToolDef, + ContextStrategy, } from '../types.js' import { TokenBudgetExceededError } from '../errors.js' import { LoopDetector } from './loop-detector.js' import { emitTrace } from '../utils/trace.js' +import { estimateTokens } from '../utils/tokens.js' import type { ToolRegistry } from '../tool/framework.js' import type { ToolExecutor } from '../tool/executor.js' @@ -94,6 +96,8 @@ export interface RunnerOptions { readonly loopDetection?: LoopDetectionConfig /** Maximum cumulative tokens (input + output) allowed for this run. */ readonly maxTokenBudget?: number + /** Optional context compression strategy for long multi-turn runs. */ + readonly contextStrategy?: ContextStrategy } /** @@ -172,6 +176,31 @@ function addTokenUsage(a: TokenUsage, b: TokenUsage): TokenUsage { const ZERO_USAGE: TokenUsage = { input_tokens: 0, output_tokens: 0 } +/** + * Prepends synthetic framing text to the first user message so we never emit + * consecutive `user` turns (Bedrock) and summaries do not concatenate onto + * the original user prompt (direct API). If there is no user message yet, + * inserts a single assistant text preamble. + */ +function prependSyntheticPrefixToFirstUser( + messages: LLMMessage[], + prefix: string, +): LLMMessage[] { + const userIdx = messages.findIndex(m => m.role === 'user') + if (userIdx < 0) { + return [{ + role: 'assistant', + content: [{ type: 'text', text: prefix.trimEnd() }], + }, ...messages] + } + const target = messages[userIdx]! + const merged: LLMMessage = { + role: 'user', + content: [{ type: 'text', text: prefix }, ...target.content], + } + return [...messages.slice(0, userIdx), merged, ...messages.slice(userIdx + 1)] +} + // --------------------------------------------------------------------------- // AgentRunner // --------------------------------------------------------------------------- @@ -191,6 +220,10 @@ const ZERO_USAGE: TokenUsage = { input_tokens: 0, output_tokens: 0 } */ export class AgentRunner { private readonly maxTurns: number + private summarizeCache: { + oldSignature: string + summaryPrefix: string + } | null = null constructor( private readonly adapter: LLMAdapter, @@ -201,6 +234,172 @@ export class AgentRunner { this.maxTurns = options.maxTurns ?? 10 } + private serializeMessage(message: LLMMessage): string { + return JSON.stringify(message) + } + + private truncateToSlidingWindow(messages: LLMMessage[], maxTurns: number): LLMMessage[] { + if (maxTurns <= 0) { + return messages + } + + const firstUserIndex = messages.findIndex(m => m.role === 'user') + const firstUser = firstUserIndex >= 0 ? messages[firstUserIndex]! : null + const afterFirst = firstUserIndex >= 0 + ? messages.slice(firstUserIndex + 1) + : messages.slice() + + if (afterFirst.length <= maxTurns * 2) { + return messages + } + + const kept = afterFirst.slice(-maxTurns * 2) + const result: LLMMessage[] = [] + + if (firstUser !== null) { + result.push(firstUser) + } + + const droppedPairs = Math.floor((afterFirst.length - kept.length) / 2) + if (droppedPairs > 0) { + const notice = + `[Earlier conversation history truncated — ${droppedPairs} turn(s) removed]\n\n` + result.push(...prependSyntheticPrefixToFirstUser(kept, notice)) + return result + } + + result.push(...kept) + return result + } + + private async summarizeMessages( + messages: LLMMessage[], + maxTokens: number, + summaryModel: string | undefined, + baseChatOptions: LLMChatOptions, + turns: number, + options: RunOptions, + ): Promise<{ messages: LLMMessage[]; usage: TokenUsage }> { + const estimated = estimateTokens(messages) + if (estimated <= maxTokens || messages.length < 4) { + return { messages, usage: ZERO_USAGE } + } + + const firstUserIndex = messages.findIndex(m => m.role === 'user') + if (firstUserIndex < 0 || firstUserIndex === messages.length - 1) { + return { messages, usage: ZERO_USAGE } + } + + const firstUser = messages[firstUserIndex]! + const rest = messages.slice(firstUserIndex + 1) + if (rest.length < 2) { + return { messages, usage: ZERO_USAGE } + } + + // Split on an even boundary so we never separate a tool_use assistant turn + // from its tool_result user message (rest is user/assistant pairs). + const splitAt = Math.max(2, Math.floor(rest.length / 4) * 2) + const oldPortion = rest.slice(0, splitAt) + const recentPortion = rest.slice(splitAt) + + const oldSignature = oldPortion.map(m => this.serializeMessage(m)).join('\n') + if (this.summarizeCache !== null && this.summarizeCache.oldSignature === oldSignature) { + const mergedRecent = prependSyntheticPrefixToFirstUser( + recentPortion, + `${this.summarizeCache.summaryPrefix}\n\n`, + ) + return { messages: [firstUser, ...mergedRecent], usage: ZERO_USAGE } + } + + const summaryPrompt = [ + 'Summarize the following conversation history for an LLM.', + '- Preserve user goals, constraints, and decisions.', + '- Keep key tool outputs and unresolved questions.', + '- Use concise bullets.', + '- Do not fabricate details.', + ].join('\n') + + const summaryInput: LLMMessage[] = [ + { + role: 'user', + content: [ + { type: 'text', text: summaryPrompt }, + { type: 'text', text: `\n\nConversation:\n${oldSignature}` }, + ], + }, + ] + + const summaryOptions: LLMChatOptions = { + ...baseChatOptions, + model: summaryModel ?? this.options.model, + tools: undefined, + } + + const summaryStartMs = Date.now() + const summaryResponse = await this.adapter.chat(summaryInput, summaryOptions) + if (options.onTrace) { + const summaryEndMs = Date.now() + emitTrace(options.onTrace, { + type: 'llm_call', + runId: options.runId ?? '', + taskId: options.taskId, + agent: options.traceAgent ?? this.options.agentName ?? 'unknown', + model: summaryOptions.model, + phase: 'summary', + turn: turns, + tokens: summaryResponse.usage, + startMs: summaryStartMs, + endMs: summaryEndMs, + durationMs: summaryEndMs - summaryStartMs, + }) + } + + const summaryText = extractText(summaryResponse.content).trim() + const summaryPrefix = summaryText.length > 0 + ? `[Conversation summary]\n${summaryText}` + : '[Conversation summary unavailable]' + + this.summarizeCache = { oldSignature, summaryPrefix } + const mergedRecent = prependSyntheticPrefixToFirstUser( + recentPortion, + `${summaryPrefix}\n\n`, + ) + return { + messages: [firstUser, ...mergedRecent], + usage: summaryResponse.usage, + } + } + + private async applyContextStrategy( + messages: LLMMessage[], + strategy: ContextStrategy, + baseChatOptions: LLMChatOptions, + turns: number, + options: RunOptions, + ): Promise<{ messages: LLMMessage[]; usage: TokenUsage }> { + if (strategy.type === 'sliding-window') { + return { messages: this.truncateToSlidingWindow(messages, strategy.maxTurns), usage: ZERO_USAGE } + } + + if (strategy.type === 'summarize') { + return this.summarizeMessages( + messages, + strategy.maxTokens, + strategy.summaryModel, + baseChatOptions, + turns, + options, + ) + } + + const estimated = estimateTokens(messages) + const compressed = await strategy.compress(messages, estimated) + if (!Array.isArray(compressed) || compressed.length === 0) { + throw new Error('contextStrategy.custom.compress must return a non-empty LLMMessage[]') + } + return { messages: compressed, usage: ZERO_USAGE } + } + // ------------------------------------------------------------------------- // Tool resolution // ------------------------------------------------------------------------- @@ -313,7 +512,7 @@ export class AgentRunner { options: RunOptions = {}, ): AsyncGenerator { // Working copy of the conversation — mutated as turns progress. - const conversationMessages: LLMMessage[] = [...initialMessages] + let conversationMessages: LLMMessage[] = [...initialMessages] // Accumulated state across all turns. let totalUsage: TokenUsage = ZERO_USAGE @@ -363,6 +562,19 @@ export class AgentRunner { turns++ + // Optionally compact context before each LLM call after the first turn. + if (this.options.contextStrategy && turns > 1) { + const compacted = await this.applyContextStrategy( + conversationMessages, + this.options.contextStrategy, + baseChatOptions, + turns, + options, + ) + conversationMessages = compacted.messages + totalUsage = addTokenUsage(totalUsage, compacted.usage) + } + // ------------------------------------------------------------------ // Step 1: Call the LLM and collect the full response for this turn. // ------------------------------------------------------------------ @@ -376,6 +588,7 @@ export class AgentRunner { taskId: options.taskId, agent: options.traceAgent ?? this.options.agentName ?? 'unknown', model: this.options.model, + phase: 'turn', turn: turns, tokens: response.usage, startMs: llmStartMs, diff --git a/src/index.ts b/src/index.ts index bc28b96..0d1b16d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -153,6 +153,7 @@ export type { ToolCallRecord, LoopDetectionConfig, LoopDetectionInfo, + ContextStrategy, // Team TeamConfig, diff --git a/src/types.ts b/src/types.ts index 98f0397..2864400 100644 --- a/src/types.ts +++ b/src/types.ts @@ -65,6 +65,18 @@ export interface LLMMessage { readonly content: ContentBlock[] } +/** Context management strategy for long-running agent conversations. */ +export type ContextStrategy = + | { type: 'sliding-window'; maxTurns: number } + | { type: 'summarize'; maxTokens: number; summaryModel?: string } + | { + type: 'custom' + compress: ( + messages: LLMMessage[], + estimatedTokens: number, + ) => Promise | LLMMessage[] + } + /** Token accounting for a single API call. */ export interface TokenUsage { readonly input_tokens: number @@ -215,6 +227,8 @@ export interface AgentConfig { readonly maxTokens?: number /** Maximum cumulative tokens (input + output) allowed for this run. */ readonly maxTokenBudget?: number + /** Optional context compression policy to control input growth across turns. */ + readonly contextStrategy?: ContextStrategy readonly temperature?: number /** * Maximum wall-clock time (in milliseconds) for the entire agent run. @@ -487,6 +501,8 @@ export interface TraceEventBase { export interface LLMCallTrace extends TraceEventBase { readonly type: 'llm_call' readonly model: string + /** Distinguishes normal turn calls from context-summary calls. */ + readonly phase?: 'turn' | 'summary' readonly turn: number readonly tokens: TokenUsage } diff --git a/src/utils/tokens.ts b/src/utils/tokens.ts new file mode 100644 index 0000000..1dc57cf --- /dev/null +++ b/src/utils/tokens.ts @@ -0,0 +1,27 @@ +import type { LLMMessage } from '../types.js' + +/** + * Estimate token count using a lightweight character heuristic. + * This intentionally avoids model-specific tokenizer dependencies. + */ +export function estimateTokens(messages: LLMMessage[]): number { + let chars = 0 + + for (const message of messages) { + for (const block of message.content) { + if (block.type === 'text') { + chars += block.text.length + } else if (block.type === 'tool_result') { + chars += block.content.length + } else if (block.type === 'tool_use') { + chars += JSON.stringify(block.input).length + } else if (block.type === 'image') { + // Account for non-text payloads with a small fixed cost. + chars += 64 + } + } + } + + // Conservative English heuristic: ~4 chars per token. + return Math.ceil(chars / 4) +} diff --git a/tests/context-strategy.test.ts b/tests/context-strategy.test.ts new file mode 100644 index 0000000..7c847b0 --- /dev/null +++ b/tests/context-strategy.test.ts @@ -0,0 +1,202 @@ +import { describe, it, expect, vi } from 'vitest' +import { z } from 'zod' +import { AgentRunner } from '../src/agent/runner.js' +import { ToolRegistry, defineTool } from '../src/tool/framework.js' +import { ToolExecutor } from '../src/tool/executor.js' +import type { LLMAdapter, LLMChatOptions, LLMMessage, LLMResponse, TraceEvent } from '../src/types.js' + +function textResponse(text: string): LLMResponse { + return { + id: `resp-${Math.random().toString(36).slice(2)}`, + content: [{ type: 'text', text }], + model: 'mock-model', + stop_reason: 'end_turn', + usage: { input_tokens: 10, output_tokens: 20 }, + } +} + +function toolUseResponse(toolName: string, input: Record): LLMResponse { + return { + id: `resp-${Math.random().toString(36).slice(2)}`, + content: [{ + type: 'tool_use', + id: `tu-${Math.random().toString(36).slice(2)}`, + name: toolName, + input, + }], + model: 'mock-model', + stop_reason: 'tool_use', + usage: { input_tokens: 15, output_tokens: 25 }, + } +} + +function buildRegistryAndExecutor(): { registry: ToolRegistry; executor: ToolExecutor } { + const registry = new ToolRegistry() + registry.register( + defineTool({ + name: 'echo', + description: 'Echo input', + inputSchema: z.object({ message: z.string() }), + async execute({ message }) { + return { data: message } + }, + }), + ) + return { registry, executor: new ToolExecutor(registry) } +} + +describe('AgentRunner contextStrategy', () => { + it('keeps baseline behavior when contextStrategy is not set', async () => { + const calls: LLMMessage[][] = [] + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: m.content }))) + return calls.length === 1 + ? toolUseResponse('echo', { message: 'hello' }) + : textResponse('done') + }, + async *stream() { + /* unused */ + }, + } + const { registry, executor } = buildRegistryAndExecutor() + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 4, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'start' }] }]) + expect(calls).toHaveLength(2) + expect(calls[0]).toHaveLength(1) + expect(calls[1]!.length).toBeGreaterThan(calls[0]!.length) + }) + + it('sliding-window truncates old turns and preserves the first user message', async () => { + const calls: LLMMessage[][] = [] + const responses = [ + toolUseResponse('echo', { message: 't1' }), + toolUseResponse('echo', { message: 't2' }), + toolUseResponse('echo', { message: 't3' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: m.content }))) + return responses[idx++]! + }, + async *stream() { + /* unused */ + }, + } + const { registry, executor } = buildRegistryAndExecutor() + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 8, + contextStrategy: { type: 'sliding-window', maxTurns: 1 }, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'original prompt' }] }]) + + const laterCall = calls[calls.length - 1]! + const firstUserText = laterCall[0]!.content[0] + expect(firstUserText).toMatchObject({ type: 'text', text: 'original prompt' }) + const flattenedText = laterCall.flatMap(m => m.content.filter(c => c.type === 'text')) + expect(flattenedText.some(c => c.type === 'text' && c.text.includes('truncated'))).toBe(true) + }) + + it('summarize strategy replaces old context and emits summary trace call', async () => { + const calls: Array<{ messages: LLMMessage[]; options: LLMChatOptions }> = [] + const traces: TraceEvent[] = [] + const responses = [ + toolUseResponse('echo', { message: 'first turn payload '.repeat(20) }), + toolUseResponse('echo', { message: 'second turn payload '.repeat(20) }), + textResponse('This is a concise summary.'), + textResponse('final answer'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages, options) { + calls.push({ messages: messages.map(m => ({ role: m.role, content: m.content })), options }) + return responses[idx++]! + }, + async *stream() { + /* unused */ + }, + } + const { registry, executor } = buildRegistryAndExecutor() + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 8, + contextStrategy: { type: 'summarize', maxTokens: 20 }, + }) + + const result = await runner.run( + [{ role: 'user', content: [{ type: 'text', text: 'start' }] }], + { onTrace: (e) => { traces.push(e) }, runId: 'run-summary', traceAgent: 'context-agent' }, + ) + + const summaryCall = calls.find(c => c.messages.length === 1 && c.options.tools === undefined) + expect(summaryCall).toBeDefined() + const llmTraces = traces.filter(t => t.type === 'llm_call') + expect(llmTraces.some(t => t.type === 'llm_call' && t.phase === 'summary')).toBe(true) + + // Summary adapter usage must count toward RunResult.tokenUsage (maxTokenBudget). + expect(result.tokenUsage.input_tokens).toBe(15 + 15 + 10 + 10) + expect(result.tokenUsage.output_tokens).toBe(25 + 25 + 20 + 20) + + // After compaction, summary text is folded into the next user turn (not a + // standalone user message), preserving user/assistant alternation. + const turnAfterSummary = calls.find( + c => c.messages.some( + m => m.role === 'user' && m.content.some( + b => b.type === 'text' && b.text.includes('[Conversation summary]'), + ), + ), + ) + expect(turnAfterSummary).toBeDefined() + const rolesAfterFirstUser = turnAfterSummary!.messages.map(m => m.role).join(',') + expect(rolesAfterFirstUser).not.toMatch(/^user,user/) + }) + + it('custom strategy calls compress callback and uses returned messages', async () => { + const compress = vi.fn((messages: LLMMessage[]) => messages.slice(-1)) + const calls: LLMMessage[][] = [] + const responses = [ + toolUseResponse('echo', { message: 'hello' }), + textResponse('done'), + ] + let idx = 0 + const adapter: LLMAdapter = { + name: 'mock', + async chat(messages) { + calls.push(messages.map(m => ({ role: m.role, content: m.content }))) + return responses[idx++]! + }, + async *stream() { + /* unused */ + }, + } + const { registry, executor } = buildRegistryAndExecutor() + const runner = new AgentRunner(adapter, registry, executor, { + model: 'mock-model', + allowedTools: ['echo'], + maxTurns: 4, + contextStrategy: { + type: 'custom', + compress, + }, + }) + + await runner.run([{ role: 'user', content: [{ type: 'text', text: 'custom prompt' }] }]) + + expect(compress).toHaveBeenCalledOnce() + expect(calls[1]).toHaveLength(1) + }) +})