diff --git a/config/evolution.yaml b/config/evolution.yaml index dcb325b..89cc4fd 100644 --- a/config/evolution.yaml +++ b/config/evolution.yaml @@ -28,6 +28,17 @@ reflection: effort: "high" max_budget_usd: 0.50 +# LLM Judge Configuration +judges: + # "auto" enables when ANTHROPIC_API_KEY is available + # "always" enables unconditionally + # "never" disables unconditionally + enabled: "auto" + # Safety net against runaway costs (daily reset) + cost_cap_usd_per_day: 50.0 + # Maximum golden suite entries (prune oldest when exceeded) + max_golden_suite_size: 50 + # Directory paths (relative to project root) paths: config_dir: "phantom-config" diff --git a/src/evolution/__tests__/application.test.ts b/src/evolution/__tests__/application.test.ts index b1b33b0..9ac42b9 100644 --- a/src/evolution/__tests__/application.test.ts +++ b/src/evolution/__tests__/application.test.ts @@ -12,6 +12,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/constitution.test.ts b/src/evolution/__tests__/constitution.test.ts index 3e5c0a5..1cdfc71 100644 --- a/src/evolution/__tests__/constitution.test.ts +++ b/src/evolution/__tests__/constitution.test.ts @@ -11,6 +11,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/cost-cap.test.ts b/src/evolution/__tests__/cost-cap.test.ts new file mode 100644 index 0000000..24135db --- /dev/null +++ b/src/evolution/__tests__/cost-cap.test.ts @@ -0,0 +1,199 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { EvolutionEngine } from "../engine.ts"; +import type { SessionSummary } from "../types.ts"; + +const TEST_DIR = "/tmp/phantom-test-cost-cap"; +const CONFIG_PATH = `${TEST_DIR}/config/evolution.yaml`; + +let savedApiKey: string | undefined; + +function setupTestEnv(costCap: number): void { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + " consolidation_interval: 10", + "gates:", + " drift_threshold: 0.7", + " max_file_lines: 200", + " auto_rollback_threshold: 0.1", + " auto_rollback_window: 5", + "judges:", + ' enabled: "never"', + ` cost_cap_usd_per_day: ${costCap}`, + " max_golden_suite_size: 50", + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n1. Be honest.\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "# User Profile\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); +} + +function makeSession(overrides: Partial = {}): SessionSummary { + return { + session_id: `session-${Date.now()}`, + session_key: "cli:main", + user_id: "user-1", + user_messages: ["No, use TypeScript not JavaScript"], + assistant_messages: ["Got it."], + tools_used: [], + files_tracked: [], + outcome: "success", + cost_usd: 0.05, + started_at: "2026-03-25T10:00:00Z", + ended_at: "2026-03-25T10:05:00Z", + ...overrides, + }; +} + +describe("Cost Cap", () => { + beforeEach(() => { + savedApiKey = process.env.ANTHROPIC_API_KEY; + }); + + afterEach(() => { + if (savedApiKey !== undefined) { + process.env.ANTHROPIC_API_KEY = savedApiKey; + } else { + process.env.ANTHROPIC_API_KEY = undefined; + } + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("cost cap config is parsed from YAML", () => { + setupTestEnv(10.0); + const engine = new EvolutionEngine(CONFIG_PATH); + const config = engine.getEvolutionConfig(); + expect(config.judges.cost_cap_usd_per_day).toBe(10.0); + }); + + test("cost cap defaults to 50 when not configured", () => { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); + + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.getEvolutionConfig().judges.cost_cap_usd_per_day).toBe(50.0); + }); + + test("engine uses heuristic path when judges are disabled", async () => { + setupTestEnv(50.0); + const engine = new EvolutionEngine(CONFIG_PATH); + + // judges.enabled: "never" means heuristics + expect(engine.usesLLMJudges()).toBe(false); + + const result = await engine.afterSession(makeSession()); + // Should still work with heuristics + expect(result.changes_applied.length).toBeGreaterThan(0); + + const userProfile = readFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "utf-8"); + expect(userProfile).toContain("TypeScript"); + }); +}); diff --git a/src/evolution/__tests__/engine.test.ts b/src/evolution/__tests__/engine.test.ts index 47fef28..7af4012 100644 --- a/src/evolution/__tests__/engine.test.ts +++ b/src/evolution/__tests__/engine.test.ts @@ -27,6 +27,8 @@ function setupTestEnvironment(): void { " auto_rollback_window: 5", "reflection:", ' model: "claude-sonnet-4-20250514"', + "judges:", + ' enabled: "never"', "paths:", ` config_dir: "${TEST_DIR}/phantom-config"`, ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, diff --git a/src/evolution/__tests__/golden-suite-cap.test.ts b/src/evolution/__tests__/golden-suite-cap.test.ts new file mode 100644 index 0000000..1741f83 --- /dev/null +++ b/src/evolution/__tests__/golden-suite-cap.test.ts @@ -0,0 +1,116 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, rmSync, writeFileSync } from "node:fs"; +import type { EvolutionConfig } from "../config.ts"; +import { addCase, loadSuite, pruneSuite } from "../golden-suite.ts"; +import type { GoldenCase } from "../types.ts"; + +const TEST_DIR = "/tmp/phantom-test-golden-cap"; + +function testConfig(): EvolutionConfig { + return { + cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, + gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, + reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, + paths: { + config_dir: TEST_DIR, + constitution: `${TEST_DIR}/constitution.md`, + version_file: `${TEST_DIR}/meta/version.json`, + metrics_file: `${TEST_DIR}/meta/metrics.json`, + evolution_log: `${TEST_DIR}/meta/evolution-log.jsonl`, + golden_suite: `${TEST_DIR}/meta/golden-suite.jsonl`, + session_log: `${TEST_DIR}/memory/session-log.jsonl`, + }, + }; +} + +function makeGoldenCase(index: number, daysAgo = 0): GoldenCase { + const date = new Date(); + date.setDate(date.getDate() - daysAgo); + return { + id: `golden-${index}`, + description: `Correction ${index}`, + lesson: `Lesson for correction ${index}`, + session_id: `session-${index}`, + created_at: date.toISOString(), + }; +} + +describe("Golden Suite Cap", () => { + beforeEach(() => { + mkdirSync(`${TEST_DIR}/meta`, { recursive: true }); + writeFileSync(`${TEST_DIR}/meta/golden-suite.jsonl`, "", "utf-8"); + }); + + afterEach(() => { + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("pruneSuite is a no-op when suite is under the cap", () => { + const config = testConfig(); + for (let i = 0; i < 5; i++) { + addCase(config, makeGoldenCase(i)); + } + const removed = pruneSuite(config, 50); + expect(removed).toBe(0); + expect(loadSuite(config)).toHaveLength(5); + }); + + test("pruneSuite removes oldest entries when suite exceeds cap", () => { + const config = testConfig(); + // Add 10 cases with decreasing age (0 = newest, 9 = oldest) + for (let i = 0; i < 10; i++) { + addCase(config, makeGoldenCase(i, i)); + } + expect(loadSuite(config)).toHaveLength(10); + + const removed = pruneSuite(config, 5); + expect(removed).toBe(5); + + const remaining = loadSuite(config); + expect(remaining).toHaveLength(5); + + // Remaining should be the 5 newest (days ago 0-4) + for (const entry of remaining) { + const id = Number.parseInt(entry.id.replace("golden-", ""), 10); + expect(id).toBeLessThan(5); + } + }); + + test("pruneSuite with max_golden_suite_size defaults to 50", () => { + const config = testConfig(); + // Default cap is 50 from the config + expect(config.judges.max_golden_suite_size).toBe(50); + }); + + test("pruneSuite handles empty suite", () => { + const config = testConfig(); + const removed = pruneSuite(config, 50); + expect(removed).toBe(0); + }); + + test("pruneSuite handles suite at exactly the cap", () => { + const config = testConfig(); + for (let i = 0; i < 5; i++) { + addCase(config, makeGoldenCase(i)); + } + const removed = pruneSuite(config, 5); + expect(removed).toBe(0); + expect(loadSuite(config)).toHaveLength(5); + }); + + test("pruneSuite keeps newest entries when exceeding cap by 1", () => { + const config = testConfig(); + // oldest first, then newest + addCase(config, makeGoldenCase(0, 10)); + addCase(config, makeGoldenCase(1, 0)); + + const removed = pruneSuite(config, 1); + expect(removed).toBe(1); + + const remaining = loadSuite(config); + expect(remaining).toHaveLength(1); + // The newest entry (days ago 0) should remain + expect(remaining[0].id).toBe("golden-1"); + }); +}); diff --git a/src/evolution/__tests__/judge-activation.test.ts b/src/evolution/__tests__/judge-activation.test.ts new file mode 100644 index 0000000..59935ce --- /dev/null +++ b/src/evolution/__tests__/judge-activation.test.ts @@ -0,0 +1,205 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { EvolutionEngine } from "../engine.ts"; + +const TEST_DIR = "/tmp/phantom-test-judge-activation"; +const CONFIG_PATH = `${TEST_DIR}/config/evolution.yaml`; + +function setupWithJudgeMode(enabled: "auto" | "always" | "never"): void { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + " consolidation_interval: 10", + "gates:", + " drift_threshold: 0.7", + " max_file_lines: 200", + " auto_rollback_threshold: 0.1", + " auto_rollback_window: 5", + "reflection:", + ' model: "claude-sonnet-4-20250514"', + "judges:", + ` enabled: "${enabled}"`, + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n1. Be honest.\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); +} + +let savedApiKey: string | undefined; + +describe("Judge Activation", () => { + beforeEach(() => { + savedApiKey = process.env.ANTHROPIC_API_KEY; + }); + + afterEach(() => { + if (savedApiKey !== undefined) { + process.env.ANTHROPIC_API_KEY = savedApiKey; + } else { + process.env.ANTHROPIC_API_KEY = undefined; + } + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("auto mode enables judges when ANTHROPIC_API_KEY is set", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("auto mode disables judges when ANTHROPIC_API_KEY is missing", () => { + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); + + test("never mode disables judges even when API key is set", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("never"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); + + test("always mode enables judges regardless of API key", () => { + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("always"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("usesLLMJudges accessor matches resolved state", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("auto"); + const engine2 = new EvolutionEngine(CONFIG_PATH); + expect(engine2.usesLLMJudges()).toBe(false); + }); + + test("missing judges section defaults to auto mode", () => { + process.env.ANTHROPIC_API_KEY = undefined; + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + // Config without judges section + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); + + // No API key + auto = disabled + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); +}); diff --git a/src/evolution/__tests__/metrics.test.ts b/src/evolution/__tests__/metrics.test.ts index 2a640b4..fcd24e4 100644 --- a/src/evolution/__tests__/metrics.test.ts +++ b/src/evolution/__tests__/metrics.test.ts @@ -18,6 +18,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/validation.test.ts b/src/evolution/__tests__/validation.test.ts index 9c42ba4..9eae6ce 100644 --- a/src/evolution/__tests__/validation.test.ts +++ b/src/evolution/__tests__/validation.test.ts @@ -20,6 +20,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/versioning.test.ts b/src/evolution/__tests__/versioning.test.ts index 2703f35..72868ca 100644 --- a/src/evolution/__tests__/versioning.test.ts +++ b/src/evolution/__tests__/versioning.test.ts @@ -11,6 +11,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/config.ts b/src/evolution/config.ts index abbcd89..ea2f6d2 100644 --- a/src/evolution/config.ts +++ b/src/evolution/config.ts @@ -26,6 +26,13 @@ export const EvolutionConfigSchema = z.object({ max_budget_usd: z.number().positive().default(0.5), }) .default({}), + judges: z + .object({ + enabled: z.enum(["auto", "always", "never"]).default("auto"), + cost_cap_usd_per_day: z.number().positive().default(50.0), + max_golden_suite_size: z.number().int().positive().default(50), + }) + .default({}), paths: z .object({ config_dir: z.string().default("phantom-config"), diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts index 87bd38d..c32b4df 100644 --- a/src/evolution/engine.ts +++ b/src/evolution/engine.ts @@ -4,8 +4,7 @@ import { applyApproved } from "./application.ts"; import { type EvolutionConfig, loadEvolutionConfig } from "./config.ts"; import { recordObservations, runConsolidation } from "./consolidation.ts"; import { ConstitutionChecker } from "./constitution.ts"; -import { addCase } from "./golden-suite.ts"; -import { loadSuite } from "./golden-suite.ts"; +import { addCase, loadSuite, pruneSuite } from "./golden-suite.ts"; import { runQualityJudge } from "./judges/quality-judge.ts"; import { type JudgeCosts, emptyJudgeCosts } from "./judges/types.ts"; import { @@ -30,24 +29,45 @@ import { getHistory, readVersion, rollback as versionRollback } from "./versioni export class EvolutionEngine { private config: EvolutionConfig; private checker: ConstitutionChecker; - private useLLMJudges: boolean; + private llmJudgesEnabled: boolean; + private dailyCostUsd = 0; + private dailyCostResetDate = ""; - constructor(configPath?: string, useLLMJudges = false) { + constructor(configPath?: string) { this.config = loadEvolutionConfig(configPath); this.checker = new ConstitutionChecker(this.config); - this.useLLMJudges = useLLMJudges; + this.llmJudgesEnabled = this.resolveJudgeMode(); + if (this.llmJudgesEnabled) { + console.log("[evolution] LLM judges enabled (API key detected)"); + } else { + console.log("[evolution] LLM judges disabled (no API key or config override)"); + } } - getEvolutionConfig(): EvolutionConfig { - return this.config; + private resolveJudgeMode(): boolean { + const setting = this.config.judges?.enabled ?? "auto"; + if (setting === "never") return false; + if (setting === "always") return true; + return !!process.env.ANTHROPIC_API_KEY; } - enableLLMJudges(): void { - this.useLLMJudges = true; + usesLLMJudges(): boolean { + return this.llmJudgesEnabled; } - disableLLMJudges(): void { - this.useLLMJudges = false; + /** Memory consolidation runs outside afterSession() but still needs to respect the daily cap. */ + isWithinCostCap(): boolean { + return !this.isDailyCostCapReached(); + } + + /** Consolidation judge costs happen outside the evolution pipeline but count toward the daily cap. */ + trackExternalJudgeCost(cost: { totalUsd: number }): void { + this.resetDailyCostIfNewDay(); + this.dailyCostUsd += cost.totalUsd; + } + + getEvolutionConfig(): EvolutionConfig { + return this.config; } /** @@ -62,12 +82,13 @@ export class EvolutionEngine { // Step 1: Observation Extraction (LLM or heuristic) let observations: import("./types.ts").SessionObservation[]; - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { const currentConfig = this.getConfig(); const result = await extractObservationsWithLLM(session, currentConfig); observations = result.observations; if (result.judgeCost) { addCost(judgeCosts.observation_extraction, result.judgeCost); + this.incrementDailyCost(result.judgeCost.totalUsd); } } else { observations = extractObservations(session); @@ -98,10 +119,11 @@ export class EvolutionEngine { const goldenSuite = loadSuite(this.config); let validationResults: import("./types.ts").ValidationResult[]; - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { const judgeResult = await validateAllWithJudges(deltas, this.checker, goldenSuite, this.config, currentConfig); validationResults = judgeResult.results; mergeCosts(judgeCosts, judgeResult.judgeCosts); + this.incrementDailyCost(totalCostFromJudgeCosts(judgeResult.judgeCosts)); } else { validationResults = validateAll(deltas, this.checker, goldenSuite, this.config); } @@ -139,13 +161,14 @@ export class EvolutionEngine { } // Quality Assessment (LLM only, non-blocking) - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { try { const qualityResult = await runQualityJudge(session, currentConfig); judgeCosts.quality_assessment.calls++; judgeCosts.quality_assessment.totalUsd += qualityResult.costUsd; judgeCosts.quality_assessment.totalInputTokens += qualityResult.inputTokens; judgeCosts.quality_assessment.totalOutputTokens += qualityResult.outputTokens; + this.incrementDailyCost(qualityResult.costUsd); if (qualityResult.data.regression_signal) { console.warn( @@ -184,11 +207,14 @@ export class EvolutionEngine { this.rollback(this.getCurrentVersion() - 1); } - // Record judge costs - if (this.useLLMJudges) { + // Record judge costs to persistent metrics (daily tracking already done incrementally above) + if (this.llmJudgesEnabled) { this.recordJudgeCosts(judgeCosts); } + // Enforce golden suite cap + this.pruneGoldenSuite(); + return { version: this.getCurrentVersion(), changes_applied: applied, @@ -236,6 +262,39 @@ export class EvolutionEngine { console.log(`[evolution] Rolled back to version ${toVersion}`); } + private resetDailyCostIfNewDay(): void { + const today = new Date().toISOString().slice(0, 10); + if (this.dailyCostResetDate !== today) { + this.dailyCostUsd = 0; + this.dailyCostResetDate = today; + } + } + + private isDailyCostCapReached(): boolean { + this.resetDailyCostIfNewDay(); + const cap = this.config.judges?.cost_cap_usd_per_day ?? 50.0; + if (this.dailyCostUsd >= cap) { + console.warn( + `[evolution] Daily cost cap reached ($${this.dailyCostUsd.toFixed(2)} >= $${cap}), using heuristics`, + ); + return true; + } + return false; + } + + private incrementDailyCost(usd: number): void { + this.resetDailyCostIfNewDay(); + this.dailyCostUsd += usd; + } + + private pruneGoldenSuite(): void { + const maxSize = this.config.judges?.max_golden_suite_size ?? 50; + const removed = pruneSuite(this.config, maxSize); + if (removed > 0) { + console.log(`[evolution] Pruned ${removed} oldest golden suite entries (cap: ${maxSize})`); + } + } + private recordJudgeCosts(costs: JudgeCosts): void { const metricsPath = this.config.paths.metrics_file; try { @@ -277,3 +336,11 @@ function mergeCosts(target: JudgeCosts, source: JudgeCosts): void { addCost(target[key], source[key]); } } + +function totalCostFromJudgeCosts(costs: JudgeCosts): number { + let total = 0; + for (const key of Object.keys(costs) as Array) { + total += costs[key].totalUsd; + } + return total; +} diff --git a/src/evolution/golden-suite.ts b/src/evolution/golden-suite.ts index 22f780f..5643a9a 100644 --- a/src/evolution/golden-suite.ts +++ b/src/evolution/golden-suite.ts @@ -45,6 +45,22 @@ export function addCase(config: EvolutionConfig, goldenCase: GoldenCase): void { } } +/** + * Prune the golden suite to the given max size, removing oldest entries. + * No-op if the suite is within the limit. + */ +export function pruneSuite(config: EvolutionConfig, maxSize: number): number { + const suite = loadSuite(config); + if (suite.length <= maxSize) return 0; + + const sorted = suite.sort((a, b) => b.created_at.localeCompare(a.created_at)); + const pruned = sorted.slice(0, maxSize); + const content = pruned.map((c) => JSON.stringify(c)).join("\n"); + writeFileSync(config.paths.golden_suite, `${content}\n`, "utf-8"); + + return suite.length - maxSize; +} + /** * Run the golden suite against a proposed change description. * Returns cases that might be affected. diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts index 088f54d..6254e99 100644 --- a/src/evolution/judges/client.ts +++ b/src/evolution/judges/client.ts @@ -1,6 +1,7 @@ import Anthropic from "@anthropic-ai/sdk"; import { zodOutputFormat } from "@anthropic-ai/sdk/helpers/zod"; -import type { z } from "zod"; +// zod/v4 required: matches schemas.ts for zodOutputFormat compatibility +import type { z } from "zod/v4"; import { JUDGE_MAX_TOKENS, JUDGE_TEMPERATURE, @@ -23,6 +24,10 @@ export function setClient(client: Anthropic | null): void { _client = client; } +export function isJudgeAvailable(): boolean { + return !!process.env.ANTHROPIC_API_KEY; +} + /** * Call a single LLM judge with structured output. * Uses the raw Anthropic SDK (not the Agent SDK). @@ -46,7 +51,9 @@ export async function callJudge(options: { system: options.systemPrompt, messages: [{ role: "user", content: options.userMessage }], output_config: { - format: zodOutputFormat(options.schema), + // Cast needed: SDK .d.ts references zod v3 types but runtime uses zod/v4 + // biome-ignore lint/suspicious/noExplicitAny: bridging zod v3/v4 type mismatch + format: zodOutputFormat(options.schema as any), }, }); diff --git a/src/evolution/judges/schemas.ts b/src/evolution/judges/schemas.ts index b9fb0ef..a96f69d 100644 --- a/src/evolution/judges/schemas.ts +++ b/src/evolution/judges/schemas.ts @@ -1,4 +1,5 @@ -import { z } from "zod"; +// zod/v4 required: the Anthropic SDK's zodOutputFormat reads schema._zod.def (v4 only) +import { z } from "zod/v4"; // -- Observation Extraction -- diff --git a/src/evolution/judges/types.ts b/src/evolution/judges/types.ts index 6402013..755972d 100644 --- a/src/evolution/judges/types.ts +++ b/src/evolution/judges/types.ts @@ -1,6 +1,6 @@ -export const JUDGE_MODEL_SONNET = "claude-sonnet-4-6-20250514"; -export const JUDGE_MODEL_HAIKU = "claude-haiku-4-5-20250929"; -export const JUDGE_MODEL_OPUS = "claude-opus-4-6-20250918"; +export const JUDGE_MODEL_SONNET = "claude-sonnet-4-6"; +export const JUDGE_MODEL_HAIKU = "claude-haiku-4-5"; +export const JUDGE_MODEL_OPUS = "claude-opus-4-6"; export const JUDGE_TIMEOUT_MS = 30_000; export const JUDGE_MAX_TOKENS = 4096; diff --git a/src/index.ts b/src/index.ts index 4a25539..47fd660 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,7 +37,7 @@ import { PeerHealthMonitor } from "./mcp/peer-health.ts"; import { PeerManager } from "./mcp/peers.ts"; import { PhantomMcpServer } from "./mcp/server.ts"; import { loadMemoryConfig } from "./memory/config.ts"; -import { type SessionData, consolidateSession } from "./memory/consolidation.ts"; +import { type SessionData, consolidateSession, consolidateSessionWithLLM } from "./memory/consolidation.ts"; import { MemoryContextBuilder } from "./memory/context-builder.ts"; import { MemorySystem } from "./memory/system.ts"; import { isFirstRun, isOnboardingInProgress } from "./onboarding/detection.ts"; @@ -99,7 +99,8 @@ async function main(): Promise { try { evolution = new EvolutionEngine(); const currentVersion = evolution.getCurrentVersion(); - console.log(`[evolution] Engine initialized (v${currentVersion})`); + const judgeMode = evolution.usesLLMJudges() ? "LLM judges" : "heuristic"; + console.log(`[evolution] Engine initialized (v${currentVersion}, ${judgeMode})`); setEvolutionVersionProvider(() => evolution?.getCurrentVersion() ?? 0); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); @@ -491,19 +492,41 @@ async function main(): Promise { outcome: response.text.startsWith("Error:") ? "failure" : "success", }; - consolidateSession(memory, sessionData) - .then((result) => { - if (result.episodesCreated > 0 || result.factsExtracted > 0) { - console.log( - `[memory] Consolidated: ${result.episodesCreated} episodes, ` + - `${result.factsExtracted} facts (${result.durationMs}ms)`, - ); - } - }) - .catch((err: unknown) => { - const errMsg = err instanceof Error ? err.message : String(err); - console.warn(`[memory] Consolidation failed: ${errMsg}`); - }); + const useLLMConsolidation = evolution?.usesLLMJudges() && evolution.isWithinCostCap(); + if (useLLMConsolidation) { + const evolvedConfig = evolution?.getConfig(); + const existingFacts = evolvedConfig ? `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}` : ""; + consolidateSessionWithLLM(memory, sessionData, existingFacts) + .then(({ result, judgeCost }) => { + if (judgeCost) { + evolution?.trackExternalJudgeCost(judgeCost); + } + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[memory] Consolidated (LLM): ${result.episodesCreated} episodes, ` + + `${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + }) + .catch((err: unknown) => { + const errMsg = err instanceof Error ? err.message : String(err); + console.warn(`[memory] LLM consolidation failed: ${errMsg}`); + }); + } else { + consolidateSession(memory, sessionData) + .then((result) => { + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[memory] Consolidated: ${result.episodesCreated} episodes, ` + + `${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + }) + .catch((err: unknown) => { + const errMsg = err instanceof Error ? err.message : String(err); + console.warn(`[memory] Consolidation failed: ${errMsg}`); + }); + } } // Evolution pipeline (non-blocking) diff --git a/src/mcp/__tests__/dynamic-handlers.test.ts b/src/mcp/__tests__/dynamic-handlers.test.ts index 1c5376b..86aa33b 100644 --- a/src/mcp/__tests__/dynamic-handlers.test.ts +++ b/src/mcp/__tests__/dynamic-handlers.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from "bun:test"; -import type { DynamicToolDef } from "../dynamic-tools.ts"; import { buildSafeEnv, executeDynamicHandler } from "../dynamic-handlers.ts"; +import type { DynamicToolDef } from "../dynamic-tools.ts"; describe("buildSafeEnv", () => { test("includes only safe environment variables", () => { @@ -24,7 +24,7 @@ describe("buildSafeEnv", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } }); @@ -40,7 +40,7 @@ describe("buildSafeEnv", () => { if (origToken !== undefined) { process.env.SLACK_BOT_TOKEN = origToken; } else { - delete process.env.SLACK_BOT_TOKEN; + process.env.SLACK_BOT_TOKEN = undefined; } } }); @@ -82,7 +82,7 @@ describe("executeDynamicHandler", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } }); @@ -154,7 +154,7 @@ describe("executeDynamicHandler", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } });