From 4a2d3be560b2e8dea9c24b5457fca26ebc34ab64 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Cheema Date: Mon, 30 Mar 2026 21:06:19 -0700 Subject: [PATCH 1/3] feat: activate LLM judges for self-evolution engine The self-evolution LLM judges were built and tested but never activated. The heuristic regex fallback was running as the primary path, which violates the Cardinal Rule (TypeScript doing reasoning work that should be delegated to the LLM). This change auto-detects ANTHROPIC_API_KEY at construction time and enables Sonnet-powered judges when available. The heuristic path remains as a fallback for environments without an API key. What changed: - EvolutionEngine constructor resolves judge mode at startup via config setting (auto/always/never) + API key detection - Removed enableLLMJudges() and disableLLMJudges() runtime toggles that were never called and could cause inconsistent state - Added judges config section to evolution.yaml with daily cost cap ($50/day safety net) and golden suite size cap (50 entries) - Upgraded memory consolidation to use LLM path when judges enabled, with existingFacts from evolved config for contradiction detection - Fixed Zod v3/v4 compatibility: judge schemas now import from zod/v4 to match the Anthropic SDK's zodOutputFormat expectations - Fixed model ID constants to use short aliases (claude-sonnet-4-6) instead of dated versions that returned 404 - Golden suite pruning enforces the 50-entry cap When judges are enabled, every session gets: - Sonnet observation extraction (catches implicit corrections, inferred preferences, sentiment signals that regex misses) - Triple-judge constitution and safety gates with minority veto - Cascaded Haiku-to-Sonnet regression gate - Session quality assessment - LLM-powered memory consolidation with structured fact extraction Verified on two production VMs: - cheema.ghostwright.dev: judges correctly rejected an unsafe evolution change ("never suggest anything else") based on constitutional analysis of the Honesty principle - cheem.ghostwright.dev (fresh VM): full E2E from zero to working judges in 90 seconds, extracted implicit signals like communication style preferences from casual conversation 785 tests pass, 0 failures. Typecheck clean. Lint clean. --- config/evolution.yaml | 11 + src/evolution/__tests__/application.test.ts | 1 + src/evolution/__tests__/constitution.test.ts | 1 + src/evolution/__tests__/cost-cap.test.ts | 199 +++++++++++++++++ src/evolution/__tests__/engine.test.ts | 2 + .../__tests__/golden-suite-cap.test.ts | 116 ++++++++++ .../__tests__/judge-activation.test.ts | 205 ++++++++++++++++++ src/evolution/__tests__/metrics.test.ts | 1 + src/evolution/__tests__/validation.test.ts | 1 + src/evolution/__tests__/versioning.test.ts | 1 + src/evolution/config.ts | 7 + src/evolution/engine.ts | 80 +++++-- src/evolution/golden-suite.ts | 16 ++ src/evolution/judges/client.ts | 11 +- src/evolution/judges/schemas.ts | 3 +- src/evolution/judges/types.ts | 6 +- src/index.ts | 50 +++-- 17 files changed, 674 insertions(+), 37 deletions(-) create mode 100644 src/evolution/__tests__/cost-cap.test.ts create mode 100644 src/evolution/__tests__/golden-suite-cap.test.ts create mode 100644 src/evolution/__tests__/judge-activation.test.ts diff --git a/config/evolution.yaml b/config/evolution.yaml index dcb325b..89cc4fd 100644 --- a/config/evolution.yaml +++ b/config/evolution.yaml @@ -28,6 +28,17 @@ reflection: effort: "high" max_budget_usd: 0.50 +# LLM Judge Configuration +judges: + # "auto" enables when ANTHROPIC_API_KEY is available + # "always" enables unconditionally + # "never" disables unconditionally + enabled: "auto" + # Safety net against runaway costs (daily reset) + cost_cap_usd_per_day: 50.0 + # Maximum golden suite entries (prune oldest when exceeded) + max_golden_suite_size: 50 + # Directory paths (relative to project root) paths: config_dir: "phantom-config" diff --git a/src/evolution/__tests__/application.test.ts b/src/evolution/__tests__/application.test.ts index b1b33b0..9ac42b9 100644 --- a/src/evolution/__tests__/application.test.ts +++ b/src/evolution/__tests__/application.test.ts @@ -12,6 +12,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/constitution.test.ts b/src/evolution/__tests__/constitution.test.ts index 3e5c0a5..1cdfc71 100644 --- a/src/evolution/__tests__/constitution.test.ts +++ b/src/evolution/__tests__/constitution.test.ts @@ -11,6 +11,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/cost-cap.test.ts b/src/evolution/__tests__/cost-cap.test.ts new file mode 100644 index 0000000..24135db --- /dev/null +++ b/src/evolution/__tests__/cost-cap.test.ts @@ -0,0 +1,199 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { EvolutionEngine } from "../engine.ts"; +import type { SessionSummary } from "../types.ts"; + +const TEST_DIR = "/tmp/phantom-test-cost-cap"; +const CONFIG_PATH = `${TEST_DIR}/config/evolution.yaml`; + +let savedApiKey: string | undefined; + +function setupTestEnv(costCap: number): void { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + " consolidation_interval: 10", + "gates:", + " drift_threshold: 0.7", + " max_file_lines: 200", + " auto_rollback_threshold: 0.1", + " auto_rollback_window: 5", + "judges:", + ' enabled: "never"', + ` cost_cap_usd_per_day: ${costCap}`, + " max_golden_suite_size: 50", + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n1. Be honest.\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "# User Profile\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); +} + +function makeSession(overrides: Partial = {}): SessionSummary { + return { + session_id: `session-${Date.now()}`, + session_key: "cli:main", + user_id: "user-1", + user_messages: ["No, use TypeScript not JavaScript"], + assistant_messages: ["Got it."], + tools_used: [], + files_tracked: [], + outcome: "success", + cost_usd: 0.05, + started_at: "2026-03-25T10:00:00Z", + ended_at: "2026-03-25T10:05:00Z", + ...overrides, + }; +} + +describe("Cost Cap", () => { + beforeEach(() => { + savedApiKey = process.env.ANTHROPIC_API_KEY; + }); + + afterEach(() => { + if (savedApiKey !== undefined) { + process.env.ANTHROPIC_API_KEY = savedApiKey; + } else { + process.env.ANTHROPIC_API_KEY = undefined; + } + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("cost cap config is parsed from YAML", () => { + setupTestEnv(10.0); + const engine = new EvolutionEngine(CONFIG_PATH); + const config = engine.getEvolutionConfig(); + expect(config.judges.cost_cap_usd_per_day).toBe(10.0); + }); + + test("cost cap defaults to 50 when not configured", () => { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); + + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.getEvolutionConfig().judges.cost_cap_usd_per_day).toBe(50.0); + }); + + test("engine uses heuristic path when judges are disabled", async () => { + setupTestEnv(50.0); + const engine = new EvolutionEngine(CONFIG_PATH); + + // judges.enabled: "never" means heuristics + expect(engine.usesLLMJudges()).toBe(false); + + const result = await engine.afterSession(makeSession()); + // Should still work with heuristics + expect(result.changes_applied.length).toBeGreaterThan(0); + + const userProfile = readFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "utf-8"); + expect(userProfile).toContain("TypeScript"); + }); +}); diff --git a/src/evolution/__tests__/engine.test.ts b/src/evolution/__tests__/engine.test.ts index 47fef28..7af4012 100644 --- a/src/evolution/__tests__/engine.test.ts +++ b/src/evolution/__tests__/engine.test.ts @@ -27,6 +27,8 @@ function setupTestEnvironment(): void { " auto_rollback_window: 5", "reflection:", ' model: "claude-sonnet-4-20250514"', + "judges:", + ' enabled: "never"', "paths:", ` config_dir: "${TEST_DIR}/phantom-config"`, ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, diff --git a/src/evolution/__tests__/golden-suite-cap.test.ts b/src/evolution/__tests__/golden-suite-cap.test.ts new file mode 100644 index 0000000..1741f83 --- /dev/null +++ b/src/evolution/__tests__/golden-suite-cap.test.ts @@ -0,0 +1,116 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, rmSync, writeFileSync } from "node:fs"; +import type { EvolutionConfig } from "../config.ts"; +import { addCase, loadSuite, pruneSuite } from "../golden-suite.ts"; +import type { GoldenCase } from "../types.ts"; + +const TEST_DIR = "/tmp/phantom-test-golden-cap"; + +function testConfig(): EvolutionConfig { + return { + cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, + gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, + reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, + paths: { + config_dir: TEST_DIR, + constitution: `${TEST_DIR}/constitution.md`, + version_file: `${TEST_DIR}/meta/version.json`, + metrics_file: `${TEST_DIR}/meta/metrics.json`, + evolution_log: `${TEST_DIR}/meta/evolution-log.jsonl`, + golden_suite: `${TEST_DIR}/meta/golden-suite.jsonl`, + session_log: `${TEST_DIR}/memory/session-log.jsonl`, + }, + }; +} + +function makeGoldenCase(index: number, daysAgo = 0): GoldenCase { + const date = new Date(); + date.setDate(date.getDate() - daysAgo); + return { + id: `golden-${index}`, + description: `Correction ${index}`, + lesson: `Lesson for correction ${index}`, + session_id: `session-${index}`, + created_at: date.toISOString(), + }; +} + +describe("Golden Suite Cap", () => { + beforeEach(() => { + mkdirSync(`${TEST_DIR}/meta`, { recursive: true }); + writeFileSync(`${TEST_DIR}/meta/golden-suite.jsonl`, "", "utf-8"); + }); + + afterEach(() => { + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("pruneSuite is a no-op when suite is under the cap", () => { + const config = testConfig(); + for (let i = 0; i < 5; i++) { + addCase(config, makeGoldenCase(i)); + } + const removed = pruneSuite(config, 50); + expect(removed).toBe(0); + expect(loadSuite(config)).toHaveLength(5); + }); + + test("pruneSuite removes oldest entries when suite exceeds cap", () => { + const config = testConfig(); + // Add 10 cases with decreasing age (0 = newest, 9 = oldest) + for (let i = 0; i < 10; i++) { + addCase(config, makeGoldenCase(i, i)); + } + expect(loadSuite(config)).toHaveLength(10); + + const removed = pruneSuite(config, 5); + expect(removed).toBe(5); + + const remaining = loadSuite(config); + expect(remaining).toHaveLength(5); + + // Remaining should be the 5 newest (days ago 0-4) + for (const entry of remaining) { + const id = Number.parseInt(entry.id.replace("golden-", ""), 10); + expect(id).toBeLessThan(5); + } + }); + + test("pruneSuite with max_golden_suite_size defaults to 50", () => { + const config = testConfig(); + // Default cap is 50 from the config + expect(config.judges.max_golden_suite_size).toBe(50); + }); + + test("pruneSuite handles empty suite", () => { + const config = testConfig(); + const removed = pruneSuite(config, 50); + expect(removed).toBe(0); + }); + + test("pruneSuite handles suite at exactly the cap", () => { + const config = testConfig(); + for (let i = 0; i < 5; i++) { + addCase(config, makeGoldenCase(i)); + } + const removed = pruneSuite(config, 5); + expect(removed).toBe(0); + expect(loadSuite(config)).toHaveLength(5); + }); + + test("pruneSuite keeps newest entries when exceeding cap by 1", () => { + const config = testConfig(); + // oldest first, then newest + addCase(config, makeGoldenCase(0, 10)); + addCase(config, makeGoldenCase(1, 0)); + + const removed = pruneSuite(config, 1); + expect(removed).toBe(1); + + const remaining = loadSuite(config); + expect(remaining).toHaveLength(1); + // The newest entry (days ago 0) should remain + expect(remaining[0].id).toBe("golden-1"); + }); +}); diff --git a/src/evolution/__tests__/judge-activation.test.ts b/src/evolution/__tests__/judge-activation.test.ts new file mode 100644 index 0000000..59935ce --- /dev/null +++ b/src/evolution/__tests__/judge-activation.test.ts @@ -0,0 +1,205 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdirSync, rmSync, writeFileSync } from "node:fs"; +import { EvolutionEngine } from "../engine.ts"; + +const TEST_DIR = "/tmp/phantom-test-judge-activation"; +const CONFIG_PATH = `${TEST_DIR}/config/evolution.yaml`; + +function setupWithJudgeMode(enabled: "auto" | "always" | "never"): void { + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + " consolidation_interval: 10", + "gates:", + " drift_threshold: 0.7", + " max_file_lines: 200", + " auto_rollback_threshold: 0.1", + " auto_rollback_window: 5", + "reflection:", + ' model: "claude-sonnet-4-20250514"', + "judges:", + ` enabled: "${enabled}"`, + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n1. Be honest.\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); +} + +let savedApiKey: string | undefined; + +describe("Judge Activation", () => { + beforeEach(() => { + savedApiKey = process.env.ANTHROPIC_API_KEY; + }); + + afterEach(() => { + if (savedApiKey !== undefined) { + process.env.ANTHROPIC_API_KEY = savedApiKey; + } else { + process.env.ANTHROPIC_API_KEY = undefined; + } + rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + test("auto mode enables judges when ANTHROPIC_API_KEY is set", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("auto mode disables judges when ANTHROPIC_API_KEY is missing", () => { + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); + + test("never mode disables judges even when API key is set", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("never"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); + + test("always mode enables judges regardless of API key", () => { + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("always"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + }); + + test("usesLLMJudges accessor matches resolved state", () => { + process.env.ANTHROPIC_API_KEY = "sk-test-key"; + setupWithJudgeMode("auto"); + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(true); + + process.env.ANTHROPIC_API_KEY = undefined; + setupWithJudgeMode("auto"); + const engine2 = new EvolutionEngine(CONFIG_PATH); + expect(engine2.usesLLMJudges()).toBe(false); + }); + + test("missing judges section defaults to auto mode", () => { + process.env.ANTHROPIC_API_KEY = undefined; + mkdirSync(`${TEST_DIR}/config`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true }); + mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true }); + + // Config without judges section + writeFileSync( + CONFIG_PATH, + [ + "cadence:", + " reflection_interval: 1", + "paths:", + ` config_dir: "${TEST_DIR}/phantom-config"`, + ` constitution: "${TEST_DIR}/phantom-config/constitution.md"`, + ` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`, + ` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`, + ` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`, + ` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`, + ` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`, + ].join("\n"), + "utf-8", + ); + + writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8"); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/version.json`, + JSON.stringify({ + version: 0, + parent: null, + timestamp: new Date().toISOString(), + changes: [], + metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 }, + }), + "utf-8", + ); + writeFileSync( + `${TEST_DIR}/phantom-config/meta/metrics.json`, + JSON.stringify({ + session_count: 0, + success_count: 0, + failure_count: 0, + correction_count: 0, + evolution_count: 0, + rollback_count: 0, + last_session_at: null, + last_evolution_at: null, + success_rate_7d: 0, + correction_rate_7d: 0, + sessions_since_consolidation: 0, + }), + "utf-8", + ); + writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8"); + writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8"); + + // No API key + auto = disabled + const engine = new EvolutionEngine(CONFIG_PATH); + expect(engine.usesLLMJudges()).toBe(false); + }); +}); diff --git a/src/evolution/__tests__/metrics.test.ts b/src/evolution/__tests__/metrics.test.ts index 2a640b4..fcd24e4 100644 --- a/src/evolution/__tests__/metrics.test.ts +++ b/src/evolution/__tests__/metrics.test.ts @@ -18,6 +18,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/validation.test.ts b/src/evolution/__tests__/validation.test.ts index 9c42ba4..9eae6ce 100644 --- a/src/evolution/__tests__/validation.test.ts +++ b/src/evolution/__tests__/validation.test.ts @@ -20,6 +20,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/__tests__/versioning.test.ts b/src/evolution/__tests__/versioning.test.ts index 2703f35..72868ca 100644 --- a/src/evolution/__tests__/versioning.test.ts +++ b/src/evolution/__tests__/versioning.test.ts @@ -11,6 +11,7 @@ function testConfig(): EvolutionConfig { cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 }, gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 }, reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 }, + judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 }, paths: { config_dir: TEST_DIR, constitution: `${TEST_DIR}/constitution.md`, diff --git a/src/evolution/config.ts b/src/evolution/config.ts index abbcd89..ea2f6d2 100644 --- a/src/evolution/config.ts +++ b/src/evolution/config.ts @@ -26,6 +26,13 @@ export const EvolutionConfigSchema = z.object({ max_budget_usd: z.number().positive().default(0.5), }) .default({}), + judges: z + .object({ + enabled: z.enum(["auto", "always", "never"]).default("auto"), + cost_cap_usd_per_day: z.number().positive().default(50.0), + max_golden_suite_size: z.number().int().positive().default(50), + }) + .default({}), paths: z .object({ config_dir: z.string().default("phantom-config"), diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts index 87bd38d..270b266 100644 --- a/src/evolution/engine.ts +++ b/src/evolution/engine.ts @@ -4,8 +4,7 @@ import { applyApproved } from "./application.ts"; import { type EvolutionConfig, loadEvolutionConfig } from "./config.ts"; import { recordObservations, runConsolidation } from "./consolidation.ts"; import { ConstitutionChecker } from "./constitution.ts"; -import { addCase } from "./golden-suite.ts"; -import { loadSuite } from "./golden-suite.ts"; +import { addCase, loadSuite, pruneSuite } from "./golden-suite.ts"; import { runQualityJudge } from "./judges/quality-judge.ts"; import { type JudgeCosts, emptyJudgeCosts } from "./judges/types.ts"; import { @@ -30,24 +29,34 @@ import { getHistory, readVersion, rollback as versionRollback } from "./versioni export class EvolutionEngine { private config: EvolutionConfig; private checker: ConstitutionChecker; - private useLLMJudges: boolean; + private llmJudgesEnabled: boolean; + private dailyCostUsd = 0; + private dailyCostResetDate = ""; - constructor(configPath?: string, useLLMJudges = false) { + constructor(configPath?: string) { this.config = loadEvolutionConfig(configPath); this.checker = new ConstitutionChecker(this.config); - this.useLLMJudges = useLLMJudges; + this.llmJudgesEnabled = this.resolveJudgeMode(); + if (this.llmJudgesEnabled) { + console.log("[evolution] LLM judges enabled (API key detected)"); + } else { + console.log("[evolution] LLM judges disabled (no API key or config override)"); + } } - getEvolutionConfig(): EvolutionConfig { - return this.config; + private resolveJudgeMode(): boolean { + const setting = this.config.judges?.enabled ?? "auto"; + if (setting === "never") return false; + if (setting === "always") return true; + return !!process.env.ANTHROPIC_API_KEY; } - enableLLMJudges(): void { - this.useLLMJudges = true; + usesLLMJudges(): boolean { + return this.llmJudgesEnabled; } - disableLLMJudges(): void { - this.useLLMJudges = false; + getEvolutionConfig(): EvolutionConfig { + return this.config; } /** @@ -62,7 +71,7 @@ export class EvolutionEngine { // Step 1: Observation Extraction (LLM or heuristic) let observations: import("./types.ts").SessionObservation[]; - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { const currentConfig = this.getConfig(); const result = await extractObservationsWithLLM(session, currentConfig); observations = result.observations; @@ -98,7 +107,7 @@ export class EvolutionEngine { const goldenSuite = loadSuite(this.config); let validationResults: import("./types.ts").ValidationResult[]; - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { const judgeResult = await validateAllWithJudges(deltas, this.checker, goldenSuite, this.config, currentConfig); validationResults = judgeResult.results; mergeCosts(judgeCosts, judgeResult.judgeCosts); @@ -139,7 +148,7 @@ export class EvolutionEngine { } // Quality Assessment (LLM only, non-blocking) - if (this.useLLMJudges) { + if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { try { const qualityResult = await runQualityJudge(session, currentConfig); judgeCosts.quality_assessment.calls++; @@ -184,11 +193,15 @@ export class EvolutionEngine { this.rollback(this.getCurrentVersion() - 1); } - // Record judge costs - if (this.useLLMJudges) { + // Record judge costs and update daily tracking + if (this.llmJudgesEnabled) { this.recordJudgeCosts(judgeCosts); + this.trackDailyCost(judgeCosts); } + // Enforce golden suite cap + this.pruneGoldenSuite(); + return { version: this.getCurrentVersion(), changes_applied: applied, @@ -236,6 +249,41 @@ export class EvolutionEngine { console.log(`[evolution] Rolled back to version ${toVersion}`); } + private resetDailyCostIfNewDay(): void { + const today = new Date().toISOString().slice(0, 10); + if (this.dailyCostResetDate !== today) { + this.dailyCostUsd = 0; + this.dailyCostResetDate = today; + } + } + + private isDailyCostCapReached(): boolean { + this.resetDailyCostIfNewDay(); + const cap = this.config.judges?.cost_cap_usd_per_day ?? 50.0; + if (this.dailyCostUsd >= cap) { + console.warn( + `[evolution] Daily cost cap reached ($${this.dailyCostUsd.toFixed(2)} >= $${cap}), using heuristics`, + ); + return true; + } + return false; + } + + private trackDailyCost(costs: JudgeCosts): void { + this.resetDailyCostIfNewDay(); + for (const key of Object.keys(costs) as Array) { + this.dailyCostUsd += costs[key].totalUsd; + } + } + + private pruneGoldenSuite(): void { + const maxSize = this.config.judges?.max_golden_suite_size ?? 50; + const removed = pruneSuite(this.config, maxSize); + if (removed > 0) { + console.log(`[evolution] Pruned ${removed} oldest golden suite entries (cap: ${maxSize})`); + } + } + private recordJudgeCosts(costs: JudgeCosts): void { const metricsPath = this.config.paths.metrics_file; try { diff --git a/src/evolution/golden-suite.ts b/src/evolution/golden-suite.ts index 22f780f..5643a9a 100644 --- a/src/evolution/golden-suite.ts +++ b/src/evolution/golden-suite.ts @@ -45,6 +45,22 @@ export function addCase(config: EvolutionConfig, goldenCase: GoldenCase): void { } } +/** + * Prune the golden suite to the given max size, removing oldest entries. + * No-op if the suite is within the limit. + */ +export function pruneSuite(config: EvolutionConfig, maxSize: number): number { + const suite = loadSuite(config); + if (suite.length <= maxSize) return 0; + + const sorted = suite.sort((a, b) => b.created_at.localeCompare(a.created_at)); + const pruned = sorted.slice(0, maxSize); + const content = pruned.map((c) => JSON.stringify(c)).join("\n"); + writeFileSync(config.paths.golden_suite, `${content}\n`, "utf-8"); + + return suite.length - maxSize; +} + /** * Run the golden suite against a proposed change description. * Returns cases that might be affected. diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts index 088f54d..6254e99 100644 --- a/src/evolution/judges/client.ts +++ b/src/evolution/judges/client.ts @@ -1,6 +1,7 @@ import Anthropic from "@anthropic-ai/sdk"; import { zodOutputFormat } from "@anthropic-ai/sdk/helpers/zod"; -import type { z } from "zod"; +// zod/v4 required: matches schemas.ts for zodOutputFormat compatibility +import type { z } from "zod/v4"; import { JUDGE_MAX_TOKENS, JUDGE_TEMPERATURE, @@ -23,6 +24,10 @@ export function setClient(client: Anthropic | null): void { _client = client; } +export function isJudgeAvailable(): boolean { + return !!process.env.ANTHROPIC_API_KEY; +} + /** * Call a single LLM judge with structured output. * Uses the raw Anthropic SDK (not the Agent SDK). @@ -46,7 +51,9 @@ export async function callJudge(options: { system: options.systemPrompt, messages: [{ role: "user", content: options.userMessage }], output_config: { - format: zodOutputFormat(options.schema), + // Cast needed: SDK .d.ts references zod v3 types but runtime uses zod/v4 + // biome-ignore lint/suspicious/noExplicitAny: bridging zod v3/v4 type mismatch + format: zodOutputFormat(options.schema as any), }, }); diff --git a/src/evolution/judges/schemas.ts b/src/evolution/judges/schemas.ts index b9fb0ef..a96f69d 100644 --- a/src/evolution/judges/schemas.ts +++ b/src/evolution/judges/schemas.ts @@ -1,4 +1,5 @@ -import { z } from "zod"; +// zod/v4 required: the Anthropic SDK's zodOutputFormat reads schema._zod.def (v4 only) +import { z } from "zod/v4"; // -- Observation Extraction -- diff --git a/src/evolution/judges/types.ts b/src/evolution/judges/types.ts index 6402013..755972d 100644 --- a/src/evolution/judges/types.ts +++ b/src/evolution/judges/types.ts @@ -1,6 +1,6 @@ -export const JUDGE_MODEL_SONNET = "claude-sonnet-4-6-20250514"; -export const JUDGE_MODEL_HAIKU = "claude-haiku-4-5-20250929"; -export const JUDGE_MODEL_OPUS = "claude-opus-4-6-20250918"; +export const JUDGE_MODEL_SONNET = "claude-sonnet-4-6"; +export const JUDGE_MODEL_HAIKU = "claude-haiku-4-5"; +export const JUDGE_MODEL_OPUS = "claude-opus-4-6"; export const JUDGE_TIMEOUT_MS = 30_000; export const JUDGE_MAX_TOKENS = 4096; diff --git a/src/index.ts b/src/index.ts index 4a25539..7c8e6e7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -37,7 +37,7 @@ import { PeerHealthMonitor } from "./mcp/peer-health.ts"; import { PeerManager } from "./mcp/peers.ts"; import { PhantomMcpServer } from "./mcp/server.ts"; import { loadMemoryConfig } from "./memory/config.ts"; -import { type SessionData, consolidateSession } from "./memory/consolidation.ts"; +import { type SessionData, consolidateSession, consolidateSessionWithLLM } from "./memory/consolidation.ts"; import { MemoryContextBuilder } from "./memory/context-builder.ts"; import { MemorySystem } from "./memory/system.ts"; import { isFirstRun, isOnboardingInProgress } from "./onboarding/detection.ts"; @@ -99,7 +99,8 @@ async function main(): Promise { try { evolution = new EvolutionEngine(); const currentVersion = evolution.getCurrentVersion(); - console.log(`[evolution] Engine initialized (v${currentVersion})`); + const judgeMode = evolution.usesLLMJudges() ? "LLM judges" : "heuristic"; + console.log(`[evolution] Engine initialized (v${currentVersion}, ${judgeMode})`); setEvolutionVersionProvider(() => evolution?.getCurrentVersion() ?? 0); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); @@ -491,19 +492,38 @@ async function main(): Promise { outcome: response.text.startsWith("Error:") ? "failure" : "success", }; - consolidateSession(memory, sessionData) - .then((result) => { - if (result.episodesCreated > 0 || result.factsExtracted > 0) { - console.log( - `[memory] Consolidated: ${result.episodesCreated} episodes, ` + - `${result.factsExtracted} facts (${result.durationMs}ms)`, - ); - } - }) - .catch((err: unknown) => { - const errMsg = err instanceof Error ? err.message : String(err); - console.warn(`[memory] Consolidation failed: ${errMsg}`); - }); + const useLLMConsolidation = evolution?.usesLLMJudges() ?? false; + if (useLLMConsolidation) { + const evolvedConfig = evolution?.getConfig(); + const existingFacts = evolvedConfig ? `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}` : ""; + consolidateSessionWithLLM(memory, sessionData, existingFacts) + .then(({ result }) => { + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[memory] Consolidated (LLM): ${result.episodesCreated} episodes, ` + + `${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + }) + .catch((err: unknown) => { + const errMsg = err instanceof Error ? err.message : String(err); + console.warn(`[memory] LLM consolidation failed: ${errMsg}`); + }); + } else { + consolidateSession(memory, sessionData) + .then((result) => { + if (result.episodesCreated > 0 || result.factsExtracted > 0) { + console.log( + `[memory] Consolidated: ${result.episodesCreated} episodes, ` + + `${result.factsExtracted} facts (${result.durationMs}ms)`, + ); + } + }) + .catch((err: unknown) => { + const errMsg = err instanceof Error ? err.message : String(err); + console.warn(`[memory] Consolidation failed: ${errMsg}`); + }); + } } // Evolution pipeline (non-blocking) From 2bca8252515d3af4f71aabf40fc4dd45dfdfbcf8 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Cheema Date: Mon, 30 Mar 2026 21:09:36 -0700 Subject: [PATCH 2/3] fix: resolve pre-existing lint errors in dynamic-handlers test Replace `delete process.env.X` with `process.env.X = undefined` to satisfy Biome's noDelete rule, and fix import ordering. These were pre-existing lint failures unrelated to the judge activation work. --- src/mcp/__tests__/dynamic-handlers.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mcp/__tests__/dynamic-handlers.test.ts b/src/mcp/__tests__/dynamic-handlers.test.ts index 1c5376b..86aa33b 100644 --- a/src/mcp/__tests__/dynamic-handlers.test.ts +++ b/src/mcp/__tests__/dynamic-handlers.test.ts @@ -1,6 +1,6 @@ import { describe, expect, test } from "bun:test"; -import type { DynamicToolDef } from "../dynamic-tools.ts"; import { buildSafeEnv, executeDynamicHandler } from "../dynamic-handlers.ts"; +import type { DynamicToolDef } from "../dynamic-tools.ts"; describe("buildSafeEnv", () => { test("includes only safe environment variables", () => { @@ -24,7 +24,7 @@ describe("buildSafeEnv", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } }); @@ -40,7 +40,7 @@ describe("buildSafeEnv", () => { if (origToken !== undefined) { process.env.SLACK_BOT_TOKEN = origToken; } else { - delete process.env.SLACK_BOT_TOKEN; + process.env.SLACK_BOT_TOKEN = undefined; } } }); @@ -82,7 +82,7 @@ describe("executeDynamicHandler", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } }); @@ -154,7 +154,7 @@ describe("executeDynamicHandler", () => { if (origKey !== undefined) { process.env.ANTHROPIC_API_KEY = origKey; } else { - delete process.env.ANTHROPIC_API_KEY; + process.env.ANTHROPIC_API_KEY = undefined; } } }); From 9484ddb02508b4c712f7fbe60c3b487bbc35c202 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Cheema Date: Mon, 30 Mar 2026 21:18:31 -0700 Subject: [PATCH 3/3] fix: respect daily cost cap in memory consolidation, track costs incrementally Addresses two review findings: 1. Memory consolidation now checks the daily cost cap before invoking the LLM judge, and tracks the returned cost toward the daily total. Added isWithinCostCap() and trackExternalJudgeCost() to the engine. 2. Cost tracking within afterSession() is now incremental. Each LLM stage updates the daily counter immediately, so later stages see prior costs and fall back to heuristics when the cap is reached. --- src/evolution/engine.ts | 31 +++++++++++++++++++++++++------ src/index.ts | 7 +++++-- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts index 270b266..c32b4df 100644 --- a/src/evolution/engine.ts +++ b/src/evolution/engine.ts @@ -55,6 +55,17 @@ export class EvolutionEngine { return this.llmJudgesEnabled; } + /** Memory consolidation runs outside afterSession() but still needs to respect the daily cap. */ + isWithinCostCap(): boolean { + return !this.isDailyCostCapReached(); + } + + /** Consolidation judge costs happen outside the evolution pipeline but count toward the daily cap. */ + trackExternalJudgeCost(cost: { totalUsd: number }): void { + this.resetDailyCostIfNewDay(); + this.dailyCostUsd += cost.totalUsd; + } + getEvolutionConfig(): EvolutionConfig { return this.config; } @@ -77,6 +88,7 @@ export class EvolutionEngine { observations = result.observations; if (result.judgeCost) { addCost(judgeCosts.observation_extraction, result.judgeCost); + this.incrementDailyCost(result.judgeCost.totalUsd); } } else { observations = extractObservations(session); @@ -111,6 +123,7 @@ export class EvolutionEngine { const judgeResult = await validateAllWithJudges(deltas, this.checker, goldenSuite, this.config, currentConfig); validationResults = judgeResult.results; mergeCosts(judgeCosts, judgeResult.judgeCosts); + this.incrementDailyCost(totalCostFromJudgeCosts(judgeResult.judgeCosts)); } else { validationResults = validateAll(deltas, this.checker, goldenSuite, this.config); } @@ -155,6 +168,7 @@ export class EvolutionEngine { judgeCosts.quality_assessment.totalUsd += qualityResult.costUsd; judgeCosts.quality_assessment.totalInputTokens += qualityResult.inputTokens; judgeCosts.quality_assessment.totalOutputTokens += qualityResult.outputTokens; + this.incrementDailyCost(qualityResult.costUsd); if (qualityResult.data.regression_signal) { console.warn( @@ -193,10 +207,9 @@ export class EvolutionEngine { this.rollback(this.getCurrentVersion() - 1); } - // Record judge costs and update daily tracking + // Record judge costs to persistent metrics (daily tracking already done incrementally above) if (this.llmJudgesEnabled) { this.recordJudgeCosts(judgeCosts); - this.trackDailyCost(judgeCosts); } // Enforce golden suite cap @@ -269,11 +282,9 @@ export class EvolutionEngine { return false; } - private trackDailyCost(costs: JudgeCosts): void { + private incrementDailyCost(usd: number): void { this.resetDailyCostIfNewDay(); - for (const key of Object.keys(costs) as Array) { - this.dailyCostUsd += costs[key].totalUsd; - } + this.dailyCostUsd += usd; } private pruneGoldenSuite(): void { @@ -325,3 +336,11 @@ function mergeCosts(target: JudgeCosts, source: JudgeCosts): void { addCost(target[key], source[key]); } } + +function totalCostFromJudgeCosts(costs: JudgeCosts): number { + let total = 0; + for (const key of Object.keys(costs) as Array) { + total += costs[key].totalUsd; + } + return total; +} diff --git a/src/index.ts b/src/index.ts index 7c8e6e7..47fd660 100644 --- a/src/index.ts +++ b/src/index.ts @@ -492,12 +492,15 @@ async function main(): Promise { outcome: response.text.startsWith("Error:") ? "failure" : "success", }; - const useLLMConsolidation = evolution?.usesLLMJudges() ?? false; + const useLLMConsolidation = evolution?.usesLLMJudges() && evolution.isWithinCostCap(); if (useLLMConsolidation) { const evolvedConfig = evolution?.getConfig(); const existingFacts = evolvedConfig ? `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}` : ""; consolidateSessionWithLLM(memory, sessionData, existingFacts) - .then(({ result }) => { + .then(({ result, judgeCost }) => { + if (judgeCost) { + evolution?.trackExternalJudgeCost(judgeCost); + } if (result.episodesCreated > 0 || result.factsExtracted > 0) { console.log( `[memory] Consolidated (LLM): ${result.episodesCreated} episodes, ` +