diff --git a/src/__tests__/agent-runner.test.ts b/src/__tests__/agent-runner.test.ts index 6cbf505..f0d7f48 100644 --- a/src/__tests__/agent-runner.test.ts +++ b/src/__tests__/agent-runner.test.ts @@ -1,6 +1,6 @@ import { describe, it, before, after } from "node:test"; import assert from "node:assert/strict"; -import { AgentRunner } from "../agent-runner.js"; +import { AgentRunner, type ReviewResult } from "../agent-runner.js"; import { createTask } from "../types.js"; import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs"; import { join } from "node:path"; @@ -345,4 +345,97 @@ describe("AgentRunner", () => { assert.strictEqual(task.events[0].type, "evt-5"); assert.strictEqual(task.events[199].type, "evt-204"); }); + + // ── pickReviewAgent ── + + it("pickReviewAgent selects codex to review claude's work", () => { + assert.strictEqual(AgentRunner.pickReviewAgent("claude"), "codex"); + }); + + it("pickReviewAgent selects claude to review codex's work", () => { + assert.strictEqual(AgentRunner.pickReviewAgent("codex"), "claude"); + }); + + it("pickReviewAgent selects codex to review claude-sdk's work", () => { + assert.strictEqual(AgentRunner.pickReviewAgent("claude-sdk"), "codex"); + }); + + it("pickReviewAgent selects claude for generic agents", () => { + assert.strictEqual(AgentRunner.pickReviewAgent("aider --yes"), "claude"); + assert.strictEqual(AgentRunner.pickReviewAgent("custom-agent"), "claude"); + }); + + // ── parseReviewResponse ── + + it("parseReviewResponse parses clean JSON", () => { + const runner = new AgentRunner(); + const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner); + const result = parse('{"approve": true, "score": 85, "issues": [], "suggestions": ["looks good"]}'); + assert.ok(result); + assert.strictEqual(result.approve, true); + assert.strictEqual(result.score, 85); + assert.deepStrictEqual(result.suggestions, ["looks good"]); + }); + + it("parseReviewResponse extracts JSON from surrounding text", () => { + const runner = new AgentRunner(); + const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner); + const result = parse('Here is my review:\n{"approve": false, "score": 30, "issues": ["bug found"], "suggestions": []}\nDone.'); + assert.ok(result); + assert.strictEqual(result.approve, false); + assert.strictEqual(result.score, 30); + assert.deepStrictEqual(result.issues, ["bug found"]); + }); + + it("parseReviewResponse returns null for unparseable output", () => { + const runner = new AgentRunner(); + const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner); + assert.strictEqual(parse("I can't produce JSON right now"), null); + }); + + it("parseReviewResponse clamps score to 0-100", () => { + const runner = new AgentRunner(); + const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner); + const result = parse('{"approve": true, "score": 150, "issues": [], "suggestions": []}'); + assert.ok(result); + assert.strictEqual(result.score, 100); + }); + + it("parseReviewResponse returns null when approve or score is missing/wrong type", () => { + const runner = new AgentRunner(); + const parse = (runner as unknown as { parseReviewResponse: (s: string) => ReviewResult | null }).parseReviewResponse.bind(runner); + assert.strictEqual(parse('{"score": 80, "issues": []}'), null); // missing approve + assert.strictEqual(parse('{"approve": "yes", "score": 80}'), null); // approve is string + assert.strictEqual(parse('{"approve": true, "issues": []}'), null); // missing score + }); + + // ── reviewDiff approve field ── + + it("reviewDiff includes approve field based on score threshold", () => { + const runner = new AgentRunner(); + const clean = runner.reviewDiff("diff --git a/foo.ts\n+const x = 1;"); + assert.strictEqual(typeof clean.approve, "boolean"); + assert.strictEqual(clean.approve, true); // score=50 >= 40 + + const bad = runner.reviewDiff("diff --git a/foo.ts\n+console.log('x');\n+console.log('y');\n+console.log('z');"); + // console.log penalty takes score below 40 + // Actually score=50-10=40, still >=40 + assert.strictEqual(typeof bad.approve, "boolean"); + }); + + // ── reviewDiffWithAgent fallback ── + + it("reviewDiffWithAgent falls back to heuristic when agent fails", async () => { + const runner = new AgentRunner(); + // codex not available in test env → will fail → should fall back to heuristic + const result = await runner.reviewDiffWithAgent( + "diff --git a/foo.test.ts b/foo.test.ts\n+it('works', () => {});", + "claude", // task was by claude → review by codex → codex fails → fallback + 2, // 2 second timeout to keep test fast + ); + assert.strictEqual(typeof result.approve, "boolean"); + assert.strictEqual(typeof result.score, "number"); + assert.ok(Array.isArray(result.issues)); + assert.ok(Array.isArray(result.suggestions)); + }); }); diff --git a/src/__tests__/scheduler.test.ts b/src/__tests__/scheduler.test.ts index 2815f2b..0da3e3c 100644 --- a/src/__tests__/scheduler.test.ts +++ b/src/__tests__/scheduler.test.ts @@ -21,6 +21,7 @@ function makeRunner(): AgentRunner { return { run: async (task: Task) => { task.status = "success"; task.durationMs = 100; return task; }, getRunningTasks: () => [], + reviewDiffWithAgent: async () => ({ approve: true, score: 80, issues: [], suggestions: [] }), } as unknown as AgentRunner; } @@ -845,4 +846,208 @@ describe("Scheduler", () => { assert.ok(result); }); }); + + // ─── 11. Cross-agent review gate ─── + + describe("cross-agent review gate", () => { + it("review gate blocks merge when review rejects", async () => { + let mergeCalledWith = false; + const pool = { + ...makePool(), + release: async (_name: string, merge: boolean) => { + mergeCalledWith = merge; + return { merged: merge }; + }, + } as unknown as WorktreePool; + + const runner = { + run: async (task: Task) => { + task.status = "success"; + task.durationMs = 100; + // Simulate agent creating a git diff event + task.events.push({ + type: "git_diff", + timestamp: new Date().toISOString(), + data: { diff: "diff --git a/foo.ts\n+console.log('debug');" }, + }); + return task; + }, + getRunningTasks: () => [], + reviewDiffWithAgent: async () => ({ + approve: false, + score: 25, + issues: ["console.log found"], + suggestions: [], + }), + } as unknown as AgentRunner; + + const events: Record[] = []; + const s = new Scheduler(pool, runner, makeStore(), (ev) => events.push(ev)); + + // Manually invoke executeAndRelease + const task = s.submit("test review gate"); + const exec = (s as any).executeAndRelease.bind(s); + await exec(task, "w0", "/tmp/w0"); + + // Task should still be "success" (review doesn't change status) + assert.strictEqual(task.status, "success"); + // But merge should have been blocked + assert.strictEqual(mergeCalledWith, false); + // Review should be attached to task + assert.ok(task.review); + assert.strictEqual(task.review!.approve, false); + assert.strictEqual(task.review!.score, 25); + // Error should explain why merge was blocked + assert.ok(task.error.includes("review rejected"), "error should explain rejection"); + // Should have emitted review_started and review_rejected events + assert.ok(events.some(e => e.type === "review_started"), "should emit review_started"); + assert.ok(events.some(e => e.type === "review_rejected"), "should emit review_rejected"); + }); + + it("review gate allows merge when review approves", async () => { + let mergeCalledWith = false; + const pool = { + ...makePool(), + release: async (_name: string, merge: boolean) => { + mergeCalledWith = merge; + return { merged: true }; + }, + } as unknown as WorktreePool; + + const runner = { + run: async (task: Task) => { + task.status = "success"; + task.durationMs = 100; + task.events.push({ + type: "git_diff", + timestamp: new Date().toISOString(), + data: { diff: "diff --git a/foo.ts\n+const x = 1;" }, + }); + return task; + }, + getRunningTasks: () => [], + reviewDiffWithAgent: async () => ({ + approve: true, + score: 85, + issues: [], + suggestions: ["looks clean"], + }), + } as unknown as AgentRunner; + + const events: Record[] = []; + const s = new Scheduler(pool, runner, makeStore(), (ev) => events.push(ev)); + + const task = s.submit("test approve"); + const exec = (s as any).executeAndRelease.bind(s); + await exec(task, "w0", "/tmp/w0"); + + assert.strictEqual(task.status, "success"); + assert.strictEqual(mergeCalledWith, true); + assert.ok(task.review); + assert.strictEqual(task.review!.approve, true); + assert.ok(events.some(e => e.type === "review_started"), "should emit review_started"); + assert.ok(events.some(e => e.type === "review_approved"), "should emit review_approved"); + }); + + it("skips review when task has no diff", async () => { + let mergeCalledWith = false; + const pool = { + ...makePool(), + release: async (_name: string, merge: boolean) => { + mergeCalledWith = merge; + return { merged: true }; + }, + } as unknown as WorktreePool; + + const runner = { + run: async (task: Task) => { + task.status = "success"; + task.durationMs = 100; + // No git_diff event + return task; + }, + getRunningTasks: () => [], + reviewDiffWithAgent: async () => { throw new Error("should not be called"); }, + } as unknown as AgentRunner; + + const s = new Scheduler(pool, runner, makeStore()); + + const task = s.submit("no diff task"); + const exec = (s as any).executeAndRelease.bind(s); + await exec(task, "w0", "/tmp/w0"); + + // Should merge without review since there's no diff + assert.strictEqual(mergeCalledWith, true); + assert.strictEqual(task.review, undefined); + }); + + it("skips review when task failed", async () => { + let mergeCalledWith = false; + const pool = { + ...makePool(), + release: async (_name: string, merge: boolean) => { + mergeCalledWith = merge; + return { merged: false }; + }, + } as unknown as WorktreePool; + + const runner = { + run: async (task: Task) => { + task.status = "failed"; + task.error = "compilation error"; + task.durationMs = 50; + return task; + }, + getRunningTasks: () => [], + reviewDiffWithAgent: async () => { throw new Error("should not be called"); }, + } as unknown as AgentRunner; + + const s = new Scheduler(pool, runner, makeStore()); + + const task = s.submit("failed task"); + const exec = (s as any).executeAndRelease.bind(s); + await exec(task, "w0", "/tmp/w0"); + + assert.strictEqual(mergeCalledWith, false); + assert.strictEqual(task.review, undefined); + }); + + it("falls back gracefully when reviewDiffWithAgent throws", async () => { + let mergeCalledWith = false; + const pool = { + ...makePool(), + release: async (_name: string, merge: boolean) => { + mergeCalledWith = merge; + return { merged: false }; + }, + } as unknown as WorktreePool; + + const runner = { + run: async (task: Task) => { + task.status = "success"; + task.durationMs = 100; + task.events.push({ + type: "git_diff", + timestamp: new Date().toISOString(), + data: { diff: "diff --git a/foo.ts\n+const x = 1;" }, + }); + return task; + }, + getRunningTasks: () => [], + reviewDiffWithAgent: async () => { throw new Error("network error"); }, + } as unknown as AgentRunner; + + const s = new Scheduler(pool, runner, makeStore()); + + const task = s.submit("review throws"); + task.maxRetries = 0; // prevent retry so we can assert final status + const exec = (s as any).executeAndRelease.bind(s); + await exec(task, "w0", "/tmp/w0"); + + // The outer catch should mark the task as failed + assert.strictEqual(task.status, "failed"); + assert.ok(task.error.includes("network error")); + assert.strictEqual(mergeCalledWith, false); + }); + }); }); diff --git a/src/agent-runner.ts b/src/agent-runner.ts index 2f0f117..d2b7545 100644 --- a/src/agent-runner.ts +++ b/src/agent-runner.ts @@ -1,4 +1,4 @@ -import type { Task, TaskEvent } from "./types.js"; +import { type Task, type TaskEvent, type ReviewResult, createTask } from "./types.js"; import { log } from "./logger.js"; import { spawn, type ChildProcess } from "node:child_process"; import { exec as execCb } from "node:child_process"; @@ -50,11 +50,7 @@ export interface RunningTaskInfo { costUsd: number; } -export interface ReviewResult { - score: number; - issues: string[]; - suggestions: string[]; -} +export type { ReviewResult }; export class AgentRunner { private readonly _runningTasks = new Map(); @@ -91,7 +87,122 @@ export class AgentRunner { score -= 10; } - return { score, issues, suggestions }; + return { approve: score >= 40, score, issues, suggestions }; + } + + /** + * Select the cross-review agent: the reviewer should be a different agent + * than the one that wrote the code. + */ + static pickReviewAgent(taskAgent: string): string { + if (taskAgent === "codex") return "claude"; + if (taskAgent === "claude" || taskAgent === "claude-sdk") return "codex"; + return "claude"; // generic agents reviewed by claude + } + + /** + * Spawn a cross-agent to review a git diff. Returns a structured ReviewResult. + * Falls back to the heuristic reviewDiff() if the agent fails or times out. + */ + async reviewDiffWithAgent( + diff: string, + taskAgent: string, + timeout: number = 60, + ): Promise { + const reviewAgent = AgentRunner.pickReviewAgent(taskAgent); + const truncatedDiff = diff.length > 50_000 ? diff.slice(0, 50_000) + "\n... (truncated)" : diff; + + const reviewPrompt = [ + "You are a code reviewer. Review the following git diff and respond with ONLY a JSON object (no markdown, no explanation).", + "", + "IMPORTANT: Do NOT modify any files. Do NOT run any commands. Only output your JSON review.", + "", + "Evaluate:", + "1. Correctness: Are there bugs, logic errors, or missing edge cases?", + "2. Quality: Is the code clean, readable, and maintainable?", + "3. Safety: Are there security issues, hardcoded secrets, or dangerous patterns?", + "", + "Respond with exactly this JSON format:", + '{"approve": true/false, "score": 0-100, "issues": ["issue1", ...], "suggestions": ["suggestion1", ...]}', + "", + "approve=true means the code is ready to merge. approve=false means it needs fixes.", + "Be pragmatic — minor style issues should not block merge. Focus on correctness and safety.", + "", + "```diff", + truncatedDiff, + "```", + ].join("\n"); + + const reviewTask = createTask(reviewPrompt, { + agent: reviewAgent, + timeout, + maxBudget: 1, + }); + + try { + log("info", "cross-agent review started", { taskAgent, reviewAgent }); + // Run review in /tmp to prevent the reviewer from modifying the worktree + await this.run(reviewTask, "/tmp"); + + if (reviewTask.status !== "success" || !reviewTask.output) { + log("warn", "cross-agent review did not succeed, falling back to heuristic", { + status: reviewTask.status, + error: reviewTask.error, + }); + return this.reviewDiff(diff); + } + + // Parse the JSON response from the review agent + const parsed = this.parseReviewResponse(reviewTask.output); + if (parsed) { + parsed.reviewAgent = reviewAgent; + log("info", "cross-agent review complete", { + reviewAgent, + approve: parsed.approve, + score: parsed.score, + issueCount: parsed.issues.length, + }); + return parsed; + } + + log("warn", "cross-agent review response unparseable, falling back to heuristic"); + return this.reviewDiff(diff); + } catch (err: unknown) { + const msg = (err as Error).message ?? String(err); + log("warn", "cross-agent review failed, falling back to heuristic", { error: msg }); + return this.reviewDiff(diff); + } + } + + /** Try to extract a ReviewResult JSON from agent output (may contain surrounding text). */ + private parseReviewResponse(output: string): ReviewResult | null { + // Try direct JSON parse first + try { + const result = this.normalizeReviewObj(JSON.parse(output.trim())); + if (result) return result; + } catch { /* not pure JSON */ } + + // Try to extract JSON by finding the outermost { } + const start = output.indexOf("{"); + const end = output.lastIndexOf("}"); + if (start !== -1 && end > start) { + try { + const result = this.normalizeReviewObj(JSON.parse(output.slice(start, end + 1))); + if (result) return result; + } catch { /* not valid JSON */ } + } + + return null; + } + + private normalizeReviewObj(obj: Record): ReviewResult | null { + if (typeof obj.approve !== "boolean" || typeof obj.score !== "number") return null; + return { + approve: obj.approve, + score: Math.max(0, Math.min(100, obj.score)), + issues: Array.isArray(obj.issues) ? obj.issues.map(String) : [], + suggestions: Array.isArray(obj.suggestions) ? obj.suggestions.map(String) : [], + }; } /** Returns info about all tasks currently being executed by this runner. */ diff --git a/src/scheduler.ts b/src/scheduler.ts index 62d854f..18e4742 100644 --- a/src/scheduler.ts +++ b/src/scheduler.ts @@ -471,7 +471,31 @@ export class Scheduler { log("info", "task started", { taskId: task.id, worker: workerName }); await this.runner.run(task, workerPath, this.onEvent); - const shouldMerge = task.status === "success"; + // Cross-agent review gate: if the task succeeded, have a different agent review the diff + let shouldMerge = task.status === "success"; + if (shouldMerge) { + const diffEvent = task.events.find((e) => e.type === "git_diff"); + const diff = (diffEvent?.data as { diff?: string } | undefined)?.diff; + if (diff) { + const taskAgent = task.agent ?? "claude"; + this.onEvent?.({ type: "review_started", taskId: task.id, reviewAgent: AgentRunner.pickReviewAgent(taskAgent) }); + const review = await this.runner.reviewDiffWithAgent(diff, taskAgent); + task.review = review; + if (!review.approve) { + shouldMerge = false; + task.error = `review rejected (score ${review.score}): ${review.issues.join("; ")}`; + log("info", "cross-agent review rejected merge", { + taskId: task.id, + score: review.score, + issues: review.issues, + }); + this.onEvent?.({ type: "review_rejected", taskId: task.id, score: review.score, issues: review.issues }); + } else { + log("info", "cross-agent review approved merge", { taskId: task.id, score: review.score }); + this.onEvent?.({ type: "review_approved", taskId: task.id, score: review.score }); + } + } + } const mergeResult = await this.pool.release(workerName, shouldMerge, task.id); if (shouldMerge && !mergeResult.merged) {