diff --git a/README.md b/README.md index ce60a9b..993c0a2 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,26 @@ bun run codex-harness/index.ts "Build a personal task manager with a REST API, i Both harnesses write their output to `workspace/claude/` and `workspace/codex/` respectively. The built application lives in `workspace/{sdk}/app/`. +### Resume an Existing Harness Run + +You can resume from an existing `workspace/{sdk}/progress.json` state: + +```bash +# strict resume (default when no value is provided) +bun run claude-harness/index.ts --resume + +# resume current sprint with retry counter reset +bun run claude-harness/index.ts --resume=reset-retries + +# resume current sprint with a newly negotiated contract +bun run claude-harness/index.ts --resume=reset-contract + +# opt into strict retry behavior (re-evaluate every regression immediately) +bun run claude-harness/index.ts --resume --retry-strategy=strict +``` + +Same flags are supported for `codex-harness/index.ts`. + ## Configuration Defaults are in `shared/config.ts`: @@ -63,7 +83,10 @@ Defaults are in `shared/config.ts`: | `maxSprints` | 10 | Maximum number of sprints | | `maxRetriesPerSprint` | 3 | Max evaluation retries before failing a sprint | | `passThreshold` | 7 | Minimum score (out of 10) for each criterion | +| `retryStrategy` | `stabilized` | Retry behavior: `stabilized` keeps previously verified criteria locked unless regressions persist | +| `hardFailUnlockStreak` | 2 | Number of consecutive hard fails required to unlock a previously passed criterion | | `CLAUDE_MODEL` | `claude-sonnet-4-6` | Model for Claude harness | +| `CLAUDE_MAX_TURNS` | 80 | Max Claude turns per agent run (higher improves long evaluation completion reliability) | | `CODEX_MODEL` | `gpt-5.4` | Model for Codex harness | ## How It Works @@ -82,8 +105,10 @@ The generator reads the spec and contract, then implements features one at a tim ### 4. Evaluation Phase (per sprint) The evaluator reads the contract criteria, examines the code, **runs the application**, and tries to break it. It scores each criterion on a 1-10 scale. If all criteria pass (score >= 7/10), the sprint survives. If any fail, detailed feedback goes back to the generator -- with file paths, line numbers, and exact failure descriptions. +When `stabilized` retry mode is enabled, evaluator parsing is hardened: if the first evaluator response is not valid JSON, the harness automatically retries the evaluator once with a strict JSON-only instruction before failing the sprint. + ### 5. Retry Loop -The generator reads the adversarial feedback, decides whether to refine or pivot, and rebuilds. This cycles up to 3 times per sprint. If a sprint can't survive the evaluator after all retries, the harness stops. +The generator reads the adversarial feedback, decides whether to refine or pivot, and rebuilds. This cycles up to 3 times per sprint. In `stabilized` retry mode, criteria that have already passed are "locked" and only unlocked after repeated hard regressions, which reduces flakey fail/pass oscillations in long sprints. ### 6. Completion Once all sprints pass, you have a working application built incrementally with quality gates at every step -- every feature tested by an agent whose job was to break it. @@ -126,6 +151,7 @@ Agents communicate through files, not shared conversation history. This keeps ea - `spec.md` -- Product specification from the planner - `contracts/sprint-{n}.json` -- Sprint contracts - `feedback/sprint-{n}-round-{m}.json` -- Evaluator feedback per attempt +- `feedback/sprint-{n}-stability.json` -- Locked-pass stability state for retry stabilization - `progress.json` -- Harness state tracking ## The GAN Connection diff --git a/claude-harness/evaluator.ts b/claude-harness/evaluator.ts index 77730b2..492d84f 100644 --- a/claude-harness/evaluator.ts +++ b/claude-harness/evaluator.ts @@ -2,6 +2,7 @@ import { query, type Options } from "@anthropic-ai/claude-agent-sdk"; import { EVALUATOR_SYSTEM_PROMPT } from "../shared/prompts.ts"; import { CLAUDE_MODEL, CLAUDE_MAX_TURNS } from "../shared/config.ts"; import { log, logError } from "../shared/logger.ts"; +import { getCriterionThreshold } from "../shared/evaluation.ts"; import type { SprintContract, EvalResult } from "../shared/types.ts"; export async function runEvaluator( @@ -20,7 +21,8 @@ ${JSON.stringify(contract, null, 2)} ## Pass Threshold -Each criterion must score at least ${passThreshold}/10 to pass. +Each criterion must satisfy its own \ +\`threshold\` from the sprint contract. If a criterion has no threshold, use ${passThreshold}/10. ## Instructions @@ -37,43 +39,50 @@ Examine the application in the \`app/\` directory. Read the code, run it if poss persistSession: false, }; - let fullResponse = ""; + const fullResponse = await runEvaluationTurn(prompt, options, sprint); - for await (const msg of query({ prompt, options })) { - if (msg.type === "assistant") { - const message = msg as { message: { content: Array<{ type: string; text?: string; name?: string }> } }; - for (const block of message.message.content) { - if (block.type === "text" && block.text) { - fullResponse += block.text; - } else if (block.type === "tool_use" && block.name) { - log("EVALUATOR", ` Tool: ${block.name}`); - } - } - } else if (msg.type === "result") { - log("EVALUATOR", `Evaluation complete for sprint ${sprint}`); - } + const invalidThresholds = contract.criteria + .filter((criterion) => !Number.isInteger(criterion.threshold) || criterion.threshold < 1 || criterion.threshold > 10) + .map((criterion) => `${criterion.name}=${criterion.threshold}`); + + if (invalidThresholds.length > 0) { + log( + "EVALUATOR", + `Ignoring ${invalidThresholds.length} invalid contract thresholds (expected integer 1-10): ${invalidThresholds.join(", ")}`, + ); + } + + let evalResult = tryParseEvalResult(fullResponse, contract, passThreshold); + if (!evalResult) { + logError("EVALUATOR", "Failed to parse evaluation JSON from first attempt; retrying evaluator once..."); + const recoveryPrompt = `${prompt}\n\nCRITICAL RETRY INSTRUCTION: Your previous response was not valid JSON. Re-run any checks you need, then output ONLY a valid JSON object matching the required schema.`; + const recoveryResponse = await runEvaluationTurn(recoveryPrompt, { ...options, maxTurns: Math.max(CLAUDE_MAX_TURNS, 80) }, sprint); + evalResult = tryParseEvalResult(recoveryResponse, contract, passThreshold); } - const evalResult = parseEvalResult(fullResponse, contract, passThreshold); + if (!evalResult) { + evalResult = buildParseFailureEvalResult(contract, fullResponse); + } - const passedCount = evalResult.feedback.filter((f) => f.score >= passThreshold).length; + const passedCount = evalResult.feedback.filter((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold)).length; const totalCount = evalResult.feedback.length; const verdict = evalResult.passed ? "PASSED" : "FAILED"; log("EVALUATOR", `Sprint ${sprint}: ${verdict} (${passedCount}/${totalCount} criteria passed)`); for (const item of evalResult.feedback) { - const status = item.score >= passThreshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; - log("EVALUATOR", ` [${status}] ${item.criterion}: ${item.score}/10 - ${item.details.slice(0, 100)}`); + const threshold = getCriterionThreshold(contract, item.criterion, passThreshold); + const status = item.score >= threshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; + log("EVALUATOR", ` [${status}] ${item.criterion}: ${item.score}/10 (threshold ${threshold}) - ${item.details.slice(0, 100)}`); } return evalResult; } -function parseEvalResult( +function tryParseEvalResult( response: string, contract: SprintContract, passThreshold: number, -): EvalResult { +): EvalResult | null { // Try multiple strategies to extract JSON from the response const candidates: string[] = []; @@ -94,8 +103,7 @@ function parseEvalResult( try { const parsed = JSON.parse(candidate) as EvalResult; if (parsed.feedback && Array.isArray(parsed.feedback)) { - // Recalculate passed based on threshold - parsed.passed = parsed.feedback.every((f) => f.score >= passThreshold); + parsed.passed = parsed.feedback.every((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold)); return parsed; } } catch { @@ -103,6 +111,10 @@ function parseEvalResult( } } + return null; +} + +function buildParseFailureEvalResult(contract: SprintContract, response: string): EvalResult { logError("EVALUATOR", "Failed to parse evaluation JSON from any extraction strategy"); return { passed: false, @@ -115,3 +127,59 @@ function parseEvalResult( overallSummary: "Evaluation parsing failed. Raw response: " + response.slice(0, 500), }; } + +async function runEvaluationTurn(prompt: string, options: Options, sprint: number): Promise { + let fullResponse = ""; + + for await (const msg of query({ prompt, options })) { + if (msg.type === "assistant") { + const message = msg as { message: { content: Array<{ type: string; text?: string; name?: string }> } }; + for (const block of message.message.content) { + if (block.type === "text" && block.text) { + fullResponse += block.text + "\n"; + } else if (block.type === "tool_use" && block.name) { + log("EVALUATOR", ` Tool: ${block.name}`); + } + } + } else if (msg.type === "result") { + const resultText = extractResultText(msg); + if (resultText) { + fullResponse += resultText + "\n"; + } + log("EVALUATOR", `Evaluation complete for sprint ${sprint}`); + } + } + + return fullResponse.trim(); +} + +function extractResultText(resultMsg: unknown): string { + const chunks: string[] = []; + + const visit = (value: unknown, depth: number): void => { + if (depth > 3 || value === null || value === undefined) return; + + if (typeof value === "string") { + const trimmed = value.trim(); + if (trimmed.startsWith("{") || trimmed.startsWith("```")) { + chunks.push(trimmed); + } + return; + } + + if (Array.isArray(value)) { + for (const item of value) visit(item, depth + 1); + return; + } + + if (typeof value === "object") { + for (const [key, child] of Object.entries(value as Record)) { + if (key === "type") continue; + visit(child, depth + 1); + } + } + }; + + visit(resultMsg, 0); + return chunks.join("\n"); +} diff --git a/claude-harness/generator.ts b/claude-harness/generator.ts index 38ea8e4..9d45ffa 100644 --- a/claude-harness/generator.ts +++ b/claude-harness/generator.ts @@ -9,6 +9,7 @@ export async function runGenerator( spec: string, contract: SprintContract, previousFeedback?: EvalResult, + retryFocusCriteria: string[] = [], ): Promise<{ response: string; sessionId?: string }> { const sprint = contract.sprintNumber; const attempt = previousFeedback ? "retry" : "initial"; @@ -18,6 +19,10 @@ export async function runGenerator( if (previousFeedback) { prompt += `\n\n## Evaluation Feedback (MUST ADDRESS)\n\n${JSON.stringify(previousFeedback, null, 2)}`; + if (retryFocusCriteria.length > 0) { + prompt += `\n\n## Retry Focus (Scope Control)\n\nOnly these criteria are still failing and must be fixed now:\n${retryFocusCriteria.map((name) => `- ${name}`).join("\n")}`; + prompt += "\n\nMinimize changes outside the failing criteria. Preserve behavior for criteria that already pass unless a dependency forces a shared fix."; + } prompt += `\n\nThe previous attempt failed evaluation. Address every issue in the feedback above.`; } else { prompt += `\n\nImplement the features listed in this sprint contract. Work in the \`app/\` directory.`; diff --git a/claude-harness/harness.ts b/claude-harness/harness.ts index d947913..b6cd060 100644 --- a/claude-harness/harness.ts +++ b/claude-harness/harness.ts @@ -11,16 +11,24 @@ import { readSpec, writeContract, readContract, + readFeedback, writeFeedback, + readProgress, writeProgress, + findLatestFeedbackRound, + readSprintStabilityState, + writeSprintStabilityState, } from "../shared/files.ts"; +import { stabilizeEvaluation, buildStabilityStateFromEval, getFailedCriteria } from "../shared/evaluation.ts"; import type { HarnessConfig, + ResumeMode, SprintContract, EvalResult, HarnessProgress, HarnessResult, SprintResult, + SprintStabilityState, } from "../shared/types.ts"; import { runPlanner } from "./planner.ts"; @@ -30,52 +38,108 @@ import { runEvaluator } from "./evaluator.ts"; export async function runHarness(config: HarnessConfig): Promise { const startTime = Date.now(); const results: SprintResult[] = []; + const isResume = config.resumeMode !== undefined; + const resumeMode: ResumeMode = config.resumeMode ?? "strict"; log("HARNESS", "Initializing Claude Agent SDK harness"); log("HARNESS", `Work directory: ${config.workDir}`); log("HARNESS", `Max sprints: ${config.maxSprints} | Max retries: ${config.maxRetriesPerSprint} | Threshold: ${config.passThreshold}/10`); + log("HARNESS", `Retry strategy: ${config.retryStrategy} (unlock streak: ${config.hardFailUnlockStreak})`); + if (isResume) { + log("HARNESS", `Resume mode: ${resumeMode}`); + } - await initWorkspace(config.workDir); + await initWorkspace(config.workDir, { clean: !isResume }); - // Phase 1: Planning - logDivider(); - log("HARNESS", "PHASE 1: PLANNING"); - logDivider(); + let spec: string; + let totalSprints = 0; + let startSprint = 1; + let initialRetryForSprint = 0; + let reuseExistingContractOnStartSprint = false; + let lastEvalForStartSprint: EvalResult | undefined; + let stabilityStateForStartSprint: SprintStabilityState | undefined; + + const progress: HarnessProgress = isResume + ? await readProgress(config.workDir) + : { + status: "planning", + currentSprint: 0, + totalSprints: 0, + completedSprints: 0, + retryCount: 0, + }; - const progress: HarnessProgress = { - status: "planning", - currentSprint: 0, - totalSprints: 0, - completedSprints: 0, - retryCount: 0, - }; - await writeProgress(config.workDir, progress); + if (!isResume) { + // Phase 1: Planning + logDivider(); + log("HARNESS", "PHASE 1: PLANNING"); + logDivider(); - const plannerResponse = await runPlanner(config.userPrompt, config.workDir); + await writeProgress(config.workDir, progress); - // Planner may have written spec.md via Write tool, or returned it as text - let spec: string; - try { + const plannerResponse = await runPlanner(config.userPrompt, config.workDir); + + // Planner may have written spec.md via Write tool, or returned it as text + try { + spec = await readSpec(config.workDir); + } catch { + log("HARNESS", "Planner returned spec as text, writing to spec.md"); + await writeSpec(config.workDir, plannerResponse); + spec = plannerResponse; + } + + // Parse sprint count from spec - look for "Sprint N" patterns + totalSprints = deriveTotalSprints(spec, config.maxSprints); + progress.totalSprints = totalSprints; + log("HARNESS", `Planner produced ${totalSprints} sprints`); + } else { spec = await readSpec(config.workDir); - } catch { - log("HARNESS", "Planner returned spec as text, writing to spec.md"); - await writeSpec(config.workDir, plannerResponse); - spec = plannerResponse; - } + totalSprints = progress.totalSprints > 0 ? progress.totalSprints : deriveTotalSprints(spec, config.maxSprints); + progress.totalSprints = totalSprints; - // Parse sprint count from spec - look for "Sprint N" patterns - const sprintNumbers = Array.from(spec.matchAll(/sprint\s+(\d+)/gi)) - .map((m) => parseInt(m[1]!, 10)) - .filter((n) => n > 0 && n <= config.maxSprints); - const totalSprints = sprintNumbers.length > 0 - ? Math.min(Math.max(...sprintNumbers), config.maxSprints) - : 3; // Default to 3 if no sprint numbers found + if (progress.status === "complete") { + log("HARNESS", "Resume requested but harness is already complete."); + return { success: true, sprints: [], totalDurationMs: Date.now() - startTime }; + } + + if (progress.currentSprint <= 0) { + throw new Error("Cannot resume: progress.json does not contain a valid currentSprint"); + } - progress.totalSprints = totalSprints; - log("HARNESS", `Planner produced ${totalSprints} sprints`); + startSprint = progress.currentSprint; + const latestRound = await findLatestFeedbackRound(config.workDir, startSprint); + if (latestRound !== null) { + lastEvalForStartSprint = await readFeedback(config.workDir, startSprint, latestRound); + try { + stabilityStateForStartSprint = await readSprintStabilityState(config.workDir, startSprint); + } catch { + // Backward compatibility: older runs do not have stability snapshots + } + } + + if (resumeMode === "strict") { + if (progress.status === "failed" && latestRound !== null && latestRound >= config.maxRetriesPerSprint) { + throw new Error( + `Cannot strictly resume sprint ${startSprint}: retry budget exhausted (last round ${latestRound})`, + ); + } + initialRetryForSprint = latestRound === null ? 0 : latestRound + 1; + reuseExistingContractOnStartSprint = true; + } else if (resumeMode === "reset-retries") { + initialRetryForSprint = 0; + reuseExistingContractOnStartSprint = true; + } else { + initialRetryForSprint = 0; + reuseExistingContractOnStartSprint = false; + lastEvalForStartSprint = undefined; + stabilityStateForStartSprint = undefined; + } + + log("HARNESS", `Resuming at sprint ${startSprint}/${totalSprints} from retry ${initialRetryForSprint}`); + } // Phase 2-4: Sprint Loop - for (let sprint = 1; sprint <= totalSprints; sprint++) { + for (let sprint = startSprint; sprint <= totalSprints; sprint++) { logDivider(); log("HARNESS", `SPRINT ${sprint}/${totalSprints}`); logDivider(); @@ -83,20 +147,30 @@ export async function runHarness(config: HarnessConfig): Promise // Phase 2: Contract Negotiation progress.status = "negotiating"; progress.currentSprint = sprint; - progress.retryCount = 0; + progress.retryCount = sprint === startSprint ? initialRetryForSprint : 0; await writeProgress(config.workDir, progress); - log("HARNESS", "Negotiating sprint contract..."); - const contract = await negotiateContract(config.workDir, spec, sprint); - await writeContract(config.workDir, contract); + let contract: SprintContract; + const shouldReuseContract = sprint === startSprint && reuseExistingContractOnStartSprint; + if (shouldReuseContract) { + log("HARNESS", "Reusing existing sprint contract..."); + contract = await readContract(config.workDir, sprint); + } else { + log("HARNESS", "Negotiating sprint contract..."); + contract = await negotiateContract(config.workDir, spec, sprint); + await writeContract(config.workDir, contract); + } log("HARNESS", `Contract agreed: ${contract.criteria.length} criteria for ${contract.features.length} features`); // Phase 3-4: Build-Evaluate Loop let passed = false; - let lastEval: EvalResult | undefined; + let lastEval: EvalResult | undefined = sprint === startSprint ? lastEvalForStartSprint : undefined; + let sprintStabilityState: SprintStabilityState | undefined = sprint === startSprint ? stabilityStateForStartSprint : undefined; let attempts = 0; - for (let retry = 0; retry <= config.maxRetriesPerSprint; retry++) { + const retryStart = sprint === startSprint ? initialRetryForSprint : 0; + + for (let retry = retryStart; retry <= config.maxRetriesPerSprint; retry++) { attempts = retry + 1; // Build @@ -104,13 +178,36 @@ export async function runHarness(config: HarnessConfig): Promise progress.retryCount = retry; await writeProgress(config.workDir, progress); - await runGenerator(config.workDir, spec, contract, lastEval); + if (!sprintStabilityState && lastEval) { + sprintStabilityState = buildStabilityStateFromEval(contract, lastEval, config.passThreshold); + } + + const retryFocusCriteria = lastEval + ? getFailedCriteria(contract, lastEval, config.passThreshold) + : []; + + await runGenerator(config.workDir, spec, contract, lastEval, retryFocusCriteria); // Evaluate progress.status = "evaluating"; await writeProgress(config.workDir, progress); - lastEval = await runEvaluator(config.workDir, contract, config.passThreshold); + const rawEval = await runEvaluator(config.workDir, contract, config.passThreshold); + const stabilized = stabilizeEvaluation(contract, rawEval, config, sprintStabilityState); + lastEval = stabilized.result; + sprintStabilityState = stabilized.state; + + if (config.retryStrategy === "stabilized") { + await writeSprintStabilityState(config.workDir, sprint, sprintStabilityState); + const { lockedPassRetained, unlockedRegressions, inconclusiveRetained } = stabilized.summary; + if (lockedPassRetained > 0 || unlockedRegressions > 0) { + log( + "HARNESS", + `Stabilized retry: retained ${lockedPassRetained} locked pass(es) (${inconclusiveRetained} inconclusive), unlocked ${unlockedRegressions} regression(s)`, + ); + } + } + await writeFeedback(config.workDir, sprint, retry, lastEval); if (lastEval.passed) { @@ -156,6 +253,16 @@ export async function runHarness(config: HarnessConfig): Promise return { success: allPassed, sprints: results, totalDurationMs: totalDuration }; } +function deriveTotalSprints(spec: string, maxSprints: number): number { + const sprintNumbers = Array.from(spec.matchAll(/sprint\s+(\d+)/gi)) + .map((m) => parseInt(m[1]!, 10)) + .filter((n) => n > 0 && n <= maxSprints); + + return sprintNumbers.length > 0 + ? Math.min(Math.max(...sprintNumbers), maxSprints) + : 3; +} + async function negotiateContract( workDir: string, spec: string, diff --git a/claude-harness/index.ts b/claude-harness/index.ts index 78c7fec..2519c64 100644 --- a/claude-harness/index.ts +++ b/claude-harness/index.ts @@ -3,38 +3,100 @@ import { readFile } from "fs/promises"; import { runHarness } from "./harness.ts"; import { DEFAULT_CONFIG } from "../shared/config.ts"; import { log, logError, logDivider } from "../shared/logger.ts"; -import type { HarnessConfig } from "../shared/types.ts"; +import type { HarnessConfig, ResumeMode, RetryStrategy } from "../shared/types.ts"; let userPrompt: string | undefined; +let promptFilePath: string | undefined; +let resumeMode: ResumeMode | undefined; +let retryStrategy: RetryStrategy | undefined; +let hardFailUnlockStreak: number | undefined; -const arg = process.argv[2]; -if (arg === "--file" || arg === "-f") { - const filePath = process.argv[3]; - if (!filePath) { - console.error("Error: --file requires a path argument"); +const args = process.argv.slice(2); + +for (let i = 0; i < args.length; i++) { + const arg = args[i]!; + + if (arg === "--file" || arg === "-f") { + promptFilePath = args[i + 1]; + if (!promptFilePath) { + console.error("Error: --file requires a path argument"); + process.exit(1); + } + i += 1; + continue; + } + + if (arg === "--resume") { + resumeMode = "strict"; + continue; + } + + if (arg.startsWith("--resume=")) { + const mode = arg.split("=")[1]; + if (mode === "strict" || mode === "reset-retries" || mode === "reset-contract") { + resumeMode = mode; + continue; + } + console.error(`Error: invalid resume mode '${mode}'. Expected strict, reset-retries, or reset-contract.`); + process.exit(1); + } + + if (arg.startsWith("--retry-strategy=")) { + const mode = arg.split("=")[1]; + if (mode === "strict" || mode === "stabilized") { + retryStrategy = mode; + continue; + } + console.error(`Error: invalid retry strategy '${mode}'. Expected strict or stabilized.`); + process.exit(1); + } + + if (arg.startsWith("--hard-fail-unlock-streak=")) { + const raw = arg.split("=")[1]; + const parsed = raw ? parseInt(raw, 10) : NaN; + if (Number.isInteger(parsed) && parsed >= 1) { + hardFailUnlockStreak = parsed; + continue; + } + console.error(`Error: invalid hard fail unlock streak '${raw}'. Expected integer >= 1.`); process.exit(1); } - userPrompt = await readFile(resolve(filePath), "utf-8"); -} else { - userPrompt = arg; + + userPrompt = userPrompt ? `${userPrompt} ${arg}` : arg; +} + +if (promptFilePath) { + userPrompt = await readFile(resolve(promptFilePath), "utf-8"); } -if (!userPrompt) { +if (!userPrompt && !resumeMode) { console.error("Usage: bun run claude-harness/index.ts "); console.error(' bun run claude-harness/index.ts --file '); + console.error(' bun run claude-harness/index.ts --resume[=strict|reset-retries|reset-contract]'); + console.error(' bun run claude-harness/index.ts --retry-strategy=strict|stabilized '); + console.error(' bun run claude-harness/index.ts --hard-fail-unlock-streak=2 '); + console.error(' bun run claude-harness/index.ts --resume=reset-retries "optional prompt"'); console.error('Example: bun run claude-harness/index.ts "Build a task manager with REST API and dashboard"'); process.exit(1); } const config: HarnessConfig = { ...DEFAULT_CONFIG, - userPrompt, + userPrompt: userPrompt ?? "RESUME", workDir: resolve("workspace/claude"), + resumeMode, + retryStrategy: retryStrategy ?? DEFAULT_CONFIG.retryStrategy, + hardFailUnlockStreak: hardFailUnlockStreak ?? DEFAULT_CONFIG.hardFailUnlockStreak, }; logDivider(); log("HARNESS", "ADVERSARIAL DEV - Claude Agent SDK Harness"); -log("HARNESS", `Prompt: "${userPrompt}"`); +log("HARNESS", `Prompt: "${config.userPrompt}"`); +if (resumeMode) { + log("HARNESS", `Resume: ${resumeMode}`); +} +log("HARNESS", `Retry strategy: ${config.retryStrategy}`); +log("HARNESS", `Hard fail unlock streak: ${config.hardFailUnlockStreak}`); logDivider(); try { diff --git a/codex-harness/evaluator.ts b/codex-harness/evaluator.ts index 4ae40fb..777a91a 100644 --- a/codex-harness/evaluator.ts +++ b/codex-harness/evaluator.ts @@ -2,6 +2,7 @@ import { Codex } from "@openai/codex-sdk"; import { EVALUATOR_SYSTEM_PROMPT } from "../shared/prompts.ts"; import { CODEX_MODEL, CODEX_NETWORK_ACCESS } from "../shared/config.ts"; import { log, logError } from "../shared/logger.ts"; +import { getCriterionThreshold } from "../shared/evaluation.ts"; import type { SprintContract, EvalResult } from "../shared/types.ts"; export async function runEvaluator( @@ -18,7 +19,8 @@ ${JSON.stringify(contract, null, 2)} ## Pass Threshold -Each criterion must score at least ${passThreshold}/10 to pass. +Each criterion must satisfy its own \ +\`threshold\` from the sprint contract. If a criterion has no threshold, use ${passThreshold}/10. ## Instructions @@ -40,26 +42,49 @@ Examine the application in the \`app/\` directory. Read the code, run it if poss log("EVALUATOR", `Evaluation complete for sprint ${sprint}`); - const evalResult = parseEvalResult(response, contract, passThreshold); + const invalidThresholds = contract.criteria + .filter((criterion) => !Number.isInteger(criterion.threshold) || criterion.threshold < 1 || criterion.threshold > 10) + .map((criterion) => `${criterion.name}=${criterion.threshold}`); - const passedCount = evalResult.feedback.filter((f) => f.score >= passThreshold).length; + if (invalidThresholds.length > 0) { + log( + "EVALUATOR", + `Ignoring ${invalidThresholds.length} invalid contract thresholds (expected integer 1-10): ${invalidThresholds.join(", ")}`, + ); + } + + let evalResult = tryParseEvalResult(response, contract, passThreshold); + if (!evalResult) { + logError("EVALUATOR", "Failed to parse evaluation JSON from first attempt; retrying evaluator once..."); + const recoveryPrompt = `${fullPrompt}\n\nCRITICAL RETRY INSTRUCTION: Your previous response was not valid JSON. Re-run any checks you need, then output ONLY a valid JSON object matching the required schema.`; + const recoveryTurn = await thread.run(recoveryPrompt); + const recoveryResponse = recoveryTurn.finalResponse ?? ""; + evalResult = tryParseEvalResult(recoveryResponse, contract, passThreshold); + } + + if (!evalResult) { + evalResult = buildParseFailureEvalResult(contract, response); + } + + const passedCount = evalResult.feedback.filter((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold)).length; const totalCount = evalResult.feedback.length; const verdict = evalResult.passed ? "PASSED" : "FAILED"; log("EVALUATOR", `Sprint ${sprint}: ${verdict} (${passedCount}/${totalCount} criteria passed)`); for (const item of evalResult.feedback) { - const status = item.score >= passThreshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; - log("EVALUATOR", ` [${status}] ${item.criterion}: ${item.score}/10 - ${item.details.slice(0, 100)}`); + const threshold = getCriterionThreshold(contract, item.criterion, passThreshold); + const status = item.score >= threshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; + log("EVALUATOR", ` [${status}] ${item.criterion}: ${item.score}/10 (threshold ${threshold}) - ${item.details.slice(0, 100)}`); } return evalResult; } -function parseEvalResult( +function tryParseEvalResult( response: string, contract: SprintContract, passThreshold: number, -): EvalResult { +): EvalResult | null { // Try multiple strategies to extract JSON from the response const candidates: string[] = []; @@ -80,7 +105,7 @@ function parseEvalResult( try { const parsed = JSON.parse(candidate) as EvalResult; if (parsed.feedback && Array.isArray(parsed.feedback)) { - parsed.passed = parsed.feedback.every((f) => f.score >= passThreshold); + parsed.passed = parsed.feedback.every((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold)); return parsed; } } catch { @@ -88,6 +113,10 @@ function parseEvalResult( } } + return null; +} + +function buildParseFailureEvalResult(contract: SprintContract, response: string): EvalResult { logError("EVALUATOR", "Failed to parse evaluation JSON from any extraction strategy"); return { passed: false, diff --git a/codex-harness/generator.ts b/codex-harness/generator.ts index d4622e8..86cea7a 100644 --- a/codex-harness/generator.ts +++ b/codex-harness/generator.ts @@ -9,6 +9,7 @@ export async function runGenerator( spec: string, contract: SprintContract, previousFeedback?: EvalResult, + retryFocusCriteria: string[] = [], ): Promise<{ response: string }> { const sprint = contract.sprintNumber; const attempt = previousFeedback ? "retry" : "initial"; @@ -18,6 +19,10 @@ export async function runGenerator( if (previousFeedback) { taskPrompt += `\n\n## Evaluation Feedback (MUST ADDRESS)\n\n${JSON.stringify(previousFeedback, null, 2)}`; + if (retryFocusCriteria.length > 0) { + taskPrompt += `\n\n## Retry Focus (Scope Control)\n\nOnly these criteria are still failing and must be fixed now:\n${retryFocusCriteria.map((name) => `- ${name}`).join("\n")}`; + taskPrompt += "\n\nMinimize changes outside the failing criteria. Preserve behavior for criteria that already pass unless a dependency forces a shared fix."; + } taskPrompt += `\n\nThe previous attempt failed evaluation. Address every issue in the feedback above.`; } else { taskPrompt += `\n\nImplement the features listed in this sprint contract. Work in the \`app/\` directory.`; diff --git a/codex-harness/harness.ts b/codex-harness/harness.ts index 105c64d..6365fdf 100644 --- a/codex-harness/harness.ts +++ b/codex-harness/harness.ts @@ -10,16 +10,25 @@ import { writeSpec, readSpec, writeContract, + readContract, + readFeedback, writeFeedback, + readProgress, writeProgress, + findLatestFeedbackRound, + readSprintStabilityState, + writeSprintStabilityState, } from "../shared/files.ts"; +import { stabilizeEvaluation, buildStabilityStateFromEval, getFailedCriteria } from "../shared/evaluation.ts"; import type { HarnessConfig, + ResumeMode, SprintContract, EvalResult, HarnessProgress, HarnessResult, SprintResult, + SprintStabilityState, } from "../shared/types.ts"; import { runPlanner } from "./planner.ts"; @@ -29,52 +38,108 @@ import { runEvaluator } from "./evaluator.ts"; export async function runHarness(config: HarnessConfig): Promise { const startTime = Date.now(); const results: SprintResult[] = []; + const isResume = config.resumeMode !== undefined; + const resumeMode: ResumeMode = config.resumeMode ?? "strict"; log("HARNESS", "Initializing Codex SDK harness"); log("HARNESS", `Work directory: ${config.workDir}`); log("HARNESS", `Max sprints: ${config.maxSprints} | Max retries: ${config.maxRetriesPerSprint} | Threshold: ${config.passThreshold}/10`); + log("HARNESS", `Retry strategy: ${config.retryStrategy} (unlock streak: ${config.hardFailUnlockStreak})`); + if (isResume) { + log("HARNESS", `Resume mode: ${resumeMode}`); + } - await initWorkspace(config.workDir); + await initWorkspace(config.workDir, { clean: !isResume }); - // Phase 1: Planning - logDivider(); - log("HARNESS", "PHASE 1: PLANNING"); - logDivider(); + let spec: string; + let totalSprints = 0; + let startSprint = 1; + let initialRetryForSprint = 0; + let reuseExistingContractOnStartSprint = false; + let lastEvalForStartSprint: EvalResult | undefined; + let stabilityStateForStartSprint: SprintStabilityState | undefined; + + const progress: HarnessProgress = isResume + ? await readProgress(config.workDir) + : { + status: "planning", + currentSprint: 0, + totalSprints: 0, + completedSprints: 0, + retryCount: 0, + }; - const progress: HarnessProgress = { - status: "planning", - currentSprint: 0, - totalSprints: 0, - completedSprints: 0, - retryCount: 0, - }; - await writeProgress(config.workDir, progress); + if (!isResume) { + // Phase 1: Planning + logDivider(); + log("HARNESS", "PHASE 1: PLANNING"); + logDivider(); - const plannerResponse = await runPlanner(config.userPrompt, config.workDir); + await writeProgress(config.workDir, progress); - // Planner may have written spec.md via its tools, or returned it as text - let spec: string; - try { + const plannerResponse = await runPlanner(config.userPrompt, config.workDir); + + // Planner may have written spec.md via its tools, or returned it as text + try { + spec = await readSpec(config.workDir); + } catch { + log("HARNESS", "Planner returned spec as text, writing to spec.md"); + await writeSpec(config.workDir, plannerResponse); + spec = plannerResponse; + } + + // Parse sprint count from spec - look for "Sprint N" patterns + totalSprints = deriveTotalSprints(spec, config.maxSprints); + progress.totalSprints = totalSprints; + log("HARNESS", `Planner produced ${totalSprints} sprints`); + } else { spec = await readSpec(config.workDir); - } catch { - log("HARNESS", "Planner returned spec as text, writing to spec.md"); - await writeSpec(config.workDir, plannerResponse); - spec = plannerResponse; - } + totalSprints = progress.totalSprints > 0 ? progress.totalSprints : deriveTotalSprints(spec, config.maxSprints); + progress.totalSprints = totalSprints; - // Parse sprint count from spec - look for "Sprint N" patterns - const sprintNumbers = Array.from(spec.matchAll(/sprint\s+(\d+)/gi)) - .map((m) => parseInt(m[1]!, 10)) - .filter((n) => n > 0 && n <= config.maxSprints); - const totalSprints = sprintNumbers.length > 0 - ? Math.min(Math.max(...sprintNumbers), config.maxSprints) - : 3; // Default to 3 if no sprint numbers found + if (progress.status === "complete") { + log("HARNESS", "Resume requested but harness is already complete."); + return { success: true, sprints: [], totalDurationMs: Date.now() - startTime }; + } + + if (progress.currentSprint <= 0) { + throw new Error("Cannot resume: progress.json does not contain a valid currentSprint"); + } - progress.totalSprints = totalSprints; - log("HARNESS", `Planner produced ${totalSprints} sprints`); + startSprint = progress.currentSprint; + const latestRound = await findLatestFeedbackRound(config.workDir, startSprint); + if (latestRound !== null) { + lastEvalForStartSprint = await readFeedback(config.workDir, startSprint, latestRound); + try { + stabilityStateForStartSprint = await readSprintStabilityState(config.workDir, startSprint); + } catch { + // Backward compatibility: older runs do not have stability snapshots + } + } + + if (resumeMode === "strict") { + if (progress.status === "failed" && latestRound !== null && latestRound >= config.maxRetriesPerSprint) { + throw new Error( + `Cannot strictly resume sprint ${startSprint}: retry budget exhausted (last round ${latestRound})`, + ); + } + initialRetryForSprint = latestRound === null ? 0 : latestRound + 1; + reuseExistingContractOnStartSprint = true; + } else if (resumeMode === "reset-retries") { + initialRetryForSprint = 0; + reuseExistingContractOnStartSprint = true; + } else { + initialRetryForSprint = 0; + reuseExistingContractOnStartSprint = false; + lastEvalForStartSprint = undefined; + stabilityStateForStartSprint = undefined; + } + + log("HARNESS", `Resuming at sprint ${startSprint}/${totalSprints} from retry ${initialRetryForSprint}`); + } // Phase 2-4: Sprint Loop - for (let sprint = 1; sprint <= totalSprints; sprint++) { + for (let sprint = startSprint; sprint <= totalSprints; sprint++) { logDivider(); log("HARNESS", `SPRINT ${sprint}/${totalSprints}`); logDivider(); @@ -82,20 +147,30 @@ export async function runHarness(config: HarnessConfig): Promise // Phase 2: Contract Negotiation progress.status = "negotiating"; progress.currentSprint = sprint; - progress.retryCount = 0; + progress.retryCount = sprint === startSprint ? initialRetryForSprint : 0; await writeProgress(config.workDir, progress); - log("HARNESS", "Negotiating sprint contract..."); - const contract = await negotiateContract(config.workDir, spec, sprint); - await writeContract(config.workDir, contract); + let contract: SprintContract; + const shouldReuseContract = sprint === startSprint && reuseExistingContractOnStartSprint; + if (shouldReuseContract) { + log("HARNESS", "Reusing existing sprint contract..."); + contract = await readContract(config.workDir, sprint); + } else { + log("HARNESS", "Negotiating sprint contract..."); + contract = await negotiateContract(config.workDir, spec, sprint); + await writeContract(config.workDir, contract); + } log("HARNESS", `Contract agreed: ${contract.criteria.length} criteria for ${contract.features.length} features`); // Phase 3-4: Build-Evaluate Loop let passed = false; - let lastEval: EvalResult | undefined; + let lastEval: EvalResult | undefined = sprint === startSprint ? lastEvalForStartSprint : undefined; + let sprintStabilityState: SprintStabilityState | undefined = sprint === startSprint ? stabilityStateForStartSprint : undefined; let attempts = 0; - for (let retry = 0; retry <= config.maxRetriesPerSprint; retry++) { + const retryStart = sprint === startSprint ? initialRetryForSprint : 0; + + for (let retry = retryStart; retry <= config.maxRetriesPerSprint; retry++) { attempts = retry + 1; // Build @@ -103,13 +178,36 @@ export async function runHarness(config: HarnessConfig): Promise progress.retryCount = retry; await writeProgress(config.workDir, progress); - await runGenerator(config.workDir, spec, contract, lastEval); + if (!sprintStabilityState && lastEval) { + sprintStabilityState = buildStabilityStateFromEval(contract, lastEval, config.passThreshold); + } + + const retryFocusCriteria = lastEval + ? getFailedCriteria(contract, lastEval, config.passThreshold) + : []; + + await runGenerator(config.workDir, spec, contract, lastEval, retryFocusCriteria); // Evaluate progress.status = "evaluating"; await writeProgress(config.workDir, progress); - lastEval = await runEvaluator(config.workDir, contract, config.passThreshold); + const rawEval = await runEvaluator(config.workDir, contract, config.passThreshold); + const stabilized = stabilizeEvaluation(contract, rawEval, config, sprintStabilityState); + lastEval = stabilized.result; + sprintStabilityState = stabilized.state; + + if (config.retryStrategy === "stabilized") { + await writeSprintStabilityState(config.workDir, sprint, sprintStabilityState); + const { lockedPassRetained, unlockedRegressions, inconclusiveRetained } = stabilized.summary; + if (lockedPassRetained > 0 || unlockedRegressions > 0) { + log( + "HARNESS", + `Stabilized retry: retained ${lockedPassRetained} locked pass(es) (${inconclusiveRetained} inconclusive), unlocked ${unlockedRegressions} regression(s)`, + ); + } + } + await writeFeedback(config.workDir, sprint, retry, lastEval); if (lastEval.passed) { @@ -154,6 +252,16 @@ export async function runHarness(config: HarnessConfig): Promise return { success: allPassed, sprints: results, totalDurationMs: totalDuration }; } +function deriveTotalSprints(spec: string, maxSprints: number): number { + const sprintNumbers = Array.from(spec.matchAll(/sprint\s+(\d+)/gi)) + .map((m) => parseInt(m[1]!, 10)) + .filter((n) => n > 0 && n <= maxSprints); + + return sprintNumbers.length > 0 + ? Math.min(Math.max(...sprintNumbers), maxSprints) + : 3; +} + async function negotiateContract( workDir: string, spec: string, diff --git a/codex-harness/index.ts b/codex-harness/index.ts index 16cb310..70f819c 100644 --- a/codex-harness/index.ts +++ b/codex-harness/index.ts @@ -3,38 +3,100 @@ import { readFile } from "fs/promises"; import { runHarness } from "./harness.ts"; import { DEFAULT_CONFIG } from "../shared/config.ts"; import { log, logError, logDivider } from "../shared/logger.ts"; -import type { HarnessConfig } from "../shared/types.ts"; +import type { HarnessConfig, ResumeMode, RetryStrategy } from "../shared/types.ts"; let userPrompt: string | undefined; +let promptFilePath: string | undefined; +let resumeMode: ResumeMode | undefined; +let retryStrategy: RetryStrategy | undefined; +let hardFailUnlockStreak: number | undefined; -const arg = process.argv[2]; -if (arg === "--file" || arg === "-f") { - const filePath = process.argv[3]; - if (!filePath) { - console.error("Error: --file requires a path argument"); +const args = process.argv.slice(2); + +for (let i = 0; i < args.length; i++) { + const arg = args[i]!; + + if (arg === "--file" || arg === "-f") { + promptFilePath = args[i + 1]; + if (!promptFilePath) { + console.error("Error: --file requires a path argument"); + process.exit(1); + } + i += 1; + continue; + } + + if (arg === "--resume") { + resumeMode = "strict"; + continue; + } + + if (arg.startsWith("--resume=")) { + const mode = arg.split("=")[1]; + if (mode === "strict" || mode === "reset-retries" || mode === "reset-contract") { + resumeMode = mode; + continue; + } + console.error(`Error: invalid resume mode '${mode}'. Expected strict, reset-retries, or reset-contract.`); + process.exit(1); + } + + if (arg.startsWith("--retry-strategy=")) { + const mode = arg.split("=")[1]; + if (mode === "strict" || mode === "stabilized") { + retryStrategy = mode; + continue; + } + console.error(`Error: invalid retry strategy '${mode}'. Expected strict or stabilized.`); + process.exit(1); + } + + if (arg.startsWith("--hard-fail-unlock-streak=")) { + const raw = arg.split("=")[1]; + const parsed = raw ? parseInt(raw, 10) : NaN; + if (Number.isInteger(parsed) && parsed >= 1) { + hardFailUnlockStreak = parsed; + continue; + } + console.error(`Error: invalid hard fail unlock streak '${raw}'. Expected integer >= 1.`); process.exit(1); } - userPrompt = await readFile(resolve(filePath), "utf-8"); -} else { - userPrompt = arg; + + userPrompt = userPrompt ? `${userPrompt} ${arg}` : arg; +} + +if (promptFilePath) { + userPrompt = await readFile(resolve(promptFilePath), "utf-8"); } -if (!userPrompt) { +if (!userPrompt && !resumeMode) { console.error("Usage: bun run codex-harness/index.ts "); console.error(' bun run codex-harness/index.ts --file '); + console.error(' bun run codex-harness/index.ts --resume[=strict|reset-retries|reset-contract]'); + console.error(' bun run codex-harness/index.ts --retry-strategy=strict|stabilized '); + console.error(' bun run codex-harness/index.ts --hard-fail-unlock-streak=2 '); + console.error(' bun run codex-harness/index.ts --resume=reset-retries "optional prompt"'); console.error('Example: bun run codex-harness/index.ts "Build a task manager with REST API and dashboard"'); process.exit(1); } const config: HarnessConfig = { ...DEFAULT_CONFIG, - userPrompt, + userPrompt: userPrompt ?? "RESUME", workDir: resolve("workspace/codex"), + resumeMode, + retryStrategy: retryStrategy ?? DEFAULT_CONFIG.retryStrategy, + hardFailUnlockStreak: hardFailUnlockStreak ?? DEFAULT_CONFIG.hardFailUnlockStreak, }; logDivider(); log("HARNESS", "ADVERSARIAL DEV - Codex SDK Harness"); -log("HARNESS", `Prompt: "${userPrompt}"`); +log("HARNESS", `Prompt: "${config.userPrompt}"`); +if (resumeMode) { + log("HARNESS", `Resume: ${resumeMode}`); +} +log("HARNESS", `Retry strategy: ${config.retryStrategy}`); +log("HARNESS", `Hard fail unlock streak: ${config.hardFailUnlockStreak}`); logDivider(); try { diff --git a/shared/config.ts b/shared/config.ts index 821c963..1499621 100644 --- a/shared/config.ts +++ b/shared/config.ts @@ -4,10 +4,12 @@ export const DEFAULT_CONFIG: Omit = { maxSprints: 10, maxRetriesPerSprint: 3, passThreshold: 7, + retryStrategy: "stabilized", + hardFailUnlockStreak: 2, }; export const CLAUDE_MODEL = "claude-sonnet-4-6"; export const CODEX_MODEL = "gpt-5.4"; -export const CLAUDE_MAX_TURNS = 50; +export const CLAUDE_MAX_TURNS = 80; export const CODEX_NETWORK_ACCESS = true; diff --git a/shared/evaluation.ts b/shared/evaluation.ts new file mode 100644 index 0000000..a65438d --- /dev/null +++ b/shared/evaluation.ts @@ -0,0 +1,175 @@ +import type { + SprintContract, + EvalResult, + SprintStabilityState, + StabilizationSummary, + CriterionOutcome, + HarnessConfig, +} from "./types.ts"; + +const INCONCLUSIVE_PATTERN = /(cannot|can't|unable|not available|unavailable|not possible|missing|not installed|environment|could not run|chrome not available|permission denied|tooling unavailable|sdk unavailable)/i; + +export function getCriterionThreshold(contract: SprintContract, criterion: string, fallback: number): number { + const rawThreshold = contract.criteria.find((c) => c.name === criterion)?.threshold; + if (typeof rawThreshold !== "number" || !Number.isInteger(rawThreshold)) { + return fallback; + } + if (rawThreshold < 1 || rawThreshold > 10) { + return fallback; + } + return rawThreshold; +} + +function classifyOutcome(score: number, threshold: number, details: string): CriterionOutcome { + if (score >= threshold) { + return "pass"; + } + + return INCONCLUSIVE_PATTERN.test(details) ? "inconclusive" : "hard_fail"; +} + +export function buildStabilityStateFromEval( + contract: SprintContract, + evalResult: EvalResult, + passThreshold: number, +): SprintStabilityState { + const criteria: SprintStabilityState["criteria"] = {}; + + for (const criterion of contract.criteria) { + const threshold = getCriterionThreshold(contract, criterion.name, passThreshold); + const feedback = evalResult.feedback.find((f) => f.criterion === criterion.name); + const score = feedback?.score ?? 0; + const details = feedback?.details ?? "No evaluator feedback"; + const outcome = classifyOutcome(score, threshold, details); + + criteria[criterion.name] = { + locked: outcome === "pass", + bestScore: outcome === "pass" ? score : 0, + consecutiveHardFails: outcome === "hard_fail" ? 1 : 0, + lastObservedScore: score, + lastObservedOutcome: outcome, + }; + } + + return { + sprintNumber: contract.sprintNumber, + criteria, + }; +} + +export function stabilizeEvaluation( + contract: SprintContract, + rawEvalResult: EvalResult, + config: Pick, + previousState?: SprintStabilityState, +): { result: EvalResult; state: SprintStabilityState; summary: StabilizationSummary } { + const summary: StabilizationSummary = { + lockedPassRetained: 0, + unlockedRegressions: 0, + inconclusiveRetained: 0, + }; + + const stateCriteria: SprintStabilityState["criteria"] = {}; + const scores: Record = {}; + const feedback = contract.criteria.map((criterion) => { + const threshold = getCriterionThreshold(contract, criterion.name, config.passThreshold); + const rawItem = rawEvalResult.feedback.find((f) => f.criterion === criterion.name) ?? { + criterion: criterion.name, + score: 0, + details: "No evaluator feedback returned for this criterion", + }; + + const rawOutcome = classifyOutcome(rawItem.score, threshold, rawItem.details); + const prev = previousState?.criteria[criterion.name]; + + let effectiveScore = rawItem.score; + let effectiveDetails = rawItem.details; + let locked = prev?.locked ?? false; + let bestScore = prev?.bestScore ?? 0; + let consecutiveHardFails = prev?.consecutiveHardFails ?? 0; + + if (rawOutcome === "pass") { + locked = true; + bestScore = Math.max(bestScore, rawItem.score); + consecutiveHardFails = 0; + } else if (config.retryStrategy === "stabilized" && prev?.locked) { + if (rawOutcome === "inconclusive") { + effectiveScore = Math.max(bestScore, threshold); + effectiveDetails = `${rawItem.details} [stabilized: retained previous verified pass because this check was inconclusive in the current environment]`; + summary.lockedPassRetained += 1; + summary.inconclusiveRetained += 1; + consecutiveHardFails = 0; + } else { + const nextHardFailCount = consecutiveHardFails + 1; + if (nextHardFailCount < config.hardFailUnlockStreak) { + effectiveScore = Math.max(bestScore, threshold); + effectiveDetails = `${rawItem.details} [stabilized: retained previous verified pass; hard fail ${nextHardFailCount}/${config.hardFailUnlockStreak} before unlock]`; + summary.lockedPassRetained += 1; + consecutiveHardFails = nextHardFailCount; + } else { + locked = false; + summary.unlockedRegressions += 1; + consecutiveHardFails = nextHardFailCount; + } + } + } else if (rawOutcome === "hard_fail") { + consecutiveHardFails += 1; + } + + const effectiveOutcome = classifyOutcome(effectiveScore, threshold, effectiveDetails); + if (effectiveOutcome === "pass") { + locked = true; + bestScore = Math.max(bestScore, effectiveScore); + consecutiveHardFails = 0; + } + + scores[criterion.name] = effectiveScore; + stateCriteria[criterion.name] = { + locked, + bestScore, + consecutiveHardFails, + lastObservedScore: rawItem.score, + lastObservedOutcome: rawOutcome, + }; + + return { + criterion: criterion.name, + score: effectiveScore, + details: effectiveDetails, + }; + }); + + const passed = contract.criteria.every((criterion) => { + const threshold = getCriterionThreshold(contract, criterion.name, config.passThreshold); + const score = scores[criterion.name] ?? 0; + return score >= threshold; + }); + + return { + result: { + passed, + scores, + feedback, + overallSummary: rawEvalResult.overallSummary, + }, + state: { + sprintNumber: contract.sprintNumber, + criteria: stateCriteria, + }, + summary, + }; +} + +export function getFailedCriteria( + contract: SprintContract, + evalResult: EvalResult, + passThreshold: number, +): string[] { + return contract.criteria + .filter((criterion) => { + const threshold = getCriterionThreshold(contract, criterion.name, passThreshold); + const score = evalResult.feedback.find((f) => f.criterion === criterion.name)?.score ?? 0; + return score < threshold; + }) + .map((criterion) => criterion.name); +} diff --git a/shared/files.ts b/shared/files.ts index 23263af..a0c90b8 100644 --- a/shared/files.ts +++ b/shared/files.ts @@ -1,23 +1,29 @@ import { mkdir, readFile, writeFile, access, rm, readdir, unlink } from "fs/promises"; import { join } from "path"; import { execSync } from "child_process"; -import type { SprintContract, EvalResult, HarnessProgress } from "./types.ts"; +import type { SprintContract, EvalResult, HarnessProgress, SprintStabilityState } from "./types.ts"; -export async function initWorkspace(workDir: string): Promise { +export async function initWorkspace( + workDir: string, + options: { clean?: boolean } = {}, +): Promise { + const clean = options.clean ?? true; await mkdir(join(workDir, "contracts"), { recursive: true }); await mkdir(join(workDir, "feedback"), { recursive: true }); await mkdir(join(workDir, "app"), { recursive: true }); // Clean stale artifacts from previous runs - try { await unlink(join(workDir, "spec.md")); } catch {} - try { await unlink(join(workDir, "progress.json")); } catch {} - for (const dir of ["contracts", "feedback"]) { - try { - const files = await readdir(join(workDir, dir)); - for (const f of files) { - await unlink(join(workDir, dir, f)); - } - } catch {} + if (clean) { + try { await unlink(join(workDir, "spec.md")); } catch {} + try { await unlink(join(workDir, "progress.json")); } catch {} + for (const dir of ["contracts", "feedback"]) { + try { + const files = await readdir(join(workDir, dir)); + for (const f of files) { + await unlink(join(workDir, dir, f)); + } + } catch {} + } } // Initialize app/ as its own git repo so agent commits stay isolated @@ -37,6 +43,29 @@ export async function initWorkspace(workDir: string): Promise { } } +export async function findLatestFeedbackRound( + workDir: string, + sprintNumber: number, +): Promise { + const feedbackDir = join(workDir, "feedback"); + const pattern = new RegExp(`^sprint-${sprintNumber}-round-(\\d+)\\.json$`); + + try { + const files = await readdir(feedbackDir); + const rounds = files + .map((file) => { + const match = file.match(pattern); + return match ? parseInt(match[1]!, 10) : null; + }) + .filter((round): round is number => round !== null) + .sort((a, b) => b - a); + + return rounds.length > 0 ? rounds[0]! : null; + } catch { + return null; + } +} + export async function writeSpec(workDir: string, spec: string): Promise { await writeFile(join(workDir, "spec.md"), spec, "utf-8"); } @@ -84,6 +113,28 @@ export async function readFeedback( } } +export async function writeSprintStabilityState( + workDir: string, + sprintNumber: number, + state: SprintStabilityState, +): Promise { + const path = join(workDir, "feedback", `sprint-${sprintNumber}-stability.json`); + await writeFile(path, JSON.stringify(state, null, 2), "utf-8"); +} + +export async function readSprintStabilityState( + workDir: string, + sprintNumber: number, +): Promise { + const path = join(workDir, "feedback", `sprint-${sprintNumber}-stability.json`); + const raw = await readFile(path, "utf-8"); + try { + return JSON.parse(raw) as SprintStabilityState; + } catch { + throw new Error(`Invalid JSON in stability state file: ${path}`); + } +} + export async function writeProgress(workDir: string, progress: HarnessProgress): Promise { await writeFile(join(workDir, "progress.json"), JSON.stringify(progress, null, 2), "utf-8"); } diff --git a/shared/prompts.ts b/shared/prompts.ts index 138fb7c..70c1312 100644 --- a/shared/prompts.ts +++ b/shared/prompts.ts @@ -127,7 +127,7 @@ You MUST output your evaluation as a JSON object (and nothing else) with this ex } \`\`\` -A sprint PASSES only if ALL criteria score at or above the threshold (default: 7). +A sprint PASSES only if ALL criteria score at or above the criterion threshold (integer 1-10, default: 7). If ANY criterion falls below the threshold, the sprint FAILS and work goes back to the generator.`; export const CONTRACT_NEGOTIATION_GENERATOR_PROMPT = `You are proposing a sprint contract. Based on the product spec and the sprint number, propose what you will build and how success should be measured. @@ -152,6 +152,7 @@ Rules: - Each criterion must be SPECIFIC and TESTABLE (not vague like "works well") - Include 5-15 criteria per sprint depending on complexity - Criteria should cover: functionality, error handling, code quality, and user experience +- \`threshold\` MUST be an integer score threshold on a 1-10 scale (typically 6-9), not a raw metric target like milliseconds or bytes. Put raw targets in the description text. - Output ONLY the JSON, no other text`; export const CONTRACT_NEGOTIATION_EVALUATOR_PROMPT = `You are reviewing a proposed sprint contract. Evaluate whether the criteria are specific enough, testable, and comprehensive. @@ -164,4 +165,5 @@ Rules: - Criteria must be testable by reading code and running the app - Vague criteria like "works well" or "looks good" must be made specific - Ensure coverage of error handling and edge cases, not just happy paths +- Ensure every criterion uses an integer \`threshold\` on the 1-10 score scale; move raw targets (ms, bytes, ratios) into the description text - Output either "APPROVED" or the revised JSON contract, nothing else`; diff --git a/shared/types.ts b/shared/types.ts index 6e31c1e..504acfe 100644 --- a/shared/types.ts +++ b/shared/types.ts @@ -4,8 +4,14 @@ export interface HarnessConfig { maxSprints: number; maxRetriesPerSprint: number; passThreshold: number; + resumeMode?: ResumeMode; + retryStrategy: RetryStrategy; + hardFailUnlockStreak: number; } +export type ResumeMode = "strict" | "reset-retries" | "reset-contract"; +export type RetryStrategy = "strict" | "stabilized"; + export interface SprintContract { sprintNumber: number; features: string[]; @@ -51,3 +57,24 @@ export interface HarnessResult { sprints: SprintResult[]; totalDurationMs: number; } + +export type CriterionOutcome = "pass" | "inconclusive" | "hard_fail"; + +export interface CriterionStabilityState { + locked: boolean; + bestScore: number; + consecutiveHardFails: number; + lastObservedScore: number; + lastObservedOutcome: CriterionOutcome; +} + +export interface SprintStabilityState { + sprintNumber: number; + criteria: Record; +} + +export interface StabilizationSummary { + lockedPassRetained: number; + unlockedRegressions: number; + inconclusiveRetained: number; +}