coleam00 · romanstark · Apr 3, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/README.md b/README.md
@@ -54,6 +54,26 @@ bun run codex-harness/index.ts "Build a personal task manager with a REST API, i
 
 Both harnesses write their output to `workspace/claude/` and `workspace/codex/` respectively. The built application lives in `workspace/{sdk}/app/`.
 
+### Resume an Existing Harness Run
+
+You can resume from an existing `workspace/{sdk}/progress.json` state:
+
+```bash
+# strict resume (default when no value is provided)
+bun run claude-harness/index.ts --resume
+
+# resume current sprint with retry counter reset
+bun run claude-harness/index.ts --resume=reset-retries
+
+# resume current sprint with a newly negotiated contract
+bun run claude-harness/index.ts --resume=reset-contract
+
+# opt into strict retry behavior (re-evaluate every regression immediately)
+bun run claude-harness/index.ts --resume --retry-strategy=strict
+```
+
+Same flags are supported for `codex-harness/index.ts`.
+
 ## Configuration
 
 Defaults are in `shared/config.ts`:
@@ -63,7 +83,10 @@ Defaults are in `shared/config.ts`:
 | `maxSprints` | 10 | Maximum number of sprints |
 | `maxRetriesPerSprint` | 3 | Max evaluation retries before failing a sprint |
 | `passThreshold` | 7 | Minimum score (out of 10) for each criterion |
+| `retryStrategy` | `stabilized` | Retry behavior: `stabilized` keeps previously verified criteria locked unless regressions persist |
+| `hardFailUnlockStreak` | 2 | Number of consecutive hard fails required to unlock a previously passed criterion |
 | `CLAUDE_MODEL` | `claude-sonnet-4-6` | Model for Claude harness |
+| `CLAUDE_MAX_TURNS` | 80 | Max Claude turns per agent run (higher improves long evaluation completion reliability) |
 | `CODEX_MODEL` | `gpt-5.4` | Model for Codex harness |
 
 ## How It Works
@@ -82,8 +105,10 @@ The generator reads the spec and contract, then implements features one at a tim
 ### 4. Evaluation Phase (per sprint)
 The evaluator reads the contract criteria, examines the code, **runs the application**, and tries to break it. It scores each criterion on a 1-10 scale. If all criteria pass (score >= 7/10), the sprint survives. If any fail, detailed feedback goes back to the generator -- with file paths, line numbers, and exact failure descriptions.
 
+When `stabilized` retry mode is enabled, evaluator parsing is hardened: if the first evaluator response is not valid JSON, the harness automatically retries the evaluator once with a strict JSON-only instruction before failing the sprint.
+
 ### 5. Retry Loop
-The generator reads the adversarial feedback, decides whether to refine or pivot, and rebuilds. This cycles up to 3 times per sprint. If a sprint can't survive the evaluator after all retries, the harness stops.
+The generator reads the adversarial feedback, decides whether to refine or pivot, and rebuilds. This cycles up to 3 times per sprint. In `stabilized` retry mode, criteria that have already passed are "locked" and only unlocked after repeated hard regressions, which reduces flakey fail/pass oscillations in long sprints.
 
 ### 6. Completion
 Once all sprints pass, you have a working application built incrementally with quality gates at every step -- every feature tested by an agent whose job was to break it.
@@ -126,6 +151,7 @@ Agents communicate through files, not shared conversation history. This keeps ea
 - `spec.md` -- Product specification from the planner
 - `contracts/sprint-{n}.json` -- Sprint contracts
 - `feedback/sprint-{n}-round-{m}.json` -- Evaluator feedback per attempt
+- `feedback/sprint-{n}-stability.json` -- Locked-pass stability state for retry stabilization
 - `progress.json` -- Harness state tracking
 
 ## The GAN Connection

diff --git a/claude-harness/evaluator.ts b/claude-harness/evaluator.ts
@@ -2,6 +2,7 @@ import { query, type Options } from "@anthropic-ai/claude-agent-sdk";
 import { EVALUATOR_SYSTEM_PROMPT } from "../shared/prompts.ts";
 import { CLAUDE_MODEL, CLAUDE_MAX_TURNS } from "../shared/config.ts";
 import { log, logError } from "../shared/logger.ts";
+import { getCriterionThreshold } from "../shared/evaluation.ts";
 import type { SprintContract, EvalResult } from "../shared/types.ts";
 
 export async function runEvaluator(
@@ -20,7 +21,8 @@ ${JSON.stringify(contract, null, 2)}
 
 ## Pass Threshold
 
-Each criterion must score at least ${passThreshold}/10 to pass.
+Each criterion must satisfy its own \
+\`threshold\` from the sprint contract. If a criterion has no threshold, use ${passThreshold}/10.
 
 ## Instructions
 
@@ -37,43 +39,50 @@ Examine the application in the \`app/\` directory. Read the code, run it if poss
     persistSession: false,
   };
 
-  let fullResponse = "";
+  const fullResponse = await runEvaluationTurn(prompt, options, sprint);
 
-  for await (const msg of query({ prompt, options })) {
-    if (msg.type === "assistant") {
-      const message = msg as { message: { content: Array<{ type: string; text?: string; name?: string }> } };
-      for (const block of message.message.content) {
-        if (block.type === "text" && block.text) {
-          fullResponse += block.text;
-        } else if (block.type === "tool_use" && block.name) {
-          log("EVALUATOR", `  Tool: ${block.name}`);
-        }
-      }
-    } else if (msg.type === "result") {
-      log("EVALUATOR", `Evaluation complete for sprint ${sprint}`);
-    }
+  const invalidThresholds = contract.criteria
+    .filter((criterion) => !Number.isInteger(criterion.threshold) || criterion.threshold < 1 || criterion.threshold > 10)
+    .map((criterion) => `${criterion.name}=${criterion.threshold}`);
+
+  if (invalidThresholds.length > 0) {
+    log(
+      "EVALUATOR",
+      `Ignoring ${invalidThresholds.length} invalid contract thresholds (expected integer 1-10): ${invalidThresholds.join(", ")}`,
+    );
+  }
+
+  let evalResult = tryParseEvalResult(fullResponse, contract, passThreshold);
+  if (!evalResult) {
+    logError("EVALUATOR", "Failed to parse evaluation JSON from first attempt; retrying evaluator once...");
+    const recoveryPrompt = `${prompt}\n\nCRITICAL RETRY INSTRUCTION: Your previous response was not valid JSON. Re-run any checks you need, then output ONLY a valid JSON object matching the required schema.`;
+    const recoveryResponse = await runEvaluationTurn(recoveryPrompt, { ...options, maxTurns: Math.max(CLAUDE_MAX_TURNS, 80) }, sprint);
+    evalResult = tryParseEvalResult(recoveryResponse, contract, passThreshold);
   }
 
-  const evalResult = parseEvalResult(fullResponse, contract, passThreshold);
+  if (!evalResult) {
+    evalResult = buildParseFailureEvalResult(contract, fullResponse);
+  }
 
-  const passedCount = evalResult.feedback.filter((f) => f.score >= passThreshold).length;
+  const passedCount = evalResult.feedback.filter((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold)).length;
   const totalCount = evalResult.feedback.length;
   const verdict = evalResult.passed ? "PASSED" : "FAILED";
   log("EVALUATOR", `Sprint ${sprint}: ${verdict} (${passedCount}/${totalCount} criteria passed)`);
 
   for (const item of evalResult.feedback) {
-    const status = item.score >= passThreshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m";
-    log("EVALUATOR", `  [${status}] ${item.criterion}: ${item.score}/10 - ${item.details.slice(0, 100)}`);
+    const threshold = getCriterionThreshold(contract, item.criterion, passThreshold);
+    const status = item.score >= threshold ? "\x1b[32mPASS\x1b[0m" : "\x1b[31mFAIL\x1b[0m";
+    log("EVALUATOR", `  [${status}] ${item.criterion}: ${item.score}/10 (threshold ${threshold}) - ${item.details.slice(0, 100)}`);
   }
 
   return evalResult;
 }
 
-function parseEvalResult(
+function tryParseEvalResult(
   response: string,
   contract: SprintContract,
   passThreshold: number,
-): EvalResult {
+): EvalResult | null {
   // Try multiple strategies to extract JSON from the response
   const candidates: string[] = [];
 
@@ -94,15 +103,18 @@ function parseEvalResult(
     try {
       const parsed = JSON.parse(candidate) as EvalResult;
       if (parsed.feedback && Array.isArray(parsed.feedback)) {
-        // Recalculate passed based on threshold
-        parsed.passed = parsed.feedback.every((f) => f.score >= passThreshold);
+        parsed.passed = parsed.feedback.every((f) => f.score >= getCriterionThreshold(contract, f.criterion, passThreshold));
         return parsed;
       }
     } catch {
       // Try next candidate
     }
   }
 
+  return null;
+}
+
+function buildParseFailureEvalResult(contract: SprintContract, response: string): EvalResult {
   logError("EVALUATOR", "Failed to parse evaluation JSON from any extraction strategy");
   return {
     passed: false,
@@ -115,3 +127,59 @@ function parseEvalResult(
     overallSummary: "Evaluation parsing failed. Raw response: " + response.slice(0, 500),
   };
 }
+
+async function runEvaluationTurn(prompt: string, options: Options, sprint: number): Promise<string> {
+  let fullResponse = "";
+
+  for await (const msg of query({ prompt, options })) {
+    if (msg.type === "assistant") {
+      const message = msg as { message: { content: Array<{ type: string; text?: string; name?: string }> } };
+      for (const block of message.message.content) {
+        if (block.type === "text" && block.text) {
+          fullResponse += block.text + "\n";
+        } else if (block.type === "tool_use" && block.name) {
+          log("EVALUATOR", `  Tool: ${block.name}`);
+        }
+      }
+    } else if (msg.type === "result") {
+      const resultText = extractResultText(msg);
+      if (resultText) {
+        fullResponse += resultText + "\n";
+      }
+      log("EVALUATOR", `Evaluation complete for sprint ${sprint}`);
+    }
+  }
+
+  return fullResponse.trim();
+}
+
+function extractResultText(resultMsg: unknown): string {
+  const chunks: string[] = [];
+
+  const visit = (value: unknown, depth: number): void => {
+    if (depth > 3 || value === null || value === undefined) return;
+
+    if (typeof value === "string") {
+      const trimmed = value.trim();
+      if (trimmed.startsWith("{") || trimmed.startsWith("```")) {
+        chunks.push(trimmed);
+      }
+      return;
+    }
+
+    if (Array.isArray(value)) {
+      for (const item of value) visit(item, depth + 1);
+      return;
+    }
+
+    if (typeof value === "object") {
+      for (const [key, child] of Object.entries(value as Record<string, unknown>)) {
+        if (key === "type") continue;
+        visit(child, depth + 1);
+      }
+    }
+  };
+
+  visit(resultMsg, 0);
+  return chunks.join("\n");
+}
diff --git a/claude-harness/generator.ts b/claude-harness/generator.ts
@@ -9,6 +9,7 @@ export async function runGenerator(
   spec: string,
   contract: SprintContract,
   previousFeedback?: EvalResult,
+  retryFocusCriteria: string[] = [],
 ): Promise<{ response: string; sessionId?: string }> {
   const sprint = contract.sprintNumber;
   const attempt = previousFeedback ? "retry" : "initial";
@@ -18,6 +19,10 @@ export async function runGenerator(
 
   if (previousFeedback) {
     prompt += `\n\n## Evaluation Feedback (MUST ADDRESS)\n\n${JSON.stringify(previousFeedback, null, 2)}`;
+    if (retryFocusCriteria.length > 0) {
+      prompt += `\n\n## Retry Focus (Scope Control)\n\nOnly these criteria are still failing and must be fixed now:\n${retryFocusCriteria.map((name) => `- ${name}`).join("\n")}`;
+      prompt += "\n\nMinimize changes outside the failing criteria. Preserve behavior for criteria that already pass unless a dependency forces a shared fix.";
+    }
     prompt += `\n\nThe previous attempt failed evaluation. Address every issue in the feedback above.`;
   } else {
     prompt += `\n\nImplement the features listed in this sprint contract. Work in the \`app/\` directory.`;