timogilvie · timogilvie · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/shared/lib/eval-schema.ts b/shared/lib/eval-schema.ts
@@ -27,6 +27,8 @@
  *   fields to track cost budget constraint violations during routing (HOK-1350)
  * - **1.8.0**: Added optional `manifestRef` for per-run resource manifest
  *   attribution (HOK-1378)
+ * - **1.9.0**: Added `PlanCritique` interface and optional `planCritique`
+ *   field to `EvalRecord.metadata` for explicit plan quality evaluation (HOK-1391)
  *
  * @module eval-schema
  */
@@ -605,6 +607,41 @@ export type Stratum = string;
 // Per-Stage Outcomes (HOK-1004)
 // ────────────────────────────────────────────────────────────────
 
+/**
+ * A single dimension of plan quality critique.
+ *
+ * Each dimension captures a specific aspect of plan quality with
+ * a normalized score and brief rationale.
+ */
+export interface PlanCritiqueDimension {
+  /** Quality score 0.0–1.0 for this dimension */
+  score: number;
+  /** 1-2 sentence rationale for the score */
+  rationale: string;
+}
+
+/**
+ * Explicit plan quality critique across multiple dimensions.
+ *
+ * Captures concrete plan evaluation criteria like component boundaries,
+ * invariant coverage, and approach soundness. Enables direct plan quality
+ * measurement rather than inferring it from downstream diff cleanliness.
+ *
+ * @since 1.9.0
+ */
+export interface PlanCritique {
+  /** Did the plan identify the correct component boundaries? */
+  component_boundaries: PlanCritiqueDimension;
+  /** Were key invariants and constraints identified in the plan? */
+  invariant_coverage: PlanCritiqueDimension;
+  /** Was the proposed approach viable and technically sound? */
+  approach_soundness: PlanCritiqueDimension;
+  /** Did the implementation have to patch around plan gaps? */
+  missed_patches: PlanCritiqueDimension;
+  /** Overall aggregate plan quality score */
+  overall: PlanCritiqueDimension;
+}
+
 /**
  * Per-stage quality attribution from the eval judge.
  *

diff --git a/shared/lib/eval.test.js b/shared/lib/eval.test.js
@@ -26,7 +26,7 @@ describe('evaluateTask', () => {
 
     // Core EvalRecord fields from eval-schema.ts
     assert.ok(result.id, 'should have a UUID id');
-    assert.equal(result.schemaVersion, '1.8.0');
+    assert.equal(result.schemaVersion, '1.9.0');
     assert.equal(result.originalPrompt, 'Add a loading spinner');
     assert.ok(result.modelId);
     assert.ok(result.modelVersion);
@@ -365,4 +365,108 @@ describe('evaluateTask', () => {
       'filled prompt hash should be deterministic for same inputs'
     );
   });
+
+  it('includes valid planCritique in metadata when present', async () => {
+    const validResponse = JSON.stringify({
+      score: 0.85,
+      rationale: 'Task completed successfully.',
+      interventionFlags: [],
+      planCritique: {
+        component_boundaries: { score: 0.9, rationale: 'Correctly identified all target files.' },
+        invariant_coverage: { score: 0.75, rationale: 'Missed one edge case constraint.' },
+        approach_soundness: { score: 0.85, rationale: 'Approach was technically sound.' },
+        missed_patches: { score: 0.8, rationale: 'Implementation needed minor additions.' },
+        overall: { score: 0.82, rationale: 'Strong plan with minor gaps.' },
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Add a new feature',
+        prReviewOutput: 'Clean implementation',
+        planContent: 'Implementation plan content here',
+      },
+      undefined,
+      { _callFn: mockCallFn(validResponse) }
+    );
+
+    assert.ok(result.metadata.planCritique, 'should include planCritique in metadata');
+    assert.equal(result.metadata.planCritique.component_boundaries.score, 0.9);
+    assert.equal(result.metadata.planCritique.invariant_coverage.score, 0.75);
+    assert.equal(result.metadata.planCritique.approach_soundness.score, 0.85);
+    assert.equal(result.metadata.planCritique.missed_patches.score, 0.8);
+    assert.equal(result.metadata.planCritique.overall.score, 0.82);
+    assert.ok(result.metadata.planCritique.component_boundaries.rationale.includes('target files'));
+  });
+
+  it('omits planCritique when not present in judge response', async () => {
+    const validResponse = JSON.stringify({
+      score: 0.8,
+      rationale: 'Task completed with minor issues.',
+      interventionFlags: [],
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Fix a bug',
+        prReviewOutput: 'Bug fixed correctly',
+      },
+      undefined,
+      { _callFn: mockCallFn(validResponse) }
+    );
+
+    assert.equal(result.metadata.planCritique, undefined, 'should not include planCritique when absent');
+  });
+
+  it('gracefully handles planCritique with invalid dimension scores', async () => {
+    const invalidResponse = JSON.stringify({
+      score: 0.85,
+      rationale: 'Task completed successfully.',
+      interventionFlags: [],
+      planCritique: {
+        component_boundaries: { score: 1.5, rationale: 'Invalid score out of range.' },
+        invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' },
+        approach_soundness: { score: 0.85, rationale: 'Valid dimension.' },
+        missed_patches: { score: -0.1, rationale: 'Invalid negative score.' },
+        overall: { score: 0.82, rationale: 'Overall score.' },
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Add a feature',
+        prReviewOutput: 'Feature added',
+      },
+      undefined,
+      { _callFn: mockCallFn(invalidResponse) }
+    );
+
+    // planCritique should be omitted entirely if any dimension is invalid
+    assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when dimensions are invalid');
+  });
+
+  it('gracefully handles planCritique with missing dimensions', async () => {
+    const incompleteResponse = JSON.stringify({
+      score: 0.85,
+      rationale: 'Task completed successfully.',
+      interventionFlags: [],
+      planCritique: {
+        component_boundaries: { score: 0.9, rationale: 'Good boundaries.' },
+        invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' },
+        // Missing approach_soundness, missed_patches, and overall
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Add a feature',
+        prReviewOutput: 'Feature added',
+      },
+      undefined,
+      { _callFn: mockCallFn(incompleteResponse) }
+    );
+
+    // planCritique should be omitted if not all required dimensions are present
+    assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when incomplete');
+  });
 });
diff --git a/shared/lib/eval.ts b/shared/lib/eval.ts
@@ -10,7 +10,7 @@ import { readFile } from "node:fs/promises";
 import { randomUUID } from 'crypto';
 import { fileURLToPath } from "node:url";
 import { dirname, join } from "node:path";
-import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision } from './eval-schema.ts';
+import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision, type PlanCritique } from './eval-schema.ts';
 import { callClaude, parseJsonFromLLM } from './llm-cli.ts';
 import { getEvalConfig } from './config.ts';
 import { loadPricingTable } from './workflow-cost.ts';
@@ -25,7 +25,7 @@ const __dirname = dirname(__filename);
 const DEFAULT_MODEL = 'claude-sonnet-4-6';
 const DEFAULT_PROVIDER = 'claude-cli';
 const SUPPORTED_PROVIDERS = ['claude-cli', 'anthropic'] as const;
-const SCHEMA_VERSION = '1.8.0';
+const SCHEMA_VERSION = '1.9.0';
 const MAX_RETRIES = 2;
 const TIMEOUT_MS = 120_000;
 
@@ -91,6 +91,7 @@ interface JudgeResponse {
   rationale: string;
   interventionFlags: string[];
   stageScores?: Record<string, { score: number; rationale: string }>;
+  planCritique?: PlanCritique;
 }
 
 /**
@@ -238,6 +239,7 @@ function parseJudgeResponse(raw: string): JudgeResponse {
     rationale?: string;
     interventionFlags?: string[];
     stageScores?: Record<string, { score?: number; rationale?: string }>;
+    planCritique?: Record<string, { score?: number; rationale?: string }>;
   };
 
   if (typeof parsed.score !== 'number' || parsed.score < 0 || parsed.score > 1) {
@@ -275,11 +277,42 @@ function parseJudgeResponse(raw: string): JudgeResponse {
     }
   }
 
+  // Parse and validate planCritique (optional)
+  let planCritique: PlanCritique | undefined;
+  if (parsed.planCritique && typeof parsed.planCritique === 'object') {
+    const requiredDimensions = ['component_boundaries', 'invariant_coverage', 'approach_soundness', 'missed_patches', 'overall'];
+    const validatedDimensions: Partial<PlanCritique> = {};
+
+    for (const dimension of requiredDimensions) {
+      const dimData = parsed.planCritique[dimension];
+      if (
+        dimData &&
+        typeof dimData === 'object' &&
+        typeof dimData.score === 'number' &&
+        dimData.score >= 0 &&
+        dimData.score <= 1 &&
+        typeof dimData.rationale === 'string' &&
+        dimData.rationale.trim().length > 0
+      ) {
+        (validatedDimensions as any)[dimension] = {
+          score: dimData.score,
+          rationale: dimData.rationale.trim(),
+        };
+      }
+    }
+
+    // Only include if all required dimensions are present and valid
+    if (Object.keys(validatedDimensions).length === requiredDimensions.length) {
+      planCritique = validatedDimensions as PlanCritique;
+    }
+  }
+
   return {
     score: parsed.score,
     rationale: parsed.rationale.trim(),
     interventionFlags: parsed.interventionFlags,
     ...(stageScores && { stageScores }),
+    ...(planCritique && { planCritique }),
   };
 }
 
@@ -373,7 +406,7 @@ export async function evaluateTask(
   const response = await callFn(prompt, model);
 
   // Parse response
-  const { score, rationale, interventionFlags, stageScores } = parseJudgeResponse(response.text);
+  const { score, rationale, interventionFlags, stageScores, planCritique } = parseJudgeResponse(response.text);
   const band = getScoreBand(score);
 
   const tokenUsage = response.usage || undefined;
@@ -408,7 +441,7 @@ export async function evaluateTask(
     ...(outcomes && { outcomes }),
     ...(routingDecision && { routingDecision }),
     ...(promptArtifacts.length > 0 && { promptArtifacts }),
-    metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }) },
+    metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }), ...(planCritique && { planCritique }) },
   };
   const activeSessionId = process.env.WAVEMILL_SESSION || (await getLatestSession())?.sessionId;
   attachManifestRef(record, activeSessionId);

diff --git a/tests/startup-handoff.test.sh b/tests/startup-handoff.test.sh
@@ -303,7 +303,7 @@ write_plan "$SUCCESS_PLAN" "$TEST_REPO" "$STATE_DIR" "$STATE_FILE" "startup-succ
 SUCCESS_OUTPUT="$TMP_ROOT/success-output.txt"
 bash "$RUNNER_SCRIPT" "$SUCCESS_PLAN" > "$SUCCESS_OUTPUT" 2>&1
 
-if jq -e '.tasks["HOK-1001"].phase == "planning"' "$STATE_FILE" >/dev/null 2>&1; then
+if jq -e '.tasks["HOK-1001"].phase == "coding"' "$STATE_FILE" >/dev/null 2>&1; then
   pass "startup runner writes workflow state only after in-tmux startup succeeds"
 else
   fail "startup runner did not persist workflow state for the launched task"

diff --git a/tools/prompts/eval-judge.md b/tools/prompts/eval-judge.md
@@ -105,6 +105,41 @@ In addition to the overall score, attribute quality to **all four workflow stage
   - *When Implementation Plan is provided*: Score based on whether the plan was sound and the implementation followed it successfully.
   - *When Implementation Plan is NOT provided*: Infer from the PR diff and commit history. Was the approach well-structured (logical file changes, good decomposition)? Or are there signs of rework, wrong-direction commits, or thrashing? A clean, well-organized diff with no rework suggests good planning (score 0.7–0.9). Multiple reverts or approach changes suggest poor planning (score 0.3–0.6). For single-shot autonomous workflows with no plan artifact, score based on whether the agent chose a reasonable approach.
 
+### Plan Critique (when Implementation Plan is provided)
+
+When `{{PLAN_CONTENT}}` is available (not "Not available"), you **must** include a `planCritique` field in your JSON response. This field provides explicit plan quality evaluation across four dimensions:
+
+**1. component_boundaries** — Did the plan correctly identify which components, modules, or files needed to be modified?
+- Score 1.0 if the plan identified exactly the right components with no false positives or missed areas
+- Score 0.7–0.9 if the plan identified most key components but missed 1-2 secondary areas
+- Score 0.4–0.6 if the plan had significant gaps (missed major components) or false positives (targeted wrong areas)
+- Compare planned file changes to actual PR diff changes for precise assessment
+
+**2. invariant_coverage** — Did the plan identify critical constraints, edge cases, and invariants?
+- Score 1.0 if the plan documented all critical constraints with no surprises during implementation
+- Score 0.7–0.9 if the plan captured most key constraints but missed 1-2 non-obvious edge cases
+- Score 0.4–0.6 if the plan missed several important constraints that impacted correctness
+- Check if the implementation had to add defensive checks, validation, or error handling not mentioned in the plan
+
+**3. approach_soundness** — Was the proposed implementation approach technically viable and correct?
+- Score 1.0 if the approach was optimal and implementation followed it directly
+- Score 0.7–0.9 if the approach was sound but suboptimal, requiring minor adjustments
+- Score 0.4–0.6 if the approach had notable issues (inefficient, brittle) requiring rework
+- Check if the implementation followed the planned approach or had to deviate significantly
+
+**4. missed_patches** — Did the implementation have to work around gaps or errors in the plan?
+- Score 1.0 if the implementation followed the plan directly with no patches or workarounds
+- Score 0.7–0.9 if implementation needed 1-2 minor additions not covered by the plan
+- Score 0.4–0.6 if implementation had to patch around multiple plan gaps or fix plan errors
+- Analyze commit messages and diff hunks for signs of unplanned work
+
+**5. overall** — Aggregate plan quality score (weighted, not a simple average)
+- Weight `component_boundaries` and `invariant_coverage` heavily (strong foundation)
+- Reward low `missed_patches` (plan was actionable)
+- Let `approach_soundness` gate the overall score (low soundness caps overall regardless of other dimensions)
+
+**When to omit `planCritique`**: Only when `{{PLAN_CONTENT}}` is literally "Not available" in the input. If a plan artifact exists, you must score it.
+
 - **implementation** (always scored): Given the spec and plan, did the code correctly implement what was asked? Score 1.0 if the code is correct, complete, and production-ready. Score lower for bugs, missing edge cases, or poor code quality — but only penalize the implementation for issues that were NOT caused by a bad spec or plan.
 
 - **review** (always scored): Did the review process catch issues before human review?
@@ -129,6 +164,13 @@ Respond with **only** a JSON object (no markdown fences, no preamble):
     "plan": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
     "implementation": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
     "review": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }
+  },
+  "planCritique": {
+    "component_boundaries": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
+    "invariant_coverage": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
+    "approach_soundness": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
+    "missed_patches": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
+    "overall": { "score": <0.0-1.0>, "rationale": "<2-3 sentences>" }
   }
 }
 ```
@@ -137,5 +179,6 @@ Respond with **only** a JSON object (no markdown fences, no preamble):
 - `rationale`: A concise, human-readable explanation justifying the score. **Must reference specific intervention events if any are present.**
 - `interventionFlags`: Array of strings describing notable interventions (empty array if none). Use the format `"type:description"` (e.g., `"review_comment:missing error handling"`, `"post_pr_commit:fixed lint errors"`)
 - `stageScores`: Object with per-stage attribution scores. **Always include all four stages** (expansion, plan, implementation, review). When artifacts are not available for a stage, infer quality from the PR diff, intervention patterns, and overall outcome.
+- `planCritique`: **Optional** object with explicit plan quality scores across five dimensions. **Only include when `{{PLAN_CONTENT}}` is available** (not "Not available"). Omit this field entirely when no plan artifact exists.
 
 Output ONLY the JSON object. No other text.