timogilvie · timogilvie · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/shared/lib/agent-adapters.sh b/shared/lib/agent-adapters.sh
@@ -1606,6 +1606,8 @@ agent_pane_is_ready() {
     sleep 0.5
   fi
 
+  local current_command
+  current_command=$(_pane_current_command "$target")
   local pane_pid
   pane_pid=$(tmux display-message -t "$target" -p '#{pane_pid}' 2>/dev/null || echo "")
   if [[ -z "$pane_pid" ]]; then
@@ -1615,8 +1617,6 @@ agent_pane_is_ready() {
     return 0
   fi
 
-  local current_command
-  current_command=$(_pane_current_command "$target")
   if _pane_command_is_shell "$current_command"; then
     local children
     children=$(_pane_child_count "$target")
@@ -1650,6 +1650,7 @@ agent_verify_launch() {
   attempts=$(awk "BEGIN { v = $max_wait / $poll_interval; if (v < 1) v = 1; printf \"%d\", (v == int(v) ? v : int(v) + 1) }")
 
   local attempt=1
+  local introspection_available=0
   while (( attempt <= attempts )); do
     local current_command children state_changed=0
     current_command=$(_pane_current_command "$target")
@@ -1660,6 +1661,10 @@ agent_verify_launch() {
       return 0
     fi
 
+    if [[ -n "$current_command" || -n "$children" ]]; then
+      introspection_available=1
+    fi
+
     if [[ -n "$baseline_command" ]] || [[ -n "$baseline_children" ]]; then
       if [[ "$current_command" != "$baseline_command" ]] || [[ "$children" != "${baseline_children:-}" ]]; then
         state_changed=1

diff --git a/shared/lib/eval-record-builder.test.ts b/shared/lib/eval-record-builder.test.ts
@@ -231,6 +231,47 @@ describe('eval-record-builder', () => {
   });
 
   describe('enrichEvalRecord', () => {
+    it('attaches planCritique to the normalized plan stage outcome', () => {
+      baseRecord.metadata = {
+        stageScores: {
+          plan: {
+            score: 0.81,
+            rationale: 'The plan covered the right implementation areas.',
+          },
+        },
+        planCritique: {
+          component_boundaries: {
+            score: 0.9,
+            rationale: 'The plan identified the correct component boundary.',
+          },
+          invariant_coverage: {
+            score: 0.7,
+            rationale: 'It captured the main compatibility invariant.',
+          },
+          approach_soundness: {
+            score: 0.8,
+            rationale: 'The proposed approach was viable.',
+          },
+          missed_patches: {
+            score: 0.78,
+            rationale: 'Implementation needed only minor follow-up fixes.',
+          },
+          overall: {
+            score: 0.8,
+            rationale: 'Overall the plan was a useful guide.',
+          },
+        },
+      };
+
+      enrichEvalRecord(baseRecord, {});
+
+      expect(baseRecord.stageOutcomes?.plan).toEqual({
+        score: 0.81,
+        rationale: 'The plan covered the right implementation areas.',
+        planCritique: baseRecord.metadata.planCritique,
+      });
+    });
+
     it('should attach all metadata when provided', () => {
       const metadata = {
         agentType: 'codex',

diff --git a/shared/lib/eval-record-builder.ts b/shared/lib/eval-record-builder.ts
@@ -15,6 +15,7 @@
 
 import type {
   EvalRecord,
+  PlanCritique,
   TaskContext,
   RepoContext,
   StageOutcomes,
@@ -157,7 +158,8 @@ export function attachWorkflowCostMetadata(
  */
 export function attachStageOutcomes(
   record: EvalRecord,
-  stageScores?: Record<string, { score: number; rationale: string }>
+  stageScores?: Record<string, { score: number; rationale: string }>,
+  planCritique?: PlanCritique,
 ): void {
   if (!stageScores || Object.keys(stageScores).length === 0) {
     return;
@@ -177,6 +179,7 @@ export function attachStageOutcomes(
     stageOutcomes.plan = {
       score: stageScores.plan.score,
       rationale: stageScores.plan.rationale,
+      ...(planCritique && { planCritique }),
     };
   }
 
@@ -345,5 +348,6 @@ export function enrichEvalRecord(record: EvalRecord, metadata: EvalRecordMetadat
   const stageScores = record.metadata?.stageScores as
     | Record<string, { score: number; rationale: string }>
     | undefined;
-  attachStageOutcomes(record, stageScores);
+  const planCritique = record.metadata?.planCritique as PlanCritique | undefined;
+  attachStageOutcomes(record, stageScores, planCritique);
 }
diff --git a/shared/lib/eval-schema.test.ts b/shared/lib/eval-schema.test.ts
@@ -1000,7 +1000,7 @@ test('Record with minimal outcomes (only required fields) validates', () => {
 test('Record with fallbackEvent validates and round-trips through JSON serialization', () => {
   const record: EvalRecord = {
     ...scenarios[0].record,
-    schemaVersion: '1.8.0',
+    schemaVersion: '1.9.0',
     fallbackEvent: {
       schema_version: '1.0',
       preferred_model: 'model-a',

diff --git a/shared/lib/eval-schema.ts b/shared/lib/eval-schema.ts
@@ -27,6 +27,8 @@
  *   fields to track cost budget constraint violations during routing (HOK-1350)
  * - **1.8.0**: Added optional `manifestRef` for per-run resource manifest
  *   attribution (HOK-1378)
+ * - **1.9.0**: Added optional `planCritique` to capture explicit planning
+ *   quality dimensions from the eval judge (HOK-1391)
  *
  * @module eval-schema
  */
@@ -616,6 +618,37 @@ export interface StageScore {
   score: number;
   /** 1-2 sentence attribution rationale */
   rationale: string;
+  /** Detailed plan critique when available for the plan stage */
+  planCritique?: PlanCritique;
+}
+
+/**
+ * A single plan-critique rubric dimension.
+ *
+ * Captures both the normalized score and the judge's rationale so plan
+ * quality can be compared directly across models.
+ */
+export interface PlanCritiqueDimension {
+  /** Quality score 0.0–1.0 for this planning dimension */
+  score: number;
+  /** 1-2 sentence rationale for the score */
+  rationale: string;
+}
+
+/**
+ * Explicit critique of plan quality across the key planning dimensions.
+ */
+export interface PlanCritique {
+  /** Whether the plan chose the right component/service boundaries */
+  component_boundaries: PlanCritiqueDimension;
+  /** Whether the plan surfaced key invariants and constraints */
+  invariant_coverage: PlanCritiqueDimension;
+  /** Whether the proposed approach was viable and correct */
+  approach_soundness: PlanCritiqueDimension;
+  /** Whether implementation had to patch around plan gaps */
+  missed_patches: PlanCritiqueDimension;
+  /** Aggregate plan-quality assessment */
+  overall: PlanCritiqueDimension;
 }
 
 /**

diff --git a/shared/lib/eval.test.js b/shared/lib/eval.test.js
@@ -26,7 +26,7 @@ describe('evaluateTask', () => {
 
     // Core EvalRecord fields from eval-schema.ts
     assert.ok(result.id, 'should have a UUID id');
-    assert.equal(result.schemaVersion, '1.8.0');
+    assert.equal(result.schemaVersion, '1.9.0');
     assert.equal(result.originalPrompt, 'Add a loading spinner');
     assert.ok(result.modelId);
     assert.ok(result.modelVersion);
@@ -91,6 +91,142 @@ describe('evaluateTask', () => {
     assert.equal(result.scoreBand, 'Assisted Success');
   });
 
+  it('stores planCritique in metadata when the judge returns it', async () => {
+    const validResponse = JSON.stringify({
+      score: 0.88,
+      rationale: 'Implementation succeeded and the plan was mostly strong.',
+      interventionFlags: [],
+      stageScores: {
+        expansion: { score: 0.9, rationale: 'Spec was clear.' },
+        plan: { score: 0.84, rationale: 'Plan mostly identified the right work.' },
+        implementation: { score: 0.9, rationale: 'Code landed cleanly.' },
+        review: { score: 0.85, rationale: 'Review coverage was good.' },
+      },
+      planCritique: {
+        component_boundaries: {
+          score: 0.9,
+          rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.',
+        },
+        invariant_coverage: {
+          score: 0.75,
+          rationale: 'It identified the key optional-field compatibility constraint.',
+        },
+        approach_soundness: {
+          score: 0.85,
+          rationale: 'The approach was viable without adding extra judge calls.',
+        },
+        missed_patches: {
+          score: 0.8,
+          rationale: 'Only minor parser cleanup was needed during implementation.',
+        },
+        overall: {
+          score: 0.82,
+          rationale: 'The plan provided a solid implementation guide.',
+        },
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Add plan critique to evals',
+        prReviewOutput: 'Changes are correct',
+        planContent: 'Implement schema, prompt, parser, and tests.',
+      },
+      undefined,
+      { _callFn: mockCallFn(validResponse) }
+    );
+
+    assert.deepEqual(result.metadata.planCritique, {
+      component_boundaries: {
+        score: 0.9,
+        rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.',
+      },
+      invariant_coverage: {
+        score: 0.75,
+        rationale: 'It identified the key optional-field compatibility constraint.',
+      },
+      approach_soundness: {
+        score: 0.85,
+        rationale: 'The approach was viable without adding extra judge calls.',
+      },
+      missed_patches: {
+        score: 0.8,
+        rationale: 'Only minor parser cleanup was needed during implementation.',
+      },
+      overall: {
+        score: 0.82,
+        rationale: 'The plan provided a solid implementation guide.',
+      },
+    });
+    assert.equal(result.metadata.stageScores.plan.score, 0.84);
+  });
+
+  it('omits planCritique when the judge does not return it', async () => {
+    const validResponse = JSON.stringify({
+      score: 0.8,
+      rationale: 'Good execution.',
+      interventionFlags: [],
+      stageScores: {
+        expansion: { score: 0.8, rationale: 'Adequate.' },
+        plan: { score: 0.78, rationale: 'Reasonable inferred planning.' },
+        implementation: { score: 0.82, rationale: 'Correct result.' },
+        review: { score: 0.79, rationale: 'No major misses.' },
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Task without saved plan artifact',
+        prReviewOutput: 'Clean',
+      },
+      undefined,
+      { _callFn: mockCallFn(validResponse) }
+    );
+
+    assert.equal('planCritique' in result.metadata, false);
+  });
+
+  it('ignores invalid planCritique dimensions gracefully', async () => {
+    const validResponse = JSON.stringify({
+      score: 0.73,
+      rationale: 'Task completed.',
+      interventionFlags: [],
+      planCritique: {
+        component_boundaries: {
+          score: 1.2,
+          rationale: 'Out of range score should invalidate the object.',
+        },
+        invariant_coverage: {
+          score: 0.7,
+          rationale: 'Valid but should be dropped with the invalid object.',
+        },
+        approach_soundness: {
+          score: 0.75,
+          rationale: 'Valid but incomplete overall object.',
+        },
+        missed_patches: {
+          score: 0.8,
+          rationale: 'Valid but incomplete overall object.',
+        },
+        overall: {
+          score: 0.74,
+          rationale: 'Valid but incomplete overall object.',
+        },
+      },
+    });
+
+    const result = await evaluateTask(
+      {
+        taskPrompt: 'Task with malformed plan critique',
+        prReviewOutput: 'Completed',
+      },
+      undefined,
+      { _callFn: mockCallFn(validResponse) }
+    );
+
+    assert.equal('planCritique' in result.metadata, false);
+  });
+
   it('throws immediately on malformed JSON response', async () => {
     const callFn = mockCallFn('not json at all');