Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions shared/lib/agent-adapters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1606,6 +1606,8 @@ agent_pane_is_ready() {
sleep 0.5
fi

local current_command
current_command=$(_pane_current_command "$target")
local pane_pid
pane_pid=$(tmux display-message -t "$target" -p '#{pane_pid}' 2>/dev/null || echo "")
if [[ -z "$pane_pid" ]]; then
Expand All @@ -1615,8 +1617,6 @@ agent_pane_is_ready() {
return 0
fi

local current_command
current_command=$(_pane_current_command "$target")
if _pane_command_is_shell "$current_command"; then
local children
children=$(_pane_child_count "$target")
Expand Down Expand Up @@ -1650,6 +1650,7 @@ agent_verify_launch() {
attempts=$(awk "BEGIN { v = $max_wait / $poll_interval; if (v < 1) v = 1; printf \"%d\", (v == int(v) ? v : int(v) + 1) }")

local attempt=1
local introspection_available=0
while (( attempt <= attempts )); do
local current_command children state_changed=0
current_command=$(_pane_current_command "$target")
Expand All @@ -1660,6 +1661,10 @@ agent_verify_launch() {
return 0
fi

if [[ -n "$current_command" || -n "$children" ]]; then
introspection_available=1
fi

if [[ -n "$baseline_command" ]] || [[ -n "$baseline_children" ]]; then
if [[ "$current_command" != "$baseline_command" ]] || [[ "$children" != "${baseline_children:-}" ]]; then
state_changed=1
Expand Down
41 changes: 41 additions & 0 deletions shared/lib/eval-record-builder.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,47 @@ describe('eval-record-builder', () => {
});

describe('enrichEvalRecord', () => {
it('attaches planCritique to the normalized plan stage outcome', () => {
baseRecord.metadata = {
stageScores: {
plan: {
score: 0.81,
rationale: 'The plan covered the right implementation areas.',
},
},
planCritique: {
component_boundaries: {
score: 0.9,
rationale: 'The plan identified the correct component boundary.',
},
invariant_coverage: {
score: 0.7,
rationale: 'It captured the main compatibility invariant.',
},
approach_soundness: {
score: 0.8,
rationale: 'The proposed approach was viable.',
},
missed_patches: {
score: 0.78,
rationale: 'Implementation needed only minor follow-up fixes.',
},
overall: {
score: 0.8,
rationale: 'Overall the plan was a useful guide.',
},
},
};

enrichEvalRecord(baseRecord, {});

expect(baseRecord.stageOutcomes?.plan).toEqual({
score: 0.81,
rationale: 'The plan covered the right implementation areas.',
planCritique: baseRecord.metadata.planCritique,
});
});

it('should attach all metadata when provided', () => {
const metadata = {
agentType: 'codex',
Expand Down
8 changes: 6 additions & 2 deletions shared/lib/eval-record-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import type {
EvalRecord,
PlanCritique,
TaskContext,
RepoContext,
StageOutcomes,
Expand Down Expand Up @@ -157,7 +158,8 @@ export function attachWorkflowCostMetadata(
*/
export function attachStageOutcomes(
record: EvalRecord,
stageScores?: Record<string, { score: number; rationale: string }>
stageScores?: Record<string, { score: number; rationale: string }>,
planCritique?: PlanCritique,
): void {
if (!stageScores || Object.keys(stageScores).length === 0) {
return;
Expand All @@ -177,6 +179,7 @@ export function attachStageOutcomes(
stageOutcomes.plan = {
score: stageScores.plan.score,
rationale: stageScores.plan.rationale,
...(planCritique && { planCritique }),
};
}

Expand Down Expand Up @@ -345,5 +348,6 @@ export function enrichEvalRecord(record: EvalRecord, metadata: EvalRecordMetadat
const stageScores = record.metadata?.stageScores as
| Record<string, { score: number; rationale: string }>
| undefined;
attachStageOutcomes(record, stageScores);
const planCritique = record.metadata?.planCritique as PlanCritique | undefined;
attachStageOutcomes(record, stageScores, planCritique);
}
2 changes: 1 addition & 1 deletion shared/lib/eval-schema.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,7 @@ test('Record with minimal outcomes (only required fields) validates', () => {
test('Record with fallbackEvent validates and round-trips through JSON serialization', () => {
const record: EvalRecord = {
...scenarios[0].record,
schemaVersion: '1.8.0',
schemaVersion: '1.9.0',
fallbackEvent: {
schema_version: '1.0',
preferred_model: 'model-a',
Expand Down
33 changes: 33 additions & 0 deletions shared/lib/eval-schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
* fields to track cost budget constraint violations during routing (HOK-1350)
* - **1.8.0**: Added optional `manifestRef` for per-run resource manifest
* attribution (HOK-1378)
* - **1.9.0**: Added optional `planCritique` to capture explicit planning
* quality dimensions from the eval judge (HOK-1391)
*
* @module eval-schema
*/
Expand Down Expand Up @@ -616,6 +618,37 @@ export interface StageScore {
score: number;
/** 1-2 sentence attribution rationale */
rationale: string;
/** Detailed plan critique when available for the plan stage */
planCritique?: PlanCritique;
}

/**
* A single plan-critique rubric dimension.
*
* Captures both the normalized score and the judge's rationale so plan
* quality can be compared directly across models.
*/
export interface PlanCritiqueDimension {
/** Quality score 0.0–1.0 for this planning dimension */
score: number;
/** 1-2 sentence rationale for the score */
rationale: string;
}

/**
* Explicit critique of plan quality across the key planning dimensions.
*/
export interface PlanCritique {
/** Whether the plan chose the right component/service boundaries */
component_boundaries: PlanCritiqueDimension;
/** Whether the plan surfaced key invariants and constraints */
invariant_coverage: PlanCritiqueDimension;
/** Whether the proposed approach was viable and correct */
approach_soundness: PlanCritiqueDimension;
/** Whether implementation had to patch around plan gaps */
missed_patches: PlanCritiqueDimension;
/** Aggregate plan-quality assessment */
overall: PlanCritiqueDimension;
}

/**
Expand Down
138 changes: 137 additions & 1 deletion shared/lib/eval.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ describe('evaluateTask', () => {

// Core EvalRecord fields from eval-schema.ts
assert.ok(result.id, 'should have a UUID id');
assert.equal(result.schemaVersion, '1.8.0');
assert.equal(result.schemaVersion, '1.9.0');
assert.equal(result.originalPrompt, 'Add a loading spinner');
assert.ok(result.modelId);
assert.ok(result.modelVersion);
Expand Down Expand Up @@ -91,6 +91,142 @@ describe('evaluateTask', () => {
assert.equal(result.scoreBand, 'Assisted Success');
});

it('stores planCritique in metadata when the judge returns it', async () => {
const validResponse = JSON.stringify({
score: 0.88,
rationale: 'Implementation succeeded and the plan was mostly strong.',
interventionFlags: [],
stageScores: {
expansion: { score: 0.9, rationale: 'Spec was clear.' },
plan: { score: 0.84, rationale: 'Plan mostly identified the right work.' },
implementation: { score: 0.9, rationale: 'Code landed cleanly.' },
review: { score: 0.85, rationale: 'Review coverage was good.' },
},
planCritique: {
component_boundaries: {
score: 0.9,
rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.',
},
invariant_coverage: {
score: 0.75,
rationale: 'It identified the key optional-field compatibility constraint.',
},
approach_soundness: {
score: 0.85,
rationale: 'The approach was viable without adding extra judge calls.',
},
missed_patches: {
score: 0.8,
rationale: 'Only minor parser cleanup was needed during implementation.',
},
overall: {
score: 0.82,
rationale: 'The plan provided a solid implementation guide.',
},
},
});

const result = await evaluateTask(
{
taskPrompt: 'Add plan critique to evals',
prReviewOutput: 'Changes are correct',
planContent: 'Implement schema, prompt, parser, and tests.',
},
undefined,
{ _callFn: mockCallFn(validResponse) }
);

assert.deepEqual(result.metadata.planCritique, {
component_boundaries: {
score: 0.9,
rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.',
},
invariant_coverage: {
score: 0.75,
rationale: 'It identified the key optional-field compatibility constraint.',
},
approach_soundness: {
score: 0.85,
rationale: 'The approach was viable without adding extra judge calls.',
},
missed_patches: {
score: 0.8,
rationale: 'Only minor parser cleanup was needed during implementation.',
},
overall: {
score: 0.82,
rationale: 'The plan provided a solid implementation guide.',
},
});
assert.equal(result.metadata.stageScores.plan.score, 0.84);
});

it('omits planCritique when the judge does not return it', async () => {
const validResponse = JSON.stringify({
score: 0.8,
rationale: 'Good execution.',
interventionFlags: [],
stageScores: {
expansion: { score: 0.8, rationale: 'Adequate.' },
plan: { score: 0.78, rationale: 'Reasonable inferred planning.' },
implementation: { score: 0.82, rationale: 'Correct result.' },
review: { score: 0.79, rationale: 'No major misses.' },
},
});

const result = await evaluateTask(
{
taskPrompt: 'Task without saved plan artifact',
prReviewOutput: 'Clean',
},
undefined,
{ _callFn: mockCallFn(validResponse) }
);

assert.equal('planCritique' in result.metadata, false);
});

it('ignores invalid planCritique dimensions gracefully', async () => {
const validResponse = JSON.stringify({
score: 0.73,
rationale: 'Task completed.',
interventionFlags: [],
planCritique: {
component_boundaries: {
score: 1.2,
rationale: 'Out of range score should invalidate the object.',
},
invariant_coverage: {
score: 0.7,
rationale: 'Valid but should be dropped with the invalid object.',
},
approach_soundness: {
score: 0.75,
rationale: 'Valid but incomplete overall object.',
},
missed_patches: {
score: 0.8,
rationale: 'Valid but incomplete overall object.',
},
overall: {
score: 0.74,
rationale: 'Valid but incomplete overall object.',
},
},
});

const result = await evaluateTask(
{
taskPrompt: 'Task with malformed plan critique',
prReviewOutput: 'Completed',
},
undefined,
{ _callFn: mockCallFn(validResponse) }
);

assert.equal('planCritique' in result.metadata, false);
});

it('throws immediately on malformed JSON response', async () => {
const callFn = mockCallFn('not json at all');

Expand Down
Loading
Loading