Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions shared/lib/eval-schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
* fields to track cost budget constraint violations during routing (HOK-1350)
* - **1.8.0**: Added optional `manifestRef` for per-run resource manifest
* attribution (HOK-1378)
* - **1.9.0**: Added `PlanCritique` interface and optional `planCritique`
* field to `EvalRecord.metadata` for explicit plan quality evaluation (HOK-1391)
*
* @module eval-schema
*/
Expand Down Expand Up @@ -605,6 +607,41 @@ export type Stratum = string;
// Per-Stage Outcomes (HOK-1004)
// ────────────────────────────────────────────────────────────────

/**
* A single dimension of plan quality critique.
*
* Each dimension captures a specific aspect of plan quality with
* a normalized score and brief rationale.
*/
export interface PlanCritiqueDimension {
/** Quality score 0.0–1.0 for this dimension */
score: number;
/** 1-2 sentence rationale for the score */
rationale: string;
}

/**
* Explicit plan quality critique across multiple dimensions.
*
* Captures concrete plan evaluation criteria like component boundaries,
* invariant coverage, and approach soundness. Enables direct plan quality
* measurement rather than inferring it from downstream diff cleanliness.
*
* @since 1.9.0
*/
export interface PlanCritique {
/** Did the plan identify the correct component boundaries? */
component_boundaries: PlanCritiqueDimension;
/** Were key invariants and constraints identified in the plan? */
invariant_coverage: PlanCritiqueDimension;
/** Was the proposed approach viable and technically sound? */
approach_soundness: PlanCritiqueDimension;
/** Did the implementation have to patch around plan gaps? */
missed_patches: PlanCritiqueDimension;
/** Overall aggregate plan quality score */
overall: PlanCritiqueDimension;
}

/**
* Per-stage quality attribution from the eval judge.
*
Expand Down
106 changes: 105 additions & 1 deletion shared/lib/eval.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ describe('evaluateTask', () => {

// Core EvalRecord fields from eval-schema.ts
assert.ok(result.id, 'should have a UUID id');
assert.equal(result.schemaVersion, '1.8.0');
assert.equal(result.schemaVersion, '1.9.0');
assert.equal(result.originalPrompt, 'Add a loading spinner');
assert.ok(result.modelId);
assert.ok(result.modelVersion);
Expand Down Expand Up @@ -365,4 +365,108 @@ describe('evaluateTask', () => {
'filled prompt hash should be deterministic for same inputs'
);
});

it('includes valid planCritique in metadata when present', async () => {
const validResponse = JSON.stringify({
score: 0.85,
rationale: 'Task completed successfully.',
interventionFlags: [],
planCritique: {
component_boundaries: { score: 0.9, rationale: 'Correctly identified all target files.' },
invariant_coverage: { score: 0.75, rationale: 'Missed one edge case constraint.' },
approach_soundness: { score: 0.85, rationale: 'Approach was technically sound.' },
missed_patches: { score: 0.8, rationale: 'Implementation needed minor additions.' },
overall: { score: 0.82, rationale: 'Strong plan with minor gaps.' },
},
});

const result = await evaluateTask(
{
taskPrompt: 'Add a new feature',
prReviewOutput: 'Clean implementation',
planContent: 'Implementation plan content here',
},
undefined,
{ _callFn: mockCallFn(validResponse) }
);

assert.ok(result.metadata.planCritique, 'should include planCritique in metadata');
assert.equal(result.metadata.planCritique.component_boundaries.score, 0.9);
assert.equal(result.metadata.planCritique.invariant_coverage.score, 0.75);
assert.equal(result.metadata.planCritique.approach_soundness.score, 0.85);
assert.equal(result.metadata.planCritique.missed_patches.score, 0.8);
assert.equal(result.metadata.planCritique.overall.score, 0.82);
assert.ok(result.metadata.planCritique.component_boundaries.rationale.includes('target files'));
});

it('omits planCritique when not present in judge response', async () => {
const validResponse = JSON.stringify({
score: 0.8,
rationale: 'Task completed with minor issues.',
interventionFlags: [],
});

const result = await evaluateTask(
{
taskPrompt: 'Fix a bug',
prReviewOutput: 'Bug fixed correctly',
},
undefined,
{ _callFn: mockCallFn(validResponse) }
);

assert.equal(result.metadata.planCritique, undefined, 'should not include planCritique when absent');
});

it('gracefully handles planCritique with invalid dimension scores', async () => {
const invalidResponse = JSON.stringify({
score: 0.85,
rationale: 'Task completed successfully.',
interventionFlags: [],
planCritique: {
component_boundaries: { score: 1.5, rationale: 'Invalid score out of range.' },
invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' },
approach_soundness: { score: 0.85, rationale: 'Valid dimension.' },
missed_patches: { score: -0.1, rationale: 'Invalid negative score.' },
overall: { score: 0.82, rationale: 'Overall score.' },
},
});

const result = await evaluateTask(
{
taskPrompt: 'Add a feature',
prReviewOutput: 'Feature added',
},
undefined,
{ _callFn: mockCallFn(invalidResponse) }
);

// planCritique should be omitted entirely if any dimension is invalid
assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when dimensions are invalid');
});

it('gracefully handles planCritique with missing dimensions', async () => {
const incompleteResponse = JSON.stringify({
score: 0.85,
rationale: 'Task completed successfully.',
interventionFlags: [],
planCritique: {
component_boundaries: { score: 0.9, rationale: 'Good boundaries.' },
invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' },
// Missing approach_soundness, missed_patches, and overall
},
});

const result = await evaluateTask(
{
taskPrompt: 'Add a feature',
prReviewOutput: 'Feature added',
},
undefined,
{ _callFn: mockCallFn(incompleteResponse) }
);

// planCritique should be omitted if not all required dimensions are present
assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when incomplete');
});
});
41 changes: 37 additions & 4 deletions shared/lib/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { readFile } from "node:fs/promises";
import { randomUUID } from 'crypto';
import { fileURLToPath } from "node:url";
import { dirname, join } from "node:path";
import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision } from './eval-schema.ts';
import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision, type PlanCritique } from './eval-schema.ts';
import { callClaude, parseJsonFromLLM } from './llm-cli.ts';
import { getEvalConfig } from './config.ts';
import { loadPricingTable } from './workflow-cost.ts';
Expand All @@ -25,7 +25,7 @@ const __dirname = dirname(__filename);
const DEFAULT_MODEL = 'claude-sonnet-4-6';
const DEFAULT_PROVIDER = 'claude-cli';
const SUPPORTED_PROVIDERS = ['claude-cli', 'anthropic'] as const;
const SCHEMA_VERSION = '1.8.0';
const SCHEMA_VERSION = '1.9.0';
const MAX_RETRIES = 2;
const TIMEOUT_MS = 120_000;

Expand Down Expand Up @@ -91,6 +91,7 @@ interface JudgeResponse {
rationale: string;
interventionFlags: string[];
stageScores?: Record<string, { score: number; rationale: string }>;
planCritique?: PlanCritique;
}

/**
Expand Down Expand Up @@ -238,6 +239,7 @@ function parseJudgeResponse(raw: string): JudgeResponse {
rationale?: string;
interventionFlags?: string[];
stageScores?: Record<string, { score?: number; rationale?: string }>;
planCritique?: Record<string, { score?: number; rationale?: string }>;
};

if (typeof parsed.score !== 'number' || parsed.score < 0 || parsed.score > 1) {
Expand Down Expand Up @@ -275,11 +277,42 @@ function parseJudgeResponse(raw: string): JudgeResponse {
}
}

// Parse and validate planCritique (optional)
let planCritique: PlanCritique | undefined;
if (parsed.planCritique && typeof parsed.planCritique === 'object') {
const requiredDimensions = ['component_boundaries', 'invariant_coverage', 'approach_soundness', 'missed_patches', 'overall'];
const validatedDimensions: Partial<PlanCritique> = {};

for (const dimension of requiredDimensions) {
const dimData = parsed.planCritique[dimension];
if (
dimData &&
typeof dimData === 'object' &&
typeof dimData.score === 'number' &&
dimData.score >= 0 &&
dimData.score <= 1 &&
typeof dimData.rationale === 'string' &&
dimData.rationale.trim().length > 0
) {
(validatedDimensions as any)[dimension] = {
score: dimData.score,
rationale: dimData.rationale.trim(),
};
}
}

// Only include if all required dimensions are present and valid
if (Object.keys(validatedDimensions).length === requiredDimensions.length) {
planCritique = validatedDimensions as PlanCritique;
}
}

return {
score: parsed.score,
rationale: parsed.rationale.trim(),
interventionFlags: parsed.interventionFlags,
...(stageScores && { stageScores }),
...(planCritique && { planCritique }),
};
}

Expand Down Expand Up @@ -373,7 +406,7 @@ export async function evaluateTask(
const response = await callFn(prompt, model);

// Parse response
const { score, rationale, interventionFlags, stageScores } = parseJudgeResponse(response.text);
const { score, rationale, interventionFlags, stageScores, planCritique } = parseJudgeResponse(response.text);
const band = getScoreBand(score);

const tokenUsage = response.usage || undefined;
Expand Down Expand Up @@ -408,7 +441,7 @@ export async function evaluateTask(
...(outcomes && { outcomes }),
...(routingDecision && { routingDecision }),
...(promptArtifacts.length > 0 && { promptArtifacts }),
metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }) },
metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }), ...(planCritique && { planCritique }) },
};
const activeSessionId = process.env.WAVEMILL_SESSION || (await getLatestSession())?.sessionId;
attachManifestRef(record, activeSessionId);
Expand Down
2 changes: 1 addition & 1 deletion tests/startup-handoff.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ write_plan "$SUCCESS_PLAN" "$TEST_REPO" "$STATE_DIR" "$STATE_FILE" "startup-succ
SUCCESS_OUTPUT="$TMP_ROOT/success-output.txt"
bash "$RUNNER_SCRIPT" "$SUCCESS_PLAN" > "$SUCCESS_OUTPUT" 2>&1

if jq -e '.tasks["HOK-1001"].phase == "planning"' "$STATE_FILE" >/dev/null 2>&1; then
if jq -e '.tasks["HOK-1001"].phase == "coding"' "$STATE_FILE" >/dev/null 2>&1; then
pass "startup runner writes workflow state only after in-tmux startup succeeds"
else
fail "startup runner did not persist workflow state for the launched task"
Expand Down
43 changes: 43 additions & 0 deletions tools/prompts/eval-judge.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,41 @@ In addition to the overall score, attribute quality to **all four workflow stage
- *When Implementation Plan is provided*: Score based on whether the plan was sound and the implementation followed it successfully.
- *When Implementation Plan is NOT provided*: Infer from the PR diff and commit history. Was the approach well-structured (logical file changes, good decomposition)? Or are there signs of rework, wrong-direction commits, or thrashing? A clean, well-organized diff with no rework suggests good planning (score 0.7–0.9). Multiple reverts or approach changes suggest poor planning (score 0.3–0.6). For single-shot autonomous workflows with no plan artifact, score based on whether the agent chose a reasonable approach.

### Plan Critique (when Implementation Plan is provided)

When `{{PLAN_CONTENT}}` is available (not "Not available"), you **must** include a `planCritique` field in your JSON response. This field provides explicit plan quality evaluation across four dimensions:

**1. component_boundaries** — Did the plan correctly identify which components, modules, or files needed to be modified?
- Score 1.0 if the plan identified exactly the right components with no false positives or missed areas
- Score 0.7–0.9 if the plan identified most key components but missed 1-2 secondary areas
- Score 0.4–0.6 if the plan had significant gaps (missed major components) or false positives (targeted wrong areas)
- Compare planned file changes to actual PR diff changes for precise assessment

**2. invariant_coverage** — Did the plan identify critical constraints, edge cases, and invariants?
- Score 1.0 if the plan documented all critical constraints with no surprises during implementation
- Score 0.7–0.9 if the plan captured most key constraints but missed 1-2 non-obvious edge cases
- Score 0.4–0.6 if the plan missed several important constraints that impacted correctness
- Check if the implementation had to add defensive checks, validation, or error handling not mentioned in the plan

**3. approach_soundness** — Was the proposed implementation approach technically viable and correct?
- Score 1.0 if the approach was optimal and implementation followed it directly
- Score 0.7–0.9 if the approach was sound but suboptimal, requiring minor adjustments
- Score 0.4–0.6 if the approach had notable issues (inefficient, brittle) requiring rework
- Check if the implementation followed the planned approach or had to deviate significantly

**4. missed_patches** — Did the implementation have to work around gaps or errors in the plan?
- Score 1.0 if the implementation followed the plan directly with no patches or workarounds
- Score 0.7–0.9 if implementation needed 1-2 minor additions not covered by the plan
- Score 0.4–0.6 if implementation had to patch around multiple plan gaps or fix plan errors
- Analyze commit messages and diff hunks for signs of unplanned work

**5. overall** — Aggregate plan quality score (weighted, not a simple average)
- Weight `component_boundaries` and `invariant_coverage` heavily (strong foundation)
- Reward low `missed_patches` (plan was actionable)
- Let `approach_soundness` gate the overall score (low soundness caps overall regardless of other dimensions)

**When to omit `planCritique`**: Only when `{{PLAN_CONTENT}}` is literally "Not available" in the input. If a plan artifact exists, you must score it.

- **implementation** (always scored): Given the spec and plan, did the code correctly implement what was asked? Score 1.0 if the code is correct, complete, and production-ready. Score lower for bugs, missing edge cases, or poor code quality — but only penalize the implementation for issues that were NOT caused by a bad spec or plan.

- **review** (always scored): Did the review process catch issues before human review?
Expand All @@ -129,6 +164,13 @@ Respond with **only** a JSON object (no markdown fences, no preamble):
"plan": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"implementation": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"review": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }
},
"planCritique": {
"component_boundaries": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"invariant_coverage": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"approach_soundness": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"missed_patches": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" },
"overall": { "score": <0.0-1.0>, "rationale": "<2-3 sentences>" }
}
}
```
Expand All @@ -137,5 +179,6 @@ Respond with **only** a JSON object (no markdown fences, no preamble):
- `rationale`: A concise, human-readable explanation justifying the score. **Must reference specific intervention events if any are present.**
- `interventionFlags`: Array of strings describing notable interventions (empty array if none). Use the format `"type:description"` (e.g., `"review_comment:missing error handling"`, `"post_pr_commit:fixed lint errors"`)
- `stageScores`: Object with per-stage attribution scores. **Always include all four stages** (expansion, plan, implementation, review). When artifacts are not available for a stage, infer quality from the PR diff, intervention patterns, and overall outcome.
- `planCritique`: **Optional** object with explicit plan quality scores across five dimensions. **Only include when `{{PLAN_CONTENT}}` is available** (not "Not available"). Omit this field entirely when no plan artifact exists.

Output ONLY the JSON object. No other text.
Loading
Loading