diff --git a/shared/lib/eval-schema.ts b/shared/lib/eval-schema.ts index 9ef1b4a..c7183e3 100644 --- a/shared/lib/eval-schema.ts +++ b/shared/lib/eval-schema.ts @@ -27,6 +27,8 @@ * fields to track cost budget constraint violations during routing (HOK-1350) * - **1.8.0**: Added optional `manifestRef` for per-run resource manifest * attribution (HOK-1378) + * - **1.9.0**: Added `PlanCritique` interface and optional `planCritique` + * field to `EvalRecord.metadata` for explicit plan quality evaluation (HOK-1391) * * @module eval-schema */ @@ -605,6 +607,41 @@ export type Stratum = string; // Per-Stage Outcomes (HOK-1004) // ──────────────────────────────────────────────────────────────── +/** + * A single dimension of plan quality critique. + * + * Each dimension captures a specific aspect of plan quality with + * a normalized score and brief rationale. + */ +export interface PlanCritiqueDimension { + /** Quality score 0.0–1.0 for this dimension */ + score: number; + /** 1-2 sentence rationale for the score */ + rationale: string; +} + +/** + * Explicit plan quality critique across multiple dimensions. + * + * Captures concrete plan evaluation criteria like component boundaries, + * invariant coverage, and approach soundness. Enables direct plan quality + * measurement rather than inferring it from downstream diff cleanliness. + * + * @since 1.9.0 + */ +export interface PlanCritique { + /** Did the plan identify the correct component boundaries? */ + component_boundaries: PlanCritiqueDimension; + /** Were key invariants and constraints identified in the plan? */ + invariant_coverage: PlanCritiqueDimension; + /** Was the proposed approach viable and technically sound? */ + approach_soundness: PlanCritiqueDimension; + /** Did the implementation have to patch around plan gaps? */ + missed_patches: PlanCritiqueDimension; + /** Overall aggregate plan quality score */ + overall: PlanCritiqueDimension; +} + /** * Per-stage quality attribution from the eval judge. * diff --git a/shared/lib/eval.test.js b/shared/lib/eval.test.js index 0f383bb..7ee7dd8 100644 --- a/shared/lib/eval.test.js +++ b/shared/lib/eval.test.js @@ -26,7 +26,7 @@ describe('evaluateTask', () => { // Core EvalRecord fields from eval-schema.ts assert.ok(result.id, 'should have a UUID id'); - assert.equal(result.schemaVersion, '1.8.0'); + assert.equal(result.schemaVersion, '1.9.0'); assert.equal(result.originalPrompt, 'Add a loading spinner'); assert.ok(result.modelId); assert.ok(result.modelVersion); @@ -365,4 +365,108 @@ describe('evaluateTask', () => { 'filled prompt hash should be deterministic for same inputs' ); }); + + it('includes valid planCritique in metadata when present', async () => { + const validResponse = JSON.stringify({ + score: 0.85, + rationale: 'Task completed successfully.', + interventionFlags: [], + planCritique: { + component_boundaries: { score: 0.9, rationale: 'Correctly identified all target files.' }, + invariant_coverage: { score: 0.75, rationale: 'Missed one edge case constraint.' }, + approach_soundness: { score: 0.85, rationale: 'Approach was technically sound.' }, + missed_patches: { score: 0.8, rationale: 'Implementation needed minor additions.' }, + overall: { score: 0.82, rationale: 'Strong plan with minor gaps.' }, + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Add a new feature', + prReviewOutput: 'Clean implementation', + planContent: 'Implementation plan content here', + }, + undefined, + { _callFn: mockCallFn(validResponse) } + ); + + assert.ok(result.metadata.planCritique, 'should include planCritique in metadata'); + assert.equal(result.metadata.planCritique.component_boundaries.score, 0.9); + assert.equal(result.metadata.planCritique.invariant_coverage.score, 0.75); + assert.equal(result.metadata.planCritique.approach_soundness.score, 0.85); + assert.equal(result.metadata.planCritique.missed_patches.score, 0.8); + assert.equal(result.metadata.planCritique.overall.score, 0.82); + assert.ok(result.metadata.planCritique.component_boundaries.rationale.includes('target files')); + }); + + it('omits planCritique when not present in judge response', async () => { + const validResponse = JSON.stringify({ + score: 0.8, + rationale: 'Task completed with minor issues.', + interventionFlags: [], + }); + + const result = await evaluateTask( + { + taskPrompt: 'Fix a bug', + prReviewOutput: 'Bug fixed correctly', + }, + undefined, + { _callFn: mockCallFn(validResponse) } + ); + + assert.equal(result.metadata.planCritique, undefined, 'should not include planCritique when absent'); + }); + + it('gracefully handles planCritique with invalid dimension scores', async () => { + const invalidResponse = JSON.stringify({ + score: 0.85, + rationale: 'Task completed successfully.', + interventionFlags: [], + planCritique: { + component_boundaries: { score: 1.5, rationale: 'Invalid score out of range.' }, + invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' }, + approach_soundness: { score: 0.85, rationale: 'Valid dimension.' }, + missed_patches: { score: -0.1, rationale: 'Invalid negative score.' }, + overall: { score: 0.82, rationale: 'Overall score.' }, + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Add a feature', + prReviewOutput: 'Feature added', + }, + undefined, + { _callFn: mockCallFn(invalidResponse) } + ); + + // planCritique should be omitted entirely if any dimension is invalid + assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when dimensions are invalid'); + }); + + it('gracefully handles planCritique with missing dimensions', async () => { + const incompleteResponse = JSON.stringify({ + score: 0.85, + rationale: 'Task completed successfully.', + interventionFlags: [], + planCritique: { + component_boundaries: { score: 0.9, rationale: 'Good boundaries.' }, + invariant_coverage: { score: 0.75, rationale: 'Valid dimension.' }, + // Missing approach_soundness, missed_patches, and overall + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Add a feature', + prReviewOutput: 'Feature added', + }, + undefined, + { _callFn: mockCallFn(incompleteResponse) } + ); + + // planCritique should be omitted if not all required dimensions are present + assert.equal(result.metadata.planCritique, undefined, 'should omit planCritique when incomplete'); + }); }); diff --git a/shared/lib/eval.ts b/shared/lib/eval.ts index da863bf..1611279 100644 --- a/shared/lib/eval.ts +++ b/shared/lib/eval.ts @@ -10,7 +10,7 @@ import { readFile } from "node:fs/promises"; import { randomUUID } from 'crypto'; import { fileURLToPath } from "node:url"; import { dirname, join } from "node:path"; -import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision } from './eval-schema.ts'; +import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision, type PlanCritique } from './eval-schema.ts'; import { callClaude, parseJsonFromLLM } from './llm-cli.ts'; import { getEvalConfig } from './config.ts'; import { loadPricingTable } from './workflow-cost.ts'; @@ -25,7 +25,7 @@ const __dirname = dirname(__filename); const DEFAULT_MODEL = 'claude-sonnet-4-6'; const DEFAULT_PROVIDER = 'claude-cli'; const SUPPORTED_PROVIDERS = ['claude-cli', 'anthropic'] as const; -const SCHEMA_VERSION = '1.8.0'; +const SCHEMA_VERSION = '1.9.0'; const MAX_RETRIES = 2; const TIMEOUT_MS = 120_000; @@ -91,6 +91,7 @@ interface JudgeResponse { rationale: string; interventionFlags: string[]; stageScores?: Record; + planCritique?: PlanCritique; } /** @@ -238,6 +239,7 @@ function parseJudgeResponse(raw: string): JudgeResponse { rationale?: string; interventionFlags?: string[]; stageScores?: Record; + planCritique?: Record; }; if (typeof parsed.score !== 'number' || parsed.score < 0 || parsed.score > 1) { @@ -275,11 +277,42 @@ function parseJudgeResponse(raw: string): JudgeResponse { } } + // Parse and validate planCritique (optional) + let planCritique: PlanCritique | undefined; + if (parsed.planCritique && typeof parsed.planCritique === 'object') { + const requiredDimensions = ['component_boundaries', 'invariant_coverage', 'approach_soundness', 'missed_patches', 'overall']; + const validatedDimensions: Partial = {}; + + for (const dimension of requiredDimensions) { + const dimData = parsed.planCritique[dimension]; + if ( + dimData && + typeof dimData === 'object' && + typeof dimData.score === 'number' && + dimData.score >= 0 && + dimData.score <= 1 && + typeof dimData.rationale === 'string' && + dimData.rationale.trim().length > 0 + ) { + (validatedDimensions as any)[dimension] = { + score: dimData.score, + rationale: dimData.rationale.trim(), + }; + } + } + + // Only include if all required dimensions are present and valid + if (Object.keys(validatedDimensions).length === requiredDimensions.length) { + planCritique = validatedDimensions as PlanCritique; + } + } + return { score: parsed.score, rationale: parsed.rationale.trim(), interventionFlags: parsed.interventionFlags, ...(stageScores && { stageScores }), + ...(planCritique && { planCritique }), }; } @@ -373,7 +406,7 @@ export async function evaluateTask( const response = await callFn(prompt, model); // Parse response - const { score, rationale, interventionFlags, stageScores } = parseJudgeResponse(response.text); + const { score, rationale, interventionFlags, stageScores, planCritique } = parseJudgeResponse(response.text); const band = getScoreBand(score); const tokenUsage = response.usage || undefined; @@ -408,7 +441,7 @@ export async function evaluateTask( ...(outcomes && { outcomes }), ...(routingDecision && { routingDecision }), ...(promptArtifacts.length > 0 && { promptArtifacts }), - metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }) }, + metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }), ...(planCritique && { planCritique }) }, }; const activeSessionId = process.env.WAVEMILL_SESSION || (await getLatestSession())?.sessionId; attachManifestRef(record, activeSessionId); diff --git a/tests/startup-handoff.test.sh b/tests/startup-handoff.test.sh index 25434ef..512f3bc 100644 --- a/tests/startup-handoff.test.sh +++ b/tests/startup-handoff.test.sh @@ -303,7 +303,7 @@ write_plan "$SUCCESS_PLAN" "$TEST_REPO" "$STATE_DIR" "$STATE_FILE" "startup-succ SUCCESS_OUTPUT="$TMP_ROOT/success-output.txt" bash "$RUNNER_SCRIPT" "$SUCCESS_PLAN" > "$SUCCESS_OUTPUT" 2>&1 -if jq -e '.tasks["HOK-1001"].phase == "planning"' "$STATE_FILE" >/dev/null 2>&1; then +if jq -e '.tasks["HOK-1001"].phase == "coding"' "$STATE_FILE" >/dev/null 2>&1; then pass "startup runner writes workflow state only after in-tmux startup succeeds" else fail "startup runner did not persist workflow state for the launched task" diff --git a/tools/prompts/eval-judge.md b/tools/prompts/eval-judge.md index 3afa5c2..4c6e5af 100644 --- a/tools/prompts/eval-judge.md +++ b/tools/prompts/eval-judge.md @@ -105,6 +105,41 @@ In addition to the overall score, attribute quality to **all four workflow stage - *When Implementation Plan is provided*: Score based on whether the plan was sound and the implementation followed it successfully. - *When Implementation Plan is NOT provided*: Infer from the PR diff and commit history. Was the approach well-structured (logical file changes, good decomposition)? Or are there signs of rework, wrong-direction commits, or thrashing? A clean, well-organized diff with no rework suggests good planning (score 0.7–0.9). Multiple reverts or approach changes suggest poor planning (score 0.3–0.6). For single-shot autonomous workflows with no plan artifact, score based on whether the agent chose a reasonable approach. +### Plan Critique (when Implementation Plan is provided) + +When `{{PLAN_CONTENT}}` is available (not "Not available"), you **must** include a `planCritique` field in your JSON response. This field provides explicit plan quality evaluation across four dimensions: + +**1. component_boundaries** — Did the plan correctly identify which components, modules, or files needed to be modified? +- Score 1.0 if the plan identified exactly the right components with no false positives or missed areas +- Score 0.7–0.9 if the plan identified most key components but missed 1-2 secondary areas +- Score 0.4–0.6 if the plan had significant gaps (missed major components) or false positives (targeted wrong areas) +- Compare planned file changes to actual PR diff changes for precise assessment + +**2. invariant_coverage** — Did the plan identify critical constraints, edge cases, and invariants? +- Score 1.0 if the plan documented all critical constraints with no surprises during implementation +- Score 0.7–0.9 if the plan captured most key constraints but missed 1-2 non-obvious edge cases +- Score 0.4–0.6 if the plan missed several important constraints that impacted correctness +- Check if the implementation had to add defensive checks, validation, or error handling not mentioned in the plan + +**3. approach_soundness** — Was the proposed implementation approach technically viable and correct? +- Score 1.0 if the approach was optimal and implementation followed it directly +- Score 0.7–0.9 if the approach was sound but suboptimal, requiring minor adjustments +- Score 0.4–0.6 if the approach had notable issues (inefficient, brittle) requiring rework +- Check if the implementation followed the planned approach or had to deviate significantly + +**4. missed_patches** — Did the implementation have to work around gaps or errors in the plan? +- Score 1.0 if the implementation followed the plan directly with no patches or workarounds +- Score 0.7–0.9 if implementation needed 1-2 minor additions not covered by the plan +- Score 0.4–0.6 if implementation had to patch around multiple plan gaps or fix plan errors +- Analyze commit messages and diff hunks for signs of unplanned work + +**5. overall** — Aggregate plan quality score (weighted, not a simple average) +- Weight `component_boundaries` and `invariant_coverage` heavily (strong foundation) +- Reward low `missed_patches` (plan was actionable) +- Let `approach_soundness` gate the overall score (low soundness caps overall regardless of other dimensions) + +**When to omit `planCritique`**: Only when `{{PLAN_CONTENT}}` is literally "Not available" in the input. If a plan artifact exists, you must score it. + - **implementation** (always scored): Given the spec and plan, did the code correctly implement what was asked? Score 1.0 if the code is correct, complete, and production-ready. Score lower for bugs, missing edge cases, or poor code quality — but only penalize the implementation for issues that were NOT caused by a bad spec or plan. - **review** (always scored): Did the review process catch issues before human review? @@ -129,6 +164,13 @@ Respond with **only** a JSON object (no markdown fences, no preamble): "plan": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, "implementation": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, "review": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" } + }, + "planCritique": { + "component_boundaries": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "invariant_coverage": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "approach_soundness": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "missed_patches": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "overall": { "score": <0.0-1.0>, "rationale": "<2-3 sentences>" } } } ``` @@ -137,5 +179,6 @@ Respond with **only** a JSON object (no markdown fences, no preamble): - `rationale`: A concise, human-readable explanation justifying the score. **Must reference specific intervention events if any are present.** - `interventionFlags`: Array of strings describing notable interventions (empty array if none). Use the format `"type:description"` (e.g., `"review_comment:missing error handling"`, `"post_pr_commit:fixed lint errors"`) - `stageScores`: Object with per-stage attribution scores. **Always include all four stages** (expansion, plan, implementation, review). When artifacts are not available for a stage, infer quality from the PR diff, intervention patterns, and overall outcome. +- `planCritique`: **Optional** object with explicit plan quality scores across five dimensions. **Only include when `{{PLAN_CONTENT}}` is available** (not "Not available"). Omit this field entirely when no plan artifact exists. Output ONLY the JSON object. No other text. diff --git a/tools/prompts/plan-critique.md b/tools/prompts/plan-critique.md new file mode 100644 index 0000000..3db131e --- /dev/null +++ b/tools/prompts/plan-critique.md @@ -0,0 +1,148 @@ +# Plan Critique — Evaluating Implementation Plan Quality + +You are an expert evaluator assessing the quality of an implementation plan for a software engineering task. You will be given the original task prompt, the implementation plan, and optionally the PR diff showing what was actually implemented. + +Evaluate the plan across **four dimensions**, each scored on a scale of **0.0 to 1.0**. + +--- + +## Scoring Dimensions + +### 1. Component Boundaries (component_boundaries) + +Did the plan correctly identify which components, modules, or files needed to be modified? + +**Scoring guidelines:** +- **1.0**: Plan identified exactly the right components with no false positives or missed areas +- **0.7-0.9**: Plan identified most key components but missed 1-2 secondary areas or included unnecessary targets +- **0.4-0.6**: Plan had significant gaps (missed major components) or false positives (targeted wrong areas) +- **0.0-0.3**: Plan fundamentally misidentified the component boundary or targeted the wrong subsystems + +**When PR_DIFF is available:** Compare planned file changes to actual file changes. Perfect match = 1.0, missing files or extra files reduce the score. + +**When PR_DIFF is NOT available:** Evaluate based on whether the plan's component analysis is sound given the task requirements. + +--- + +### 2. Invariant Coverage (invariant_coverage) + +Did the plan identify critical constraints, edge cases, and invariants that the implementation must respect? + +**Scoring guidelines:** +- **1.0**: Plan documented all critical constraints and invariants; no surprises in implementation +- **0.7-0.9**: Plan captured most key constraints but missed 1-2 non-obvious edge cases +- **0.4-0.6**: Plan missed several important constraints or edge cases that would impact correctness +- **0.0-0.3**: Plan failed to identify major invariants, leading to incorrect or risky implementation + +**Examples of invariants:** +- Data format constraints (nullable fields, type safety) +- Ordering dependencies (X must happen before Y) +- Concurrency constraints (locks, race conditions) +- Backward compatibility requirements +- Performance requirements + +**When PR_DIFF is available:** Check if the implementation had to add defensive checks, validation, or error handling not mentioned in the plan — these indicate missed invariants. + +--- + +### 3. Approach Soundness (approach_soundness) + +Was the proposed implementation approach technically viable and correct? + +**Scoring guidelines:** +- **1.0**: Approach is optimal or near-optimal; implementation could follow the plan directly +- **0.7-0.9**: Approach is sound but suboptimal; minor adjustments needed during implementation +- **0.4-0.6**: Approach has notable issues (inefficient, brittle, or partially incorrect) requiring rework +- **0.0-0.3**: Approach is fundamentally flawed or infeasible; implementation had to take a different path + +**Consider:** +- Algorithmic correctness +- Architectural alignment with existing codebase patterns +- Scalability and performance implications +- Maintainability and clarity + +**When PR_DIFF is available:** Check if the implementation followed the planned approach or had to deviate significantly. + +--- + +### 4. Missed Patches (missed_patches) + +Did the implementation have to work around gaps or errors in the plan? + +**Scoring guidelines:** +- **1.0**: Implementation followed the plan directly with no patches or workarounds +- **0.7-0.9**: Implementation needed 1-2 minor additions not covered by the plan (small helpers, utilities) +- **0.4-0.6**: Implementation had to patch around multiple plan gaps or fix plan errors +- **0.0-0.3**: Implementation largely diverged from the plan due to plan inadequacy + +**Indicators of missed patches:** +- Bug fixes in commits following initial implementation +- Additional files or functions not mentioned in the plan +- Error handling or validation added ad-hoc during implementation +- Refactoring commits that restructure what the plan described + +**When PR_DIFF is available:** Analyze commit messages and diff hunks for signs of unplanned work. + +**When PR_DIFF is NOT available:** Score based on completeness of the plan — does it feel like a complete blueprint, or are there obvious gaps? + +--- + +## Overall Score + +In addition to the four dimension scores, provide an **overall plan quality score** that aggregates the dimensions. The overall score is NOT a simple average — weight it based on which dimensions matter most for autonomous execution success. + +**Suggested weighting:** +- High `component_boundaries` + high `invariant_coverage` → strong foundation, weight these heavily +- Low `missed_patches` → plan was actionable, reward this +- `approach_soundness` gates correctness — low soundness caps overall score regardless of other dimensions + +--- + +## Input + +### Original Task Prompt + +{{TASK_PROMPT}} + +### Implementation Plan + +{{PLAN_CONTENT}} + +### PR Diff (optional) + +{{PR_DIFF}} + +--- + +## Output Format + +Respond with **only** a JSON object (no markdown fences, no preamble): + +``` +{ + "planCritique": { + "component_boundaries": { + "score": <0.0-1.0>, + "rationale": "<1-2 sentences explaining the score>" + }, + "invariant_coverage": { + "score": <0.0-1.0>, + "rationale": "<1-2 sentences explaining the score>" + }, + "approach_soundness": { + "score": <0.0-1.0>, + "rationale": "<1-2 sentences explaining the score>" + }, + "missed_patches": { + "score": <0.0-1.0>, + "rationale": "<1-2 sentences explaining the score>" + }, + "overall": { + "score": <0.0-1.0>, + "rationale": "<2-3 sentences summarizing overall plan quality>" + } + } +} +``` + +Output ONLY the JSON object. No other text.