diff --git a/shared/lib/agent-adapters.sh b/shared/lib/agent-adapters.sh index 4d39318..6a9a5fe 100755 --- a/shared/lib/agent-adapters.sh +++ b/shared/lib/agent-adapters.sh @@ -1606,6 +1606,8 @@ agent_pane_is_ready() { sleep 0.5 fi + local current_command + current_command=$(_pane_current_command "$target") local pane_pid pane_pid=$(tmux display-message -t "$target" -p '#{pane_pid}' 2>/dev/null || echo "") if [[ -z "$pane_pid" ]]; then @@ -1615,8 +1617,6 @@ agent_pane_is_ready() { return 0 fi - local current_command - current_command=$(_pane_current_command "$target") if _pane_command_is_shell "$current_command"; then local children children=$(_pane_child_count "$target") @@ -1650,6 +1650,7 @@ agent_verify_launch() { attempts=$(awk "BEGIN { v = $max_wait / $poll_interval; if (v < 1) v = 1; printf \"%d\", (v == int(v) ? v : int(v) + 1) }") local attempt=1 + local introspection_available=0 while (( attempt <= attempts )); do local current_command children state_changed=0 current_command=$(_pane_current_command "$target") @@ -1660,6 +1661,10 @@ agent_verify_launch() { return 0 fi + if [[ -n "$current_command" || -n "$children" ]]; then + introspection_available=1 + fi + if [[ -n "$baseline_command" ]] || [[ -n "$baseline_children" ]]; then if [[ "$current_command" != "$baseline_command" ]] || [[ "$children" != "${baseline_children:-}" ]]; then state_changed=1 diff --git a/shared/lib/eval-record-builder.test.ts b/shared/lib/eval-record-builder.test.ts index 5e4cc75..dac4029 100644 --- a/shared/lib/eval-record-builder.test.ts +++ b/shared/lib/eval-record-builder.test.ts @@ -231,6 +231,47 @@ describe('eval-record-builder', () => { }); describe('enrichEvalRecord', () => { + it('attaches planCritique to the normalized plan stage outcome', () => { + baseRecord.metadata = { + stageScores: { + plan: { + score: 0.81, + rationale: 'The plan covered the right implementation areas.', + }, + }, + planCritique: { + component_boundaries: { + score: 0.9, + rationale: 'The plan identified the correct component boundary.', + }, + invariant_coverage: { + score: 0.7, + rationale: 'It captured the main compatibility invariant.', + }, + approach_soundness: { + score: 0.8, + rationale: 'The proposed approach was viable.', + }, + missed_patches: { + score: 0.78, + rationale: 'Implementation needed only minor follow-up fixes.', + }, + overall: { + score: 0.8, + rationale: 'Overall the plan was a useful guide.', + }, + }, + }; + + enrichEvalRecord(baseRecord, {}); + + expect(baseRecord.stageOutcomes?.plan).toEqual({ + score: 0.81, + rationale: 'The plan covered the right implementation areas.', + planCritique: baseRecord.metadata.planCritique, + }); + }); + it('should attach all metadata when provided', () => { const metadata = { agentType: 'codex', diff --git a/shared/lib/eval-record-builder.ts b/shared/lib/eval-record-builder.ts index a22a833..5b82fc1 100644 --- a/shared/lib/eval-record-builder.ts +++ b/shared/lib/eval-record-builder.ts @@ -15,6 +15,7 @@ import type { EvalRecord, + PlanCritique, TaskContext, RepoContext, StageOutcomes, @@ -157,7 +158,8 @@ export function attachWorkflowCostMetadata( */ export function attachStageOutcomes( record: EvalRecord, - stageScores?: Record + stageScores?: Record, + planCritique?: PlanCritique, ): void { if (!stageScores || Object.keys(stageScores).length === 0) { return; @@ -177,6 +179,7 @@ export function attachStageOutcomes( stageOutcomes.plan = { score: stageScores.plan.score, rationale: stageScores.plan.rationale, + ...(planCritique && { planCritique }), }; } @@ -345,5 +348,6 @@ export function enrichEvalRecord(record: EvalRecord, metadata: EvalRecordMetadat const stageScores = record.metadata?.stageScores as | Record | undefined; - attachStageOutcomes(record, stageScores); + const planCritique = record.metadata?.planCritique as PlanCritique | undefined; + attachStageOutcomes(record, stageScores, planCritique); } diff --git a/shared/lib/eval-schema.test.ts b/shared/lib/eval-schema.test.ts index 2281d9f..4ab1627 100644 --- a/shared/lib/eval-schema.test.ts +++ b/shared/lib/eval-schema.test.ts @@ -1000,7 +1000,7 @@ test('Record with minimal outcomes (only required fields) validates', () => { test('Record with fallbackEvent validates and round-trips through JSON serialization', () => { const record: EvalRecord = { ...scenarios[0].record, - schemaVersion: '1.8.0', + schemaVersion: '1.9.0', fallbackEvent: { schema_version: '1.0', preferred_model: 'model-a', diff --git a/shared/lib/eval-schema.ts b/shared/lib/eval-schema.ts index 9ef1b4a..abcea3e 100644 --- a/shared/lib/eval-schema.ts +++ b/shared/lib/eval-schema.ts @@ -27,6 +27,8 @@ * fields to track cost budget constraint violations during routing (HOK-1350) * - **1.8.0**: Added optional `manifestRef` for per-run resource manifest * attribution (HOK-1378) + * - **1.9.0**: Added optional `planCritique` to capture explicit planning + * quality dimensions from the eval judge (HOK-1391) * * @module eval-schema */ @@ -616,6 +618,37 @@ export interface StageScore { score: number; /** 1-2 sentence attribution rationale */ rationale: string; + /** Detailed plan critique when available for the plan stage */ + planCritique?: PlanCritique; +} + +/** + * A single plan-critique rubric dimension. + * + * Captures both the normalized score and the judge's rationale so plan + * quality can be compared directly across models. + */ +export interface PlanCritiqueDimension { + /** Quality score 0.0–1.0 for this planning dimension */ + score: number; + /** 1-2 sentence rationale for the score */ + rationale: string; +} + +/** + * Explicit critique of plan quality across the key planning dimensions. + */ +export interface PlanCritique { + /** Whether the plan chose the right component/service boundaries */ + component_boundaries: PlanCritiqueDimension; + /** Whether the plan surfaced key invariants and constraints */ + invariant_coverage: PlanCritiqueDimension; + /** Whether the proposed approach was viable and correct */ + approach_soundness: PlanCritiqueDimension; + /** Whether implementation had to patch around plan gaps */ + missed_patches: PlanCritiqueDimension; + /** Aggregate plan-quality assessment */ + overall: PlanCritiqueDimension; } /** diff --git a/shared/lib/eval.test.js b/shared/lib/eval.test.js index 0f383bb..7c4877e 100644 --- a/shared/lib/eval.test.js +++ b/shared/lib/eval.test.js @@ -26,7 +26,7 @@ describe('evaluateTask', () => { // Core EvalRecord fields from eval-schema.ts assert.ok(result.id, 'should have a UUID id'); - assert.equal(result.schemaVersion, '1.8.0'); + assert.equal(result.schemaVersion, '1.9.0'); assert.equal(result.originalPrompt, 'Add a loading spinner'); assert.ok(result.modelId); assert.ok(result.modelVersion); @@ -91,6 +91,142 @@ describe('evaluateTask', () => { assert.equal(result.scoreBand, 'Assisted Success'); }); + it('stores planCritique in metadata when the judge returns it', async () => { + const validResponse = JSON.stringify({ + score: 0.88, + rationale: 'Implementation succeeded and the plan was mostly strong.', + interventionFlags: [], + stageScores: { + expansion: { score: 0.9, rationale: 'Spec was clear.' }, + plan: { score: 0.84, rationale: 'Plan mostly identified the right work.' }, + implementation: { score: 0.9, rationale: 'Code landed cleanly.' }, + review: { score: 0.85, rationale: 'Review coverage was good.' }, + }, + planCritique: { + component_boundaries: { + score: 0.9, + rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.', + }, + invariant_coverage: { + score: 0.75, + rationale: 'It identified the key optional-field compatibility constraint.', + }, + approach_soundness: { + score: 0.85, + rationale: 'The approach was viable without adding extra judge calls.', + }, + missed_patches: { + score: 0.8, + rationale: 'Only minor parser cleanup was needed during implementation.', + }, + overall: { + score: 0.82, + rationale: 'The plan provided a solid implementation guide.', + }, + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Add plan critique to evals', + prReviewOutput: 'Changes are correct', + planContent: 'Implement schema, prompt, parser, and tests.', + }, + undefined, + { _callFn: mockCallFn(validResponse) } + ); + + assert.deepEqual(result.metadata.planCritique, { + component_boundaries: { + score: 0.9, + rationale: 'The plan targeted the correct eval prompt, parser, and schema layers.', + }, + invariant_coverage: { + score: 0.75, + rationale: 'It identified the key optional-field compatibility constraint.', + }, + approach_soundness: { + score: 0.85, + rationale: 'The approach was viable without adding extra judge calls.', + }, + missed_patches: { + score: 0.8, + rationale: 'Only minor parser cleanup was needed during implementation.', + }, + overall: { + score: 0.82, + rationale: 'The plan provided a solid implementation guide.', + }, + }); + assert.equal(result.metadata.stageScores.plan.score, 0.84); + }); + + it('omits planCritique when the judge does not return it', async () => { + const validResponse = JSON.stringify({ + score: 0.8, + rationale: 'Good execution.', + interventionFlags: [], + stageScores: { + expansion: { score: 0.8, rationale: 'Adequate.' }, + plan: { score: 0.78, rationale: 'Reasonable inferred planning.' }, + implementation: { score: 0.82, rationale: 'Correct result.' }, + review: { score: 0.79, rationale: 'No major misses.' }, + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Task without saved plan artifact', + prReviewOutput: 'Clean', + }, + undefined, + { _callFn: mockCallFn(validResponse) } + ); + + assert.equal('planCritique' in result.metadata, false); + }); + + it('ignores invalid planCritique dimensions gracefully', async () => { + const validResponse = JSON.stringify({ + score: 0.73, + rationale: 'Task completed.', + interventionFlags: [], + planCritique: { + component_boundaries: { + score: 1.2, + rationale: 'Out of range score should invalidate the object.', + }, + invariant_coverage: { + score: 0.7, + rationale: 'Valid but should be dropped with the invalid object.', + }, + approach_soundness: { + score: 0.75, + rationale: 'Valid but incomplete overall object.', + }, + missed_patches: { + score: 0.8, + rationale: 'Valid but incomplete overall object.', + }, + overall: { + score: 0.74, + rationale: 'Valid but incomplete overall object.', + }, + }, + }); + + const result = await evaluateTask( + { + taskPrompt: 'Task with malformed plan critique', + prReviewOutput: 'Completed', + }, + undefined, + { _callFn: mockCallFn(validResponse) } + ); + + assert.equal('planCritique' in result.metadata, false); + }); + it('throws immediately on malformed JSON response', async () => { const callFn = mockCallFn('not json at all'); diff --git a/shared/lib/eval.ts b/shared/lib/eval.ts index da863bf..8b94e41 100644 --- a/shared/lib/eval.ts +++ b/shared/lib/eval.ts @@ -10,7 +10,15 @@ import { readFile } from "node:fs/promises"; import { randomUUID } from 'crypto'; import { fileURLToPath } from "node:url"; import { dirname, join } from "node:path"; -import { getScoreBand, type EvalRecord, type InterventionRecord, type Outcomes, type RoutingDecision } from './eval-schema.ts'; +import { + getScoreBand, + type EvalRecord, + type InterventionRecord, + type Outcomes, + type PlanCritique, + type PlanCritiqueDimension, + type RoutingDecision, +} from './eval-schema.ts'; import { callClaude, parseJsonFromLLM } from './llm-cli.ts'; import { getEvalConfig } from './config.ts'; import { loadPricingTable } from './workflow-cost.ts'; @@ -25,7 +33,7 @@ const __dirname = dirname(__filename); const DEFAULT_MODEL = 'claude-sonnet-4-6'; const DEFAULT_PROVIDER = 'claude-cli'; const SUPPORTED_PROVIDERS = ['claude-cli', 'anthropic'] as const; -const SCHEMA_VERSION = '1.8.0'; +const SCHEMA_VERSION = '1.9.0'; const MAX_RETRIES = 2; const TIMEOUT_MS = 120_000; @@ -91,6 +99,7 @@ interface JudgeResponse { rationale: string; interventionFlags: string[]; stageScores?: Record; + planCritique?: PlanCritique; } /** @@ -232,12 +241,78 @@ function computeCost( return inputCost + outputCost; } +const PLAN_CRITIQUE_DIMENSIONS = [ + 'component_boundaries', + 'invariant_coverage', + 'approach_soundness', + 'missed_patches', + 'overall', +] as const satisfies readonly (keyof PlanCritique)[]; + +function parsePlanCritiqueDimension( + value: unknown, +): PlanCritiqueDimension | undefined { + if (!value || typeof value !== 'object') { + return undefined; + } + + const dimension = value as { + score?: number; + rationale?: string; + }; + + if ( + typeof dimension.score !== 'number' || + dimension.score < 0 || + dimension.score > 1 || + typeof dimension.rationale !== 'string' + ) { + return undefined; + } + + const rationale = dimension.rationale.trim(); + if (rationale.length === 0) { + return undefined; + } + + return { + score: dimension.score, + rationale, + }; +} + +function parsePlanCritique(value: unknown): PlanCritique | undefined { + if (!value || typeof value !== 'object') { + return undefined; + } + + const rawPlanCritique = value as Partial>; + const parsedDimensions = PLAN_CRITIQUE_DIMENSIONS.reduce< + Partial> + >((acc, dimensionName) => { + const parsedDimension = parsePlanCritiqueDimension( + rawPlanCritique[dimensionName], + ); + if (parsedDimension) { + acc[dimensionName] = parsedDimension; + } + return acc; + }, {}); + + if (PLAN_CRITIQUE_DIMENSIONS.every((dimensionName) => parsedDimensions[dimensionName])) { + return parsedDimensions as PlanCritique; + } + + return undefined; +} + function parseJudgeResponse(raw: string): JudgeResponse { const parsed = parseJsonFromLLM(raw) as { score?: number; rationale?: string; interventionFlags?: string[]; stageScores?: Record; + planCritique?: unknown; }; if (typeof parsed.score !== 'number' || parsed.score < 0 || parsed.score > 1) { @@ -275,11 +350,14 @@ function parseJudgeResponse(raw: string): JudgeResponse { } } + const planCritique = parsePlanCritique(parsed.planCritique); + return { score: parsed.score, rationale: parsed.rationale.trim(), interventionFlags: parsed.interventionFlags, ...(stageScores && { stageScores }), + ...(planCritique && { planCritique }), }; } @@ -373,7 +451,7 @@ export async function evaluateTask( const response = await callFn(prompt, model); // Parse response - const { score, rationale, interventionFlags, stageScores } = parseJudgeResponse(response.text); + const { score, rationale, interventionFlags, stageScores, planCritique } = parseJudgeResponse(response.text); const band = getScoreBand(score); const tokenUsage = response.usage || undefined; @@ -408,7 +486,12 @@ export async function evaluateTask( ...(outcomes && { outcomes }), ...(routingDecision && { routingDecision }), ...(promptArtifacts.length > 0 && { promptArtifacts }), - metadata: { ...metadata, interventionFlags, ...(stageScores && { stageScores }) }, + metadata: { + ...metadata, + interventionFlags, + ...(stageScores && { stageScores }), + ...(planCritique && { planCritique }), + }, }; const activeSessionId = process.env.WAVEMILL_SESSION || (await getLatestSession())?.sessionId; attachManifestRef(record, activeSessionId); diff --git a/shared/lib/llm-cli.test.ts b/shared/lib/llm-cli.test.ts index e9c283b..b76a072 100644 --- a/shared/lib/llm-cli.test.ts +++ b/shared/lib/llm-cli.test.ts @@ -292,7 +292,7 @@ describe('quota fallback', () => { const records = readFallbackRecords(); assert.equal(records.length, 1); - assert.equal(records[0].schemaVersion, '1.8.0'); + assert.equal(records[0].schemaVersion, '1.9.0'); assert.equal(records[0].modelId, 'model-b'); assert.equal(records[0].score, 1); assert.equal(records[0].fallbackEvent?.preferred_model, 'model-a'); diff --git a/shared/lib/llm-cli.ts b/shared/lib/llm-cli.ts index 760f809..3b47f0d 100644 --- a/shared/lib/llm-cli.ts +++ b/shared/lib/llm-cli.ts @@ -180,7 +180,7 @@ const DEFAULT_MAX_RETRIES = 2; const SLOW_CALL_WARNING_MS = 30_000; const SLOW_CALL_REPEAT_MS = 15_000; const FALLBACK_DEFAULT_TASK_TYPE: RegistryTaskType = 'classify'; -const FALLBACK_EVAL_SCHEMA_VERSION = '1.8.0'; +const FALLBACK_EVAL_SCHEMA_VERSION = '1.9.0'; const FALLBACK_EVENT_SCHEMA_VERSION = '1.0'; const warnedMissingTaskType = new Set(); diff --git a/shared/lib/wavemill-startup-runner.sh b/shared/lib/wavemill-startup-runner.sh index ec622ab..ef0c1d3 100755 --- a/shared/lib/wavemill-startup-runner.sh +++ b/shared/lib/wavemill-startup-runner.sh @@ -110,7 +110,7 @@ EOF save_task_state() { local issue="$1" slug="$2" branch="$3" worktree="$4" pr="${5:-}" status="${6:-}" agent="${7:-}" local linear_issue="${8:-$issue}" challenge="${9:-}" challenge_pair="${10:-}" challenge_role="${11:-}" challenge_model="${12:-}" - local planner_model="${13:-}" coder_model="${14:-}" reviewer_model="${15:-}" plan_depth="${16:-}" code_depth="${17:-}" review_mode="${18:-}" + local planner_model="${13:-}" coder_model="${14:-}" reviewer_model="${15:-}" plan_depth="${16:-}" code_depth="${17:-}" review_mode="${18:-}" phase="${19:-}" local tmp tmp=$(mktemp) || return 1 if jq --arg issue "$issue" --arg slug "$slug" --arg branch "$branch" \ @@ -118,7 +118,7 @@ save_task_state() { --arg linearIssue "$linear_issue" --arg challenge "$challenge" --arg challengePair "$challenge_pair" \ --arg challengeRole "$challenge_role" --arg challengeModel "$challenge_model" \ --arg plannerModel "$planner_model" --arg coderModel "$coder_model" --arg reviewerModel "$reviewer_model" \ - --arg planDepth "$plan_depth" --arg codeDepth "$code_depth" --arg reviewMode "$review_mode" \ + --arg planDepth "$plan_depth" --arg codeDepth "$code_depth" --arg reviewMode "$review_mode" --arg phase "$phase" \ '.tasks[$issue] = (.tasks[$issue] // {}) + { slug: $slug, branch: $branch, @@ -127,7 +127,7 @@ save_task_state() { status: $status, linearIssueId: $linearIssue, updated: (now | todate) - } + } + (if $phase != "" then {phase: $phase} else {} end) | if $agent != "" then .tasks[$issue].agent = $agent else . end | if $challenge != "" then .tasks[$issue].challenge = ($challenge == "true") else . end | if $challengePair != "" then .tasks[$issue].challengePairId = $challengePair else . end @@ -165,7 +165,10 @@ set_task_phase_local() { local tmp tmp=$(mktemp) || return 1 if jq --arg issue "$issue" --arg phase "$phase" \ - '.tasks[$issue].phase = $phase | .tasks[$issue].updated = (now | todate)' \ + '.tasks[$issue] = ((.tasks[$issue] // {}) + { + phase: $phase, + updated: (now | todate) + })' \ "$STATE_FILE" > "$tmp" 2>/dev/null; then mv "$tmp" "$STATE_FILE" return 0 @@ -415,13 +418,16 @@ $details_context" write_stage_result_local "$feature_dir" "coding" "running" "$task_agent" "${coder_model:-}" "Startup handoff launched coding" || true startup_step "[4/7] Writing task artifacts... ✓" + # Persist launched tasks as active coding work in the initial state write so + # downstream startup checks do not depend on a second jq update succeeding. + local persisted_phase="coding" + if ! save_task_state "$issue" "$slug" "$branch" "$wt_dir" "" "" "$task_agent" "$linear_issue" "$challenge" "$challenge_pair" "$challenge_role" "$challenge_model" \ - "$planner_model" "$coder_model" "$reviewer_model" "$plan_depth" "$code_depth" "$review_mode"; then + "$planner_model" "$coder_model" "$reviewer_model" "$plan_depth" "$code_depth" "$review_mode" "$persisted_phase"; then startup_log "✗ $issue FAILED at step [5/7]: saving workflow state" [[ -n "${created_window:-}" ]] && tmux kill-window -t "$SESSION:$win" >/dev/null 2>&1 || true return 1 fi - local persisted_phase="coding" if ! set_task_phase_local "$issue" "$persisted_phase"; then remove_task_state "$issue" >/dev/null 2>&1 || true @@ -451,6 +457,16 @@ $details_context" return 1 fi fi + + # Reassert the launched phase after agent dispatch and Linear updates so the + # final persisted state reflects active coding work even if a helper touched + # workflow-state during startup. + if ! set_task_phase_local "$issue" "$persisted_phase"; then + [[ -n "${state_written:-}" ]] && remove_task_state "$issue" >/dev/null 2>&1 || true + tmux kill-window -t "$SESSION:$win" >/dev/null 2>&1 || true + startup_log "✗ $issue FAILED at step [7/7]: finalizing workflow state" + return 1 + fi startup_step "[7/7] Setting Linear → In Progress... ✓" printf '%s\n' "$issue" >> "$LAUNCHED_ISSUES_FILE" diff --git a/tests/startup-handoff.test.sh b/tests/startup-handoff.test.sh index 25434ef..b94a235 100644 --- a/tests/startup-handoff.test.sh +++ b/tests/startup-handoff.test.sh @@ -12,6 +12,35 @@ FAIL=0 pass() { echo " PASS $1"; PASS=$((PASS + 1)); } fail() { echo " FAIL $1"; FAIL=$((FAIL + 1)); } +dump_file_on_failure() { + local label="$1" + local path="$2" + echo " --- $label: $path ---" + if [[ -f "$path" ]]; then + sed 's/^/ /' "$path" + else + echo " (missing)" + fi + echo " --- end $label ---" +} + +wait_for_jq_match() { + local expr="$1" + local path="$2" + local attempts="${3:-20}" + local delay="${4:-0.1}" + local i + + for ((i = 1; i <= attempts; i++)); do + if jq -e "$expr" "$path" >/dev/null 2>&1; then + return 0 + fi + sleep "$delay" + done + + return 1 +} + make_mock_bin() { local dir="$1" mkdir -p "$dir" @@ -303,10 +332,13 @@ write_plan "$SUCCESS_PLAN" "$TEST_REPO" "$STATE_DIR" "$STATE_FILE" "startup-succ SUCCESS_OUTPUT="$TMP_ROOT/success-output.txt" bash "$RUNNER_SCRIPT" "$SUCCESS_PLAN" > "$SUCCESS_OUTPUT" 2>&1 -if jq -e '.tasks["HOK-1001"].phase == "planning"' "$STATE_FILE" >/dev/null 2>&1; then +if wait_for_jq_match '.tasks["HOK-1001"].phase == "coding"' "$STATE_FILE"; then pass "startup runner writes workflow state only after in-tmux startup succeeds" else fail "startup runner did not persist workflow state for the launched task" + dump_file_on_failure "workflow-state" "$STATE_FILE" + dump_file_on_failure "startup-output" "$SUCCESS_OUTPUT" + dump_file_on_failure "tmux-log" "$MOCK_TMUX_LOG" fi if grep -q 'HOK-1001|In Progress' "$MOCK_LINEAR_LOG"; then diff --git a/tools/prompts/eval-judge.md b/tools/prompts/eval-judge.md index 3afa5c2..cd167ed 100644 --- a/tools/prompts/eval-judge.md +++ b/tools/prompts/eval-judge.md @@ -113,6 +113,22 @@ In addition to the overall score, attribute quality to **all four workflow stage **Key attribution principle**: The stage scores should help identify WHERE in the pipeline quality was lost. If the overall score is 0.7, the stage scores should make it clear whether the spec was the problem (low expansion, higher implementation) or the code was the problem (high expansion, low implementation). Stage scores must sum to a coherent story — they should explain the overall score, not just repeat it. +### Plan Critique + +When `Implementation Plan` is available, also produce a `planCritique` object that evaluates the plan directly rather than inferring plan quality only from the downstream diff. + +Score each dimension from 0.0 to 1.0 and provide a 1-2 sentence rationale: + +- `component_boundaries`: Did the plan identify the correct files, modules, or component boundaries for the work? +- `invariant_coverage`: Did the plan surface key constraints, assumptions, or invariants the implementation had to respect? +- `approach_soundness`: Was the proposed implementation approach viable, correct, and appropriately scoped? +- `missed_patches`: Did the implementation need to patch around plan gaps, omissions, or mistaken assumptions? +- `overall`: Aggregate assessment of planning quality based on the four dimensions above. + +For `missed_patches`, use a high score when the implementation flowed cleanly from the plan and a low score when the implementation had to compensate for planning gaps. + +If `Implementation Plan` is "Not available for this workflow.", omit `planCritique` entirely. + --- ## Output Format @@ -129,6 +145,13 @@ Respond with **only** a JSON object (no markdown fences, no preamble): "plan": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, "implementation": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, "review": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" } + }, + "planCritique": { + "component_boundaries": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "invariant_coverage": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "approach_soundness": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "missed_patches": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" }, + "overall": { "score": <0.0-1.0>, "rationale": "<1-2 sentences>" } } } ``` @@ -137,5 +160,6 @@ Respond with **only** a JSON object (no markdown fences, no preamble): - `rationale`: A concise, human-readable explanation justifying the score. **Must reference specific intervention events if any are present.** - `interventionFlags`: Array of strings describing notable interventions (empty array if none). Use the format `"type:description"` (e.g., `"review_comment:missing error handling"`, `"post_pr_commit:fixed lint errors"`) - `stageScores`: Object with per-stage attribution scores. **Always include all four stages** (expansion, plan, implementation, review). When artifacts are not available for a stage, infer quality from the PR diff, intervention patterns, and overall outcome. +- `planCritique`: Optional object. **Include it only when an Implementation Plan is available.** Omit it entirely when the plan artifact is not available. Output ONLY the JSON object. No other text. diff --git a/tools/prompts/plan-critique.md b/tools/prompts/plan-critique.md new file mode 100644 index 0000000..bb30132 --- /dev/null +++ b/tools/prompts/plan-critique.md @@ -0,0 +1,60 @@ +# Plan Critique + +You are evaluating the quality of a software implementation plan. Judge the plan directly rather than inferring quality only from whether the final implementation succeeded. + +You will receive: + +- The original task prompt +- The implementation plan +- Optionally, the final PR diff or review summary for evidence about whether the implementation had to patch around planning gaps + +Score the plan on the following dimensions from 0.0 to 1.0. For each dimension, provide a concise 1-2 sentence rationale grounded in the provided materials. + +## Dimensions + +- `component_boundaries`: Did the plan identify the correct files, modules, systems, or ownership boundaries for the work? +- `invariant_coverage`: Did the plan identify important constraints, invariants, edge cases, or compatibility requirements that the implementation needed to respect? +- `approach_soundness`: Was the proposed approach technically viable, correctly scoped, and likely to solve the stated task? +- `missed_patches`: Did the implementation evidence suggest the plan had gaps that later required patching around, extra fixes, or directional corrections? +- `overall`: Aggregate quality of the plan based on the dimensions above. + +## Scoring Guidance + +- `1.0`: Clear, correct, and actionable. The implementation could proceed with little ambiguity or patch-up work. +- `0.7-0.9`: Mostly strong plan with minor omissions or uncertainty, but still a good guide for implementation. +- `0.4-0.6`: Mixed quality. Some useful direction, but important boundaries, constraints, or approach details were missing or shaky. +- `0.0-0.3`: Poor plan. Misidentified the work, missed critical invariants, or proposed an unsound approach. + +For `missed_patches`, assign a high score when the implementation evidence shows the plan held up well. Assign a low score when the implementation had to compensate for plan mistakes or omissions. + +## Input + +### Original Task Prompt + +{{TASK_PROMPT}} + +### Implementation Plan + +{{PLAN_CONTENT}} + +### PR Diff / Review Output (optional) + +{{PR_DIFF}} + +## Output Format + +Respond with only a JSON object: + +```json +{ + "planCritique": { + "component_boundaries": { "score": 0.0, "rationale": "" }, + "invariant_coverage": { "score": 0.0, "rationale": "" }, + "approach_soundness": { "score": 0.0, "rationale": "" }, + "missed_patches": { "score": 0.0, "rationale": "" }, + "overall": { "score": 0.0, "rationale": "" } + } +} +``` + +Output only valid JSON. No markdown fences or extra commentary.