zbigniewsobiecki · zbigniewsobiecki · Apr 11, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,11 @@ npm-debug.log*
 
 # CASCADE tooling metadata
 .cascade-progress-comment-id
+
+# Eval harness — per-run artifacts and judge cache (keep .gitkeep)
+evals/results/*
+!evals/results/.gitkeep
+evals/.judge-cache.json
+evals/fixtures/*/node_modules/
+evals/fixtures/*/.squint.db
+evals/fixtures/*/dist/
diff --git a/bin/dev.js b/bin/dev.js
@@ -1,5 +1,6 @@
 #!/usr/bin/env node
 
+import 'dotenv/config';
 import { execute } from '@oclif/core';
 
 await execute({ development: true, dir: import.meta.url });
diff --git a/bin/run.js b/bin/run.js
@@ -1,5 +1,6 @@
 #!/usr/bin/env node
 
+import 'dotenv/config';
 import { execute } from '@oclif/core';
 
 try {

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,60 @@
+# Squint Evaluation Harness
+
+End-to-end evaluation of the squint ingestion pipeline against hand-authored ground truth.
+
+## How it works
+
+1. **Fixture**: a small, real, runnable TypeScript repo at `evals/fixtures/<name>/`
+2. **Ground truth**: typed declarative records at `evals/ground-truth/<name>/` describing what squint *should* produce
+3. **Harness**: shared code at `evals/harness/` that builds, runs, compares, and reports
+4. **Eval test**: `evals/<name>.eval.ts` — a Vitest test that wires it all together
+5. **Baseline**: a committed scoreboard at `evals/baselines/<name>.json` tracking progress per stage
+
+## Running
+
+```bash
+# Run all evals (costs LLM credits!)
+npm run eval
+
+# Run a specific eval
+npm run eval -- todo-api.eval.ts
+
+# Run a specific stage's tests within an eval
+npm run eval -- todo-api.eval.ts -t "parse stage"
+
+# Watch mode for harness development
+npm run eval:watch
+```
+
+## Cost guardrails
+
+- All LLM calls are scoped per-stage via `--from-stage`/`--to-stage` — never the full pipeline accidentally
+- Per-run cost budget enforced via `EVAL_COST_BUDGET_USD` (default `0.50`)
+- Prose-judge results cached at `evals/results/.judge-cache.json` (gitignored)
+
+## Environment variables
+
+| Var | Default | Purpose |
+|---|---|---|
+| `EVAL_JUDGE_MODEL` | `openrouter:anthropic/claude-haiku-4` | LLM used to score prose similarity |
+| `EVAL_COST_BUDGET_USD` | `0.50` | Hard fail if a single run exceeds this |
+| `EVAL_RUNS_PER_STAGE` | `1` | Re-run LLM stages N times to detect non-determinism |
+| `EVAL_KEEP_ALL` | unset | Keep all historical results instead of rotating |
+
+## Iteration plan
+
+The harness is built up one pipeline stage at a time. Each iteration adds exactly one
+LLM stage on top of a known-passing base, so when iteration N fails the bug is in stage N.
+
+See `/home/zbigniew/.claude/plans/validated-sprouting-mochi.md` for the full plan.
+
+| Iter | Stages | Cost/run |
+|---|---|---|
+| 1 | parse | $0 |
+| 2 | + symbols | ~$0.05 |
+| 3 | + relationships | ~$0.10 |
+| 4 | + modules | ~$0.15 |
+| 5 | + contracts | ~$0.20 |
+| 6 | + interactions | ~$0.25 |
+| 7 | + flows | ~$0.30 |
+| 8 | + features | ~$0.35 |
diff --git a/evals/baselines/bookstore-api.json b/evals/baselines/bookstore-api.json
@@ -0,0 +1,87 @@
+{
+  "fixture": "bookstore-api",
+  "lastRun": "2026-04-11T12:04:05.560Z",
+  "squintCommit": "b8e0f70",
+  "tableScores": {
+    "files": {
+      "passed": true,
+      "expected": 18,
+      "produced": 18,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "definitions": {
+      "passed": true,
+      "expected": 97,
+      "produced": 97,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "imports": {
+      "passed": true,
+      "expected": 15,
+      "produced": 15,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "definition_metadata": {
+      "passed": true,
+      "expected": 95,
+      "produced": 305,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "relationship_annotations": {
+      "passed": true,
+      "expected": 9,
+      "produced": 89,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "module_cohesion": {
+      "passed": true,
+      "expected": 11,
+      "produced": 97,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "contracts": {
+      "passed": true,
+      "expected": 11,
+      "produced": 11,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "interaction_rubric": {
+      "passed": true,
+      "expected": 5,
+      "produced": 24,
+      "critical": 0,
+      "major": 0,
+      "minor": 1
+    },
+    "flow_rubric": {
+      "passed": true,
+      "expected": 2,
+      "produced": 19,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "feature_cohesion": {
+      "passed": true,
+      "expected": 2,
+      "produced": 5,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    }
+  }
+}
diff --git a/evals/baselines/todo-api.json b/evals/baselines/todo-api.json
@@ -0,0 +1,87 @@
+{
+  "fixture": "todo-api",
+  "lastRun": "2026-04-10T17:44:42.211Z",
+  "squintCommit": "8b7ad46",
+  "tableScores": {
+    "files": {
+      "passed": true,
+      "expected": 14,
+      "produced": 14,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "definitions": {
+      "passed": true,
+      "expected": 50,
+      "produced": 50,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "imports": {
+      "passed": true,
+      "expected": 25,
+      "produced": 25,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "definition_metadata": {
+      "passed": true,
+      "expected": 122,
+      "produced": 161,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "relationship_annotations": {
+      "passed": true,
+      "expected": 35,
+      "produced": 69,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "module_cohesion": {
+      "passed": true,
+      "expected": 12,
+      "produced": 50,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "contracts": {
+      "passed": true,
+      "expected": 11,
+      "produced": 11,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "interaction_rubric": {
+      "passed": true,
+      "expected": 4,
+      "produced": 25,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "flow_rubric": {
+      "passed": true,
+      "expected": 2,
+      "produced": 14,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    },
+    "feature_cohesion": {
+      "passed": true,
+      "expected": 2,
+      "produced": 4,
+      "critical": 0,
+      "major": 0,
+      "minor": 0
+    }
+  }
+}